用Python获取链家二手房房源数据，做可视化图分析数据_基于python的链家房源数据分析与可视化

作者：2023面试高手 | 2024-04-06 23:34:00

踩

基于python的链家房源数据分析与可视化

前言

数据采集的步骤是固定:

发送请求, 模拟浏览器对于url地址发送请求
获取数据, 获取网页数据内容 --> 请求那个链接地址, 返回服务器响应数据
解析数据, 提取我们需要的数据内容
保存数据, 保存本地文件

所需模块

win + R 输入cmd 输入安装命令 pip install 模块名 (如果你觉得安装速度比较慢, 你可以切换国内镜像源)

# 数据请求模块 第三方模块 需要安装 pip install requests
import requests
# 数据解析模块 第三方模块 需要安装 pip install parsel
import parsel
# 导入csv模块 内置模块 不需要安装
import csv  # 固定模板
# 导入pandas模块
import pandas as pd
1
2
3
4
5
6
7
8

二手房源数据获取

请求数据

# 模拟浏览器
headers = {
    # 用户代理 表示浏览器基本身份信息
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# 请求链接
url = 'https://cs.lianjia.com/ershoufang/'
# 发送请求
response = requests.get(url=url, headers=headers)
# 输出内容 <Response [200]> 响应对象 表示请求成功
print(response)
1
2
3
4
5
6
7
8
9
10
11

解析数据

我们这次选用css选择器: 根据标签属性提取数据内容

获取所有房源所在li标签

selector = parsel.Selector(response.text)  # 选择器对象
# 获取所有房源所在li标签
lis = selector.css('.sellListContent li .info')
1
2
3

for循环遍历

for li in lis:
    title = li.css('.title a::text').get()  # 标题
    area_info = li.css('.positionInfo a::text').getall()  # 区域信息
    area_1 = area_info[0]  # 小区
    area_2 = area_info[1]  # 区域
    totalPrice = li.css('.totalPrice span::text').get()  # 总价
    unitPrice = li.css('.unitPrice span::text').get().replace('元/平', '')  # 单价
    houseInfo = li.css('.houseInfo::text').get().split(' | ')  # 房源信息
    HouseType = houseInfo[0]  # 户型
    HouseArea = houseInfo[1].replace('平米', '')  # 面积
    HouseFace = houseInfo[2]  # 朝向
    HouseInfo_1 = houseInfo[3]  # 装修
    fool = houseInfo[4]  # 楼层
    HouseInfo_2 = houseInfo[-1]  # 建筑结构
    href = li.css('.title a::attr(href)').get()  # 详情页
    dit = {
        '标题': title,
        '小区': area_1,
        '区域': area_2,
        '总价': totalPrice,
        '单价': unitPrice,
        '户型': HouseType,
        '面积': HouseArea,
        '朝向': HouseFace,
        '装修': HouseInfo_1,
        '楼层': fool,
        '年份': date,
        '建筑结构': HouseInfo_2,
        '详情页': href,
    }
    print(dit)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

保存数据

f = open('二手房.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '标题',
    '小区',
    '区域',
    '总价',
    '单价',
    '户型',
    '面积',
    '朝向',
    '装修',
    '楼层',
    '年份',
    '建筑结构',
    '详情页',
])
csv_writer.writeheader()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

接下来就是数据可视化

二手房源户型分布

from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(house_type, house_num)
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二手房源户型分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_scroll_legend.html")
)
c.load_javascript()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

二手房源朝向分布

face_type = df['朝向'].value_counts().index.to_list()
face_num = df['朝向'].value_counts().to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(face_type, face_num)
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二手房源朝向分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_scroll_legend.html")
)
c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

二手房源装修分布

face_type = df['装修'].value_counts().index.to_list()
face_num = df['装修'].value_counts().to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(face_type, face_num)
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二手房源装修分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_scroll_legend.html")
)
c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

二手房源年份分布

face_type = df['年份'].value_counts().index.to_list()
face_num = df['年份'].value_counts().to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(face_type, face_num)
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二手房源年份分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_scroll_legend.html")
)
c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

二手房源建筑结构分布

face_type = df['建筑结构'].value_counts().index.to_list()
face_num = df['建筑结构'].value_counts().to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(face_type, face_num)
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二手房源建筑结构分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
#     .render("pie_scroll_legend.html")
)
c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

各大区域房价平均价

avg_salary = df.groupby('区域')['总价'].mean()
CityType = avg_salary.index.tolist()
CityNum = [int(a) for a in avg_salary.values.tolist()]
from pyecharts.charts import Bar
# 创建柱状图实例
c = (
    Bar()
    .add_xaxis(CityType)
    .add_yaxis("", CityNum)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="各大区域房价平均价"),
        visualmap_opts=opts.VisualMapOpts(
            dimension=1,
            pos_right="5%",
            max_=30,
            is_inverse=True,
        ),
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45))  # 设置X轴标签旋转角度为45度
    )
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=False),
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_="min", name="最小值"),
                opts.MarkLineItem(type_="max", name="最大值"),
                opts.MarkLineItem(type_="average", name="平均值"),
            ]
        ),
    )
)

c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

各大区域房价单价平均价格

import pandas as pd
from pyecharts.charts import Bar
import pyecharts.options as opts

# 清理数据并将'单价'列转换为整数类型
df['单价'] = df['单价'].str.replace(',', '').astype(int)

# 计算平均价
avg_salary = df.groupby('区域')['单价'].mean()

# 获取城市类型和城市平均价格
CityType = avg_salary.index.tolist()
CityNum = [int(a) for a in avg_salary.values.tolist()]

# 创建柱状图实例
c = (
    Bar()
    .add_xaxis(CityType)
    .add_yaxis("", CityNum)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="各大区域房价单价平均价格"),
        visualmap_opts=opts.VisualMapOpts(
            dimension=1,
            pos_right="5%",
            max_=30,
            is_inverse=True,
        ),
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45))  # 设置X轴标签旋转角度为45度
    )
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=False),
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_="min", name="最小值"),
                opts.MarkLineItem(type_="max", name="最大值"),
                opts.MarkLineItem(type_="average", name="平均值"),
            ]
        ),
    )
)

# 在Notebook中显示柱状图
c.render_notebook()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

适合练手的25个Python案例源码分享，总有一个你想要的

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/2023面试高手/article/detail/375036

推荐阅读

相关标签