当前位置:   article > 正文

基于python爬虫的二手房数据分析_python 链家房价预测

python 链家房价预测

使用python的xpath库来对链家网的二手房数据进行爬取,通过数据预处理和数据清洗,最后做出线性回归模型,个人兴趣,仅供参考。

  1. from lxml import etree
  2. import requests
  3. import csv
  4. from time import sleep
  5. f = open('ZheJiang.csv', mode='w', newline='', encoding='utf-8-sig')
  6. csv_writer = csv.writer(f)
  7. csv_writer.writerow(['名称','地址', '大小', '价格','平方价格'])
  8. headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"}
  9. def grt_info(url):
  10. rep = requests.get(url, headers=headers)
  11. rep.encoding = 'utf-8'
  12. tree = etree.HTML(rep.text) # 数据解析
  13. list = tree.xpath('//*[@id="content"]/div[1]/ul/li')
  14. #print(list)
  15. for i in list:
  16. try:
  17. data=i.xpath('.//div[1]/div[1]/a/text()')[0]
  18. a= i.xpath('.//div[1]/div[2]/div/a[2]/text()')[0]
  19. b= i.xpath('.//div[1]/div[3]/div/text()')[0]
  20. c= i.xpath('.//div[1]/div[6]/div[1]/span/text()')[0]
  21. d= i.xpath('.//div[1]/div[6]/div[2]/span/text()')[0]
  22. print(d)
  23. csv_writer.writerow([ data,a,b,c,d])
  24. except:
  25. pass
  26. print(data)
  27. for i in range (1,101):
  28. url='https://sx.lianjia.com/ershoufang/pg%25s/'%i
  29. grt_info(url)

 数据爬取完成后,使用pandas模块对zhejiang.csv文件导入

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. import seaborn as sns
  4. import warnings
  5. import numpy as np
  6. plt.rcParams['font.sans-serif']='SimHei'
  7. plt.rcParams['axes.unicode_minus']=False
  8. warnings.filterwarnings('ignore')
  9. df1 = pd.read_csv('zhejiang.csv')
  10. df1.head()

.head()输出前五条数据,确保数据爬取成功。

 

  1. def split_data(data, sep=' | '):
  2. split_data = data.split(sep)
  3. return split_data
  4. df = pd.read_csv('zhejiang.csv')
  5. df[['房间规格', '平方', '位置', '装修', '楼层', '样式']] = df['大小'].apply(split_data).tolist()
  6. df = df.drop('大小', axis=1)
  7. df = df.drop('名称', axis=1)
  8. # 将 "平方" 列中的 "平米" 字符替换为空字符串
  9. df['平方'] = df['平方'].astype(str)
  10. df['平方'] = df['平方'].str.replace('平米', '')
  11. # 将 "平方" 列转换为浮点数
  12. df['平方'] = pd.to_numeric(df['平方'], errors='coerce')
  13. df['平方价格'] = df['平方价格'].astype(str)
  14. df['平方价格'] = df['平方价格'].str.replace('元/平', '')
  15. df['平方价格'] = df['平方价格'].str.replace(',', '')
  16. #df['平方价格'] = pd.to_numeric(df['平方价格'], errors='coerce')
  17. #df.fillna(0, inplace=True)
  18. df['平方价格'] = df['平方价格'].astype(float)
  19. df.head()

 数据拆分清洗完毕后,通过箱型图对装修风格的不同价位可视化

  1. sns.boxplot(x="价格",y="装修",data=df[(df.装修=='精装')],orient="h",palette="magma")
  2. sns.boxplot(x="价格",y="装修",data=df[(df.装修=='简装')],orient="h",palette="magma")
  3. sns.boxplot(x="价格",y="装修",data=df[(df.装修=='毛坯')],orient="h",palette="magma")
  4. sns.boxplot(x="价格",y="装修",data=df[(df.装修=='其他')],orient="h",palette="magma")

使用机器学习库对房间规格进行热编码后与价格对比 

  1. df['房间规格'] = df['房间规格'].astype('category')
  2. df['位置'] = df['位置'].astype('category')
  3. df['装修'] = df['装修'].astype('category')
  4. df['价格'] = pd.to_numeric(df['价格'], errors='coerce')
  5. df['平方价格'] = pd.to_numeric(df['平方价格'], errors='coerce')
  6. df['平方'] = pd.to_numeric(df['平方'], errors='coerce')
  7. df = df.dropna(subset=['价格'])
  8. # 对“房间规格”进行独热编码
  9. df_encoded = pd.get_dummies(df['房间规格'], prefix='房间规格')
  10. # 合并编码后的数据和数字数据
  11. df_new = pd.concat([df_encoded, df[[ '价格']]], axis=1)
  12. # 定义输入特征和标签
  13. X = df_new.iloc[:, :-1].values
  14. y = df_new.iloc[:, -1].values
  15. # 划分数据集为训练集和测试集
  16. from sklearn.model_selection import train_test_split
  17. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
  18. # 建立线性回归模型
  19. model = LinearRegression()
  20. model.fit(X_train, y_train)
  21. # 进行预测
  22. y_pred = model.predict(X_test)
  23. # 评估预测结果
  24. from sklearn.metrics import mean_squared_error, r2_score
  25. print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
  26. print('R^2: %.2f' % r2_score(y_test, y_pred))
  27. # 可视化预测结果和真实结果
  1. # 对“房间规格”进行独热编码
  2. df_encoded = pd.get_dummies(df['装修'], prefix='装修')
  3. # 合并编码后的数据和数字数据
  4. df_new = pd.concat([df_encoded, df[[ '价格']]], axis=1)
  5. # 定义输入特征和标签
  6. X = df_new.iloc[:, :-1].values
  7. y = df_new.iloc[:, -1].values
  8. # 划分数据集为训练集和测试集
  9. from sklearn.model_selection import train_test_split
  10. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
  11. # 建立线性回归模型
  12. model = LinearRegression()
  13. model.fit(X_train, y_train)
  14. # 进行预测
  15. y_pred = model.predict(X_test)
  16. # 评估预测结果
  17. from sklearn.metrics import mean_squared_error, r2_score
  18. print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
  19. print('R^2: %.2f' % r2_score(y_test, y_pred))
  20. # 可视化预测结果和真实结果

 最后drop去掉无用值后对二手房价的模型有个简单的建立,注意:影响房价的因素有很多,本篇仅个人学习中的一个小作品,希望大家多多包涵

  1. df = df.drop('地址', axis=1)
  2. df = df.drop('位置', axis=1)
  3. df = df.drop('楼层', axis=1)
  4. df = df.drop('样式', axis=1)
  5. from sklearn.linear_model import LinearRegression
  6. from sklearn.preprocessing import OneHotEncoder
  7. from sklearn.compose import ColumnTransformer
  8. ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['装修', '房间规格'])], remainder='passthrough')
  9. X = np.array(ct.fit_transform(df))
  10. # 定义标签
  11. y = np.array(df['价格'])
  12. # 划分数据集为训练集和测试集
  13. from sklearn.model_selection import train_test_split
  14. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
  15. # 建立线性回归模型
  16. model = LinearRegression()
  17. model.fit(X_train, y_train)
  18. # 进行预测
  19. y_pred = model.predict(X_test)
  20. # 评估预测结果
  21. from sklearn.metrics import mean_squared_error, r2_score
  22. print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
  23. print('R^2: %.2f' % r2_score(y_test, y_pred))
  24. # 输出模型的系数和截距
  25. print('Coefficients:', model.coef_)
  26. print('Intercept:', model.intercept_)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/794616
推荐阅读
相关标签
  

闽ICP备14008679号