- 本文利用爬虫爬取了贝壳网上海市房屋出租数据,利用python对数据进行清洗、分析和展示,并对房屋租金进行数据挖掘建模预测。
# 导入相关包
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display, Image
# 设置中文正常显示,设置负号正常显示
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
sns.set_style({'font.sans-serif':['SimHei','Arial']})
%matplotlib inline
一、 爬虫部分
# 导入相关包
import requests
from bs4 import BeautifulSoup
import json
import re
# 避免反扒,构建header和cookies,否则网站会检测不合法的访问请求头部信息,返回404.
# header
r_h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'}
# cookies
r_c = {}
cookies = '''F12在网络里查看'''
for i in cookies.split('; '):
r_c[i.split('=')[0]] = i.split('=')[1]
# 构造爬虫网站基础结构
district = ['jingan', 'xuhui', 'huangpu', 'changning', 'putuo', 'pudong',
'baoshan', 'hongkou', 'yangpu', 'minhang', 'jinshan', 'jiading',
'chongming', 'fengxian', 'songjiang', 'qingpu']
base_url = 'https://sh.zu.ke.com/zufang/'
# 求出每个区共有多少页房源信息
district_info = {}
for i in district:
pg_i = requests.get(url=base_url + i + '/', headers=r_h, cookies = r_c)
pgsoup_i = BeautifulSoup(pg_i.text, 'lxml')
num_i = pgsoup_i.find('div', class_ = "content__pg").attrs['data-totalpage']
district_info[i] = int(num_i)
# 建立爬虫网站
url_list = []
for i, num in district_info.items():
for j in range(1, num + 1):
url_i = base_url + i + '/' + 'pg' + str(j) + '/'
url_list.append(url_i)
# 爬取网页信息函数
def data_request(urli, r_h, r_c):
r_i = requests.get(url=urli, headers=r_h, cookies = r_c)
soup_i = BeautifulSoup(r_i.text, 'lxml')
div = soup_i.find('div', class_ = "content__list")
info_list = div.find_all('div', class_ = "content__list--item--main")
data_i = []
for info in info_list:
item = {}
item['region'] = urli.split('/')[-3]
item['info'] = info
data_i.append(item)
return data_i
# 爬取网页信息
data = []
for i in url_list:
data.extend(data_request(i, r_h, r_c))
二、 数据清洗
# 建立数据清洗函数,从爬取网站信息中获取需要分析的字段
def data_clear(data_i):
item = {}
item['region'] = data_i['region']
info = data_i['info']
item['house_info'] = ''.join(re.findall(r'\n\n\n(.+)\n\n\n', info.get_text())).replace(' ','')
item['location_info'] = ''.join(re.findall(r'\n\n\n(.+)\n/\n', info.get_text())).replace(' ','')
item['area_info'] = ''.join(re.findall(r'\n/\n(.+)㎡\n', info.get_text())).replace(' ','')
item['floor_info'] = ''.join(re.findall(r'\n/\n(.+))\n', info.get_text())).replace(' ','').replace('(','')
item['direction_info'] = ''.join(re.findall(r'\n(.+)/\n', info.get_text())).replace(' ','').replace('/','')
item['layout_info'] = ''.join(re.findall(r'/\n(.+)\n/\n', info.get_text())).replace(' ','')
item['price_info'] = ''.join(re.findall(r'\n\n(.+)元/月\n', info.get_text())).replace(' ','')
item['other_info'] = ''.join(re.findall(r'\n \n\n\n([\s\S]*)\n\n', info.get_text())).replace(' ','')
return item
# 数据清洗
data_c = []
for data_i in data:
data_c.append(data_clear(data_i))
三、 数据存储
# 导入相关工具包,建立连接
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient['data']
beike_data = db['beike_data']
# 导入数据至 mongodb
beike_data.insert_many(data_c)
# 正常的采集流程应为采集一条存入一条,
# 存储应在数据采集阶段完成,本文仅为了演示mongodb大致用法,
# 读者可自行尝试采集阶段存储,方法是一致的
<pymongo.results.InsertManyResult at 0x260c3179508>
# 对数据简单处理
data_pre = pd.read_csv("pd_data.csv",index_col=0)
# 查看数据大致结构
data_pre[:3]
# 去掉与分析无关的字段
data_pre = data_pre.drop(['_id', 'region'], axis=1)
#
# 查看个字段数据类型
data_pre.info()
# OUTPUT
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36622 entries, 0 to 36621
Data columns (total 8 columns):
area_info 36131 non-null object
direction_info 36499 non-null object
floor_info 34720 non-null object
house_info 36622 non-null object
layout_info 34729 non-null object
location_info 36131 non-null object
other_info 34720 non-null object
price_info 36622 non-null object
dtypes: object(8)
memory usage: 2.5+ MB
# 创建空数据框
beike_df = pd.DataFrame()
# 提取字段
beike_df['Area'] = data_pre['area_info']
beike_df['Price'] = data_pre['price_info']
beike_df['Direction'] = data_pre['direction_info']
beike_df['Layout'] = data_pre['layout_info']
beike_df['Relative height'] = data_pre['floor_info'].str.split('楼层', expand = True)[0]
beike_df['Total floors'] = data_pre['floor_info'].str.split('楼层', expand = True)[1].str[:-1]
beike_df['Renting modes'] = data_pre['house_info'].str.split('·', expand = True)[0]
beike_df['District'] = data_pre['location_info'].str.split('-', expand = True)[0]
beike_df['Street'] = data_pre['location_info'].str.split('-', expand = True)[1]
beike_df['Community'] = data_pre['location_info'].str.split('-', expand = True)[2]
beike_df['Metro'] = data_pre['other_info'].str.contains('地铁') + 0
beike_df['Decoration'] = data_pre['other_info'].str.contains('精装') + 0
beike_df['Apartment'] = data_pre['other_info'].str.contains('公寓') + 0
# 数据列位置重排方便查看,存至csv
columns = ['District', 'Street', 'Community', 'Renting modes','Layout', 'Total floors', 'Relative height', 'Area', 'Direction', 'Metro','Decoration', 'Apartment', 'Price']
df = pd.DataFrame(beike_df, columns = columns)
df.to_csv("df.csv",sep=',')
四、 添加经纬度地理信息字段
# 构造中心点,取上海国际饭店作为中心点
center = (121.4671328,31.23570852)
# 将地点名称列单独导出,由第三方平台转化为经纬度信息
location_info = data_pre['location_info'].str.replace('-', '')
location_info.to_csv("location_info.csv",header='location',sep=',')
https://maplocation.sjfkai.com/ 从该网站将地理信息点转化为对应的经纬度坐标(百度坐标系)
https://github.com/wandergis/coordTransform_py coordTransform_py模块将百度坐标系下的经纬度转换为国际坐标系wgs84
# 模块说明
# coord_converter.py [-h] -i INPUT -o OUTPUT -t TYPE [-n LNG_COLUMN] [-a LAT_COLUMN] [-s SKIP_INVALID_ROW]
# arguments:
# -i , --input Location of input file
# -o , --output Location of output file
# -t , --type Convert type, must be one of: g2b, b2g, w2g, g2w, b2w,
# w2b
# -n , --lng_column Column name for longitude (default: lng)
# -a , --lat_column Column name for latitude (default: lat)
# -s , --skip_invalid_row
# Whether to skip invalid row (default: False)
# 将各点地理信息读入到pandas
geographic_info = pd.read_csv("wgs84.csv",index_col=0)
# 合并经纬度
def point(x,y):
return(x,y)
geographic_info['point'] = geographic_info.apply(lambda row:point(row['lat'],row['lng']),axis=1)
# 利用geopy计算各点与中心点之间的距离,作为一个字段反映租房位置接近市中心的程度
from geopy.distance import great_circle
center = (31.23570852,121.4671328)
list_d = []
for point in geographic_info['point']:
list_d.append(great_circle(point, center).km)
geographic_info.insert(loc=4,column='distance',value=list_d
#
# 导出至csv,留存备用
geographic_info.to_csv("geographic_info.csv",sep=',')
五、 数据进一步处理及特征工程
df=pd.read_csv("df.csv",index_col=0)
#
# 查看数据类型及缺失情况
df.info()
#
# OUTPUT
<class 'pandas.core.frame.DataFrame'>
Int64Index: 36622 entries, 0 to 36621
Data columns (total 13 columns):
District 36131 non-null object
Street 34728 non-null object
Community 34729 non-null object
Renting modes 36622 non-null object
Layout 34729 non-null object
Total floors 34712 non-null float64
Relative height 34720 non-null object
Area 36131 non-null object
Direction 36499 non-null object
Metro 34720 non-null float64
Decoration 34720 non-null float64
Apartment 34720 non-null float64
Price 36622 non-null object
dtypes: float64(4), object(9)
memory usage: 3.9+ MB***
房屋所属街区和小区,房屋装修情况等信息缺失较多,因信息无法通过填充等方式补全,后续会通过移除的方式进行处理。
# 查看各个字段的数据分布,主要为了清除无效的数据(比如“未知”等不合理数据)
df['District'].value_counts()
df['Street'].value_counts()
df['Community'].value_counts()
df['Renting modes'].value_counts()
df['Total floors'].value_counts()
df['Relative height'].value_counts()
df['Layout'].value_counts()
# 去除空值
df = df.dropna(how='any')
# 将价格和面积字段的数据类型改为浮点型
df[['Price','Area']] = df[['Price','Area']].astype(float)
# 将地铁有无、装修类型、是否是公寓字段类型改为整型
df[['Metro','Decoration','Apartment']] = df[['Metro','Decoration','Apartment']].astype(int)
# 去掉房屋布局为未知的数据
df = df[~df['Layout'].str.contains("未知")]
df.dtypes
# OUTPUT
District object
Street object
Community object
Renting modes object
Layout object
Total floors float64
Relative height object
Area float64
Direction object
Metro int32
Decoration int32
Apartment int32
Price float64
dtype: object
- 房屋朝向
# 房屋朝向较分布不符合常理,需要进行清理
df['Direction'].value_counts()
# OUTPUT
南 25809
南北 2956
北 1980
东南 835
东 792
西 534
西南 499
西北 217
东南南 202
东北 160
南西南 69
东东南南西南西 66
南西 57
东西 41
东南北 36
东东南南北 35
东东南 31
南西北 30
东南西北 29
东东南南 21
东东南南西南北 19
东南西南 15
西南西 13
东南南西南北 8
东南南北 8
西西北 7
南西南西 6
东南南西南 6
北东北 5
西南东北 5
东南西 5
南东北 5
西南西北 5
东南南北东北 5
东西北 4
南西北北 4
西南北 4
南西南北 3
东南南西北 3
东东南北 3
南北东北 3
东南南西北北 3
西南西西北 3
东南东北 3
东南南西 3
东东南南西北北 2
东东南南西南 2
西北北 2
东东北 2
东南西南西西北东北 1
南西南西北 1
南西西北北 1
东南南西西北北 1
东北东北 1
西西北北 1
西东北 1
Name: Direction, dtype: int64
# 构建清理字典,主要思路将重复的朝向更改为简单类型
cleanup_nums= {"东南南":"东南","南西南":"西南","东东南南西南西":"东西南",
"南西":"西南","东东南南北":"东南北","东东南":"东南",
"南西北":"西南北","东南西北":"东西南北","东东南南":"东南",
"东东南南西南北":"东西南北","东南西南":"东西南","西南西":"西南",
"东南南西南北":"东西南北","东南南北":"东南北","西西北":"西北",
"南西南西":"西南","东南南西南":"东西南","西南东北":"东西南北",
"西南西北":"西南北","南东北":"东南北","北东北":"东北",
"东南南北东北":"东南北","东南西":"东西南","南西北北":"西南北",
"东南东北":"东南北","南北东北":"东南北","东东南北":"东南北",
"东南南西北北":"东西南北","南西南北":"西南北","西南西西北":"西南北",
"东南南西北":"东西南北","东南南西":"东西南","东东北":"东北",
"西北北":"西北","东东南南西南":"东西南","东东南南西北北":"东西南北",
"东南南西西北北":"东西南北","南西西北北":"西南北","东北东北":"东北",
"东南西南西西北东北":"东西南北","南西南西北":"西南北","西西北北":"西北",
"西东北":"东西北"
}
df['Direction'].replace(cleanup_nums,inplace=True)
# 上面字典构建太繁琐,可以考虑先做一次去重,然后构建字典,会省去很多重复工作。
#for i in len(df['Direction']):
# df['Direction'][i] = ''.join(set(df['Direction'][i]))
#df['Direction']
#df['Direction'].value_counts()
#cleanup_nums构建较少字典即可
#df['Direction'].replace(cleanup_nums,inplace=True)
# 清理之后的房屋朝向信息
df['Direction'].value_counts()
# OUTPUT
南 25809
南北 2956
北 1980
东南 1089
东 792
西南 644
西 534
西北 227
东北 168
东南北 98
东西南 97
东西南北 71
西南北 51
东西 41
东西北 5
Name: Direction, dtype: int64
- 房屋面积
df_aera = df['Area']
#sns直方图
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)}, figsize=(16, 10))
sns.boxplot(df_aera, ax=ax_box )
sns.distplot(df_aera, bins=40, ax=ax_hist)
# 不显示箱形图的横坐标
ax_box.set(xlabel='')
ax_box.set_title('房屋面积分布')
房屋面积分布为长尾分布,不符合数据建模要求的正态分布,这里先简单处理以下,主要处理思路为,将面积大于800平小于6平的明显不属于住宅性质的数据移除,对于400平以上但是只有1室的数据也做移除处理。
df = df[(df['Area'] < 800)]
df = df[~(df['Area'] < 6)]
df = df[~((df['Area'] > 400) & (df['Layout'].str.contains('1室')))]
df_aera = df['Area']
#sns直方图
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)},figsize=(16, 10))
sns.boxplot(df_aera, ax=ax_box)
sns.distplot(df_aera, bins=40, ax=ax_hist)
# Remove x axis name for the boxplot 不显示箱形图的横坐标
ax_box.set(xlabel='')
ax_box.set_title('房屋面积分布')
清理之后的数据如图,后续会进一步做处理。
- 房屋价格
f, ax_reg = plt.subplots(figsize=(16, 6))
sns.regplot(x='Area', y='Price', data=df, ax=ax_reg)
ax_reg.set_title('房屋面积与价格分布关系')
在对房屋面积做了初步清理后,房屋价格仍然会有一些离群点,后续将进行清理,主要将价格大于100000的数据移除。
df_price = df['Price']
df = df[df['Price'] < 100000]
df_price = df['Price']
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)},figsize=(16, 6))
sns.boxplot(df_price, ax=ax_box)
sns.distplot(df_price, bins=40, ax=ax_hist)
# 不显示箱形图的横坐标
ax_box.set(xlabel='')
ax_box.set_title('房屋面积分布')
f, ax_reg = plt.subplots(figsize=(16, 6))
sns.regplot(x='Area', y='Price', data=df, ax=ax_reg)
ax_reg.set_title('房屋面积与价格分布关系')
仍会有一些离群点,但是数据质量已经好了很多。
- 将地理信息表与房屋信息表合并
# 做连接字段
df['region'] = df['District'] + df['Street'] + df['Community']
geo=pd.read_csv("geographic_info.csv",index_col=0)
df_geo = pd.merge(df, geo, on='region', how='left')
# 去除分析无关字段
df_geo = df_geo.drop(['region', 'point'], axis=1)
# 添加电梯字段,爬取的原始数据中无电梯信息,主要根据总楼层高度来判断,默认总楼层大于6层即有电梯
elevator = (df_geo['Total floors'] > 6) + 0
df_geo.insert(loc=4,column='Elevator',value=elevator)
# 导出数据
df_geo.to_csv("df_geo.csv", sep=',')
六、 数据可视化
# 对基础字段进行数据可视化,以了解数据分布信息,对数据进行进一步处理。
df_geo = pd.read_csv("df_geo.csv",index_col=0)
# 可视化主要用交互式可视化包 bokeh 和 pyecharts
# 导入相关包
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
output_notebook()
import pyecharts.options as opts
from pyecharts.globals import ThemeType
from pyecharts.faker import Faker
from pyecharts.charts import Grid, Boxplot, Scatter, Pie, Bar
(https://bokeh.pydata.org)BokehJS 1.2.0 successfully loaded.
- 租赁方式可视化
# 租赁方式可视化
Renting_modes_group = df_geo.groupby('Renting modes')['Price'].count()
val = list(Renting_modes_group.astype('float64'))
var = list(Renting_modes_group.index)
p_group = (
Pie(opts.InitOpts(width="1000px",height="600px",theme=ThemeType.LIGHT))
.add("", [list(z) for z in zip(var, val)])
.set_global_opts(title_opts=opts.TitleOpts(title="出租形式分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="middle", orient="vertical"),
toolbox_opts=opts.ToolboxOpts())
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
p_group.render_notebook()
其中合租数据共5687份,因合租数据较难推测房屋整租租金,对于研究租金整体分布无法利用,且合租占总数据的比重较少,本次分析做去除处理,后续可以单开一个专题做合租租金的分布研究。
df_geo = df_geo[~(df_geo['Renting modes'] == '合租')]
- 地理距离可视化
# 距离上海市中心点70公里以外的数据已超出上海市边界,属于数据处理产生的错误点,做移除处理
df_geo = df_geo[~(df_geo['distance'] > 70)]
# 对距离字段做分桶处理,分为0-5,5-10,10-20,20-70,数字越大距市中心越远。
bins = [0, 5, 10, 20, 70]
cats = pd.cut(df_geo.distance,bins)
df_geo['cats'] = cats
f, ax1 = plt.subplots(figsize=(16, 6))
sns.countplot(df_geo['cats'], ax=ax1, palette="husl")
ax1.set_title('距市中心不同距离出租房屋数量分布')
f, ax2 = plt.subplots(figsize=(16, 6))
sns.boxplot(x='cats', y='Price', data=df_geo, ax=ax2, palette="husl")
ax2.set_title('距市中心不同距离出租房屋价格分布')
plt.show()
从图中可以看出距市中心5-10公里范围内出租房屋数量最多,其次是0-5公里范围内,再偏远一些的范围房屋数量相对较少一些,说明大部分人租房范围在距市中心10公里以内。随着距市中心越来越远,出租房屋均价也越来越低,大约每个范围均价之间有2000元的差距。
- 房屋面积可视化
df_area = df_geo['Area']
df_price = df_geo['Area']
# bokeh直方图
hist, edge = np.histogram(df_area, bins=50)
p = figure(title="出租房屋面积分布",plot_width=1000, plot_height=600, toolbar_location="above",tooltips="数量: @top")
p.quad(top=hist, bottom=0, left=edge[:-1], right=edge[1:], line_color="white")
show(p)
#pyecharts直方图
hist, edge = np.histogram(df_area, bins=50)
x_data = list(edge[1:])
y_data = list(hist.astype('float64'))
bar = (
Bar(init_opts = opts.InitOpts(width="1000px",height="600px",theme=ThemeType.LIGHT))
.add_xaxis(xaxis_data=x_data)
.add_yaxis("数量", yaxis_data=y_data,category_gap=1)
.set_global_opts(title_opts=opts.TitleOpts(title="出租房屋面积分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="middle", orient="vertical"),
datazoom_opts=[opts.DataZoomOpts(type_="slider")], toolbox_opts=opts.ToolboxOpts(),
tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "line",trigger="item",formatter='{c}'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
bar.render_notebook()
两种交互式的可视化比Seaborn有很多优势,可以点选查看每个条目的数量,拖动以观察感兴趣的区间。
- 各城区出租房屋租金价格及数量分布
df_geo['Persq']=df_geo['Price']/df_geo['Area']
region_group_price = df_geo.groupby('District')['Price'].count().sort_values(ascending=False).to_frame().reset_index()
region_group_Persq = df_geo.groupby('District')['Persq'].mean().sort_values(ascending=False).to_frame().reset_index()
region_group = pd.merge(region_group_price, region_group_Persq, on='District')
x_region = list(region_group['District'])
y1_region = list(region_group['Price'].astype('float64'))
y2_region = list(region_group['Persq'])
bar_group = (
Bar(opts.InitOpts(width="1000px",height="400px",theme=ThemeType.LIGHT))
.add_xaxis(x_region)
.add_yaxis("Num", y1_region, gap="5%")
.add_yaxis("Persq", y2_region, gap="5%")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="各区租房数量与单位面积租金价格"),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="middle", orient="vertical"),
datazoom_opts=[opts.DataZoomOpts(type_="slider")],
toolbox_opts=opts.ToolboxOpts(),
tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "line",trigger="item",formatter='{c}'))
)
bar_group.render_notebook()
租房数量与单位面积租金价格按区域分布与按距离分布的规律比较类似,市中心区域租房数量较多,单位面积租金价格也相对较高,单位面积租金价格最高的是黄浦区为158元/平方,最低的为崇明区,为25元/平方。
region_group_price = df_geo.groupby('District')['Price']
box_data = []
for i in x_region:
d = list(region_group_price.get_group(i))
box_data.append(d)
# pyecharts 各区租金价格分布图
boxplot_group = Boxplot(opts.InitOpts(width="1000px",height="400px",theme=ThemeType.LIGHT))
boxplot_group.add_xaxis(x_region)
boxplot_group.add_yaxis("Price", boxplot_group.prepare_data(box_data))
boxplot_group.set_global_opts(title_opts=opts.TitleOpts(title="上各区租金价格分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="middle", orient="vertical"),
datazoom_opts=[opts.DataZoomOpts(),opts.DataZoomOpts(type_="slider")],
toolbox_opts=opts.ToolboxOpts(),
tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "line",trigger="item",formatter='{c}'))
boxplot_group.render_notebook()
# seaborn 各区租金价格分布图
f, ax = plt.subplots(figsize=(16,8))
sns.boxplot(x='District', y='Price', data=df_geo, palette="Set3", linewidth=2.5, order=x_region)
ax.set_title('各区租金价格分布',fontsize=15)
ax.set_xlabel('区域')
ax.set_ylabel('房屋租金')
plt.show()
由各区租金价格分布图可知,出租房屋租金价格平均值最高的仍为黄浦区,各区都存在偏高的异常点,推测应为别墅等房间较多的房屋类型。松江、嘉定和宝山距市中心距离适中,而且价格比其他各区低,适合预算较低的上班族租住。
- 出租房屋朝向及户型分布
f, ax1 = plt.subplots(figsize=(16,25))
sns.countplot(y='Layout', data=df_geo, ax=ax1,order = df_geo['Layout'].value_counts().index)
ax1.set_title('房屋户型分布',fontsize=15)
ax1.set_xlabel('数量')
ax1.set_ylabel('户型')
f, ax2 = plt.subplots(figsize=(16,6))
sns.countplot(x='Direction', data=df_geo, ax=ax2,order = df_geo['Direction'].value_counts().index)
ax2.set_title('房屋朝向分布',fontsize=15)
ax2.set_xlabel('朝向')
ax2.set_ylabel('数量')
plt.show()
房屋的户型当中占比较多的是小户型,多房间的户型占比较少,其中一室一厅一卫的户型最多,也符合现在很多人想有自己私密空间的趋势,其次是两室一厅。房屋朝向以朝南居多,符合建筑设计常识,其中余下南北等复合朝向应为不同房间朝向的叠加,仍以朝向中含南的房屋数量最多。
- 出租房屋租金价格与电梯、地铁、装修、公寓字段之间关系
sns.catplot(x='Elevator',y='Price',hue='Metro',row='Apartment',
col='Decoration',data=df_geo,kind="box",palette="husl")
有电梯的房屋租金价格普遍比没电梯的房屋租金价格高,同样有地铁这个因素也使房屋租金变高。大部分情况下精装房屋租金价格较高,是否是公寓对发房屋租金价格影响没有规律。
- 出租房屋租金价格与房屋层高之间关系
f, ax1 = plt.subplots(figsize=(16, 4))
sns.countplot(df_geo['Relative height'], ax=ax1, palette="husl")
ax1.set_title('不同层高出租房屋数量分布')
f, ax2 = plt.subplots(figsize=(16, 8))
sns.boxplot(x='Relative height', y='Price', data=df_geo, ax=ax2, palette="husl")
ax2.set_title('不同层高出租房屋租金价格分布')
plt.show()
中等高度楼层出租房数量最多,低楼层出租房数量最少。平均租金价格低楼层反而最高,高楼层最低,推测因为低楼层生活便利程度较高的缘故。
- 各区出租房屋租金价格分布热力图
# 导入相关模块
from pyecharts.charts import Geo
from pyecharts.globals import ChartType
from pyecharts.globals import GeoType
# 构造数据集
region = df_geo['District'] + df_geo['Street'] + df_geo['Community']
region = region.reset_index(drop=True)
data_pair = [(region[i], df_geo.iloc[i]['Price']) for i in range(len(region))]
pieces=[
{'max': 1000,'label': '1000以下','color':'#50A3BA'}, #有上限无下限,label和color自定义
{'min': 1000, 'max': 1800,'label': '1000-1800','color':'#81AE9F'},
{'min': 1800, 'max': 2300,'label': '1800-2300','color':'#E2C568'},
{'min': 2300, 'max': 2800,'label': '2300-2800','color':'#FCF84D'},
{'min': 2800, 'max': 5000,'label': '2800-5000','color':'#E49A61'},
{'min': 5000, 'max': 8000,'label': '5000-8000','color':'#DD675E'},
{'min': 8000, 'label': '8000以上','color':'#D94E5D'}#有下限无上限
]
# 各区出租房屋租金价格分布热力图
geo = Geo(opts.InitOpts(width="600px",height="600px",theme=ThemeType.LIGHT))
geo.add_schema(maptype = "上海",emphasis_label_opts=opts.LabelOpts(is_show=True,font_size=16))
for i in range(len(region)):
geo.add_coordinate(region[i],df_geo.iloc[i]['lng'],df_geo.iloc[i]['lat'])
geo.add("",data_pair,type_=GeoType.HEATMAP)
geo.set_global_opts(
title_opts=opts.TitleOpts(title="上海房租分布"),
visualmap_opts=opts.VisualMapOpts(min_ = 0,max_ = 30000,split_number = 20,range_text=["High", "Low"],is_calculable=True,range_color=["lightskyblue", "yellow", "orangered"]),
toolbox_opts=opts.ToolboxOpts(),
tooltip_opts=opts.TooltipOpts(is_show=True)
)
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.render_notebook()
#geo.render("HEATMAP.html")
房屋租金价格热力图反映了一个区域租金高低以及房屋数量密集程度,pyecharts做出的热力图显示效果不是太好,因此又用pyecharts做了价格分布散点图作为补充,两张图表达的意思一致,后续又导出至tableau做了一版,显示效果较pyecharts好一些。
# 各区出租房屋租金价格分布散点图
geo = Geo(opts.InitOpts(width="600px",height="600px",theme=ThemeType.LIGHT))
geo.add_schema(maptype = "上海")
for i in range(len(region)):
geo.add_coordinate(region[i],df_geo.iloc[i]['lng'],df_geo.iloc[i]['lat'])
geo.add("",data_pair,type_=GeoType.EFFECT_SCATTER,symbol_size=5)
geo.set_global_opts(
title_opts=opts.TitleOpts(title="上海房租分布"),
visualmap_opts=opts.VisualMapOpts(is_piecewise=True, pieces=pieces),
toolbox_opts=opts.ToolboxOpts()
)
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.render_notebook()
#geo.render("SCATTER.html")
七、数据建模
- 特征工程
# 读取数据
data=pd.read_csv("df_geo_df.csv",index_col=0)
# 去掉与建模无关特征
data = data.drop(['Street', 'Community', 'Renting modes', 'Total floors', 'lng', 'lat', 'cats', 'Persq'], axis=1)
# Distance字段为连续变量,将其离散分箱后去除。
bins = [0, 5, 10, 20, 70]
cats = pd.cut(data.distance,bins)
data['cats'] = cats
data = data.drop(['distance'], axis=1)
如果每个不同的 Year 值都作为特征值,我们并不能找出 Year 对 Price 有什么影响,因为年限划分的太细了。因此,我们只有将连续数值型特征 Year 离散化,做分箱处理。 特征离散化后,模型会更稳定,降低了模型过拟合的风险。
# 房屋布局这个特征不能直接作为模型输入,从中提取出房间数、厅数、卫生间数
data['room_num'] = data['Layout'].str.extract('(^\d).*', expand=False).astype('int64')
data['hall_num'] = data['Layout'].str.extract('^\d.*?(\d).*', expand=False).astype('int64')
data['bathroom_num'] = data['Layout'].str.extract('^\d.*?\d.*?(\d).*', expand=False).astype('int64')
#本篇数据比较规整,可以直接.str[0] .str[2] .str[4]
# 创建新特征
# 主要思路是考虑每个租客均摊的居住面积、厅和卫生间被共享的程度,
# 处理生成一个表示居住舒适度的新字段‘convenience’,数值越大表示居住舒适度越高
data['area_per_capita'] = data['Area'] / data['room_num']
data['area_per_capita_norm'] = data[['area_per_capita']].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
data['convenience'] = data['area_per_capita_norm'] + (data['hall_num'] / data['room_num']) + (data['bathroom_num'] / data['room_num'])
# 去掉建模无关特征
data = data.drop(['Layout', 'area_per_capita', 'area_per_capita_norm'], axis=1)
# 重置索引
data = data.reset_index(drop = True)
# 将区域、相对高度和朝向特征数据由中文改为简写
district_nums = {"嘉定":"JD","奉贤":"FX","宝山":"BS","崇明":"CM",
"徐汇":"XH","普陀":"PT","杨浦":"YP","松江":"SJ",
"浦东":"PD","虹口":"HK","金山":"JS","长宁":"CN",
"闵行":"MH","青浦":"QP","静安":"JA","黄浦":"HP"}
relative_height_nums = {"高":"H","中":"M","低":"L"}
direction_nums = {"东":"E","东北":"EN","东南":"ES","东南北":"ESN",
"东西":"EW","东西北":"EWN","东西南":"EWS",
"东西南北":"EWSN","北":"N","南":"S","南北":"SN",
"西":"W","西北":"WN","西南":"WS","西南北":"WSN"}
data['District'].replace(district_nums,inplace=True)
data['Relative height'].replace(relative_height_nums,inplace=True)
data['Direction'].replace(direction_nums,inplace=True)
# 重命名字段名
data_columns = {"Relative height":"rh","cats":"distance_cat"}
data.rename(columns = data_columns,inplace = True)
data.columns = data.columns.str.lower()
# 最后查看各字段数据类型是否合理
data.info()
# OUTPUT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28799 entries, 0 to 28798
Data columns (total 14 columns):
district 28799 non-null object
elevator 28799 non-null int64
rh 28799 non-null object
area 28799 non-null float64
direction 28799 non-null object
metro 28799 non-null int64
decoration 28799 non-null int64
apartment 28799 non-null int64
price 28799 non-null float64
distance_cat 28733 non-null category
room_num 28799 non-null int64
hall_num 28799 non-null int64
bathroom_num 28799 non-null int64
convenience 28799 non-null float64
dtypes: category(1), float64(3), int64(7), object(3)
memory usage: 2.9+ MB
# 对数据集列重新排序,方便查看
data_order = ['price','area','convenience','district','elevator','rh',
'direction','metro','decoration','apartment','distance_cat',
'room_num','hall_num','bathroom_num' ]
data = data[data_order]
# 取出数值类型的特征做可视化
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in data.columns:
if data[i].dtype in numeric_dtypes:
if i in ['elevator','metro','decoration','apartment']:
pass
else:
numeric.append(i)
# 数值类型数据可视化
f, ax = plt.subplots(figsize=(14, 6))
ax.set_xscale("log")
ax = sns.boxplot(data=data[numeric], orient="h", palette="husl")
ax.xaxis.grid(False)
ax.set(ylabel="Names")
ax.set(xlabel="Values")
ax.set(title="Data Distribution")
for tick in ax.xaxis.get_major_ticks():
tick.label1.set_fontproperties('stixgeneral')
# 纠偏处理
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
# 找出偏态数据
skew_col = data[numeric].apply(lambda x: skew(x))
high_skew = skew_col[skew_col > 0.5]
high_skew_index = high_skew.index
high_skew_index = high_skew_index.drop('price')
# 纠偏处理
data['price'] = np.log1p(data['price'])
for i in high_skew_index:
data[i] = boxcox1p(data[i], boxcox_normmax(data[i] + 1))
# 纠偏后各数值数据分布图
f, ax = plt.subplots(figsize=(14, 6))
ax.set_xscale("log")
ax = sns.boxplot(data=data[numeric], orient="h", palette="husl")
ax.xaxis.grid(False)
ax.set(ylabel="Names")
ax.set(xlabel="Values")
ax.set(title="Data Distribution")
for tick in ax.xaxis.get_major_ticks():
tick.label1.set_fontproperties('stixgeneral')
# 取出面积和价格查看数据分布
cols = ['area','price']
fig, axs = plt.subplots(ncols=1, nrows=0, figsize=(14, 14))
plt.subplots_adjust(right=1)
plt.subplots_adjust(top=1)
sns.color_palette("husl")
for i, col in enumerate(cols, 1):
plt.subplot(len(cols), 2, i)
sns.distplot(data[col], bins=30)
plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.show()
面积和价格特征经过处理后基本呈正态分布。
# 对于区域和朝向等不能作为模型输入的特征,需要将这些非数值量化,处理方法为使用独热编码,处理为01格式。
for cols in data.columns:
if (data[cols].dtype == np.object) or (cols == 'distance_cat'):
data = pd.concat((data, pd.get_dummies(data[cols], prefix=cols, dtype=np.int64)), axis=1)
del data[cols]
data.info()
# OUTPUT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28799 entries, 0 to 28798
Data columns (total 48 columns):
price 28799 non-null float64
area 28799 non-null float64
convenience 28799 non-null float64
elevator 28799 non-null int64
metro 28799 non-null int64
decoration 28799 non-null int64
apartment 28799 non-null int64
room_num 28799 non-null float64
hall_num 28799 non-null int64
bathroom_num 28799 non-null float64
district_BS 28799 non-null int64
district_CM 28799 non-null int64
district_CN 28799 non-null int64
district_FX 28799 non-null int64
district_HK 28799 non-null int64
district_HP 28799 non-null int64
district_JA 28799 non-null int64
district_JD 28799 non-null int64
district_JS 28799 non-null int64
district_MH 28799 non-null int64
district_PD 28799 non-null int64
district_PT 28799 non-null int64
district_QP 28799 non-null int64
district_SJ 28799 non-null int64
district_XH 28799 non-null int64
district_YP 28799 non-null int64
rh_H 28799 non-null int64
rh_L 28799 non-null int64
rh_M 28799 non-null int64
direction_E 28799 non-null int64
direction_EN 28799 non-null int64
direction_ES 28799 non-null int64
direction_ESN 28799 non-null int64
direction_EW 28799 non-null int64
direction_EWN 28799 non-null int64
direction_EWS 28799 non-null int64
direction_EWSN 28799 non-null int64
direction_N 28799 non-null int64
direction_S 28799 non-null int64
direction_SN 28799 non-null int64
direction_W 28799 non-null int64
direction_WN 28799 non-null int64
direction_WS 28799 non-null int64
direction_WSN 28799 non-null int64
distance_cat_(0, 5] 28799 non-null int64
distance_cat_(5, 10] 28799 non-null int64
distance_cat_(10, 20] 28799 non-null int64
distance_cat_(20, 70] 28799 non-null int64
dtypes: float64(5), int64(43)
memory usage: 10.5 MB
data.shape
# OUTPUT
(28799, 48)
# 对特征相关性进行可视化
colormap = plt.cm.OrRd
plt.figure(figsize=(40,40))
sns.heatmap(data.corr(),cbar=True,linewidths=0.1,vmax=1.0, square=True,
fmt='.2f',cmap=colormap, linecolor='white', annot=True)
# 数据导出备份
data.to_csv("all_data.csv", sep=',')
all_data=pd.read_csv("all_data.csv",index_col=0)
- 模型训练
# 导入相关包
# 本项目采用带剪枝的决策树来对模型进行训练
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import ShuffleSplit
plt.style.use('bmh')
import warnings
warnings.filterwarnings(action="ignore")
# 创建特征集
X = all_data.drop('price', axis=1)
X.shape
# OUTPUT
(28799, 47)
# 创建标签集
y = all_data['price']
y.shape
# OUTPUT
(28799,)
# 学习曲线 在不同的模型参数下,显示随着训练数据量的增加,模型训练集得分和验证集得分的变化,
# 本次重点关注决策树模型在不同最大深度下的表现。
# r2_score:目标变量的预测值和实际值之间的相关程度平方的百分比,数值表示该模型中目标变量中有百分之多少能够用特征来解释
# 复杂度曲线 在全部数据集训练的情况下,随着模型参数变化,模型训练集得分和验证集得分的变化,
# 本次本次重点关注决策树模型在不同最大深度下的表现。
由学习曲线可知,随着训练数据量增加,训练集得分减少,验证集得分增加,当数据量达到一定规模时,训练集得分与验证集得分趋于平稳,之后数据量再增加也无法提升模型的表现。当模型最大深度为11时,模型预测的偏差和方差达到了均衡。 从复杂度曲线可知,当最大深度大于11以后,模型验证集得分平稳后下降,方差越来越大,已出现过拟合趋势。 结合两个曲线结果,认为模型最大深度为11预测效果最好。
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 利用网格搜索来寻找使模型性能最好的参数组合
# 通过交叉验证得到每个参数组合的得分,以确定最优的参数组合
# 寻优参数取值范围
param_grid = {
'max_depth': list(range(6, 16)),
'min_samples_split': [70, 80, 90],
'max_features': [8, 10, 12],
'min_samples_leaf':[14, 16, 18]
}
# 交叉验证
cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)
# 定义训练模型
rf = RandomForestRegressor(random_state=42)
# 网格搜索
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
scoring='r2', cv = cv, n_jobs = 4, verbose = 1)
grid_search.fit(X_train, y_train)
# 最优参数
grid_search.best_params_
# OUTPUT
{'max_depth': 14,
'max_features': 12,
'min_samples_leaf': 14,
'min_samples_split': 70}
# 评分函数
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
score = r2_score(test_labels, predictions)
print('R2_score = {:0.4f}'.format(score))
# 用最优参数对数据集进行训练并评分
best_grid = grid_search.best_estimator_
evaluate(best_grid, X_test, y_test)
# OUTPUT
R2_score = 0.8828
改进:尝试使用多种回归模型进行预测,比较计算效率及准确度,尝试模型融合。在数据集中增加新的字段如:周边生活便利程度,装修状况更细分一些(精装简装毛坯),加入房屋建成年份,水电是否民用,配套设施是否齐全等。