import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
data1 = pd.read_csv('lianjia1.csv', encoding='gbk')
data2 = pd.read_csv('lianjia2.csv', encoding='gbk')
data3 = pd.read_csv('lianjia3.csv', encoding='utf-8')
data4 = pd.read_csv('lianjia4.csv', encoding='utf-8')
data5 = pd.read_csv('lianjia5.csv', encoding='utf-8')
data6 = pd.read_csv('lianjia6.csv', encoding='utf-8')
data7 = pd.read_csv('lianjia7.csv', encoding='utf-8')
data = pd.concat([data1, data2, data3, data4, data5,data6,data7])
data.dropna(inplace=True)
np.unique(data.cjshijian.str.contains('2015-'))
array([False, True])
data = data[data.cjshijian.str.contains('2015-')]
data = data[['cjdanjia','cjxiaoqu','cjlouceng']]
data.head(2)
|
cjdanjia |
cjxiaoqu |
cjlouceng |
| 0 |
43997元/平 |
红莲北里 3室1厅 57平 |
南 北/高楼层/6层 |
| 1 |
36969元/平 |
红莲南里 1室1厅 43平 |
南/高楼层/7层 |
data['cjdanjia'] = data.cjdanjia.str.replace('元/平','').astype(np.float32).map(lambda x : round(x/10000, 2))
data = data[data.cjxiaoqu.str.split().map(len) ==3]
data = data.assign(xiaoqu= data.cjxiaoqu.map(lambda x :x.split()[0]))
data = data.assign(huxing= data.cjxiaoqu.map(lambda x :x.split()[1]))
data = data.assign(mianji= data.cjxiaoqu.map(lambda x :x.split()[2]))
del data['cjxiaoqu']
data = data.assign(chaoxiang = data.cjlouceng.map(lambda x:x.split('/')[0]))
data = data.assign(louceng = data.cjlouceng.map(lambda x:x.split('/')[1]))
del data['cjlouceng']
top15 = data.xiaoqu.value_counts()[:15].index
top15
Index(['新龙城', '北京新天地', '北京像素南区', '远洋山水', '芍药居北里', '天通西苑三区', '荣丰2008', '天通苑东一区',
'天通苑中苑', '北京像素北区', '青年汇佳园', '海特花园小区', '天通西苑二区', '东亚上北中心', '沿海赛洛城'],
dtype='object')
data = data[data.xiaoqu.isin(top15)]
data.head()
|
cjdanjia |
xiaoqu |
huxing |
mianji |
chaoxiang |
louceng |
| 31 |
6.62 |
荣丰2008 |
1室1厅 |
32平 |
南 |
低楼层 |
| 347 |
3.97 |
远洋山水 |
1室--厅 |
56平 |
东 |
中楼层 |
| 388 |
3.01 |
北京像素北区 |
2室1厅 |
57平 |
西南 |
低楼层 |
| 716 |
2.90 |
北京像素北区 |
2室1厅 |
58平 |
东北 |
中楼层 |
| 260 |
3.81 |
沿海赛洛城 |
1室1厅 |
64平 |
东 |
低楼层 |
data['mianji'] = data.mianji.str.replace('平','').astype(np.float32)
data.head(3)
|
cjdanjia |
xiaoqu |
huxing |
mianji |
chaoxiang |
louceng |
| 31 |
6.62 |
荣丰2008 |
1室1厅 |
32.0 |
南 |
低楼层 |
| 347 |
3.97 |
远洋山水 |
1室--厅 |
56.0 |
东 |
中楼层 |
| 388 |
3.01 |
北京像素北区 |
2室1厅 |
57.0 |
西南 |
低楼层 |
data.chaoxiang.unique()
array(['南', '东', '西南', '东北', '南 北', '西 南', '东 南', '东 南 北', '西北', '东 北',
'东 西', '西', '北', '西 北', '东南', '南 西', '东 南 西', '西 南 北', '南 北 西',
'东 西 北', '南 西 北', '西南 东北', '南 北 东', '暂无数据', '北 东南', '北 西南',
'东 西 南', '东 北 南'], dtype=object)
data = data[data.chaoxiang != '暂无数据']
data = data.join(pd.get_dummies(data[['xiaoqu','huxing','louceng']]))
data
|
cjdanjia |
xiaoqu |
huxing |
mianji |
chaoxiang |
louceng |
xiaoqu_东亚上北中心 |
xiaoqu_北京像素北区 |
xiaoqu_北京像素南区 |
xiaoqu_北京新天地 |
... |
huxing_4室--厅 |
huxing_4室1厅 |
huxing_4室2厅 |
huxing_5室--厅 |
huxing_5室1厅 |
huxing_5室2厅 |
louceng_中楼层 |
louceng_低楼层 |
louceng_地下室 |
louceng_高楼层 |
| 31 |
6.62 |
荣丰2008 |
1室1厅 |
32.0 |
南 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 148 |
3.86 |
芍药居北里 |
1室1厅 |
43.0 |
南 |
地下室 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
| 260 |
3.81 |
沿海赛洛城 |
1室1厅 |
64.0 |
东 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 261 |
3.32 |
沿海赛洛城 |
1室1厅 |
57.0 |
东 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 285 |
3.96 |
沿海赛洛城 |
2室2厅 |
105.0 |
南 北 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 286 |
3.50 |
沿海赛洛城 |
1室1厅 |
51.0 |
东 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 342 |
2.84 |
东亚上北中心 |
1室--厅 |
41.0 |
南 |
中楼层 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 347 |
3.97 |
远洋山水 |
1室--厅 |
56.0 |
东 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 362 |
2.99 |
新龙城 |
2室1厅 |
95.0 |
南 北 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 365 |
2.80 |
新龙城 |
2室1厅 |
103.0 |
南 北 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 366 |
3.00 |
新龙城 |
2室2厅 |
101.0 |
南 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 367 |
3.45 |
新龙城 |
1室1厅 |
70.0 |
南 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 368 |
2.90 |
新龙城 |
2室2厅 |
97.0 |
东 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 370 |
3.35 |
新龙城 |
3室2厅 |
110.0 |
南 北 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 371 |
2.98 |
新龙城 |
2室2厅 |
100.0 |
南 北 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 372 |
3.36 |
新龙城 |
2室2厅 |
86.0 |
南 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 374 |
3.11 |
新龙城 |
3室2厅 |
128.0 |
南 北 西 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 375 |
2.86 |
新龙城 |
2室1厅 |
108.0 |
南 北 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 376 |
2.85 |
新龙城 |
2室1厅 |
100.0 |
南 北 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 388 |
3.01 |
北京像素北区 |
2室1厅 |
57.0 |
西南 |
低楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 388 |
3.01 |
北京像素北区 |
2室1厅 |
57.0 |
西南 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 388 |
2.78 |
海特花园小区 |
2室1厅 |
74.0 |
东 西 |
中楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 388 |
2.78 |
海特花园小区 |
2室1厅 |
74.0 |
东 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 403 |
2.70 |
北京新天地 |
2室1厅 |
89.0 |
西 北 |
中楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 415 |
3.76 |
北京新天地 |
2室1厅 |
95.0 |
东南 |
高楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 416 |
3.04 |
北京新天地 |
2室1厅 |
91.0 |
南 |
高楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 417 |
3.21 |
北京新天地 |
2室1厅 |
88.0 |
东 北 |
低楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 418 |
2.90 |
北京新天地 |
3室2厅 |
127.0 |
西南 |
低楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 419 |
2.74 |
北京新天地 |
1室1厅 |
68.0 |
东 北 |
低楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 420 |
2.98 |
北京新天地 |
1室1厅 |
61.0 |
东 |
中楼层 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| ... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
| 55606 |
3.05 |
北京像素南区 |
2室1厅 |
55.0 |
西南 |
中楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 55607 |
3.06 |
北京像素南区 |
2室1厅 |
50.0 |
北 |
中楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 55608 |
3.16 |
北京像素南区 |
2室2厅 |
50.0 |
西 |
低楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55633 |
3.04 |
北京像素北区 |
2室1厅 |
56.0 |
南 西 |
高楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 55634 |
2.94 |
北京像素南区 |
4室1厅 |
52.0 |
东北 |
低楼层 |
0 |
0 |
1 |
0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55635 |
2.91 |
北京像素南区 |
2室1厅 |
55.0 |
南 西 |
低楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55636 |
3.01 |
北京像素南区 |
1室1厅 |
51.0 |
北 |
低楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55637 |
3.02 |
北京像素北区 |
2室1厅 |
58.0 |
西南 |
中楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 55638 |
3.27 |
北京像素北区 |
2室1厅 |
41.0 |
北 |
中楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 55639 |
2.81 |
北京像素南区 |
1室1厅 |
49.0 |
北 |
中楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 55641 |
2.73 |
北京像素南区 |
4室--厅 |
50.0 |
北 |
低楼层 |
0 |
0 |
1 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55642 |
3.12 |
北京像素北区 |
1室1厅 |
40.0 |
西 北 |
中楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56119 |
2.09 |
北京像素南区 |
1室--厅 |
50.0 |
南 |
中楼层 |
0 |
0 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56120 |
3.58 |
北京像素北区 |
2室1厅 |
60.0 |
南 北 |
中楼层 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56237 |
3.19 |
新龙城 |
3室2厅 |
128.0 |
南 北 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56238 |
2.66 |
东亚上北中心 |
1室1厅 |
57.0 |
南 |
中楼层 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56513 |
3.64 |
沿海赛洛城 |
1室2厅 |
70.0 |
东 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56514 |
3.79 |
沿海赛洛城 |
1室1厅 |
70.0 |
南 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 56515 |
3.16 |
沿海赛洛城 |
2室1厅 |
115.0 |
南 北 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 56518 |
3.60 |
沿海赛洛城 |
1室1厅 |
72.0 |
南 西 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 56519 |
3.23 |
沿海赛洛城 |
1室1厅 |
55.0 |
东 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 56520 |
3.59 |
沿海赛洛城 |
1室1厅 |
55.0 |
东 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 56521 |
3.59 |
沿海赛洛城 |
2室1厅 |
96.0 |
西南 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56700 |
2.59 |
东亚上北中心 |
1室--厅 |
37.0 |
西 |
中楼层 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56702 |
2.43 |
东亚上北中心 |
1室1厅 |
57.0 |
西 |
低楼层 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 56956 |
3.59 |
新龙城 |
1室1厅 |
56.0 |
北 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 56957 |
2.20 |
天通苑东一区 |
2室1厅 |
94.0 |
东南 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56960 |
3.11 |
新龙城 |
1室2厅 |
70.0 |
南 |
低楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 56961 |
3.18 |
新龙城 |
3室2厅 |
111.0 |
南 北 |
高楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
| 56962 |
2.83 |
新龙城 |
2室2厅 |
98.0 |
东 西 |
中楼层 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
1852 rows × 42 columns
data['dong'] = (data.chaoxiang.map(lambda x : '东' in x.split())).astype(np.int32)
data['xi'] = (data.chaoxiang.map(lambda x : '西' in x.split())).astype(np.int32)
data['nan'] = (data.chaoxiang.map(lambda x : '南' in x.split())).astype(np.int32)
data['bei'] = (data.chaoxiang.map(lambda x : '北' in x.split())).astype(np.int32)
data['dongnan'] = (data.chaoxiang.map(lambda x : '东南' in x.split())).astype(np.int32)
data['xinan'] = (data.chaoxiang.map(lambda x : '西南' in x.split())).astype(np.int32)
data['dongbei'] = (data.chaoxiang.map(lambda x : '东北' in x.split())).astype(np.int32)
data['xibei'] = (data.chaoxiang.map(lambda x : '西北' in x.split())).astype(np.int32)
data.columns
Index(['cjdanjia', 'xiaoqu', 'huxing', 'mianji', 'chaoxiang', 'louceng',
'xiaoqu_东亚上北中心', 'xiaoqu_北京像素北区', 'xiaoqu_北京像素南区', 'xiaoqu_北京新天地',
'xiaoqu_天通苑东一区', 'xiaoqu_天通苑中苑', 'xiaoqu_天通西苑三区', 'xiaoqu_天通西苑二区',
'xiaoqu_新龙城', 'xiaoqu_沿海赛洛城', 'xiaoqu_海特花园小区', 'xiaoqu_芍药居北里',
'xiaoqu_荣丰2008', 'xiaoqu_远洋山水', 'xiaoqu_青年汇佳园', 'huxing_1室--厅',
'huxing_1室1厅', 'huxing_1室2厅', 'huxing_2室--厅', 'huxing_2室1厅',
'huxing_2室2厅', 'huxing_2室3厅', 'huxing_3室--厅', 'huxing_3室1厅',
'huxing_3室2厅', 'huxing_3室3厅', 'huxing_4室--厅', 'huxing_4室1厅',
'huxing_4室2厅', 'huxing_5室--厅', 'huxing_5室1厅', 'huxing_5室2厅',
'louceng_中楼层', 'louceng_低楼层', 'louceng_地下室', 'louceng_高楼层', 'dong',
'xi', 'nan', 'bei', 'dongnan', 'xinan', 'dongbei', 'xibei'],
dtype='object')
data.drop(data.columns[[1,2,4,5]], axis=1, inplace=True)
data
|
cjdanjia |
mianji |
xiaoqu_东亚上北中心 |
xiaoqu_北京像素北区 |
xiaoqu_北京像素南区 |
xiaoqu_北京新天地 |
xiaoqu_天通苑东一区 |
xiaoqu_天通苑中苑 |
xiaoqu_天通西苑三区 |
xiaoqu_天通西苑二区 |
... |
louceng_地下室 |
louceng_高楼层 |
dong |
xi |
nan |
bei |
dongnan |
xinan |
dongbei |
xibei |
| 31 |
6.62 |
32.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 148 |
3.86 |
43.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 260 |
3.81 |
64.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 261 |
3.32 |
57.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 285 |
3.96 |
105.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 286 |
3.50 |
51.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 342 |
2.84 |
41.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 347 |
3.97 |
56.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 362 |
2.99 |
95.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 365 |
2.80 |
103.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 366 |
3.00 |
101.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 367 |
3.45 |
70.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 368 |
2.90 |
97.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 370 |
3.35 |
110.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 371 |
2.98 |
100.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 372 |
3.36 |
86.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
| 374 |
3.11 |
128.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
| 375 |
2.86 |
108.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 376 |
2.85 |
100.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 388 |
3.01 |
57.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 388 |
3.01 |
57.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 388 |
2.78 |
74.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 388 |
2.78 |
74.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 403 |
2.70 |
89.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
| 415 |
3.76 |
95.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 416 |
3.04 |
91.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 417 |
3.21 |
88.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 418 |
2.90 |
127.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 419 |
2.74 |
68.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 420 |
2.98 |
61.0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| ... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
| 55606 |
3.05 |
55.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55607 |
3.06 |
50.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 55608 |
3.16 |
50.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 55633 |
3.04 |
56.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
| 55634 |
2.94 |
52.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
| 55635 |
2.91 |
55.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
| 55636 |
3.01 |
51.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 55637 |
3.02 |
58.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 55638 |
3.27 |
41.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 55639 |
2.81 |
49.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 55641 |
2.73 |
50.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 55642 |
3.12 |
40.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
| 56119 |
2.09 |
50.0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 56120 |
3.58 |
60.0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 56237 |
3.19 |
128.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
| 56238 |
2.66 |
57.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 56513 |
3.64 |
70.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 56514 |
3.79 |
70.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 56515 |
3.16 |
115.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 56518 |
3.60 |
72.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
| 56519 |
3.23 |
55.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 56520 |
3.59 |
55.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 56521 |
3.59 |
96.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| 56700 |
2.59 |
37.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 56702 |
2.43 |
57.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 56956 |
3.59 |
56.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 56957 |
2.20 |
94.0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 56960 |
3.11 |
70.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 56961 |
3.18 |
111.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
| 56962 |
2.83 |
98.0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1852 rows × 46 columns
Y = data.cjdanjia
X = data[[x for x in data.columns if x != 'cjdanjia']]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
model.predict(X_test)
array([3.54665262, 1.84432528, 3.51138981, 3.98215708, 3.36580034,
1.88201581, 3.21580288, 2.33208225, 2.09160868, 4.1281406 ,
2.36620194, 4.21686701, 3.59944481, 2.06453045, 4.13269 ,
1.84432576, 2.43646187, 2.88226974, 2.33208225, 3.1939061 ,
4.03979213, 2.96406722, 1.81346511, 3.06775758, 3.05491474,
3.1360286 , 3.99916963, 4.32998919, 3.33511616, 1.94227664,
3.10201174, 3.21006363, 4.68719684, 2.9850422 , 2.53152937,
2.07719992, 3.1481618 , 2.15467268, 3.08020351, 4.64958521,
2.64958863, 2.84651008, 4.57702631, 3.4841747 , 4.16648524,
2.84844517, 2.97104674, 2.61040881, 4.32998919, 2.03134972,
4.01823018, 1.97383557, 2.41625926, 4.37021131, 1.77878054,
2.41629925, 3.09124581, 2.37530854, 4.73390401, 4.30862556,
1.80473525, 3.02924878, 2.83819801, 4.91761246, 4.40271337,
1.98365363, 3.61851997, 2.8662761 , 4.97066841, 3.00969724,
2.15917709, 2.79711081, 2.82625599, 5.75809283, 2.4868289 ,
2.00149098, 4.18132244, 2.32747275, 2.76499757, 2.42745698,
2.26339241, 3.16802316, 4.35109157, 2.49178652, 2.91376021,
2.84356037, 1.49141912, 2.27554011, 2.05316992, 4.29595972,
2.87189315, 4.90403619, 5.74139584, 3.09648816, 3.58979582,
3.17806313, 1.47075586, 2.06083945, 3.08564913, 2.6397108 ,
3.06089967, 2.96597686, 2.63565871, 3.20329316, 4.04179007,
3.17552503, 3.8474278 , 3.12522532, 3.04294964, 5.65158196,
4.21936542, 2.48125583, 4.50437188, 1.70229024, 2.07846376,
2.31880324, 3.46787309, 5.21671323, 1.97186316, 3.04086916,
3.99530084, 2.15917709, 2.91376021, 3.80513772, 2.97910429,
3.13648847, 4.08180629, 2.71471393, 3.07686345, 2.27709597,
5.04433487, 5.46478765, 2.73 , 5.81495598, 3.73140732,
3.02825421, 3.2108307 , 3.42007434, 2.26268721, 2.93784626,
4.12022006, 1.98955488, 2.4914239 , 1.73998077, 2.19861149,
2.21302025, 3.84180461, 4.99191227, 3.06621723, 4.30862556,
4.28137263, 1.49141912, 2.31767349, 2.75904334, 2.95716276,
2.55429561, 3.82162929, 5.75581726, 3.57589166, 2.36620194,
2.29128132, 3.10721108, 4.56052941, 2.87053392, 4.71962696,
1.7939048 , 2.74208975, 1.76956268, 2.41138165, 2.04103672,
2.85037354, 4.00545245, 3.96797116, 4.63925095, 1.63403144,
1.78186996, 2.84171639, 2.21918606, 2.83819801, 2.48125583,
3.50032946, 3.79765893, 4.15787802, 1.8339119 , 2.53200218,
2.70243355, 3.02025366, 5.71257831, 3.05491474, 2.00068069,
3.58162687, 2.94462235, 4.03751656, 2.22435034, 2.13035956,
5.7595965 , 4.07926215, 2.6109347 , 5.14267305, 2.7644082 ,
3.96172366, 2.17426963, 2.95286696, 1.87986081, 3.01209872,
2.77455019, 2.95722048, 3.38375406, 2.52907712, 5.85097804,
4.50701389, 2.27554011, 3.0551991 , 2.79978517, 2.4868289 ,
1.98787534, 2.93035767, 3.37035682, 1.28330249, 4.02510574,
5.06547241, 2.74208975, 1.32417066, 5.88439056, 2.77462455,
3.64819662, 3.8994612 , 2.90551035, 2.96186412, 5.66651034,
3.92157605, 2.77131832, 2.22295888, 3.60834399, 4.13269 ,
2.1595355 , 1.62033836, 3.1548615 , 3.45311976, 3.41147678,
3.0673323 , 1.11023806, 4.06216025, 3.2901411 , 3.66921306,
2.8792539 , 1.72041791, 4.26206491, 3.37035682, 3.08561177,
2.89592903, 3.5091269 , 1.95705219, 3.02518262, 4.27124282,
3.80789055, 4.16719131, 3.92157605, 1.27463079, 2.84739984,
2.96000937, 1.72893733, 2.68997097, 2.9407321 , 2.74463457,
5.73803455, 4.90092813, 2.72620481, 2.32347635, 3.09648816,
4.16719131, 4.14994499, 3.09390438, 4.03503639, 3.7603951 ,
4.49211408, 2.69030036, 1.84432528, 5.7595965 , 5.75011669,
2.59299853, 1.99806239, 3.16683734, 5.40511666, 2.33422325,
3.98237992, 2.63716069, 3.08783297, 2.81769255, 4.81379465,
2.25026695, 2.43873744, 3.10066662, 4.12974402, 4.47053858,
3.24793352, 2.28662038, 2.35850716, 3.73157758, 3.16819364,
5.77248894, 2.35406875, 3.41013891, 3.35260298, 2.09703088,
1.90196082, 5.87752 , 2.12163871, 4.36219971, 2.19861149,
3.3954847 , 2.36620194, 1.91987029, 2.29150473, 3.28853057,
3.38335151, 3.54665262, 2.34046354, 1.93805355, 2.10570201,
2.8647823 , 3.07788872, 3.31856327, 3.85633902, 2.37530854,
3.58162687, 4.93991617, 4.80343526, 2.06247571, 5.42440304,
3.07274133, 2.94738254, 1.91154883, 2.94738254, 2.28922917,
2.79609067, 3.04242284, 1.5246137 , 2.97139526, 5.64210216,
1.84751037, 3.57311149, 1.80471166, 2.68997097, 4.094869 ,
2.33208225, 2.07586054, 3.02801407, 2.90429662, 2.94498387,
3.36207152, 4.73201452, 2.77131832, 4.39715539, 3.69771697,
2.5022541 , 2.76014142, 2.57145054, 3.09305673, 4.48395754,
4.01106752, 4.56801743, 4.76138471, 2.74565984, 2.8647823 ,
3.10892712, 2.70258073, 4.30908727, 3.62047718, 1.73998077,
3.71646091, 3.01384545, 3.90988081, 5.77248894, 3.27533698,
4.4754162 , 2.81673203, 2.44735236, 4.06216025, 2.53754321,
2.34421545, 3.10595175, 2.18628583, 2.45211293, 1.94762194,
2.74565984, 3.07940979, 2.64198491, 2.65769094, 2.84844517,
5.63996838, 5.78932861, 4.20061103, 2.28922917, 4.6928115 ,
2.30828472, 3.30987163, 4.17763705, 2.76829328, 3.6811981 ,
1.73399418, 2.74208975, 5.15907818, 3.19928674, 2.86081081,
2.15749238, 3.0693235 , 3.0529984 , 6.07172079, 3.39088905,
4.00798884, 3.73297978, 4.37027911, 2.99536334, 2.09703088,
3.21580346, 3.23940796, 2.12195745, 3.3954847 , 2.85612515,
1.99806239, 3.11934427, 1.7319769 , 4.96083875, 1.60069586,
2.01280414, 1.79910779, 3.23642435, 3.59944481, 3.11186286,
2.54019806, 3.07274133, 4.18387977, 3.27675105, 2.90583092,
4.25455849, 3.68171193, 4.30233474, 3.04326422, 3.54817206,
3.1360286 , 3.0551991 , 1.69541738, 4.15505812, 2.22295888,
3.02949001, 2.42432868, 3.55613242, 2.56410218, 4.00798884,
2.32747275, 2.95716276, 2.42745698, 3.14624668, 3.67333406,
2.50482301, 2.68997097, 2.49842407, 4.91045928, 2.28662038,
4.10003098, 3.5634741 , 2.83478421])
from sklearn import metrics
metrics.mean_squared_error(model.predict(X_test), Y_test)
0.30372099459200635
model.coef_
array([-1.44087637e-02, -5.88458915e-01, -3.69944601e-01, -7.10786377e-01,
-1.40959934e-01, -9.55124367e-01, -7.43602416e-01, -8.44907451e-01,
-7.22158788e-01, 7.31838479e-03, 4.11108334e-01, -5.53607152e-01,
1.27018675e+00, 2.10620937e+00, 1.06421863e+00, 7.70508539e-01,
-1.43821100e+00, -7.65648863e-01, -7.25699895e-01, -3.85104209e-01,
-3.34462568e-01, -3.18062598e-01, -3.43394946e-02, 2.77555756e-16,
-1.07187887e-02, 1.28991791e-01, 6.57366427e-01, -5.82059811e-01,
4.07375703e-01, 4.13416185e-01, 7.77156117e-16, 1.23347953e+00,
1.75367759e+00, 5.38855506e-01, 4.30840975e-01, -1.52068518e+00,
5.50988700e-01, -4.01476142e-02, -1.82192128e-01, 5.27502507e-02,
-6.74488194e-02, -2.57261956e-02, -8.27544267e-03, -3.35368041e-01,
-3.54450446e-01])