导入模块:
import pandas as pd
data=pd.read_csv('C:/Users/第四章/4.2.1 数据排序/数据排序.csv',
engine='python',encoding='utf-8')
1.排序:按年龄升序/性别降序排序
sortData=data.sort_values(
by=['年龄','性别'],
ascending=[True,False])
#查找空缺值数量并进行排序
full_null=full.sinull().sum()
full_null=full_null[full_null>0]
full_null.sort_values(inplace=True)
2.查找重复值:
#1.以整行为准查询
dIndex1=data.duplicated()
print(dIndex)
#2.以某列为准查询
dIndex2=data.duplicated(['性别'])
print(dIndex2)
print(data[data.duplicated()])
#删除重复值
cData=data.drop_duplicates()
3.缺失值处理:
#1.缺失值处理:均值填充
data['消费']=data.消费.fillna(data.消费.mean())
print(data)
#2.缺失值处理:丢失
cData=data.dropna(axis=0,how='any')
print(cData)
#3.对字符串前后的空格消除
data['name']=data['name'].str.strip()
4.字符转换:
#1.转换为字符串/数值型:astype
data['电话号码']=data.电话号码.astype(str)
data['数值型电话号码']=data.电话号码.astype(float)
#2.转换时间为年-月-日:pd.to_datetime
data['时间']=pd.to_datetime(
data.注册时间,
format='%Y/%m/%d')
print(data.时间)
#3.时间格式化:dt.strftime
data['年月']=data.时间.dt.strftime('%Y-%m')
print(data.年月)
5.拆分:
#1.字段拆分:slice,数值型要记得先转换成字符型
bands=data['tel'].str.slice(0,3)
areas=data['tel'].str.slice(3,7)
#2.按分隔符拆分:split,分割为n+1列,是否展开
newData=data['name'].str.split(' ',1,True)
#3.时间属性抽取:dt.year/month/day/weekday/hour/minute/second
data['时间.年']=data['时间'].dt.year
6.记录抽取:
#1.记录抽取:title包含关键字
fData=data[data.title.str.contains('电台',na=False)]
#2.空值抽取
fData=data[data.title.isnull()]
#3.数据范围抽取
fData=data[data.comments between(1000,10000)]
#4.时间范围抽取
from datetime import datetime
dt1=datetime(year=2015,month=1,day=1)
dt2=datetime(year=2015,month=12,day=31)
fData=data[(data.ptime>=dt1)&(data.ptime<=dt2)]
#5.组合条件抽取:且& 或| 取反~
fData=data[~data.title.str.contains('电台',na=False)]
7.随机抽样:sample(n,frac,replace,n和frec取一,默认不放回
sData=data.sample(n=3)
sData=data.sample(frac=0.2,replace=True)
8.合并:
#1.记录合并:concat根据列名合并
data=pd.concat([data1,data2,data3])
#2.字段合并:记得先转换为字符串
tel=data['band']+data['area']+data['num']
#3.字段匹配:merge,inner/outer/left/right
itemPrice=pd.merge(items,
prices,
left_on='id',
right_on='id')
10.数据标准化:
data['消费']=round(
(
data.消费-data.消费.min()
)/(
data.消费.max()-data.消费a.min()
),2
)
11.数据分组:cut(x,bins,right,labels)
data.cost.min()
data.cost.max()
bins=[0,20,40,60,80,100]
label=['0-20','20-40','40-60','60-80','80-100']
#right=False默认左闭合,label自定义列标
data['cut']=pd.cut(data.cost,bins,right=False,labels=label)
12.描述性统计:
#1.describe()/var/std/min/max/count
data.sales.describe()
data.sales.var()
#查找众数后填充
data['sales'].head()
data['sales'].value_counts()
#或者直接填充众数
data['sales']=data['sales'].fillna(data['sales'].mode()[0])
# 前向后填充
data['sales']=data['sales'].fillna(method='ffill'))
# 后向前填充
data['sales']=data['sales'].fillna(method='bfill'))
#2.分位数统计,nearest表示最近的只
data.sales.quantile(0.3,interpolation='nearest')
13.其他统计分析:
#1.分组分析:groupby(列名).agg(统计函数),as_index分组的列是否作为索引列
#按性别分组,按年龄均值统计
ga=data.groupby(
by=['gender'],as_index=False)['age'].agg('mean')
#按gender分组,并把其中值赋给age
ga=data.groupby('gender')['age'].transform(lambda x:x.median())
#2.结构分析:计算百分比
ga/ga.sum()
#3.分布分析:年龄太多需要分段统计cut
bins=[0,20,30,40,100]
ageLabels=['20岁及以下','21-30岁','31-40岁','41岁以上']
data['年龄分层']=pd.cut(
data.年龄,
bins,
labels=ageLabels)
aggResult=data.groupby(['年龄分层']('用户ID').agg('count'))
pAggResult=round(
aggResult/aggResult.sum(),
4)*100
#字符串格式化:
pAggResult.map('{:,.2f}%'.format)
#4.交叉分析pivot_table,数据透视表
#行为年龄分组,列为性别,统计ID人数count
ptResult=data.pivot_table(
values='用户ID',
index='年龄分层',
columns='性别',
aggfunc='count')
#从时间中获取天数dt.days
from datetime import datetime
data['DateDiff']=datetime.now()-data['DealDateTime']
data['DateDiff']=data['DateDiff'].dt.days
14.RFM分析:RFM=100R_S+10F_S+1*M_S
#按ID统计最近消费日期间隔
R_Agg=data.groupby(
by=['customerId'],
as_index=False)['DateDiff'].agg('min')
#按ID统计时间段消费总次数
F_Agg=data.groupby(
by=['customerId'],
as_index=False)['OrderId'].agg('count')
#按ID统计时间段消费金额总和
M_Agg=data.groupby(
by=['customerId'],
as_index=False)['Sales'].agg('sum')
#关联统计结果
aggData=R_Agg.merge(F_Agg).merge(M_Agg)
aggData.columns=['customerId','RAgg','FAgg','MAgg']
#按用户最近消费距离列从小到大求出相应的数据点
bins=aggData.RAgg.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation='nearest')
#对得分进行赋值:从大到小
bins[0]=0
rLabels=[5,4,3,2,1]
R_S=pd.cut(aggData.RAgg,bins,labels=rLabels)
#按用户消费频次列从小到大求出相应的数据点
bins=aggData.FAgg.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation='nearest')
#对得分进行赋值:从小到大
bins[0]=0
fLabels=[1,2,3,4,5]
F_S=pd.cut(aggData.FAgg,bins,labels=fLabels)
#按用户消费金额列从小到大求出相应的数据点
bins=aggData.MAgg.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation='nearest')
#对得分进行赋值:从小到大
bins[0]=0
mLabels=[1,2,3,4,5]
M_S=pd.cut(aggData.MAgg,bins,labels=mLabels)
aggData['R_S']=R_S
aggData['F_S']=F_S
aggData['M_S']=M_S
#计算RFM总得分
aggData['RFM']=100*R_S.astype(int)+10*F_S.astype(int)+1*M_S.astype(int)
bins=aggData.RFM.quantile(q=[0,0.125,0.25,0.375,0.5,0.625,0.75,0.875,1],interpolation='nearest')
bins[0]=0
rfmLabels=[1,2,3,4,5,6,7,8]
aggData['level']=pd.cut(aggData.RFM,
bins,labels=rfmLabels)
aggData.groupby(
by='level')['CustomerID'].agg('count')
15.相关分析和回归分析:
#1.2.相关系数(矩阵)corr
data['人口'].corr(data['文盲率'])
corrMatrix=data[['超市购物率','网购率','文盲率','人口']].corr()
#回归分析:简单线性回归,scatter/line/bar/barth/hist/box/pie/scatter
data.plot('广告费(万元)','销售额(万元)',kind='scatter')
#估计模型参数
x=data['广告费(万元)']
y=data['销售额(万元)']
from sklearn.linear_model import LinearRegression
lrModel=LinearRegression()
lrModel.fit(x,y)
#模型参数
lrModel.coef_
lrModel.intercept_
#模型检验R^2
lrModel.score(x,y)
#模型预测:根据x预测y
lrModel.predict(pre_x)
#3.多重线性回归
x=data[['广告费','客流量']]
y=data['销售额']
lrModel=LinearRegression()
lrModel.fit(x,y)
lrModel.coef_
lrModel.intercept_
lrModel.score(x,y)
16.数据可视化:
import matplotlib.pyplot as plt
#1.散点图和颜色
plt.scatter(data['广告费'],data['购买用户数'],c='R')
#2.坐标轴名称/颜色
plt.xlabel('广告费',color='k',fontproperties=font)
plt.ylabel('购买用户数',color='k',fontproperties=font)
#3.坐标轴刻度
plt.xticks(color='k',fontproperties=fon)
#打开新画布
plt.figure()
#5.为图形增加图例
legend=plt.legend(labels=['促销','不促销'],prop=font)
#6.通过s调整marker的大小
plt.scatter(
data[data['促销']=='是']['广告费'],
data[data['促销']=='是']['购买用户数'],
c='r',marker='o',
s=data[data['促销']=='是']['渠道数']*80)
17.常见图形:
#1.矩阵图scatter
#记得先按一定标准进行分组统计均值或总数,得到aggData(含省份/月消费/月流量)
font=matplotlib.font_manager.FontProperties(fname='C:/Users/SourceHanSansCN-Light.otf',size=20)
labelFont=matplotlib.font_manager.FontProperties(fname='C:/Users/SourceHanSansCN-Light.otf',size=35)
mainColor=(91/255,155/255,213/255,1)
fontColor=(110/225,110/225,110/225,1)
fig=plt.figure()
gap=0.01
xMin=aggData['月消费(元)'].min()*(1-gap)
xMax=aggData['月消费(元)'].max()*(1+gap)
yMin=aggData['月流量(MB)'].min()*(1-gap)
yMax=aggData['月流量(MB)'].max()*(1+gap)
#设置坐标轴的范围
plt.xlim(xMin,xMax)
plt.ylim(yMin,yMax)
#去掉坐标轴的刻度
#plt.xticks([])
#plt.yticks([])
#绘制散点图
plt.scatter(aggData['月消费(元)'],aggData['月流量(MB)'],s=300,marker='o',color=mainColor)
plt.xlabel('人均月消费(元)',color=fontColor,fontproperties=labelFont)
plt.ylabel('人均月流量(MB)',color=fontColor,fontproperties=labelFont)
#绘制均值线
plt.vlines(x=aggData['月消费(元)'].mean(),ymin=yMin,ymax=yMax,linewidth=1,color=mainColor)
plt.hlines(y=aggData['月流量(MB)'].mean(),xmin=xMin,xmax=xMax,linewidth=1,color=mainColor)
#标注象限标记
plt.text(xMax-0.5,yMax-5,'I',color=fontColor,fontsize=50)
plt.text(xMin,yMax-5,'',color=fontColor,fontsize=50)
plt.text(xMin,yMin,'III',color=fontColor,fontsize=50)
plt.text(xMax-0.6,yMin,'IV',color=fontColor,fontsize=50)
#画标签
for i,r in aggData.iterrows():
plt.text(r['月消费(元)']+0.25,
r['月流量(MB)']-1,
r['省份'],
color=fontColor,
fontproperties=font)
plt.show()
#2.折线图plot
plt.plot(x,y,'-',color='r')
#3.饼图pie
#设置横纵轴相等的圆形
plt.axis('equal')
maleColor=(91/255,155/255,213/255,0.5)
femaleColor=(91/255,155/255,213/255,1)
plt.pie(x,labels=['男','女'],colors=[femaleColor,maleColor],
autopct='%.1f%%',textprops={'fontproperties':font})
#4.柱形图bar/条形图barh
plt.bar(index,result['月消费(元)'])
#5.箱线图
plt.box()