本文已参与「新人创作礼」活动,一起开启掘金创作之路。
1、sort_values、set_index、reset_index、cumsum、groupby函数的用法
import pandas as pd
#sort_values()函数是按照选中索引所在列的原素进行排序
df=pd.DataFrame({'A':[3,1,1,6,7],'B':['a','d','c','b','e'],'C':[123,343,122,978,459]})
print(df.sort_values('A'))
# 结果
'''
A B C
1 1 d 343
2 1 c 122
0 3 a 123
3 6 b 978
4 7 e 459
'''
#如果多个索引,按照索引先后顺序,依次在上一次索引排序后再对重复排序的内部进行排序
print(df.sort_values(['A','B']))
# 结果
'''
A B C
1 1 e 343
2 1 t 122
0 3 a 123
3 6 o 978
4 7 r 459
'''
# set_index,rset_index:重新设置某一列作为索引/恢复某一列的索引
df=pd.DataFrame({'A':['a','a','e','a','e'],'B':[4,6,5,7,5],'C':[i for i in range(23,28)]})
print(df.set_index('A'))
# 结果
'''
B C
A
a 4 23
a 6 24
e 5 25
a 7 26
e 5 27
'''
sets=df.set_index(['A','B'])
print(sets)
# 结果
'''
C
A B
a 4 23
6 24
e 5 25
a 7 26
e 5 27
'''
sets1=sets.reset_index(['A','B'])
print(sets1)
#结果
'''
A B C
0 a 4 23
1 a 6 24
2 e 5 25
3 a 7 26
4 e 5 27
'''
# cumsum:计算轴向元素累加和,返回由中间结果组成的数组
import numpy as np
# 是2*2*3的数组
arr = np.array([[[1,2,3],[8,9,12]],[[1,2,4],[2,4,5]]])
print(arr.cumsum(0))
# 结果
'''
[[[ 1 2 3]
[ 8 9 12]]
[[ 2 4 7]
[10 13 17]]]
'''
print(arr.cumsum(1))
# 结果
'''
[[[ 1 2 3]
[ 9 11 15]]
[[ 1 2 4]
[ 3 6 9]]]
'''
print(arr.cumsum(2))
# 结果
'''
[[[ 1 3 6]
[ 8 17 29]]
[[ 1 3 7]
[ 2 6 11]]]
'''
# groupby
df=pd.DataFrame({'A':['age','bwr','age','bwr','dfd'],'B':[1,2,1,5,6],'C':[345,23,345,35,33]})
# 分组后并没有进行计算,下面在'B'列求了平均
print(df.groupby('A')['B'].mean())
2、concat的用法
举个例子: 将如下12个月份如图一的表合并成合并成图二的表(这里我已经将inst_id设置为索引,之后都是以它连接)
图一:
图二:
import pandas as pd
import numpy as np
# 读取excel
df1 = pd.read_excel('合同1.xls')
# 将sinst_id的数据类型转化成string类型
df1['sinst_id'] = df1.sinst_id.astype(np.str)
# 将sinst_id设置成行索引
df1 = df1.set_index('sinst_id')
df2 = pd.read_excel('合同2.xls')
df2['sinst_id'] = df2.sinst_id.astype(np.str)
df2 = df2.set_index('sinst_id')
df3 = pd.read_excel('合同3.xls')
df3['sinst_id'] = df3.sinst_id.astype(np.str)
df3 = df3.set_index('sinst_id')
df4 = pd.read_excel('合同4.xls')
df4['sinst_id'] = df4.sinst_id.astype(np.str)
df4 = df4.set_index('sinst_id')
df5 = pd.read_excel('合同5.xls')
df5['sinst_id'] = df5.sinst_id.astype(np.str)
df5 = df5.set_index('sinst_id')
df6 = pd.read_excel('合同6.xls')
df6['sinst_id'] = df6.sinst_id.astype(np.str)
df6 = df6.set_index('sinst_id')
df7 = pd.read_excel('合同7.xls')
df7['sinst_id'] = df7.sinst_id.astype(np.str)
df7 = df7.set_index('sinst_id')
df8 = pd.read_excel('合同8.xls')
df8['sinst_id'] = df8.sinst_id.astype(np.str)
df8 = df8.set_index('sinst_id')
df9 = pd.read_excel('合同9.xls')
df9['sinst_id'] = df9.sinst_id.astype(np.str)
df9 = df9.set_index('sinst_id')
df10 = pd.read_excel('合同10.xls')
df10['sinst_id'] = df10.sinst_id.astype(np.str)
df10 = df10.set_index('sinst_id')
df11 = pd.read_excel('合同11.xls')
df11['sinst_id'] = df11.sinst_id.astype(np.str)
df11 = df11.set_index('sinst_id')
df12 = pd.read_excel('合同12.xls')
df12['sinst_id'] = df12.sinst_id.astype(np.str)
df12 = df12.set_index('sinst_id')
frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]
# 连接表axis=1时是表横向连接,keys可以设置连接后的列索引,join为外连接
result = pd.concat(frames, axis=1, keys=['{}月'.format(i) for i in range(1, 13)], join='outer')
print(result)
# 存储进excel,index=True时,存储索引,header是存储以前的行索引
result.to_excel('合同总.xls', sheet_name='Sheet1', index=True, header=True)