pandas库的sort_values、set_index、reset_index、cumsum、groupby、concat的用法

本文已参与「新人创作礼」活动，一起开启掘金创作之路。

1、sort_values、set_index、reset_index、cumsum、groupby函数的用法

import pandas as pd


#sort_values()函数是按照选中索引所在列的原素进行排序
df=pd.DataFrame({'A':[3,1,1,6,7],'B':['a','d','c','b','e'],'C':[123,343,122,978,459]})

print(df.sort_values('A'))
# 结果
'''
   A  B    C
1  1  d  343
2  1  c  122
0  3  a  123
3  6  b  978
4  7  e  459

'''

#如果多个索引，按照索引先后顺序，依次在上一次索引排序后再对重复排序的内部进行排序
print(df.sort_values(['A','B']))
# 结果
'''
   A  B    C
1  1  e  343
2  1  t  122
0  3  a  123
3  6  o  978
4  7  r  459
'''

# set_index,rset_index：重新设置某一列作为索引/恢复某一列的索引
df=pd.DataFrame({'A':['a','a','e','a','e'],'B':[4,6,5,7,5],'C':[i for i in range(23,28)]})
print(df.set_index('A'))
# 结果
'''
   B   C
A       
a  4  23
a  6  24
e  5  25
a  7  26
e  5  27

'''
sets=df.set_index(['A','B'])
print(sets)
# 结果
'''
      C
A B    
a 4  23
  6  24
e 5  25
a 7  26
e 5  27
'''
sets1=sets.reset_index(['A','B'])
print(sets1)
#结果
'''
   A  B   C
0  a  4  23
1  a  6  24
2  e  5  25
3  a  7  26
4  e  5  27

'''

# cumsum:计算轴向元素累加和，返回由中间结果组成的数组
import numpy as np
# 是2*2*3的数组
arr  = np.array([[[1,2,3],[8,9,12]],[[1,2,4],[2,4,5]]])
print(arr.cumsum(0))
# 结果
'''
[[[ 1  2  3]
  [ 8  9 12]]

 [[ 2  4  7]
  [10 13 17]]]
'''
print(arr.cumsum(1))
# 结果
'''
[[[ 1  2  3]
  [ 9 11 15]]

 [[ 1  2  4]
  [ 3  6  9]]]
'''
print(arr.cumsum(2))
# 结果
'''
[[[ 1  3  6]
  [ 8 17 29]]

 [[ 1  3  7]
  [ 2  6 11]]]
'''

# groupby
df=pd.DataFrame({'A':['age','bwr','age','bwr','dfd'],'B':[1,2,1,5,6],'C':[345,23,345,35,33]})
# 分组后并没有进行计算，下面在'B'列求了平均
print(df.groupby('A')['B'].mean())

2、concat的用法

举个例子：将如下12个月份如图一的表合并成合并成图二的表（这里我已经将inst_id设置为索引，之后都是以它连接）

图一：

图二：

import pandas as pd
import numpy as np

# 读取excel
df1 = pd.read_excel('合同1.xls')
# 将sinst_id的数据类型转化成string类型
df1['sinst_id'] = df1.sinst_id.astype(np.str)
# 将sinst_id设置成行索引
df1 = df1.set_index('sinst_id')

df2 = pd.read_excel('合同2.xls')
df2['sinst_id'] = df2.sinst_id.astype(np.str)
df2 = df2.set_index('sinst_id')

df3 = pd.read_excel('合同3.xls')
df3['sinst_id'] = df3.sinst_id.astype(np.str)
df3 = df3.set_index('sinst_id')

df4 = pd.read_excel('合同4.xls')
df4['sinst_id'] = df4.sinst_id.astype(np.str)
df4 = df4.set_index('sinst_id')

df5 = pd.read_excel('合同5.xls')
df5['sinst_id'] = df5.sinst_id.astype(np.str)
df5 = df5.set_index('sinst_id')

df6 = pd.read_excel('合同6.xls')
df6['sinst_id'] = df6.sinst_id.astype(np.str)
df6 = df6.set_index('sinst_id')

df7 = pd.read_excel('合同7.xls')
df7['sinst_id'] = df7.sinst_id.astype(np.str)
df7 = df7.set_index('sinst_id')

df8 = pd.read_excel('合同8.xls')
df8['sinst_id'] = df8.sinst_id.astype(np.str)
df8 = df8.set_index('sinst_id')

df9 = pd.read_excel('合同9.xls')
df9['sinst_id'] = df9.sinst_id.astype(np.str)
df9 = df9.set_index('sinst_id')

df10 = pd.read_excel('合同10.xls')
df10['sinst_id'] = df10.sinst_id.astype(np.str)
df10 = df10.set_index('sinst_id')

df11 = pd.read_excel('合同11.xls')
df11['sinst_id'] = df11.sinst_id.astype(np.str)
df11 = df11.set_index('sinst_id')

df12 = pd.read_excel('合同12.xls')
df12['sinst_id'] = df12.sinst_id.astype(np.str)
df12 = df12.set_index('sinst_id')


frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]

# 连接表axis=1时是表横向连接，keys可以设置连接后的列索引，join为外连接
result = pd.concat(frames, axis=1, keys=['{}月'.format(i) for i in range(1, 13)], join='outer')

print(result)

# 存储进excel，index=True时，存储索引，header是存储以前的行索引
result.to_excel('合同总.xls', sheet_name='Sheet1', index=True, header=True)