Pandas是基于numpy构建的数据分析包
Series: 类似一维数组
-
pd.Series(data, index = index) # 基于ndarray,内部的数据要整齐划一,数据类型必须相同 import pandas as pd pd.__version__ >>>>'1.4.2' a.index a = pd.Series([2, 0, -4, 12]) print(a.values) print(a.index) >>>>[ 2 0 -4 12] >>>>RangeIndex(start=0, stop=4, step=1) s = pd.Series(np.random.randn(5), index=['a','b','c','d','e']) print(s.values) print(s.index) • s.index = ['6','2','3','4','5'] print(s) >>>> [ 1.12122062 3.25524235 -0.39420016 0.78466055 -0.22310155] >>>> Index(['a', 'b', 'c', 'd', 'e'], dtype='object') 6 1.121221 2 3.255242 3 -0.394200 4 0.784661 5 -0.223102 >>>> dtype: float64 di = {'a':1,'b':2,'c':3} tmp = pd.Series(di) # 字典作为数据源创建Series,不需要设置index tmp >>>>> a 1 b 2 c 3 >>>>> dtype: int64s.describe() >>>>>> count 5.000000 mean 0.908764 std 1.461620 min -0.394200 25% -0.223102 50% 0.784661 75% 1.121221 max 3.255242 dtype: float64
print(s1.append(s2, ignore_index=True))这样可以忽略原始Series对象中的索引
Series中的向量化操作与布尔索引
pandas向量化操作基于numpy实现,在numpy中向量化意味着并行处理
-
布尔表达式提取
-
向量进向量出
- 不像c或java一样用for循环来实现类似操作
Series里的切片操作
s = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
print(s[1:3])
print(s['a':'c'])
>>>>>
b 2
c 3
dtype: int64
a 1
b 2
c 3
dtype: int64

Series里的缺失值
np.nan
Series中的删除与添加操作
删除:drop()
s = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
print(s.drop('a')) # 只是视图,不会改变原series。 这里传入的是index = ’a‘
print(s.drop(['a', 'c'])) # 一次drop多个
print(s) # s不会变
print(s.drop('a', inplace = True)) # 会修改series了,但这时候就没有视图,所以打印出来是个nan
print(s) # 的确变了
>>>>>>
b 2
c 3
d 4
e 5
dtype: int64
b 2
d 4
e 5
dtype: int64
a 1
b 2
c 3
d 4
e 5
dtype: int64
None
b 2
c 3
d 4
e 5
dtype: int64
添加:append() ----- 能把一个Series对象整体追加到前一个Series对象后面
s = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
b = pd.Series(np.random.rand(3))
s.append(b)
>>>>>>>
a 1.000000
b 2.000000
c 3.000000
d 4.000000
e 5.000000
0 0.925709
1 0.204938
2 0.070487
dtype: float64
s.append(b, ignore_index = True)
>>>>>>>
0 1.000000
1 2.000000
2 3.000000
3 4.000000
4 5.000000
5 0.925709
6 0.204938
7 0.070487
dtype: float64
Series中的name属性
s.name = 'long'
s.index.name = '标签'
print(s)
>>>>>>
标签
a 1
b 2
c 3
d 4
e 5
Name: long, dtype: int64
DataFrame:类似二维数组
DataFrame是带标签的二维数组,可以由若干个一维数组(Series)构成
df = pd.DataFrame({"no":[2,34,45,23]})
df
>>>>>>>>
no
0 2
1 34
2 45
3 23
df = pd.DataFrame({"no":[2,34,45,23], "name":['a','b','c','d'], "score":[21,23,34,45]})
df
>>>>>>
no name score
0 2 a 21
1 34 b 23
2 45 c 34
3 23 d 45
numpy的二维数组可以转化为DataFrame对象
data1 = np.random.randint(1,10,9).reshape(3,3)
df1 = pd.DataFrame(data1)
df1
>>>>>
0 1 2
0 2 4 3
1 9 6 4
2 8 9 9
df2 = pd.DataFrame(data1, columns=['three','two','one'], index = ['a', 'b', 'c'])
df2
>>>>>>
three two one
a 2 4 3
b 9 6 4
c 8 9 9
如何使用Series构成DataFrame
import numpy as np
import pandas as pd
row1 = pd.Series(np.arange(3), index=['one','two','three'])
row2 = pd.Series(np.arange(3), index=['a','b','c'])
row1.name = 'Series1'
row2.name = 'Series2'
df3 = pd.DataFrame([row1, row2])
df3
>>>>>
one two three a b c
Series1 0.0 1.0 2.0 NaN NaN NaN
Series2 NaN NaN NaN 0.0 1.0 2.0
df3.T # transpose 转置
# 等价于 df3.transpose() 都是改视图
>>>>
Series1 Series2
one 0.0 NaN
two 1.0 NaN
three 2.0 NaN
a NaN 0.0
b NaN 1.0
c NaN 2.0
访问DataFrame中的列与行
- 如何访问DataFrame中的一列或多列
# 如何访问DataFrame中的一列或多列
df3.columns
>>>>> Index(['one', 'two', 'three', 'a', 'b', 'c'], dtype='object')
df3.columns.values
>>>>> array(['one', 'two', 'three', 'a', 'b', 'c'], dtype=object)
df3.columns.values[0]
>>>>>'one'
df3.one
>>>>>
Series1 0.0
Series2 NaN
Name: one, dtype: float64
df3['one']
>>>>>
Series1 0.0
Series2 NaN
Name: one, dtype: float64
]
df3[['one','two']]
>>>>>>
one two
Series1 0.0 1.0
Series2 NaN NaN
- 如何访问DataFrame中的一行或多行
df3[:1]
>>>>>
one two three a b c
Series1 0.0 1.0 2.0 NaN NaN NaN
df3[:0]
>>>>>
one two three a b c
df3.loc['Series1']
>>>>>
one 0.0
two 1.0
three 2.0
a NaN
b NaN
c NaN
Name: Series1, dtype: float64
df3['Series1':'Series2']
>>>>>
one two three a b c
Series1 0.0 1.0 2.0 NaN NaN NaN
Series2 NaN NaN NaN 0.0 1.0 2.0
df3.loc[['Series1','Series2']]
>>>>>
one two three a b c
Series1 0.0 1.0 2.0 NaN NaN NaN
Series2 NaN NaN NaN 0.0 1.0 2.0
df3.iloc[:1,1:]
>>>>>>
two three a b c
Series1 1.0 2.0 NaN NaN NaN
df3.iloc[1]
>>>>>>>
one NaN
two NaN
three NaN
a 0.0
b 1.0
c 2.0
Name: Series2, dtype: float64
df3.iloc[0:1]
>>>>>>>
one two three a b c
Series1 0.0 1.0 2.0 NaN NaN NaN
DataFrame中的删除操作
import pandas as pd
data = {'one':[1,2,3], 'two':[4,5,6], 'three':[7,8,9]}
df3 = pd.DataFrame(data)
df3
>>>>>
one two three
0 1 4 7
1 2 5 8
2 3 6 9
c3 = df3['three']
type(c3)
>>>>>
pandas.core.series.Series
df3.drop('three', axis = 'columns') # 等价于 df3.drop('three', axis = 1)
# 输出的也只是视图,原始DataFrame数据没有变化
>>>>>
one two
0 1 4
1 2 5
2 3 6
id(df3)
>>>>>>
140607990434448
id(df3.drop('three', axis = 'columns'))
>>>>>>>
140607990355232
df3.drop('three', axis = 1, inplace = True)
df3
>>>>>>>
one two
0 1 4
1 2 5
2 3 6
del df3['three']
df3
>>>>>>>>
one two
0 1 4
1 2 5
2 3 6
df3.drop(0, axis = 0)
>>>>>>>>
one two three
1 2 5 8
2 3 6 9
df3.drop([0,1], axis = 0)
>>>>>>>>
one two three
2 3 6 9
轴方向
column axis = 1 删第n列
row axis = 0 删第n行
DataFrame中的添加操作
import pandas as pd
from numpy.random import randint
df4 = pd.DataFrame(columns = ['attr1', 'attr2', 'attr3'])
df4
>>>>>>
attr1 attr2 attr3
for index in range(5):
df4.loc[index] = ['name ' + str(index)] + list(randint(10, size = 2))
df4
>>>>>>>
attr1 attr2 attr3
new_row2 name5 11 22
0 name 0 8 9
1 name 1 3 9
2 name 2 0 7
3 name 3 2 5
4 name 4 1 6
df4.loc['new_row'] = 3 # 广播默默扩成三列
df4
>>>>>>>
attr1 attr2 attr3
new_row2 name5 11 22
0 name 0 8 9
1 name 1 3 9
2 name 2 0 7
3 name 3 2 5
4 name 4 1 6
new_row 3 3 3
df4.loc['new_row2'] = ['name5', 11, 22]
df4
>>>>>>>>
attr1 attr2 attr3
new_row2 name5 11 22
0 name 0 8 9
1 name 1 3 9
2 name 2 0 7
3 name 3 2 5
4 name 4 1 6
new_row 3 3 3
# append()方法将一个DataFrame对象整体追加到另一个DF
df1 = pd.DataFrame({'a':[1,2,3,4], 'b':[5,7,6,8]})
df1.append(df4)
>>>>>>>
/var/folders/9h/ls1wx4x51_n30pt4d31030z80000gn/T/ipykernel_8033/2788910709.py:4: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df1.append(df4)
a b attr1 attr2 attr3
0 1.0 5.0 NaN NaN NaN
1 2.0 7.0 NaN NaN NaN
2 3.0 6.0 NaN NaN NaN
3 4.0 8.0 NaN NaN NaN
new_row2 NaN NaN name5 11.0 22.0
0 NaN NaN name 0 8.0 9.0
1 NaN NaN name 1 3.0 9.0
2 NaN NaN name 2 0.0 7.0
3 NaN NaN name 3 2.0 5.0
4 NaN NaN name 4 1.0 6.0
new_row NaN NaN 3 3.0 3.0
df1.append(df4, ignore_index = True)
>>>>>>>>
/var/folders/9h/ls1wx4x51_n30pt4d31030z80000gn/T/ipykernel_8033/1110611899.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df1.append(df4, ignore_index = True)
a b attr1 attr2 attr3
0 1.0 5.0 NaN NaN NaN
1 2.0 7.0 NaN NaN NaN
2 3.0 6.0 NaN NaN NaN
3 4.0 8.0 NaN NaN NaN
4 NaN NaN name5 11.0 22.0
5 NaN NaN name 0 8.0 9.0
6 NaN NaN name 1 3.0 9.0
7 NaN NaN name 2 0.0 7.0
8 NaN NaN name 3 2.0 5.0
9 NaN NaN name 4 1.0 6.0
10 NaN NaN 3 3.0 3.0
# 添加一列
df1['col1'] = 1
df1
>>>>>>>>
a b col1
0 1 5 1
1 2 7 1
2 3 6 1
3 4 8 1
df1['col2'] = [1,2] # error报错, 长度不匹配。必须长度相等或者长度=1进行广播
df8 = pd.DataFrame({'a':[1,2,3,4], 'b':[5,7,6,8]})
df9 = pd.DataFrame({'a':[5,7,6,8], 'b':[1,2,3,4]})
pd.concat([df8, df9], axis = 1) # 水平堆叠
>>>>>>>>>
a b a b
0 1 5 5 1
1 2 7 7 2
2 3 6 6 3
3 4 8 8 4
pd.concat([df8, df9], axis = 0) # 垂直堆叠
>>>>>>>>>>
a b
0 1 5
1 2 7
2 3 6
3 4 8
0 5 1
1 7 2
2 6 3
3 8 4
Panel:类似三维数组
基于Pandas的文件读取与分析
read_csv()
pd.read_csv("0901-1001.csv", index_col = 0)
# index_col=0——第一列为index值
# index_col=False——重新设置一列成为index值
# index_col 默认值(index_col = None)——重新设置一列成为index值
DataFrame中的常用属性
dtypes 返回各个列的数据类型
import pandas as pd
df = pd.read_csv("0901-1001.csv", index_col = 0)
Last executed at 2022-11-14 17:50:16 in 1.76s
df.dtypes
>>>>>>>
Last executed at 2022-11-14 17:50:21 in 6ms
city_guid object
city_name object
create_time object
hour int64
creator object
logtime object
task_guid int64
task_type int64
task_group int64
collect_cnt int64
recommed bool
is_accept int64
recommend_station_cnt float64
recommedareaguid object
main_station_guid object
sub_station_guid object
trace_id object
biz_type int64
pt int64
is_recommend int64
dtype: object
df['is_recommend'].dtype
>>>>>>>>>>
Last executed at 2022-11-14 17:50:42 in 5ms
dtype('int64')
df.columns
>>>>>>>>>
Last executed at 2022-11-14 17:50:58 in 4ms
Index(['city_guid', 'city_name', 'create_time', 'hour', 'creator', 'logtime',
'task_guid', 'task_type', 'task_group', 'collect_cnt', 'recommed',
'is_accept', 'recommend_station_cnt', 'recommedareaguid',
'main_station_guid', 'sub_station_guid', 'trace_id', 'biz_type', 'pt',
'is_recommend'],
dtype='object')
df.axes
df.axes
>>>>>>>>>>
Last executed at 2022-11-14 17:51:04 in 4ms
[Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 280469, 280470, 280471, 280472, 280473, 280474, 280475, 280476, 280477, 280478],
dtype='int64', length=280479),
Index(['city_guid', 'city_name', 'create_time', 'hour', 'creator', 'logtime',
'task_guid', 'task_type', 'task_group', 'collect_cnt', 'recommed',
'is_accept', 'recommend_station_cnt', 'recommedareaguid',
'main_station_guid', 'sub_station_guid', 'trace_id', 'biz_type', 'pt',
'is_recommend'],
dtype='object')]
df.ndim
>>>>>>>>
Last executed at 2022-11-14 17:51:26 in 3ms
2
df.shape
df.shape
>>>>>>>>>
Last executed at 2022-11-14 17:51:40 in 3ms
(280479, 20)
df.size
>>>>>>>>>
Last executed at 2022-11-14 17:51:56 in 6ms
5609580
df.values # 返回数值部分,类似于一个没有行标签和列标签的Numpy数组
>>>>>>>>>>>
Last executed at 2022-11-14 17:52:07 in 195ms
array([['7911A3BA9D3B464F80C7C482B0FC80B9', '九江市',
'Sun Sep 05 22:24:15 GMT+08:00 2021', ..., 2, 20210905, 0],
['39CA4DFE94E64790912B9C4EEC92F921', '蚌埠市',
'Sun Sep 05 22:25:56 GMT+08:00 2021', ..., 2, 20210905, 0],
['9CD7EAAB07E340EE98558667E7ADB614', '乌海市',
'Sun Sep 05 22:25:58 GMT+08:00 2021', ..., 2, 20210905, 0],
...,
['2C158C9739CD4546A516B1368B422673', '襄阳市',
'Tue Sep 07 17:21:14 GMT+08:00 2021', ..., 2, 20210907, 0],
['4BD0974A8F08467D9DADCCD33D2D046D', '天津市',
'Tue Sep 07 08:46:35 GMT+08:00 2021', ..., 2, 20210907, 1],
['D9DBDF2F159143778C7748C47B262BB4', '石家庄市',
'Tue Sep 07 08:48:56 GMT+08:00 2021', ..., 2, 20210907, 0]],
dtype=object)
DataFrame中的常用方法
DataFrame的条件过滤
df[df.task_type == 2005] # 取出task_type = 2005的列
DataFrame的切片操作
# 切片操作
df['task_type']
>>>>>>>
Last executed at 2022-11-15 15:48:26 in 7ms
0 2006
1 2008
2 2005
3 2005
4 2005
...
280474 2008
280475 2008
280476 2005
280477 2006
280478 2005
Name: task_type, Length: 280479, dtype: int64
df[['task_type', 'city_name']]
>>>>>>>>
Last executed at 2022-11-15 15:48:58 in 15ms
task_type city_name
0 2006 九江市
1 2008 蚌埠市
2 2005 乌海市
3 2005 乌海市
4 2005 乌海市
... ... ...
280474 2008 石家庄市
280475 2008 石家庄市
280476 2005 襄阳市
280477 2006 天津市
280478 2005 石家庄市
280479 rows × 2 columns
[]
df[1:10][['task_type', 'city_name']]
>>>>>>>>>
Last executed at 2022-11-15 15:49:13 in 10ms
task_type city_name
1 2008 蚌埠市
2 2005 乌海市
3 2005 乌海市
4 2005 乌海市
5 2006 厦门市
6 2037 长沙市
7 2005 赣州市
8 2037 兴义市
9 2037 乌鲁木齐市
df.loc[1:10][['task_type', 'city_name']]
>>>>>>>>>>>
Last executed at 2022-11-15 15:49:26 in 19ms
task_type city_name
1 2008 蚌埠市
2 2005 乌海市
3 2005 乌海市
4 2005 乌海市
5 2006 厦门市
6 2037 长沙市
7 2005 赣州市
8 2037 兴义市
9 2037 乌鲁木齐市
10 2008 昆明市
DataFrame的排序操作
df.sort_values(by = ['city_name', 'task_type'], ascending = [True,False]).head() # 多字段按数组顺序进行升序/降序排序
Pandas的聚合和分组操作
聚合 agg
df.task_type.agg(['min', 'max', 'mean', 'median'])
》》》》》》》
Last executed at 2022-11-15 16:05:12 in 10ms
min 2005.000000
max 2037.000000
mean 2009.801044
median 2006.000000
Name: task_type, dtype: float64
df.task_type.agg(['min', 'max', np.mean, 'median'])
》》》》》》》》
Last executed at 2022-11-15 16:05:50 in 15ms
min 2005.000000
max 2037.000000
mean 2009.801044
median 2006.000000
Name: task_type, dtype: float64
df.task_type.agg(['skew', 'kurt'])
》》》》》》》》》
Last executed at 2022-11-15 16:07:13 in 19ms
skew 2.356637
kurt 3.871395
Name: task_type, dtype: float64
df.agg({'task_type':['max', 'min'], 'task_group':['mean', 'std']})
》》》》》》》》》》
Last executed at 2022-11-15 16:08:10 in 19ms
task_type task_group
max 2037.0 NaN
mean NaN 7.301220
min 2005.0 NaN
std NaN 1.959963
分组 groupby
df.groupby(['task_type']).mean() # 分组统计
》》》》》》》
Last executed at 2022-11-15 16:10:33 in 139ms
hour task_guid task_group collect_cnt recommed is_accept recommend_station_cnt biz_type pt is_recommend
task_type
2005 23.0 2.401619e+18 8.0 0.0 0.127221 0.127221 2.221534 2.0 2.021092e+07 0.584008
2006 23.0 2.403262e+18 9.0 0.0 0.097440 0.097440 1.747522 2.0 2.021092e+07 0.542277
2008 23.0 2.403350e+18 4.0 0.0 0.070315 0.070315 3.712045 2.0 2.021092e+07 0.776563
2014 23.0 2.402599e+18 5.0 0.0 0.060964 0.060964 2.623677 2.0 2.021092e+07 0.646888
2016 23.0 2.410908e+18 5.0 0.0 0.036920 0.036920 2.584148 2.0 2.021092e+07 0.631857
2020 23.0 2.429077e+18 5.0 0.0 0.636364 0.636364 3.954545 2.0 2.021093e+07 0.818182
2031 23.0 2.406969e+18 5.0 0.0 0.047796 0.047796 2.774665 2.0 2.021092e+07 0.623215
2037 23.0 2.400964e+18 8.0 0.0 0.006265 0.006265 0.375335 2.0 2.021092e+07 0.088922
df.groupby(['task_type'])[['task_group']].mean() # 分组统计指定列
》》》》》》》》》
Last executed at 2022-11-15 16:11:27 in 24ms
task_group
task_type
2005 8
2006 9
2008 4
2014 5
2016 5
2020 5
2031 5
2037 8
)
df.groupby(['task_type'])[['task_group']].agg(['mean','skew']) #
分组统计指定列
》》》》》》》》》
Last executed at 2022-11-15 16:42:46 in 48ms
task_group
mean skew
task_type
2005 8 0
2006 9 0
2008 4 0
2014 5 0
2016 5 0
2020 5 0
2031 5 0
2037 8 0
透视表
# 以index字段聚合,对value字段进行aggfunc
df.pivot_table(index=['task_type'], values=['task_group'], aggfunc = [np.mean,'sum'])
>>>>>>>>>>>>>>>
Last executed at 2022-11-15 17:28:30 in 48ms
mean sum
task_group task_group
task_type
2005 8 646936
2006 9 860472
2008 4 243988
2014 5 58395
2016 5 4740
2020 5 110
2031 5 16110
2037 8 217088
类SQL操作pandas
关键字:query
df.query("task_type == 2037").head()
code = 2037
df.query("task_type == @code").head() # @引入变量