pandas02：基础xlrd更新到了2.0.1版本，只支持.xls文件。 object代表了一种混合类型，正如上面的例

import numpy as np
import pandas as pd

pd.__version__

'1.1.5'

文件的读取和写入

文件读取

使用pandas读取csv，excel，txt文件

df_csv = pd.read_csv('data/my_csv.csv')
df_csv

	col1	col2	col3	col4	col5
0	2	a	1.4	apple	2020/1/1
1	3	b	3.4	banana	2020/1/2
2	6	c	2.5	orange	2020/1/5
3	5	d	3.2	lemon	2020/1/7

df_txt = pd.read_table('data/my_table.txt')
df_txt

	col1	col2	col3	col4
0	2	a	1.4	apple 2020/1/1
1	3	b	3.4	banana 2020/1/2
2	6	c	2.5	orange 2020/1/5
3	5	d	3.2	lemon 2020/1/7

关于xlrd打不开xlsx文件问题
- xlrd更新到了2.0.1版本，只支持.xls文件。
- 可以用openpyxl代替xlrd打开.xlsx文件 !pip3 install openpyxl

df_excel = pd.read_excel('data/my_excel.xlsx',engine='openpyxl')
df_excel

	col1	col2	col3	col4	col5
0	2	a	1.4	apple	2020/1/1
1	3	b	3.4	banana	2020/1/2
2	6	c	2.5	orange	2020/1/5
3	5	d	3.2	lemon	2020/1/7

# header=None，第一行不作为列名
pd.read_table('data/my_table.txt',header=None)

	0	1	2	3
0	col1	col2	col3	col4
1	2	a	1.4	apple 2020/1/1
2	3	b	3.4	banana 2020/1/2
3	6	c	2.5	orange 2020/1/5
4	5	d	3.2	lemon 2020/1/7

# index_col 表示把某一列/几列作为索引
pd.read_csv('data/my_csv.csv',index_col=['col1','col2'])

		col3	col4	col5
col1	col2
2	a	1.4	apple	2020/1/1
3	b	3.4	banana	2020/1/2
6	c	2.5	orange	2020/1/5
5	d	3.2	lemon	2020/1/7

# usecols 读取列的集合，默认读取所有列
pd.read_table('data/my_table.txt',usecols=['col1','col2'])

	col1	col2
0	2	a
1	3	b
2	6	c
3	5	d

# parse_dates 需要转化为时间的列
pd.read_csv('data/my_csv.csv',parse_dates=['col5'])

	col1	col2	col3	col4	col5
0	2	a	1.4	apple	2020-01-01
1	3	b	3.4	banana	2020-01-02
2	6	c	2.5	orange	2020-01-05
3	5	d	3.2	lemon	2020-01-07

# nrows读取行
pd.read_csv('data/my_csv.csv',nrows=2)

	col1	col2	col3	col4	col5
0	2	a	1.4	apple	2020/1/1
1	3	b	3.4	banana	2020/1/2

# 读取txt文件，自定义分隔符
pd.read_table('data/my_table_special_sep.txt')

	col1 \|\|\|\| col2
0	TS \|\|\|\| This is an apple.
1	GQ \|\|\|\| My name is Bob.
2	WT \|\|\|\| Well done!
3	PT \|\|\|\| May I help you?

sep传入为正则参数

# 上表以 |||| 作为分割，用sep实现分割
# 需要制定引擎为python
pd.read_table('data/my_table_special_sep.txt',sep='\|\|\|\|',engine='python')

	col1	col2
0	TS	This is an apple.
1	GQ	My name is Bob.
2	WT	Well done!
3	PT	May I help you?

数据写入

# 当索引没有特殊意义，将index设置为False
df_csv.to_csv('data/my_csv_saved.csv',index=False)
df_csv = pd.read_csv('data/my_csv_saved.csv')
df_csv

df_excel.to_excel('data/my_excel_saved.xlsx',index=False)

# 使用to_csv保存为txt文件，可使用sep自定分隔符
df_txt.to_csv('data/my_txt_saved.txt',sep='\t',index=False)

# 将表格转为markdown/latex，可使用to_markdown/to_latex函数
# 需要安装tabulate 函数
!pip3 install tabulate

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tabulate
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c4/f4/770ae9385990f5a19a91431163d262182d3203662ea2b5739d0fcfc080f1/tabulate-0.8.7-py3-none-any.whl (24 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.8.7

print(df_csv.to_markdown())

|    |   col1 | col2   |   col3 | col4   | col5     |
|---:|-------:|:-------|-------:|:-------|:---------|
|  0 |      2 | a      |    1.4 | apple  | 2020/1/1 |
|  1 |      3 | b      |    3.4 | banana | 2020/1/2 |
|  2 |      6 | c      |    2.5 | orange | 2020/1/5 |
|  3 |      5 | d      |    3.2 | lemon  | 2020/1/7 |

print(df_csv.to_latex())

\begin{tabular}{lrlrll}
\toprule
{} &  col1 & col2 &  col3 &    col4 &      col5 \\
\midrule
0 &     2 &    a &   1.4 &   apple &  2020/1/1 \\
1 &     3 &    b &   3.4 &  banana &  2020/1/2 \\
2 &     6 &    c &   2.5 &  orange &  2020/1/5 \\
3 &     5 &    d &   3.2 &   lemon &  2020/1/7 \\
\bottomrule
\end{tabular}

基本数据结构

Series，存储一维values
DataFrame，存储二维的values

Series

data，序列的值
index，索引
- 可指定它的名字，默认为空
dtype，存储类型
name，序列的名字

s = pd.Series(data = [100,'a',{'dic1':5}],
              index = pd.Index(['id1',20,'third'], name='my_idx'),
              dtype = 'object',
              name = 'my_name')
s

my_idx
id1              100
20                 a
third    {'dic1': 5}
Name: my_name, dtype: object

object,混合类型 object代表了一种混合类型，正如上面的例子中存储了整数、字符串以及Python的字典数据结构。
此外，目前 pandas把纯字符串序列也默认认为是一种object类型的序列，但它也可以用string类型存储。

# 获取属性
s.values

array([100, 'a', {'dic1': 5}], dtype=object)

s.index

Index(['id1', 20, 'third'], dtype='object', name='my_idx')

s.dtype

dtype('O')

s.name

'my_name'

# 获取序列长度
s.shape

(3,)

# 索引
s['third']

{'dic1': 5}

DataFrame

DataFrame在Series基础上增加了列索引，一个数据框可以由二维的data与行列索引来构造

data = [[1,'a',1.2],[2,'b',2.2],[3,'c',3.2]]
df = pd.DataFrame(data = data,
                  index = ['row_%d'%i for i in range(3)],
                  columns=['col_0','col_1','col_2'])
df

	col_0	col_1	col_2
row_0	1	a	1.2
row_1	2	b	2.2
row_2	3	c	3.2

# 一般采用列索引名到数据映射来构建数据框，再加上行索引1
df = pd.DataFrame(data = {'col_0':[1,2,3],'col_1':list('abc'),'col_2':[1.2,2.2,3.2]},
                  index = ['row_%d'%i for i in range(3)])
df

	col_0	col_1	col_2
row_0	1	a	1.2
row_1	2	b	2.2
row_2	3	c	3.2

# 可用[col_name] 和 [col_list]取出相应的列
df['col_0']

row_0    1
row_1    2
row_2    3
Name: col_0, dtype: int64

df[['col_1','col_2']]

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	col_1	col_2
row_0	a	1.2
row_1	b	2.2
row_2	c	3.2

# 取出相应的属性
df.values

array([[1, 'a', 1.2],
       [2, 'b', 2.2],
       [3, 'c', 3.2]], dtype=object)

df.index

Index(['row_0', 'row_1', 'row_2'], dtype='object')

df.columns

Index(['col_0', 'col_1', 'col_2'], dtype='object')

df.shape

(3, 3)

df.dtypes

col_0      int64
col_1     object
col_2    float64
dtype: object

# 可通过 .T 进行DataFrame转置
df.T

	row_0	row_1	row_2
col_0	1	2	3
col_1	a	b	c
col_2	1.2	2.2	3.2

常用基本函数

# learn_pandas.csv的虚拟数据集，它记录了四所学校学生的体测个人信息。
df = pd.read_csv('data/learn_pandas.csv')
df.columns
# 上述列名依次代表学校、年级、姓名、性别、身高、体重、是否为转系生、体测场次、测试时间、1000米成绩，本章只需使用其中的前七列。

Index(['School', 'Grade', 'Name', 'Gender', 'Height', 'Weight', 'Transfer',       'Test_Number', 'Test_Date', 'Time_Record'],
      dtype='object')

df = df[df.columns[:7]]
df
# 学校、年级、姓名、性别、身高、体重、是否为转系生

	School	Grade	Name	Gender	Height	Weight	Transfer
0	Shanghai Jiao Tong University	Freshman	Gaopeng Yang	Female	158.9	46.0	N
1	Peking University	Freshman	Changqiang You	Male	166.5	70.0	N
2	Shanghai Jiao Tong University	Senior	Mei Sun	Male	188.9	89.0	N
3	Fudan University	Sophomore	Xiaojuan Sun	Female	NaN	41.0	N
4	Fudan University	Sophomore	Gaojuan You	Male	174.0	74.0	N
...	...	...	...	...	...	...	...
195	Fudan University	Junior	Xiaojuan Sun	Female	153.9	46.0	N
196	Tsinghua University	Senior	Li Zhao	Female	160.9	50.0	N
197	Shanghai Jiao Tong University	Senior	Chengqiang Chu	Female	153.9	45.0	N
198	Shanghai Jiao Tong University	Senior	Chengmei Shen	Male	175.3	71.0	N
199	Tsinghua University	Sophomore	Chunpeng Lv	Male	155.7	51.0	N

200 rows × 7 columns

汇总函数

head，tail 返回表/序列的前/后n行，n默认为5
info，信息概况
describe，数值列对应的主要统计量

df.head(2)

	School	Grade	Name	Gender	Height	Weight	Transfer
0	Shanghai Jiao Tong University	Freshman	Gaopeng Yang	Female	158.9	46.0	N
1	Peking University	Freshman	Changqiang You	Male	166.5	70.0	N

df.tail(3)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	School	Grade	Name	Gender	Height	Weight	Transfer
197	Shanghai Jiao Tong University	Senior	Chengqiang Chu	Female	153.9	45.0	N
198	Shanghai Jiao Tong University	Senior	Chengmei Shen	Male	175.3	71.0	N
199	Tsinghua University	Sophomore	Chunpeng Lv	Male	155.7	51.0	N

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   School    200 non-null    object 
 1   Grade     200 non-null    object 
 2   Name      200 non-null    object 
 3   Gender    200 non-null    object 
 4   Height    183 non-null    float64
 5   Weight    189 non-null    float64
 6   Transfer  188 non-null    object 
dtypes: float64(2), object(5)
memory usage: 11.1+ KB

df.describe()

	Height	Weight
count	183.000000	189.000000
mean	163.218033	55.015873
std	8.608879	12.824294
min	145.400000	34.000000
25%	157.150000	46.000000
50%	161.900000	51.000000
75%	167.500000	65.000000
max	193.900000	89.000000

特征统计函数

常见函数
- sum：求和, mean:平均值 , median：中位数, var：方差, std：标准差, max：最大值, min：最小值
- quantile, 返回分位数
- count, 非缺失值个数
- idxmax,最大值对应的索引

df_demo = df[['Height','Weight']]

df_demo.mean()

Height    163.218033
Weight     55.015873
dtype: float64

df_demo.max()

Height    193.9
Weight     89.0
dtype: float64

df_demo.var()

Height     74.112805
Weight    164.462513
dtype: float64

df_demo.quantile()

Height    161.9
Weight     51.0
Name: 0.5, dtype: float64

df_demo.count()

Height    183
Weight    189
dtype: int64

df_demo.idxmax()

Height    193
Weight      2
dtype: int64

上面这些所有的函数，由于操作后返回的是标量，所以又称为聚合函数，它们有一个公共参数axis ，默认为0代表逐列聚合，如果设置为1则表示逐行聚合

df_demo.mean(axis=1).head()

0    102.45
1    118.25
2    138.95
3     41.00
4    124.00
dtype: float64

唯一值函数

观察某列组合的唯一值
- unique，唯一值组成的列表
- nunique，唯一值组成的个数
- value_counts，可以得到唯一值和其对应出现的频数
观察多列组合的唯一值
- drop_duplicates，其中的关键参数是 keep
  - 默认值 first 表示每个组合保留第一次出现的所在行
  - last 表示保留最后一次出现的所在行，
  - False 表示把所有重复组合所在的行剔除

df['School'].unique()

array(['Shanghai Jiao Tong University', 'Peking University',       'Fudan University', 'Tsinghua University'], dtype=object)

df['School'].nunique()

#  唯一值和对应出现的频数
df['School'].value_counts()

Tsinghua University              69
Shanghai Jiao Tong University    57
Fudan University                 40
Peking University                34
Name: School, dtype: int64

# 观察多列组合的唯一值
df_demo = df[['Gender','Transfer','Name']]
# 默认保留第一次出现的重复行
df_demo.drop_duplicates(['Gender','Transfer'])

	Gender	Transfer	Name
0	Female	N	Gaopeng Yang
1	Male	N	Changqiang You
12	Female	NaN	Peng You
21	Male	NaN	Xiaopeng Shen
36	Male	Y	Xiaojuan Qin
43	Female	Y	Gaoli Feng

# 保留最后一次出现的重复行
df_demo.drop_duplicates(['Gender','Transfer'],keep='last')

	Gender	Transfer	Name
147	Male	NaN	Juan You
150	Male	Y	Chengpeng You
169	Female	Y	Chengquan Qin
194	Female	NaN	Yanmei Qian
197	Female	N	Chengqiang Chu
199	Male	N	Chunpeng Lv

# 去掉所有重复行
df_demo.drop_duplicates(['Name','Gender'],keep=False).head()

	Gender	Transfer	Name
0	Female	N	Gaopeng Yang
1	Male	N	Changqiang You
2	Male	N	Mei Sun
4	Male	N	Gaojuan You
5	Female	N	Xiaoli Qian

# 在Series上面使用
df['School'].drop_duplicates()

0    Shanghai Jiao Tong University
1                Peking University
3                 Fudan University
5              Tsinghua University
Name: School, dtype: object

替换函数

一般，替换操做都是替换某一列
替换函数
- 映射替换
  - replace方法，支持特殊的方向替换
    - method：参数为ffill使用前1个值进行替换
    - bfill：使用后1个值进行替换。
  - str.replace
  - cat.codes
- 逻辑替换
  - where，传入条件为False进行替换
  - mask，传入条件为True进行替换
- 数值替换
  - round，按照给定精度四舍五入
  - abs，取值
  - clip，截断

df['Gender'].replace({'Female':0, 'Male':1}).head()

0    0
1    1
2    1
3    0
4    1
Name: Gender, dtype: int64

df['Gender'].replace([0, 1],['Female', 'Male']).head()

0    Female
1      Male
2      Male
3    Female
4      Male
Name: Gender, dtype: object

# 特殊方向替换
s = pd.Series(['a',1,'b',2,1,1,'a'])
s

0    a
1    1
2    b
3    2
4    1
5    1
6    a
dtype: object

# 替换为前一个值
s.replace([1,2],method='ffill')

0    a
1    a
2    b
3    b
4    b
5    b
6    a
dtype: object

# 替换为后一个值
s.replace([1,2],method='bfill')

0    a
1    b
2    b
3    a
4    a
5    a
6    a
dtype: object

# 逻辑替换
s = pd.Series([-1,1.2345,100,-50])
s

0     -1.0000
1      1.2345
2    100.0000
3    -50.0000
dtype: float64

# where，条件false则替换
s.where(s<0,100) # 100为指定替换值，默认替换为NaN

0    -1.0
1     NaN
2     NaN
3   -50.0
dtype: float64

# mask，条件true则替换
s.mask(s<0,99)

0     99.0000
1      1.2345
2    100.0000
3     99.0000
dtype: float64

# 传入的条件只需是与被调用的 Series 索引一致的布尔序列
s_condition= pd.Series([True,False,False,True],index=s.index)
s.mask(s_condition, 99)

0     99.0000
1      1.2345
2    100.0000
3     99.0000
dtype: float64

# 数值替换
# 四舍五入2位数
s.round(2)

0     -1.00
1      1.23
2    100.00
3    -50.00
dtype: float64

# 绝对值
s.abs()

0      1.0000
1      1.2345
2    100.0000
3     50.0000
dtype: float64

# 截断
s.clip(0,2) # 上下截断边界

0    0.0000
1    1.2345
2    2.0000
3    0.0000
dtype: float64

排序函数

排序共有2种方式
- 值排序，sort_values
- 索引排序，sort_index

# 使用sed_index将年级和姓名2列作为索引
df_demo = df[['Grade','Name','Height','Weight']].set_index(['Grade','Name'])
df_demo

		Height	Weight
Grade	Name
Freshman	Gaopeng Yang	158.9	46.0
Freshman	Changqiang You	166.5	70.0
Senior	Mei Sun	188.9	89.0
Sophomore	Xiaojuan Sun	NaN	41.0
Sophomore	Gaojuan You	174.0	74.0
...	...	...	...
Junior	Xiaojuan Sun	153.9	46.0
Senior	Li Zhao	160.9	50.0
	Chengqiang Chu	153.9	45.0
	Chengmei Shen	175.3	71.0
Sophomore	Chunpeng Lv	155.7	51.0

200 rows × 2 columns

# 对身高排序，默认参数ascending=True为升序
df_demo.sort_values('Height').head()

		Height	Weight
Grade	Name
Junior	Xiaoli Chu	145.4	34.0
Senior	Gaomei Lv	147.3	34.0
Sophomore	Peng Han	147.8	34.0
Senior	Changli Lv	148.7	41.0
Sophomore	Changjuan You	150.5	40.0

# 倒序
df_demo.sort_values('Height',ascending=False).head()

		Height	Weight
Grade	Name
Senior	Xiaoqiang Qin	193.9	79.0
	Mei Sun	188.9	89.0
	Gaoli Zhao	186.5	83.0
Freshman	Qiang Han	185.3	87.0
Senior	Qiang Zheng	183.9	87.0

多排序问题
体重相同，对身高进行排序
- 保持身高降序，体重升序

# 值排序
df_demo.sort_values(['Weight','Height'],ascending=[True,False]).head()

		Height	Weight
Grade	Name
Sophomore	Peng Han	147.8	34.0
Senior	Gaomei Lv	147.3	34.0
Junior	Xiaoli Chu	145.4	34.0
Sophomore	Qiang Zhou	150.5	36.0
Freshman	Yanqiang Xu	152.4	38.0

# 索引排序
# 元素值在索引中，需要指定索引层的名字/层号，用参数level表示
# 字符串排序由字母顺序决定
df_demo.sort_index(level=['Grade','Name'],ascending=[False,True]).head()

		Height	Weight
Grade	Name
Sophomore	Changjuan You	150.5	40.0
	Changmei Xu	151.6	43.0
	Changqiang Qian	167.6	64.0
	Chengli You	164.1	57.0
	Chengqiang Lv	166.8	53.0

apply方法

apply方法常用于DataFrame的行或者列迭代
是一个以序列为输入的函数

# .mean函数
df_demo = df[['Height','Weight']]
def my_mean(x):
    res = x.mean()
    return res

df_demo.apply(my_mean)

Height    163.218033
Weight     55.015873
dtype: float64

# 用lambda表达式简化
# 求每列数据的均值
df_demo.apply(lambda x:x.mean()) # x代表被调用df_demo逐个输入的序列

Height    163.218033
Weight     55.015873
dtype: float64

# axis = 1 跨行，水平方向
# 求每行数据的均值
df_demo.apply(lambda x:x.mean(),axis=1).head()

0    102.45
1    118.25
2    138.95
3     41.00
4    124.00
dtype: float64

mad函数返回的是一个序列中偏离该序列均值的绝对值大小的均值.
例如序列1,3,7,10中，均值为5.25，每一个元素偏离的绝对值为4.25,2.25,1.75,4.75，这个偏离序列的均值为3.25。

# 利用apply计算身高和体重的mad指标
df_demo.apply(lambda x:(x-x.mean()).abs().mean())

Height     6.707229
Weight    10.391870
dtype: float64

# mad函数
df_demo.mad()

Height     6.707229
Weight    10.391870
dtype: float64

谨慎使用 apply
得益于传入自定义函数的处理， apply 的自由度很高，但这是以性能为代价的。一般而言，使用 pandas 的内置函数处理和 apply 来处理同一个任务，其速度会相差较多，因此只有在确实存在自定义需求的情境下才考虑使用 apply 。

窗口对象

滑动窗口rolling
扩张窗口expanding
加权窗口ewm

滑窗对象

对序列使用.rolling得到滑窗对象，对其最重要对参数为窗口大小window
window
- window = 数值型int --> 计算统计量的观测值的数量，及从第0个元素开始，向前数window个，然后在使用后面定义的函数
- window = offset时间偏移量 --> 表示时间窗的大小

# 得到滑窗对象
s = pd.Series(np.arange(1,6,1))
roller = s.rolling(window = 3)
roller

Rolling [window=3,center=False,axis=0]

# 窗口包含当前行所在的元素
roller.mean()

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64

解释：

s = pd.Series([1,2,3,4,5])	--> [1,2,3,4,5]

经过：s.rolling(window=3)  -->  
		从第0个下标，往前数3个，[空，空 ，1]
		从第1个下标，往前数3个，[空，1 ， 2]
		从第2个下标，往前数3个，[1， 2， 3]
		从第3个下标，往前数3个，[2， 3， 4]
		从第4个下标，往前数3个，[3， 4， 5]
然后得到的元素，在经过统计函数，mean()进行计算。
第n个元素的值将是n，n-1和n-2元素的平均值
便形成了：
0    NaN
1    NaN
2    2.0
3    3.0
4    4.0

roller.sum()

0     NaN
1     NaN
2     6.0
3     9.0
4    12.0
dtype: float64

# 滑动协方差
s1 = pd.Series([1,4,18,29])
roller.cov(s1)

0     NaN
1     NaN
2     8.5
3    12.5
4     NaN
dtype: float64

# 滑动相关系数
roller.corr(s1)

0         NaN
1         NaN
2    0.936766
3    0.997609
4         NaN
dtype: float64

# 支持roller传入相关自定义函数
roller.apply(lambda x:x.mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
dtype: float64

类滑窗函数 shift，diff，pct_change
- 公共参数为periods=n，默认为1
- shift，取向前第n个元素的值
- diff，与向前第n个数做差
- pct_change，与向前第n个元素比计算增长率
- 其中，n可以为负，表示反向类似操作

# 输出s
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

# 向前取第1个元素
s.shift(2)

0    NaN
1    NaN
2    1.0
3    2.0
4    3.0
dtype: float64

# 向前移动2，与原数据做差
s.diff(2)
# 移动之后数据   原数据    
#  NaN            1   
#  NaN            2
#  1.0            3
#  2.0            4
#  3.0            5

0    NaN
1    NaN
2    2.0
3    2.0
4    2.0
dtype: float64

# 向前移动1，比计算增长率
s.pct_change(1) # 默认为1

0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
dtype: float64

s.shift(-1)

0    2.0
1    3.0
2    4.0
3    5.0
4    NaN
dtype: float64

s.diff(-2)

0   -2.0
1   -2.0
2   -2.0
3    NaN
4    NaN
dtype: float64

类滑窗函数可以用窗口大小为n+1的rolling方法等价代替

# s.shift(2)
s.rolling(3).apply(lambda x:list(x)[0])

0    NaN
1    NaN
2    1.0
3    2.0
4    3.0
dtype: float64

# s.diff(2)
s.rolling(3).apply(lambda x:list(x)[-1]-list(x)[0])

0    NaN
1    NaN
2    2.0
3    2.0
4    2.0
dtype: float64

# s.pct_change()
def my_pct(x):
    L = list(x)
    return L[-1]/L[0]-1
s.rolling(2).apply(my_pct)

0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
dtype: float64

扩张窗口

扩张窗口又称累计窗口，可以理解为一个动态长度的窗口，其窗口的大小就是从序列开始处到具体操作的对应位置，其使用的聚合函数会作用于这些逐步扩张的窗口上。具体地说，设序列为a1, a2, a3, a4，则其每个位置对应的窗口即[a1]、[a1, a2]、[a1, a2, a3]、[a1, a2, a3, a4]。

s.expanding().mean()

0    1.0
1    1.5
2    2.0
3    2.5
4    3.0
dtype: float64

s.expanding().max()

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

s.expanding().sum()

0     1.0
1     3.0
2     6.0
3    10.0
4    15.0
dtype: float64

s.expanding().apply(lambda x:x.prod())

0      1.0
1      2.0
2      6.0
3     24.0
4    120.0
dtype: float64

参考

DataWhale组队学习：Joyful Pandas

pandas中的窗口对象

	col1 \|\|\|\| col2
0	TS \|\|\|\| This is an apple.
1	GQ \|\|\|\| My name is Bob.
2	WT \|\|\|\| Well done!
3	PT \|\|\|\| May I help you?