Pandas 操作
pandas 的索引取值我是真的愤怒
一次总结
# 行选择
In [7]: df[1:5]
Out[7]:
date c1 c2 c3
1 2012-04-11 1 16 3
2 2012-04-12 7 6 1
3 2012-04-13 2 16 7
4 2012-04-14 4 17 7
# 区块选择
In [11]: data[:7][['rnd_1', 'rnd_2']]
df[1:5] # 按行切片取值
df[['c1','c2']] # 列选择
df[1:3][['c1','c2']] # 先行后列
df[1] #报错
# df.loc
df.loc[1:5] # loc与第一种方法不同之处在于会把第5行也选择进去
# df.iloc
# df.at
df = df.reset_index(drop=True) # 丢弃索引
#
df = pd.read_csv(path,index_col=0) # Unnamed:0 drop
pandas.pydata.org/docs/user_g…
垃圾索引耽误两小时(生气
pandas.pydata.org/docs/refere…
df = pd.read_csv('out.csv',index_col=0)
label_df = pd.read_csv('label_2021.csv',index_col=0)
judge = label_df.index.isin(df.index)
label = label_df[judge]
label.shape
日期操作
# int64index to datetime
import datetime as dt
x = pd.to_datetime(list(df.index),infer_datetime_format=True,format='%Y%m%d')
# int索引取值
def get_df_from_date(df, begin_date=0, end_date=0):
indexs = list(df.index)
idx = [True if indexs[i]>=begin_date and indexs[i]<=end_date else False for i in range(df.shape[0])]
return df[idx]
df2 = get_df_from_date(df,20130415,20220607)
def date_move(date,gap_days):
d = dt.datetime.strptime(str(date), '%Y%m%d')
d2 = d + dt.timedelta(days=gap_days)
d3 = int(d2.strftime("%Y%m%d"))
return d3
d = date_move(20140101,250)
groupby
并行
from joblib import Parallel, delayed
r = Parallel(n_jobs=20)(delayed(compute_zcore_from_path)(path,year_size) for path in path_list)
len(r)
pyplot 画图总结
import matplotlib.pyplot as plt
plt.plot(x,y)
plt.show()
plt.figure(figsize=(6, 6.5)) # 设置图片大小
进阶
plt.hist(x,bins=20) # 直方图
\