0 准备
先开始常规的准备工作。
本次学习内容主要来自kaggle中的time series系列课程,练习中使用的数据来自kaggle中的比赛:www.kaggle.com/competition…
课程很棒,对我来说收获很多,感兴趣的小伙伴们可以去看看原课程。
from pathlib import Path
from warnings import simplefilter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
simplefilter("ignore") # ignore warnings to clean up output cells
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
"axes",
labelweight="bold",
labelsize="large",
titleweight="bold",
titlesize=14,
titlepad=10,
)
plot_params = dict(
color="0.75",
style=".-",
markeredgecolor="0.25",
markerfacecolor="0.25",
legend=False,
)
%config InlineBackend.figure_format = 'retina'
# Load data
data_dir = Path("../input/ts-course-data")
tunnel = pd.read_csv(data_dir / "tunnel.csv", parse_dates=["Day"])
tunnel = tunnel.set_index("Day")
tunnel = tunnel.to_period() # to_period的用法参见文章最后的附录部分
1 线性回归(Linear Regression)
1.1 数据读取和处理
读取数据并且用date作为数据的索引。
import pandas as pd
df = pd.read_csv(
"../input/ts-course-data/book_sales.csv",
index_col='Date',
parse_dates=['Date'], # 解析date列为date格式
).drop('Paperback', axis=1) # 删除掉['Paperback']列
创建时间序列和滞后特征值
import numpy as np
# 根据索引按照顺序创建序列
df['Time'] = np.arange(len(df.index))
# 创建滞后的特征值
df['Lag_1'] = df['Hardcover'].shift(1) # 向下平移一位
df = df.reindex(columns=['Hardcover', 'Lag_1']) # 重新设置索引
在图片中查看回归线的走势
fig, ax = plt.subplots()
ax = sns.regplot(x='Lag_1', y='Hardcover', data=df, ci=None, scatter_kws=dict(color='0.25'))
ax.set_aspect('equal')
ax.set_title('Lag Plot of Hardcover Sales');
1.2 回归模型创建
用时间序列来训练线性回归的模型。
from sklearn.linear_model import LinearRegression
# Training data
X = df.loc[:, ['Time']] # features
y = df.loc[:, 'NumVehicles'] # target
# Train the model
model = LinearRegression()
model.fit(X, y)
# Store the fitted values as a time series with the same time index as
# the training data
y_pred = pd.Series(model.predict(X), index=X.index)
用滞后值来训练线性回归的模型。
df['Lag_1'] = df['NumVehicles'].shift(1)
from sklearn.linear_model import LinearRegression
X = df.loc[:, ['Lag_1']]
X.dropna(inplace=True) # drop missing values in the feature set
y = df.loc[:, 'NumVehicles'] # create the target
y, X = y.align(X, join='inner') # drop corresponding values in target
model = LinearRegression()
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)
# 在图形中查看
fig, ax = plt.subplots()
ax.plot(X['Lag_1'], y, '.', color='0.25')
ax.plot(X['Lag_1'], y_pred)
ax.set_aspect('equal')
ax.set_ylabel('NumVehicles')
ax.set_xlabel('Lag_1')
ax.set_title('Lag Plot of Tunnel Traffic');
ax = y.plot(**plot_params)
ax = y_pred.plot()
2 趋势(Trend)
其实线性回归也是在描述数据在时间序列上的趋势。
2.1 移动平均线
# 首先,导入数据
data_dir = Path("../input/ts-course-data")
tunnel = pd.read_csv(data_dir / "tunnel.csv", parse_dates=["Day"])
tunnel = tunnel.set_index("Day").to_period()
# 计算移动平均线
moving_average = tunnel.rolling(
window=365, # 365-day window
center=True, # puts the average at the center of the window
min_periods=183, # choose about half the window size
).mean() # compute the mean (could also do median, std, min, max, ...)
ax = tunnel.plot(style=".", color="0.5")
moving_average.plot(
ax=ax, linewidth=3, title="Tunnel Traffic - 365-Day Moving Average", legend=False,
);
2.2 创建特征矩阵
创建特征矩阵也可以使用pandas中的get_dummy,这里我们介绍DeterministicProcess。
from statsmodels.tsa.deterministic import DeterministicProcess
dp = DeterministicProcess(
index=tunnel.index, # dates from the training data
constant=True, # dummy feature for the bias (y_intercept)
order=1, # 1 linear 2 quadric 3 cubic
drop=True, # drop terms if necessary to avoid collinearity
)
# 这里创建特征矩阵
X = dp.in_sample()
from sklearn.linear_model import LinearRegression
y = tunnel["NumVehicles"] # the target
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)
在图形中查看趋势。
ax = tunnel.plot(style=".", color="0.5", title="Tunnel Traffic - Linear Trend")
_ = y_pred.plot(ax=ax, linewidth=3, label="Trend")
利用上述得到的字段和模型进行预测。
# 首先进行预测
X = dp.out_of_sample(steps=30)
y_fore = pd.Series(model.predict(X), index=X.index)
# 对于预测的结果进行图形化的展示
ax = tunnel["2005-05":].plot(title="Tunnel Traffic - Linear Trend Forecast", **plot_params)
ax = y_pred["2005-05":].plot(ax=ax, linewidth=3, label="Trend")
ax = y_fore.plot(ax=ax, linewidth=3, label="Trend Forecast", color="C3")
_ = ax.legend()
3 周期(Seasonality)
3.1 Indicators
本质上是创建one hot encoding的离散型字段。
3.2 傅立叶特征(Fourier features)
- 计算方式:
import pandas as pd
import numpy as np
def fourier_features(index, freq, order):
time = np.arange(len(index), dtype=np.float32)
k = 2 * np.pi * (1 / freq) * time
features = {}
for i in range(1, order + 1):
features.update({
f"sin_{freq}_{i}": np.sin(i * k),
f"cos_{freq}_{i}": np.cos(i * k),
})
return pd.DataFrame(features, index=index)
- 画图方法
from pathlib import Path
from warnings import simplefilter
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
simplefilter("ignore")
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
"axes",
labelweight="bold",
labelsize="large",
titleweight="bold",
titlesize=16,
titlepad=10,
)
plot_params = dict(
color="0.75",
style=".-",
markeredgecolor="0.25",
markerfacecolor="0.25",
legend=False,
)
def seasonal_plot(X, y, period, freq, ax=None):
if ax is None:
_, ax = plt.subplots()
palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
ax = sns.lineplot(
x=freq,
y=y,
hue=period,
data=X,
ci=False,
ax=ax,
palette=palette,
legend=False,
)
ax.set_title(f"Seasonal Plot ({period}/{freq})")
for line, name in zip(ax.lines, X[period].unique()):
y_ = line.get_ydata()[-1]
ax.annotate(
name,
xy=(1, y_),
xytext=(6, 0),
color=line.get_color(),
xycoords=ax.get_yaxis_transform(),
textcoords="offset points",
size=14,
va="center",
)
return ax
def plot_periodogram(ts, detrend='linear', ax=None):
from scipy.signal import periodogram
fs = pd.Timedelta("365D") / pd.Timedelta("1D")
freqencies, spectrum = periodogram(
ts,
fs=fs,
detrend=detrend,
window="boxcar",
scaling='spectrum',
)
if ax is None:
_, ax = plt.subplots()
ax.step(freqencies, spectrum, color="purple")
ax.set_xscale("log")
ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
ax.set_xticklabels(
[
"Annual (1)",
"Semiannual (2)",
"Quarterly (4)",
"Bimonthly (6)",
"Monthly (12)",
"Biweekly (26)",
"Weekly (52)",
"Semiweekly (104)",
],
rotation=30,
)
ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
ax.set_ylabel("Variance")
ax.set_title("Periodogram")
return ax
data_dir = Path("../input/ts-course-data")
tunnel = pd.read_csv(data_dir / "tunnel.csv", parse_dates=["Day"])
tunnel = tunnel.set_index("Day").to_period("D")
- 画图查看规律
X = tunnel.copy()
# days within a week
X["day"] = X.index.dayofweek # the x-axis (freq)
X["week"] = X.index.week # the seasonal period (period)
# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year
fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="NumVehicles", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="NumVehicles", period="year", freq="dayofyear", ax=ax1);
- 查看周期性
plot_periodogram(tunnel.NumVehicles);
- 创建特征矩阵
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
fourier = CalendarFourier(freq="A", order=10) # 10 sin/cos pairs for "A"nnual seasonality
dp = DeterministicProcess(
index=tunnel.index,
constant=True, # dummy feature for bias (y-intercept)
order=1, # trend (order 1 means linear)
seasonal=True, # weekly seasonality (indicators)
additional_terms=[fourier], # annual seasonality (fourier)
drop=True, # drop terms to avoid collinearity
)
X = dp.in_sample() # create features for dates in tunnel.index
- 创建离散特征的两种方式
# pandas
holidays = (
holidays_events
.query("locale in ['National', 'Regional']")
.loc['2017':'2017-08-15', ['description']]
.assign(description=lambda x: x.description.cat.remove_unused_categories())
)
X_holidays = pd.get_dummies(holidays)
# sk_learn
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_holidays = pd.DataFrame(
ohe.fit_transform(holidays),
index=holidays.index,
columns=holidays.description.unique(),
)
- 根据创建的特征做出预测 这里需要注意,阶数越高,对之前数据的拟合就越好,但是也容易导致后续预测时出现过拟合问题。
y = tunnel["NumVehicles"]
model = LinearRegression(fit_intercept=False)
_ = model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=y.index)
X_fore = dp.out_of_sample(steps=90)
y_fore = pd.Series(model.predict(X_fore), index=X_fore.index)
ax = y.plot(color='0.25', style='.', title="Tunnel Traffic - Seasonal Forecast")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax = y_fore.plot(ax=ax, label="Seasonal Forecast", color='C3')
_ = ax.legend()
4 循环(Cycle)
循环和周期之间的区别在于,周期和时间之间的关系更紧密,循环和时间之间的关系比较弱,也就是说循环的规律性比周期弱。 首先,我们需要绘制延迟值。
def lagplot(x, y=None, lag=1, standardize=False, ax=None, **kwargs):
from matplotlib.offsetbox import AnchoredText
x_ = x.shift(lag)
if standardize:
x_ = (x_ - x_.mean()) / x_.std()
if y is not None:
y_ = (y - y.mean()) / y.std() if standardize else y
else:
y_ = x
corr = y_.corr(x_)
if ax is None:
fig, ax = plt.subplots()
scatter_kws = dict(
alpha=0.75,
s=3,
)
line_kws = dict(color='C3', )
ax = sns.regplot(x=x_,
y=y_,
scatter_kws=scatter_kws,
line_kws=line_kws,
lowess=True,
ax=ax,
**kwargs)
at = AnchoredText(
f"{corr:.2f}",
prop=dict(size="large"),
frameon=True,
loc="upper left",
)
at.patch.set_boxstyle("square, pad=0.0")
ax.add_artist(at)
ax.set(title=f"Lag {lag}", xlabel=x_.name, ylabel=y_.name)
return ax
def plot_lags(x, y=None, lags=6, nrows=1, lagplot_kwargs={}, **kwargs):
import math
kwargs.setdefault('nrows', nrows)
kwargs.setdefault('ncols', math.ceil(lags / nrows))
kwargs.setdefault('figsize', (kwargs['ncols'] * 2, nrows * 2 + 0.5))
fig, axs = plt.subplots(sharex=True, sharey=True, squeeze=False, **kwargs)
for ax, k in zip(fig.get_axes(), range(kwargs['nrows'] * kwargs['ncols'])):
if k + 1 <= lags:
ax = lagplot(x, y, lag=k + 1, ax=ax, **lagplot_kwargs)
ax.set_title(f"Lag {k + 1}", fontdict=dict(fontsize=14))
ax.set(xlabel="", ylabel="")
else:
ax.axis('off')
plt.setp(axs[-1, :], xlabel=x.name)
plt.setp(axs[:, 0], ylabel=y.name if y is not None else x.name)
fig.tight_layout(w_pad=0.1, h_pad=0.1)
return fig
data_dir = Path("../input/ts-course-data")
flu_trends = pd.read_csv(data_dir / "flu-trends.csv")
flu_trends.set_index(
pd.PeriodIndex(flu_trends.Week, freq="W"),
inplace=True,
)
flu_trends.drop("Week", axis=1, inplace=True)
ax = flu_trends.FluVisits.plot(title='Flu Trends', **plot_params)
_ = ax.set(ylabel="Office Visits")
_ = plot_lags(flu_trends.FluVisits, lags=12, nrows=2)
_ = plot_pacf(flu_trends.FluVisits, lags=12)
创建滞后值。
def make_lags(ts, lags):
return pd.concat(
{
f'y_lag_{i}': ts.shift(i)
for i in range(1, lags + 1)
},
axis=1)
X = make_lags(flu_trends.FluVisits, lags=4)
X = X.fillna(0.0)
做出预测。
# Create target series and data splits
y = flu_trends.FluVisits.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=60, shuffle=False)
# Fit and predict
model = LinearRegression() # `fit_intercept=True` since we didn't use DeterministicProcess
model.fit(X_train, y_train)
y_pred = pd.Series(model.predict(X_train), index=y_train.index)
y_fore = pd.Series(model.predict(X_test), index=y_test.index)
# 绘制预测值的图形
ax = y_train.plot(**plot_params)
ax = y_test.plot(**plot_params)
ax = y_pred.plot(ax=ax)
_ = y_fore.plot(ax=ax, color='C3')
要想使得预测更加精准,就需要找出更加具有预测价值的指标。
ax = flu_trends.plot(
y=["FluCough", "FluVisits"],
secondary_y="FluCough",
)
search_terms = ["FluContagious", "FluCough", "FluFever", "InfluenzaA", "TreatFlu", "IHaveTheFlu", "OverTheCounterFlu", "HowLongFlu"]
# Create three lags for each search term
X0 = make_lags(flu_trends[search_terms], lags=3)
X0.columns = [' '.join(col).strip() for col in X0.columns.values]
X1 = make_lags(flu_trends['FluVisits'], lags=4)
X = pd.concat([X0, X1], axis=1).fillna(0.0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=60, shuffle=False)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = pd.Series(model.predict(X_train), index=y_train.index)
y_fore = pd.Series(model.predict(X_test), index=y_test.index)
ax = y_test.plot(**plot_params)
_ = y_fore.plot(ax=ax, color='C3')
5 混合模型(Hybrid Models)
线性回归适合描述线性的趋势,XGBoost适合描述关系。真实的应用场景中往往需要结合这些模型的优点,一般会用一个简单的模型(线性回归)结合一个复杂的模型(GBDT/深度学习)。 一个时间序列的数据集通常包含以下这些关系:
series = trend + seasons + cycles + error
首先,我们可能会尝试对于不同部分的数据做不同的处理,然后把两个部分的数据结合在一起。
model_1.fit(X_train_1, y_train)
y_pred_1 = model_1.predict(X_train)
model_2.fit(X_train_2, y_train - y_pred_1)
y_pred_2 = model_2.predict(X_train_2)
y_pred = y_pred_1 + y_pred_2
6 预测
7 附录
7.1 常用处理方法
7.1.1 to_period
import pandas as pd
pd.to_period('M')
'A': 年度(Year-End)'Q': 季度(Quarter-End)'M': 月份(Month-End)'W': 周(Week-End),这里的周是指星期日为一周的最后一天'D': 日期(Day)'B': 工作日(Business-Day)'H': 小时(Hour-End)'T': 分钟(Minute-End)'S': 秒(Second-End)
7.2 知识点
7.2.1 傅立叶级数(CalendarFourier)
以下是傅立叶级数的用法
from statsmodels.tsa.deterministic import CalendarFourier
# 假设 seasonal_period 是季节性周期,例如,一年有12个月
seasonal_period = 12
k = 12 # 傅里叶级数的阶数
# 创建 CalendarFourier 对象
cf = CalendarFourier(index=df.index, period=seasonal_period, K=k)
# 生成傅里叶级数特征
cf_series = cf.in_sample()