Linear Regression With Time Series
产生 Time 序列
df = tunnel.copy()
df['Time'] = np.arange(len(tunnel.index))
df.head()
使用LinearRegression模型
from sklearn.linear_model import LinearRegression
# Training data
X = df.loc[:, ['Time']] # features
y = df.loc[:, 'NumVehicles'] # target
# Train the model
model = LinearRegression()
model.fit(X, y)
# Store the fitted values as a time series with the same time index as
# the training data
y_pred = pd.Series(model.predict(X), index=X.index)
ax = y.plot(**plot_params)
ax = y_pred.plot(ax=ax, linewidth=3)
ax.set_title('Time Plot of Tunnel Traffic');
产生 Lag feature
df['Lag_1'] = df['NumVehicles'].shift(1)
df.head()
使用LinearRegression模型
from sklearn.linear_model import LinearRegression
X = df.loc[:, ['Lag_1']]
X.dropna(inplace=True) # drop missing values in the feature set
y = df.loc[:, 'NumVehicles'] # create the target
y, X = y.align(X, join='inner') # drop corresponding values in target
model = LinearRegression()
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)
ax = y.plot(**plot_params)
ax = y_pred.plot()
Trend 趋势
一个计算均值的方法:
moving_average = tunnel.rolling(
window=365, # 365-day window
center=True, # puts the average at the center of the window
min_periods=183, # choose about half the window size
).mean() # compute the mean (could also do median, std, min, max, ...)
ax = tunnel.plot(style=".", color="0.5")
moving_average.plot(
ax=ax, linewidth=3, title="Tunnel Traffic - 365-Day Moving Average", legend=False,
);
一个获取时间序列的方法:
from statsmodels.tsa.deterministic import DeterministicProcess
dp = DeterministicProcess(
index=tunnel.index, # dates from the training data
constant=True, # dummy feature for the bias (y_intercept)
order=1, # the time dummy (trend)
drop=True, # drop terms if necessary to avoid collinearity
)
# `in_sample` creates features for the dates given in the `index` argument
X = dp.in_sample()
X.head()
通过线性回归拟合,往往就是在拟合趋势(个人理解)
from sklearn.linear_model import LinearRegression
y = tunnel["NumVehicles"] # the target
# The intercept is the same as the `const` feature from
# DeterministicProcess. LinearRegression behaves badly with duplicated
# features, so we need to be sure to exclude it here.
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=X.index)
ax = tunnel.plot(style=".", color="0.5", title="Tunnel Traffic - Linear Trend")
_ = y_pred.plot(ax=ax, linewidth=3, label="Trend")
X = dp.out_of_sample(steps=30)
y_fore = pd.Series(model.predict(X), index=X.index)
y_fore.head()
Seasonality 季节性序列
给 X 添加季节性faeture
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
# A 表示 每年, order 表示 几个周期,order = 10 表示按照每年 1、2、3、、、10个周期添加数据,正弦余弦对应的函数值
fourier = CalendarFourier(freq="A", order=10) # 10 sin/cos pairs for "A"nnual seasonality
# seasonal=True 表示添加季节性 feature,因为数据的 freq 是 Day,所以会按照7天为一个周期,如果数据的 freq 是 Month,就会按照12个数据(月)为一个周期。
dp = DeterministicProcess(
index=tunnel.index,
constant=True, # dummy feature for bias (y-intercept)
order=1, # trend (order 1 means linear)
seasonal=True, # weekly seasonality (indicators)
additional_terms=[fourier], # annual seasonality (fourier)
drop=True, # drop terms to avoid collinearity
)
X = dp.in_sample() # create features for dates in tunnel.index
X.head(10)
这里还是用线性模型拟合,跟通常一样,没有差,主要区别在于给数据添加周期和傅立叶标签
y = tunnel["NumVehicles"]
model = LinearRegression(fit_intercept=False)
_ = model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=y.index)
X_fore = dp.out_of_sample(steps=90)
y_fore = pd.Series(model.predict(X_fore), index=X_fore.index)
ax = y.plot(color='0.25', style='.', title="Tunnel Traffic - Seasonal Forecast")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax = y_fore.plot(ax=ax, label="Seasonal Forecast", color='C3')
_ = ax.legend()
Time Series as Features
就是把前面的 target 也作为 fetaure
核心代码
import pandas as pd
# Federal Reserve dataset: https://www.kaggle.com/federalreserve/interest-rates
reserve = pd.read_csv(
"../input/ts-course-data/reserve.csv",
parse_dates={'Date': ['Year', 'Month', 'Day']},
index_col='Date',
)
y = reserve.loc[:, 'Unemployment Rate'].dropna().to_period('M')
df = pd.DataFrame({
'y': y,
'y_lag_1': y.shift(1),
'y_lag_2': y.shift(2),
})
df.head()