分析目的:
根据时间序列反映出来发展过程和趋势,建立ARIMA模型,预测下一段时间可能达到的水平。
字段说明
date:时间
co2: 二氧化碳
1.导入数据
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
data = sm.datasets.co2.load_pandas()
Mauna_Loa_CO2 =pd.read_csv('./Mauna_Loa_CO2 .csv')
Mauna_Loa_CO2.head()
date | co2 | |
0 | 1958/3/29 | 316.1 |
1 | 1958/4/5 | 317.3 |
2 | 1958/4/12 | 317.6 |
3 | 1958/4/19 | 317.5 |
4 | 1958/4/26 | 316.4 |
2.数据探索
Mauna_Loa_CO2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2284 entries, 0 to 2283
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 2284 non-null object
1 co2 2225 non-null float64
dtypes: float64(1), object(1)
memory usage: 35.8+ KB
Mauna_Loa_CO2.isnull().any()
date False
co2 True
dtype: bool
Mauna_Loa_CO2_Miss=Mauna_Loa_CO2.fillna(method='ffill')
Mauna_Loa_CO2_Miss.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2284 entries, 0 to 2283
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 2284 non-null object
1 co2 2284 non-null float64
dtypes: float64(1), object(1)
memory usage: 35.8+ KB
Mauna_Loa_CO2_Miss.isnull().any()
date False
co2 False
dtype: bool
import pandas as pd
Mauna_Loa_CO2_Miss['date']=pd.to_datetime(Mauna_Loa_CO2_Miss['date'])
Mauna_Loa_CO2_Miss.set_index('date',inplace=True)
Mauna_Loa_CO2_Miss_Mon=Mauna_Loa_CO2_Miss['co2'].resample('MS').mean()
Mauna_Loa_CO2_Miss_Mon2=pd.DataFrame(Mauna_Loa_CO2_Miss_Mon)
Mauna_Loa_CO2_Miss_Mon2.head()
co2 | date |
1958-03-01 | 316.100 |
1958-04-01 | 317.200 |
1958-05-01 | 317.420 |
1958-06-01 | 317.900 |
1958-07-01 | 315.625 |
import matplotlib.pyplot as plt
Mauna_Loa_CO2_Miss.plot(figsize=(15, 6))
plt.title("CO2 Trend")
plt.xlabel("Month")
plt.ylabel("CO2")
plt.show()
from statsmodels.tsa.seasonal import seasonal_decompose
from pylab import rcParams
rcParams['figure.figsize'] = 11, 9
decomposition = sm.tsa.seasonal_decompose(Mauna_Loa_CO2_Miss_Mon2, model='additive')
fig = decomposition.plot()
plt.show()
3.序列平稳性检验
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
def test_stationarity(timeseries):
#计算均值与方差
rolmean = timeseries.rolling(window=12).mean()
rolstd = timeseries.rolling(window=12).std()
#绘制图形:
fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')# 使用自动最佳的图例显示位置
plt.title('Rolling Mean & Standard Deviation')
plt.xlabel("Month") #添加X轴说明
plt.ylabel("CO2") #添加Y轴说明
plt.show()#观察是否平稳
print('ADF检验结果:')
#进行ADF检验
print('Results of Dickey-Fuller Test:')
# 使用减小AIC的办法估算ADF测试所需的滞后数
dftest = adfuller(timeseries, autolag='AIC')
# 将ADF测试结果、显著性概率、所用的滞后数和所用的观测数打印出来
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(Mauna_Loa_CO2_Miss_Mon2['co2'])
ADF检验结果:
Results of Dickey-Fuller Test:
Test Statistic 2.140106
p-value 0.998829
#Lags Used 15.000000
Number of Observations Used 510.000000
Critical Value (1%) -3.443237
Critical Value (5%) -2.867224
Critical Value (10%) -2.569797
dtype: float64
4.序列变换
Mauna_Loa_CO2_Miss_Mon2['first_difference'] =Mauna_Loa_CO2_Miss_Mon2['co2'].diff(1)
test_stationarity(Mauna_Loa_CO2_Miss_Mon2['first_difference'].dropna(inplace=False))
ADF检验结果:
Results of Dickey-Fuller Test:
Test Statistic -4.824703
p-value 0.000049
#Lags Used 19.000000
Number of Observations Used 505.000000
Critical Value (1%) -3.443366
Critical Value (5%) -2.867280
Critical Value (10%) -2.569827
dtype: float64
5.白噪音检验
Mauna_Loa_CO2_Miss_Mon2['seasonal_difference'] = Mauna_Loa_CO2_Miss_Mon2['first_difference'].diff(12)
test_stationarity(Mauna_Loa_CO2_Miss_Mon2['seasonal_difference'].dropna(inplace=False))
from statsmodels.stats.diagnostic import acorr_ljungbox
print(acorr_ljungbox(Mauna_Loa_CO2_Miss_Mon2['seasonal_difference'].iloc[13:], lags=1))
6.确定参数
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(Mauna_Loa_CO2_Miss_Mon2['seasonal_difference'].iloc[13:], lags=40, ax=ax1)
#从13开始是因为做季节性差分时window是12
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(Mauna_Loa_CO2_Miss_Mon2['seasonal_difference'].iloc[13:], lags=40, ax=ax2)
import warnings
import itertools
import numpy as np
import pandas as pd
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12)
for x in list(itertools.product(p, d, q))]
warnings.filterwarnings("ignore")
# specify to ignore warning messages
AIC_Value = []
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(Mauna_Loa_CO2_Miss_Mon2['co2'],
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
AIC_Value.append(results.aic)
print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
7.模型结果
mod = sm.tsa.statespace.SARIMAX(Mauna_Loa_CO2_Miss_Mon2['co2'],
order=(1, 1, 1),
seasonal_order=(1, 1, 1, 12),
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])
results.plot_diagnostics(figsize=(15, 12))
plt.show()
pred = results.get_prediction(start=pd.to_datetime('1995-01-01'), dynamic=False)
pred_ci = pred.conf_int()
ax = Mauna_Loa_CO2_Miss_Mon2['1990':].plot(label='observed')
ax.set_ylim(350, 380)
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='r', alpha=.9)
ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')
plt.legend()
plt.show()
8.总结
残差图的分布基本为标准正态分布,拟合残差基本为白噪音,其拟合效果比较好。