• Python数据分析与机器学习46-时间序列案例


    一. 消费者信心指数数据分析

    1.1 数据源介绍

    sentiment.csv
    美国消费者信心指数
    image.png

    1.2 时间序列图及差分图

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    
    import pandas as pd
    import numpy as np
    
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    
    import matplotlib.pylab as plt
    import seaborn as sns
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 美国消费者信心指数
    Sentiment = 'E:/file/sentiment.csv'
    Sentiment = pd.read_csv(Sentiment, index_col=0, parse_dates=[0])
    
    sentiment_short = Sentiment.loc['2005':'2016']
    
    sentiment_short.plot(figsize=(12,8))
    plt.legend(bbox_to_anchor=(1.25, 0.5))
    plt.title("Consumer Sentiment")
    sns.despine()
    
    sentiment_short['diff_1'] = sentiment_short['UMCSENT'].diff(1)
    sentiment_short['diff_2'] = sentiment_short['diff_1'].diff(1)
    sentiment_short.plot(subplots=True, figsize=(18, 12))
    
    del sentiment_short['diff_2']
    del sentiment_short['diff_1']
    
    fig = plt.figure(figsize=(12,8))
    
    ax1 = fig.add_subplot(211)
    fig = sm.graphics.tsa.plot_acf(sentiment_short, lags=20,ax=ax1)
    ax1.xaxis.set_ticks_position('bottom')
    fig.tight_layout();
    
    ax2 = fig.add_subplot(212)
    fig = sm.graphics.tsa.plot_pacf(sentiment_short, lags=20, ax=ax2)
    ax2.xaxis.set_ticks_position('bottom')
    fig.tight_layout();
    
    
    # 散点图也可以表示
    
    lags=9
    
    ncols=3
    nrows=int(np.ceil(lags/ncols))
    
    fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(4*ncols, 4*nrows))
    
    for ax, lag in zip(axes.flat, np.arange(1,lags+1, 1)):
        lag_str = 't-{}'.format(lag)
        X = (pd.concat([sentiment_short, sentiment_short.shift(-lag)], axis=1,
                       keys=['y'] + [lag_str]).dropna())
    
        X.plot(ax=ax, kind='scatter', y='y', x=lag_str);
        corr = X.corr().iloc[:,:].values[0][1]
        ax.set_ylabel('Original')
        ax.set_title('Lag: {} (corr={:.2f})'.format(lag_str, corr));
        ax.set_aspect('equal');
        sns.despine();
    
    fig.tight_layout();
    
    
    # 更直观一些
    def tsplot(y, lags=None, title='', figsize=(14, 8)):
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0))
        hist_ax = plt.subplot2grid(layout, (0, 1))
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
    
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        y.plot(ax=hist_ax, kind='hist', bins=25)
        hist_ax.set_title('Histogram')
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
        sns.despine()
        plt.tight_layout()
        return ts_ax, acf_ax, pacf_ax
    
    tsplot(sentiment_short, title='Consumer Sentiment', lags=36);
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102

    测试记录:
    image.png
    image.png

    1.3 AR图

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    import matplotlib.pylab as plt
    import seaborn as sns
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 读取数据
    Sentiment = 'E:/file/sentiment.csv'
    Sentiment = pd.read_csv(Sentiment, index_col=0, parse_dates=[0])
    
    sentiment_short = Sentiment.loc['2005':'2016']
    
    # 自相关图
    fig = plt.figure(figsize=(12,8))
    
    ax1 = fig.add_subplot(211)
    fig = sm.graphics.tsa.plot_acf(sentiment_short, lags=20,ax=ax1)
    ax1.xaxis.set_ticks_position('bottom')
    fig.tight_layout();
    
    ax2 = fig.add_subplot(212)
    fig = sm.graphics.tsa.plot_pacf(sentiment_short, lags=20, ax=ax2)
    ax2.xaxis.set_ticks_position('bottom')
    fig.tight_layout();
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41

    测试记录:
    image.png

    1.4 散点图

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    import matplotlib.pylab as plt
    import seaborn as sns
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 读取数据
    Sentiment = 'E:/file/sentiment.csv'
    Sentiment = pd.read_csv(Sentiment, index_col=0, parse_dates=[0])
    
    sentiment_short = Sentiment.loc['2005':'2016']
    
    # 散点图也可以表示
    lags=9
    
    ncols=3
    nrows=int(np.ceil(lags/ncols))
    
    fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(4*ncols, 4*nrows))
    
    for ax, lag in zip(axes.flat, np.arange(1,lags+1, 1)):
        lag_str = 't-{}'.format(lag)
        X = (pd.concat([sentiment_short, sentiment_short.shift(-lag)], axis=1,
                       keys=['y'] + [lag_str]).dropna())
    
        X.plot(ax=ax, kind='scatter', y='y', x=lag_str);
        corr = X.corr().iloc[:,:].values[0][1]
        ax.set_ylabel('Original')
        ax.set_title('Lag: {} (corr={:.2f})'.format(lag_str, corr));
        ax.set_aspect('equal');
        sns.despine();
    
    fig.tight_layout();
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50

    测试记录:
    image.png

    1.5 更直观的展示

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    import matplotlib.pylab as plt
    import seaborn as sns
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 读取数据
    Sentiment = 'E:/file/sentiment.csv'
    Sentiment = pd.read_csv(Sentiment, index_col=0, parse_dates=[0])
    
    sentiment_short = Sentiment.loc['2005':'2016']
    
    # 更直观一些
    def tsplot(y, lags=None, title='', figsize=(14, 8)):
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0))
        hist_ax = plt.subplot2grid(layout, (0, 1))
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
    
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        y.plot(ax=hist_ax, kind='hist', bins=25)
        hist_ax.set_title('Histogram')
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
        sns.despine()
        plt.tight_layout()
        return ts_ax, acf_ax, pacf_ax
    
    tsplot(sentiment_short, title='Consumer Sentiment', lags=36);
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50

    测试记录:
    image.png

    二. 参数选择

    2.1 数据源介绍

    series1.csv
    一个标准的时间序列数据
    image.png

    2.2 直观的图形化展示

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    
    import matplotlib.pylab as plt
    import seaborn as sns
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 美国消费者信心指数
    filename_ts = 'E:/file/series1.csv'
    ts_df = pd.read_csv(filename_ts, index_col=0, parse_dates=[0])
    
    n_sample = ts_df.shape[0]
    
    # 划分测试集和训练集
    n_train=int(0.95*n_sample)+1
    n_forecast=n_sample-n_train
    #ts_df
    ts_train = ts_df.iloc[:n_train]['value']
    ts_test = ts_df.iloc[n_train:]['value']
    #print(ts_train.shape)
    #print(ts_test.shape)
    #print("Training Series:", "\n", ts_train.tail(), "\n")
    #print("Testing Series:", "\n", ts_test.head())
    
    
    def tsplot(y, lags=None, title='', figsize=(14, 8)):
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0))
        hist_ax = plt.subplot2grid(layout, (0, 1))
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
    
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        y.plot(ax=hist_ax, kind='hist', bins=25)
        hist_ax.set_title('Histogram')
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        [ax.set_xlim(0) for ax in [acf_ax, pacf_ax]]
        sns.despine()
        fig.tight_layout()
        return ts_ax, acf_ax, pacf_ax
    
    tsplot(ts_train, title='A Given Training Series', lags=20);
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62

    测试记录:
    image.png

    2.3 热力图

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    import matplotlib.pylab as plt
    import seaborn as sns
    import itertools
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 美国消费者信心指数
    filename_ts = 'E:/file/series1.csv'
    ts_df = pd.read_csv(filename_ts, index_col=0, parse_dates=[0])
    
    n_sample = ts_df.shape[0]
    
    # 划分训练集和测试集
    n_train=int(0.95*n_sample)+1
    n_forecast=n_sample-n_train
    #ts_df
    ts_train = ts_df.iloc[:n_train]['value']
    ts_test = ts_df.iloc[n_train:]['value']
    
    # 训练模型
    arima200 = sm.tsa.SARIMAX(ts_train, order=(2,0,0))
    model_results = arima200.fit()
    
    # 选择参数
    p_min = 0
    d_min = 0
    q_min = 0
    p_max = 4
    d_max = 0
    q_max = 4
    
    # Initialize a DataFrame to store the results
    results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min, p_max + 1)],
                               columns=['MA{}'.format(i) for i in range(q_min, q_max + 1)])
    
    for p, d, q in itertools.product(range(p_min, p_max + 1),
                                     range(d_min, d_max + 1),
                                     range(q_min, q_max + 1)):
        if p == 0 and d == 0 and q == 0:
            results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = np.nan
            continue
    
        try:
            model = sm.tsa.SARIMAX(ts_train, order=(p, d, q),
                                   # enforce_stationarity=False,
                                   # enforce_invertibility=False,
                                   )
            results = model.fit()
            results_bic.loc['AR{}'.format(p), 'MA{}'.format(q)] = results.bic
        except:
            continue
    results_bic = results_bic[results_bic.columns].astype(float)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax = sns.heatmap(results_bic,
                     mask=results_bic.isnull(),
                     ax=ax,
                     annot=True,
                     fmt='.2f',
                     );
    ax.set_title('BIC');
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79

    测试记录:
    image.png

    2.4 AIC/BIC以及残差分析 正态分布 QQ图线性

    代码:

    from __future__ import absolute_import, division, print_function
    # http://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
    import sys
    import os
    import pandas as pd
    import numpy as np
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import statsmodels.tsa.api as smt
    import matplotlib.pylab as plt
    import seaborn as sns
    import itertools
    
    # 一些配置项
    pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
    np.set_printoptions(precision=5, suppress=True) # numpy
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    
    sns.set(style='ticks', context='poster')
    
    # 美国消费者信心指数
    filename_ts = 'E:/file/series1.csv'
    ts_df = pd.read_csv(filename_ts, index_col=0, parse_dates=[0])
    
    n_sample = ts_df.shape[0]
    
    # 划分训练集和测试集
    n_train=int(0.95*n_sample)+1
    n_forecast=n_sample-n_train
    #ts_df
    ts_train = ts_df.iloc[:n_train]['value']
    ts_test = ts_df.iloc[n_train:]['value']
    
    # 训练模型
    arima200 = sm.tsa.SARIMAX(ts_train, order=(2,0,0))
    model_results = arima200.fit()
    
    # AIC 和 BIC
    #print(help(sm.tsa.arma_order_select_ic))
    train_results = sm.tsa.arma_order_select_ic(ts_train, ic=['aic', 'bic'], trend='n', max_ar=4, max_ma=4)
    print('AIC', train_results.aic_min_order)
    print('BIC', train_results.bic_min_order)
    
    #残差分析 正态分布 QQ图线性
    model_results.plot_diagnostics(figsize=(16, 12));
    
    plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49

    测试记录:

    AIC (3, 3)
    BIC (1, 1)
    
    • 1
    • 2

    image.png

    参考:

    1. https://study.163.com/course/introduction.htm?courseId=1003590004#/courseDetail?tab=1
  • 相关阅读:
    CSS基础10-单行/多行文本溢出省略
    rqt_publisher报错/publisher_widget.py“, line 105, in _update_thread_run
    黑马Java热门面试题MySQL(五)
    【java进阶02:抽象类和接口】类与类之间的关系 抽象类与接口的区别
    Go 必知的10个命令
    解决SLF4J: Class path contains multiple SLF4J bindings.
    SpringBoot任务详解
    LeetCode240题:搜索二维矩阵II(python3)
    OGG-01224 Address already in use 问题
    实验5-2——网络yum源的配置
  • 原文地址:https://blog.csdn.net/u010520724/article/details/126153487