• 机器学习-数值特征


    离散值处理

    import pandas as pd
    import numpy as np
    
    • 1
    • 2
    vg_df = pd.read_csv('datasets/vgsales.csv', encoding = "ISO-8859-1")
    vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]
    
    • 1
    • 2
    NamePlatformYearGenrePublisher
    1Super Mario Bros.NES1985.0PlatformNintendo
    2Mario Kart WiiWii2008.0RacingNintendo
    3Wii Sports ResortWii2009.0SportsNintendo
    4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
    5TetrisGB1989.0PuzzleNintendo
    6New Super Mario Bros.DS2006.0PlatformNintendo
    genres = np.unique(vg_df['Genre'])
    genres
    
    • 1
    • 2
    array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
           'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
           'Strategy'], dtype=object)
    
    • 1
    • 2
    • 3

    LabelEncoder

    from sklearn.preprocessing import LabelEncoder
    
    gle = LabelEncoder()
    genre_labels = gle.fit_transform(vg_df['Genre'])
    genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
    genre_mappings
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    {0: 'Action',
     1: 'Adventure',
     2: 'Fighting',
     3: 'Misc',
     4: 'Platform',
     5: 'Puzzle',
     6: 'Racing',
     7: 'Role-Playing',
     8: 'Shooter',
     9: 'Simulation',
     10: 'Sports',
     11: 'Strategy'}
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    vg_df['GenreLabel'] = genre_labels
    vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]
    
    • 1
    • 2
    NamePlatformYearGenreGenreLabel
    1Super Mario Bros.NES1985.0Platform4
    2Mario Kart WiiWii2008.0Racing6
    3Wii Sports ResortWii2009.0Sports10
    4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
    5TetrisGB1989.0Puzzle5
    6New Super Mario Bros.DS2006.0Platform4

    Map

    poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
    poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)
    
    np.unique(poke_df['Generation'])
    
    • 1
    • 2
    • 3
    • 4
    array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)
    
    • 1
    gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
                   'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}
    
    poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
    poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    NameGenerationGenerationLabel
    4OctilleryGen 22
    5HelioptileGen 66
    6DialgaGen 44
    7DeoxysDefense FormeGen 33
    8RapidashGen 11
    9SwannaGen 55

    One-hot Encoding

    poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]
    
    • 1
    NameGenerationLegendary
    4OctilleryGen 2False
    5HelioptileGen 6False
    6DialgaGen 4True
    7DeoxysDefense FormeGen 3True
    8RapidashGen 1False
    9SwannaGen 5False
    from sklearn.preprocessing import OneHotEncoder, LabelEncoder
    
    # transform and map pokemon generations
    gen_le = LabelEncoder()
    gen_labels = gen_le.fit_transform(poke_df['Generation'])
    poke_df['Gen_Label'] = gen_labels
    
    # transform and map pokemon legendary status
    leg_le = LabelEncoder()
    leg_labels = leg_le.fit_transform(poke_df['Legendary'])
    poke_df['Lgnd_Label'] = leg_labels
    
    poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
    poke_df_sub.iloc[4:10]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    NameGenerationGen_LabelLegendaryLgnd_Label
    4OctilleryGen 21False0
    5HelioptileGen 65False0
    6DialgaGen 43True1
    7DeoxysDefense FormeGen 32True1
    8RapidashGen 10False0
    9SwannaGen 54False0
    # encode generation labels using one-hot encoding scheme
    gen_ohe = OneHotEncoder()
    gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
    gen_feature_labels = list(gen_le.classes_)
    print (gen_feature_labels)
    gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
    
    # encode legendary status labels using one-hot encoding scheme
    leg_ohe = OneHotEncoder()
    leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()
    leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
    print (leg_feature_labels)
    leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    ['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6']
    ['Legendary_False', 'Legendary_True']
    
    • 1
    • 2
    poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
    columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
                  ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
    poke_df_ohe[columns].iloc[4:10]
    
    • 1
    • 2
    • 3
    • 4
    NameGenerationGen_LabelGen 1Gen 2Gen 3Gen 4Gen 5Gen 6LegendaryLgnd_LabelLegendary_FalseLegendary_True
    4OctilleryGen 210.01.00.00.00.00.0False01.00.0
    5HelioptileGen 650.00.00.00.00.01.0False01.00.0
    6DialgaGen 430.00.00.01.00.00.0True10.01.0
    7DeoxysDefense FormeGen 320.00.01.00.00.00.0True10.01.0
    8RapidashGen 101.00.00.00.00.00.0False01.00.0
    9SwannaGen 540.00.00.00.01.00.0False01.00.0

    Get Dummy

    gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)
    pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
    
    • 1
    • 2
    NameGenerationGen 2Gen 3Gen 4Gen 5Gen 6
    4OctilleryGen 210000
    5HelioptileGen 600001
    6DialgaGen 400100
    7DeoxysDefense FormeGen 301000
    8RapidashGen 100000
    9SwannaGen 500010
    gen_onehot_features = pd.get_dummies(poke_df['Generation'])
    pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]
    
    • 1
    • 2
    NameGenerationGen 1Gen 2Gen 3Gen 4Gen 5Gen 6
    4OctilleryGen 2010000
    5HelioptileGen 6000001
    6DialgaGen 4000100
    7DeoxysDefense FormeGen 3001000
    8RapidashGen 1100000
    9SwannaGen 5000010
    import pandas as pd
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import numpy as np
    import scipy.stats as spstats
    
    %matplotlib inline
    mpl.style.reload_library()
    mpl.style.use('classic')
    mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
    mpl.rcParams['figure.figsize'] = [6.0, 4.0]
    mpl.rcParams['figure.dpi'] = 100
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
    poke_df.head()
    
    • 1
    • 2
    #NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
    01BulbasaurGrassPoison318454949656545Gen 1False
    12IvysaurGrassPoison405606263808060Gen 1False
    23VenusaurGrassPoison52580828310010080Gen 1False
    33VenusaurMega VenusaurGrassPoison6258010012312212080Gen 1False
    44CharmanderFireNaN309395243605065Gen 1False
    poke_df[['HP', 'Attack', 'Defense']].head()
    
    • 1
    HPAttackDefense
    0454949
    1606263
    2808283
    380100123
    4395243
    poke_df[['HP', 'Attack', 'Defense']].describe()
    
    • 1
    HPAttackDefense
    count800.000000800.000000800.000000
    mean69.25875079.00125073.842500
    std25.53466932.45736631.183501
    min1.0000005.0000005.000000
    25%50.00000055.00000050.000000
    50%65.00000075.00000070.000000
    75%80.000000100.00000090.000000
    max255.000000190.000000230.000000
    popsong_df = pd.read_csv('datasets/song_views.csv', encoding='utf-8')
    popsong_df.head(10)
    
    • 1
    • 2
    user_idsong_idtitlelisten_count
    0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One2
    1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One0
    24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One0
    3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One0
    4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One0
    568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One0
    6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One17
    745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One0
    8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One68
    9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One0

    二值特征

    watched = np.array(popsong_df['listen_count']) 
    watched[watched >= 1] = 1
    popsong_df['watched'] = watched
    popsong_df.head(10)
    
    • 1
    • 2
    • 3
    • 4
    user_idsong_idtitlelisten_countwatched
    0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One21
    1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One00
    24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One00
    3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One00
    4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One00
    568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One00
    6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One171
    745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One00
    8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One681
    9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One00
    from sklearn.preprocessing import Binarizer
    
    bn = Binarizer(threshold=0.9)
    pd_watched = bn.transform([popsong_df['listen_count']])[0]
    popsong_df['pd_watched'] = pd_watched
    popsong_df.head(11)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    user_idsong_idtitlelisten_countwatchedpd_watched
    0b6b799f34a204bd928ea014c243ddad6d0be4f8fSOBONKR12A58A7A7E0You're The One211
    1b41ead730ac14f6b6717b9cf8859d5579f3f8d4dSOBONKR12A58A7A7E0You're The One000
    24c84359a164b161496d05282707cecbd50adbfc4SOBONKR12A58A7A7E0You're The One000
    3779b5908593756abb6ff7586177c966022668b06SOBONKR12A58A7A7E0You're The One000
    4dd88ea94f605a63d9fc37a214127e3f00e85e42dSOBONKR12A58A7A7E0You're The One000
    568f0359a2f1cedb0d15c98d88017281db79f9bc6SOBONKR12A58A7A7E0You're The One000
    6116a4c95d63623a967edf2f3456c90ebbf964e6fSOBONKR12A58A7A7E0You're The One1711
    745544491ccfcdc0b0803c34f201a6287ed4e30f8SOBONKR12A58A7A7E0You're The One000
    8e701a24d9b6c59f5ac37ab28462ca82470e27cfbSOBONKR12A58A7A7E0You're The One6811
    9edc8b7b1fd592a3b69c3d823a742e1a064abec95SOBONKR12A58A7A7E0You're The One000
    10fb41d1c374d093ab643ef3bcd70eeb258d479076SOBONKR12A58A7A7E0You're The One111

    多项式特征

    atk_def = poke_df[['Attack', 'Defense']]
    atk_def.head()
    
    • 1
    • 2
    AttackDefense
    04949
    16263
    28283
    3100123
    45243
    from sklearn.preprocessing import PolynomialFeatures
    
    pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
    res = pf.fit_transform(atk_def)
    res
    
    • 1
    • 2
    • 3
    • 4
    • 5
    array([[    49.,     49.,   2401.,   2401.,   2401.],
           [    62.,     63.,   3844.,   3906.,   3969.],
           [    82.,     83.,   6724.,   6806.,   6889.],
           ..., 
           [   110.,     60.,  12100.,   6600.,   3600.],
           [   160.,     60.,  25600.,   9600.,   3600.],
           [   110.,    120.,  12100.,  13200.,  14400.]])
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    intr_features = pd.DataFrame(res, columns=['Attack', 'Defense', 'Attack^2', 'Attack x Defense', 'Defense^2'])
    intr_features.head(5)
    
    • 1
    • 2
    AttackDefenseAttack^2Attack x DefenseDefense^2
    049.049.02401.02401.02401.0
    162.063.03844.03906.03969.0
    282.083.06724.06806.06889.0
    3100.0123.010000.012300.015129.0
    452.043.02704.02236.01849.0

    binning特征

    fcc_survey_df = pd.read_csv('datasets/fcc_2016_coder_survey_subset.csv', encoding='utf-8')
    fcc_survey_df[['ID.x', 'EmploymentField', 'Age', 'Income']].head()
    
    • 1
    • 2
    ID.xEmploymentFieldAgeIncome
    0cef35615d61b202f1dc794ef2746df14office and administrative support28.032000.0
    1323e5a113644d18185c743c241407754food and beverage22.015000.0
    2b29a1027e5cd062e654a63764157461dfinance19.048000.0
    304a11e4bcb573a1261eb0d9948d32637arts, entertainment, sports, or media26.043000.0
    49368291c93d5d5f5c8cdb1a575e18beceducation20.06000.0
    fig, ax = plt.subplots()
    fcc_survey_df['Age'].hist(color='#A9C5D3')
    ax.set_title('Developer Age Histogram', fontsize=12)
    ax.set_xlabel('Age', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    Text(0,0.5,'Frequency')
    
    • 1

    在这里插入图片描述

    Binning based on rounding

    Age Range: Bin
    ---------------
     0 -  9  : 0
    10 - 19  : 1
    20 - 29  : 2
    30 - 39  : 3
    40 - 49  : 4
    50 - 59  : 5
    60 - 69  : 6
      ... and so on
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    fcc_survey_df['Age_bin_round'] = np.array(np.floor(np.array(fcc_survey_df['Age']) / 10.))
    fcc_survey_df[['ID.x', 'Age', 'Age_bin_round']].iloc[1071:1076]
    
    • 1
    • 2
    ID.xAgeAge_bin_round
    10716a02aa4618c99fdb3e24de522a09943117.01.0
    1072f0e5e47278c5f248fe861c5f7214c07a38.03.0
    10736e14f6d0779b7e424fa3fdd9e4bd3bf921.02.0
    1074c2654c07dc929cdf3dad4d1aec4ffbb353.05.0
    1075f07449fc9339b2e57703ec788623252335.03.0

    分位数切分

    fcc_survey_df[['ID.x', 'Age', 'Income']].iloc[4:9]
    
    • 1
    ID.xAgeIncome
    49368291c93d5d5f5c8cdb1a575e18bec20.06000.0
    5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.0
    67599c0aa0419b59fd11ffede98a3665d23.032000.0
    76dff182db452487f07a47596f314bddc35.040000.0
    89dc233f8ed1c6eb2432672ab4bb3924933.080000.0
    fig, ax = plt.subplots()
    fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
    ax.set_title('Developer Income Histogram', fontsize=12)
    ax.set_xlabel('Developer Income', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    Text(0,0.5,'Frequency')
    
    • 1

    在这里插入图片描述

    quantile_list = [0, .25, .5, .75, 1.]
    quantiles = fcc_survey_df['Income'].quantile(quantile_list)
    quantiles
    
    • 1
    • 2
    • 3
    0.00      6000.0
    0.25     20000.0
    0.50     37000.0
    0.75     60000.0
    1.00    200000.0
    Name: Income, dtype: float64
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    fig, ax = plt.subplots()
    fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
    
    for quantile in quantiles:
        qvl = plt.axvline(quantile, color='r')
    ax.legend([qvl], ['Quantiles'], fontsize=10)
    
    ax.set_title('Developer Income Histogram with Quantiles', fontsize=12)
    ax.set_xlabel('Developer Income', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    Text(0,0.5,'Frequency')
    
    • 1

    在这里插入图片描述

    quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
    fcc_survey_df['Income_quantile_range'] = pd.qcut(fcc_survey_df['Income'], 
                                                     q=quantile_list)
    fcc_survey_df['Income_quantile_label'] = pd.qcut(fcc_survey_df['Income'], 
                                                     q=quantile_list, labels=quantile_labels)
    fcc_survey_df[['ID.x', 'Age', 'Income', 
                   'Income_quantile_range', 'Income_quantile_label']].iloc[4:9]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    ID.xAgeIncomeIncome_quantile_rangeIncome_quantile_label
    49368291c93d5d5f5c8cdb1a575e18bec20.06000.0(5999.999, 20000.0]0-25Q
    5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.0(37000.0, 60000.0]50-75Q
    67599c0aa0419b59fd11ffede98a3665d23.032000.0(20000.0, 37000.0]25-50Q
    76dff182db452487f07a47596f314bddc35.040000.0(37000.0, 60000.0]50-75Q
    89dc233f8ed1c6eb2432672ab4bb3924933.080000.0(60000.0, 200000.0]75-100Q

    对数变换 COX-BOX

    fcc_survey_df['Income_log'] = np.log((1+ fcc_survey_df['Income']))
    fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log']].iloc[4:9]
    
    • 1
    • 2
    ID.xAgeIncomeIncome_log
    49368291c93d5d5f5c8cdb1a575e18bec20.06000.08.699681
    5dd0e77eab9270e4b67c19b0d6bbf621b34.040000.010.596660
    67599c0aa0419b59fd11ffede98a3665d23.032000.010.373522
    76dff182db452487f07a47596f314bddc35.040000.010.596660
    89dc233f8ed1c6eb2432672ab4bb3924933.080000.011.289794
    income_log_mean = np.round(np.mean(fcc_survey_df['Income_log']), 2)
    
    fig, ax = plt.subplots()
    fcc_survey_df['Income_log'].hist(bins=30, color='#A9C5D3')
    plt.axvline(income_log_mean, color='r')
    ax.set_title('Developer Income Histogram after Log Transform', fontsize=12)
    ax.set_xlabel('Developer Income (log scale)', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    Text(11.5,450,'$\\mu$=10.43')
    
    • 1

    日期相关特征

    import datetime
    import numpy as np
    import pandas as pd
    from dateutil.parser import parse
    import pytz
    
    • 1
    • 2
    • 3
    • 4
    • 5
    time_stamps = ['2015-03-08 10:30:00.360000+00:00', '2017-07-13 15:45:05.755000-07:00',
                   '2012-01-20 22:30:00.254000+05:30', '2016-12-25 00:30:00.000000+10:00']
    df = pd.DataFrame(time_stamps, columns=['Time'])
    df
    
    • 1
    • 2
    • 3
    • 4

    在这里插入图片描述

    Time
    02015-03-08 10:30:00.360000+00:00
    12017-07-13 15:45:05.755000-07:00
    22012-01-20 22:30:00.254000+05:30
    32016-12-25 00:30:00.000000+10:00
    ts_objs = np.array([pd.Timestamp(item) for item in np.array(df.Time)])
    df['TS_obj'] = ts_objs
    ts_objs
    
    • 1
    • 2
    • 3
    array([Timestamp('2015-03-08 10:30:00.360000+0000', tz='UTC'),
           Timestamp('2017-07-13 15:45:05.755000-0700', tz='pytz.FixedOffset(-420)'),
           Timestamp('2012-01-20 22:30:00.254000+0530', tz='pytz.FixedOffset(330)'),
           Timestamp('2016-12-25 00:30:00+1000', tz='pytz.FixedOffset(600)')], dtype=object)
    
    • 1
    • 2
    • 3
    • 4
    df['Year'] = df['TS_obj'].apply(lambda d: d.year)
    df['Month'] = df['TS_obj'].apply(lambda d: d.month)
    df['Day'] = df['TS_obj'].apply(lambda d: d.day)
    df['DayOfWeek'] = df['TS_obj'].apply(lambda d: d.dayofweek)
    df['DayName'] = df['TS_obj'].apply(lambda d: d.weekday_name)
    df['DayOfYear'] = df['TS_obj'].apply(lambda d: d.dayofyear)
    df['WeekOfYear'] = df['TS_obj'].apply(lambda d: d.weekofyear)
    df['Quarter'] = df['TS_obj'].apply(lambda d: d.quarter)
    
    df[['Time', 'Year', 'Month', 'Day', 'Quarter', 
        'DayOfWeek', 'DayName', 'DayOfYear', 'WeekOfYear']]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    TimeYearMonthDayQuarterDayOfWeekDayNameDayOfYearWeekOfYear
    02015-03-08 10:30:00.360000+00:0020153816Sunday6710
    12017-07-13 15:45:05.755000-07:00201771333Thursday19428
    22012-01-20 22:30:00.254000+05:30201212014Friday203
    32016-12-25 00:30:00.000000+10:002016122546Saturday36051

    时间相关特征

    df['Hour'] = df['TS_obj'].apply(lambda d: d.hour)
    df['Minute'] = df['TS_obj'].apply(lambda d: d.minute)
    df['Second'] = df['TS_obj'].apply(lambda d: d.second)
    df['MUsecond'] = df['TS_obj'].apply(lambda d: d.microsecond)   #毫秒
    df['UTC_offset'] = df['TS_obj'].apply(lambda d: d.utcoffset()) #UTC时间位移
    
    df[['Time', 'Hour', 'Minute', 'Second', 'MUsecond', 'UTC_offset']]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    TimeHourMinuteSecondMUsecondUTC_offset
    02015-03-08 10:30:00.360000+00:001030036000000:00:00
    12017-07-13 15:45:05.755000-07:0015455755000-1 days +17:00:00
    22012-01-20 22:30:00.254000+05:302230025400005:30:00
    32016-12-25 00:30:00.000000+10:000300010:00:00

    按照早晚切分时间

    hour_bins = [-1, 5, 11, 16, 21, 23]
    bin_names = ['Late Night', 'Morning', 'Afternoon', 'Evening', 'Night']
    df['TimeOfDayBin'] = pd.cut(df['Hour'], 
                                bins=hour_bins, labels=bin_names)
    df[['Time', 'Hour', 'TimeOfDayBin']]
    
    • 1
    • 2
    • 3
    • 4
    • 5
    TimeHourTimeOfDayBin
    02015-03-08 10:30:00.360000+00:0010Morning
    12017-07-13 15:45:05.755000-07:0015Afternoon
    22012-01-20 22:30:00.254000+05:3022Night
    32016-12-25 00:30:00.000000+10:000Late Night
  • 相关阅读:
    【多线程】线程池
    ChatGPT Plus暂停注册,用户激增压力太大!
    Kubernetes Prometheus 监控 Nginx
    手写JavaScript常见5种设计模式
    C++11异步任务轮子实现(header-only)
    [计算机网络实验] TCP协议
    在一张 24 GB 的消费级显卡上用 RLHF 微调 20B LLMs
    数据结构——链表
    Flink通讯模型—Akka与Actor模型
    vue-router清除url地址栏路由参数
  • 原文地址:https://blog.csdn.net/weixin_39107270/article/details/133640644