• (Transfer Learning)迁移学习在IMDB上训练情感分析模型


    1. 背景

    有些场景下,开始的时候数据量很小,如果我们用一个几千条数据训练一个全新的深度机器学习的文本分类模型,效果不会很好。这个时候你有两种选择,1.用传统的机器学习训练,2.利用迁移学习在一个预训练的模型上训练。本博客教你怎么用tensorflow Hub和keras 在少量的数据上训练一个文本分类模型。

    2. 实践

    2.1. 下载IMDB 数据集,参考下面博客。

    Imdb影评的数据集介绍与下载_imdb影评数据集-CSDN博客

    2.2.  预处理数据

    替换掉imdb目录 (imdb_raw_data_dir). 创建dataset目录。

    1. import numpy as np
    2. import os as os
    3. import re
    4. from sklearn.model_selection import train_test_split
    5. vocab_size = 30000
    6. maxlen = 200
    7. imdb_raw_data_dir = "/Users/harry/Documents/apps/ml/aclImdb"
    8. save_dir = "dataset"
    9. def get_data(datapath =r'D:\train_data\aclImdb\aclImdb\train' ):
    10. pos_files = os.listdir(datapath + '/pos')
    11. neg_files = os.listdir(datapath + '/neg')
    12. print(len(pos_files))
    13. print(len(neg_files))
    14. pos_all = []
    15. neg_all = []
    16. for pf, nf in zip(pos_files, neg_files):
    17. with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f:
    18. s = f.read()
    19. s = process(s)
    20. pos_all.append(s)
    21. with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f:
    22. s = f.read()
    23. s = process(s)
    24. neg_all.append(s)
    25. print(len(pos_all))
    26. # print(pos_all[0])
    27. print(len(neg_all))
    28. X_orig= np.array(pos_all + neg_all)
    29. # print(X_orig)
    30. Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))])
    31. print("X_orig:", X_orig.shape)
    32. print("Y_orig:", Y_orig.shape)
    33. return X_orig, Y_orig
    34. def generate_dataset():
    35. X_orig, Y_orig = get_data(imdb_raw_data_dir + r'/train')
    36. X_orig_test, Y_orig_test = get_data(imdb_raw_data_dir + r'/test')
    37. X_orig = np.concatenate([X_orig, X_orig_test])
    38. Y_orig = np.concatenate([Y_orig, Y_orig_test])
    39. X = X_orig
    40. Y = Y_orig
    41. np.random.seed = 1
    42. random_indexs = np.random.permutation(len(X))
    43. X = X[random_indexs]
    44. Y = Y[random_indexs]
    45. X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    46. print("X_train:", X_train.shape)
    47. print("y_train:", y_train.shape)
    48. print("X_test:", X_test.shape)
    49. print("y_test:", y_test.shape)
    50. np.savez(save_dir + '/train_test', X_train=X_train, y_train=y_train, X_test= X_test, y_test=y_test )
    51. def rm_tags(text):
    52. re_tag = re.compile(r'<[^>]+>')
    53. return re_tag.sub(' ', text)
    54. def clean_str(string):
    55. string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    56. string = re.sub(r"\'s", " \'s", string) # it's -> it 's
    57. string = re.sub(r"\'ve", " \'ve", string) # I've -> I 've
    58. string = re.sub(r"n\'t", " n\'t", string) # doesn't -> does n't
    59. string = re.sub(r"\'re", " \'re", string) # you're -> you are
    60. string = re.sub(r"\'d", " \'d", string) # you'd -> you 'd
    61. string = re.sub(r"\'ll", " \'ll", string) # you'll -> you 'll
    62. string = re.sub(r"\'m", " \'m", string) # I'm -> I 'm
    63. string = re.sub(r",", " , ", string)
    64. string = re.sub(r"!", " ! ", string)
    65. string = re.sub(r"\(", " \( ", string)
    66. string = re.sub(r"\)", " \) ", string)
    67. string = re.sub(r"\?", " \? ", string)
    68. string = re.sub(r"\s{2,}", " ", string)
    69. return string.strip().lower()
    70. def process(text):
    71. text = clean_str(text)
    72. text = rm_tags(text)
    73. #text = text.lower()
    74. return text
    75. if __name__ == '__main__':
    76. generate_dataset()

    执行完后,产生train_test.npz 文件

    2.3.  训练模型

    1. 取数据集

    1. def get_dataset_to_train():
    2. train_test = np.load('dataset/train_test.npz', allow_pickle=True)
    3. x_train = train_test['X_train']
    4. y_train = train_test['y_train']
    5. x_test = train_test['X_test']
    6. y_test = train_test['y_test']
    7. return x_train, y_train, x_test, y_test

    2. 创建模型

    基于nnlm-en-dim50/2 预训练的文本嵌入向量,在模型外面加了两层全连接。

    1. def get_model():
    2. hub_layer = hub.KerasLayer(embedding_url, input_shape=[], dtype=tf.string, trainable=True)
    3. # Build the model
    4. model = Sequential([
    5. hub_layer,
    6. Dense(16, activation='relu'),
    7. Dropout(0.5),
    8. Dense(2, activation='softmax')
    9. ])
    10. print(model.summary())
    11. model.compile(optimizer=keras.optimizers.Adam(),
    12. loss=keras.losses.SparseCategoricalCrossentropy(),
    13. metrics=[keras.metrics.SparseCategoricalAccuracy()])
    14. return model

    还可以使用来自 TFHub 的许多其他预训练文本嵌入向量:

    还有很多!在 TFHub 上查找更多文本嵌入向量模型

    3. 评估你的模型

    1. def evaluate_model(test_data, test_labels):
    2. model = load_trained_model()
    3. # Evaluate the model
    4. results = model.evaluate(test_data, test_labels, verbose=2)
    5. print("Test accuracy:", results[1])
    6. def load_trained_model():
    7. # model = get_model()
    8. # model.load_weights('./models/model_new1.h5')
    9. model = tf.keras.models.load_model('models_pb')
    10. return model

    4. 测试几个例子

    1. def predict(real_data):
    2. model = load_trained_model()
    3. probabilities = model.predict([real_data]);
    4. print("probabilities :",probabilities)
    5. result = get_label(probabilities)
    6. return result
    7. def get_label(probabilities):
    8. index = np.argmax(probabilities[0])
    9. print("index :" + str(index))
    10. result_str = index_dic.get(str(index))
    11. # result_str = list(index_dic.keys())[list(index_dic.values()).index(index)]
    12. return result_str
    13. def predict_my_module():
    14. # review = "I don't like it"
    15. # review = "this is bad movie "
    16. # review = "This is good movie"
    17. review = " this is terrible movie"
    18. # review = "This isn‘t great movie"
    19. # review = "i think this is bad movie"
    20. # review = "I'm not very disappoint for this movie"
    21. # review = "I'm not very disappoint for this movie"
    22. # review = "I am very happy for this movie"
    23. #neg:0 postive:1
    24. s = predict(review)
    25. print(s)
    26. if __name__ == '__main__':
    27. x_train, y_train, x_test, y_test = get_dataset_to_train()
    28. model = get_model()
    29. model = train(model, x_train, y_train, x_test, y_test)
    30. evaluate_model(x_test, y_test)
    31. predict_my_module()

    完整代码

    1. import numpy as np
    2. import tensorflow as tf
    3. from keras.models import Sequential
    4. from keras.layers import Dense, Dropout
    5. import keras as keras
    6. from keras.callbacks import EarlyStopping, ModelCheckpoint
    7. import tensorflow_hub as hub
    8. embedding_url = "https://tfhub.dev/google/nnlm-en-dim50/2"
    9. index_dic = {"0":"negative", "1": "positive"}
    10. def get_dataset_to_train():
    11. train_test = np.load('dataset/train_test.npz', allow_pickle=True)
    12. x_train = train_test['X_train']
    13. y_train = train_test['y_train']
    14. x_test = train_test['X_test']
    15. y_test = train_test['y_test']
    16. return x_train, y_train, x_test, y_test
    17. def get_model():
    18. hub_layer = hub.KerasLayer(embedding_url, input_shape=[], dtype=tf.string, trainable=True)
    19. # Build the model
    20. model = Sequential([
    21. hub_layer,
    22. Dense(16, activation='relu'),
    23. Dropout(0.5),
    24. Dense(2, activation='softmax')
    25. ])
    26. print(model.summary())
    27. model.compile(optimizer=keras.optimizers.Adam(),
    28. loss=keras.losses.SparseCategoricalCrossentropy(),
    29. metrics=[keras.metrics.SparseCategoricalAccuracy()])
    30. return model
    31. def train(model , train_data, train_labels, test_data, test_labels):
    32. # train_data, train_labels, test_data, test_labels = get_dataset_to_train()
    33. train_data = [tf.compat.as_str(tf.compat.as_bytes(str(x))) for x in train_data]
    34. test_data = [tf.compat.as_str(tf.compat.as_bytes(str(x))) for x in test_data]
    35. train_data = np.asarray(train_data) # Convert to numpy array
    36. test_data = np.asarray(test_data) # Convert to numpy array
    37. print(train_data.shape, test_data.shape)
    38. early_stop = EarlyStopping(monitor='val_sparse_categorical_accuracy', patience=4, mode='max', verbose=1)
    39. # 定义ModelCheckpoint回调函数
    40. # checkpoint = ModelCheckpoint( './models/model_new1.h5', monitor='val_sparse_categorical_accuracy', save_best_only=True,
    41. # mode='max', verbose=1)
    42. checkpoint_pb = ModelCheckpoint(filepath="./models_pb/", monitor='val_sparse_categorical_accuracy', save_weights_only=False, save_best_only=True)
    43. history = model.fit(train_data[:2000], train_labels[:2000], epochs=45, batch_size=45, validation_data=(test_data, test_labels), shuffle=True,
    44. verbose=1, callbacks=[early_stop, checkpoint_pb])
    45. print("history", history)
    46. return model
    47. def evaluate_model(test_data, test_labels):
    48. model = load_trained_model()
    49. # Evaluate the model
    50. results = model.evaluate(test_data, test_labels, verbose=2)
    51. print("Test accuracy:", results[1])
    52. def predict(real_data):
    53. model = load_trained_model()
    54. probabilities = model.predict([real_data]);
    55. print("probabilities :",probabilities)
    56. result = get_label(probabilities)
    57. return result
    58. def get_label(probabilities):
    59. index = np.argmax(probabilities[0])
    60. print("index :" + str(index))
    61. result_str = index_dic.get(str(index))
    62. # result_str = list(index_dic.keys())[list(index_dic.values()).index(index)]
    63. return result_str
    64. def load_trained_model():
    65. # model = get_model()
    66. # model.load_weights('./models/model_new1.h5')
    67. model = tf.keras.models.load_model('models_pb')
    68. return model
    69. def predict_my_module():
    70. # review = "I don't like it"
    71. # review = "this is bad movie "
    72. # review = "This is good movie"
    73. review = " this is terrible movie"
    74. # review = "This isn‘t great movie"
    75. # review = "i think this is bad movie"
    76. # review = "I'm not very disappoint for this movie"
    77. # review = "I'm not very disappoint for this movie"
    78. # review = "I am very happy for this movie"
    79. #neg:0 postive:1
    80. s = predict(review)
    81. print(s)
    82. if __name__ == '__main__':
    83. x_train, y_train, x_test, y_test = get_dataset_to_train()
    84. model = get_model()
    85. model = train(model, x_train, y_train, x_test, y_test)
    86. evaluate_model(x_test, y_test)
    87. predict_my_module()

  • 相关阅读:
    611. 有效三角形的个数
    No ‘Access-Control-Allow-Origin‘ header前端浏览器跨域用LiveServer处理
    python解析wirshark抓包数据
    使用JDBC连接DM8数据库(ODBC连接DM8数据库)
    Lottie动画多动图切换遇到的坑
    机器学习——seaborn实用画图方法简介
    Vue + Element UI 实现权限管理系统 前端篇(四):优化登录流程
    gitlab使用简介
    audiosever 基础知识点
    C#子线程操作UI线程更新报线程间操作无效
  • 原文地址:https://blog.csdn.net/keeppractice/article/details/134465987