Embedding:将经过独热编码过后的类别型特征向量化从而生成稠密特征
MLP:将稠密特征输入多层神经网络去拟合得到最优结果

链接:https://pan.baidu.com/s/1e_98bghHp3X2hNoPoNn-sQ?pwd=ykaw
提取码:ykaw
参考:
https://andyguo.blog.csdn.net/article/details/121094641
- import tensorflow as tf
-
- TRAIN_DATA_URL = "file:///D:/SparrowRecSys-master/src/main/resources/webroot/sampledata/modelSamples.csv"
- samples_file_path = tf.keras.utils.get_file("modelSamples.csv", TRAIN_DATA_URL)
-
- TRAIN_DATA_URL1 = "file:///D:/SparrowRecSys-master/src/main/resources/webroot/sampledata/trainingSamples.csv"
- samples_file_path1 = tf.keras.utils.get_file("trainingSamples.csv", TRAIN_DATA_URL1)
-
- TRAIN_DATA_URL2 = "file:///D:/SparrowRecSys-master/src/main/resources/webroot/sampledata/testSamples.csv"
- samples_file_path2 = tf.keras.utils.get_file("testSamples.csv", TRAIN_DATA_URL2)
-
-
- #读入数据
- def get_dataset(file_path):
- dataset = tf.data.experimental.make_csv_dataset(
- file_path,
- batch_size=12,
- label_name='label',
- na_value="0",
- num_epochs=1,
- ignore_errors=True)
- return dataset
-
- #划分训练集和测试集
- train_dataset = get_dataset(samples_file_path1)
- test_dataset = get_dataset(samples_file_path2)
-
-
- #载入类别型特征(类别型特征需要独热编码再Embedding化)
- # movieId userId movieGenre userGenre
-
- #风格类型
- genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
- 'Sci-Fi', 'Drama', 'Thriller',
- 'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']
- #用户喜欢的风格类型/电影风格 对应的风格类型词典
- GENRE_FEATURES = {
- 'userGenre1': genre_vocab,
- 'userGenre2': genre_vocab,
- 'userGenre3': genre_vocab,
- 'userGenre4': genre_vocab,
- 'userGenre5': genre_vocab,
- 'movieGenre1': genre_vocab,
- 'movieGenre2': genre_vocab,
- 'movieGenre3': genre_vocab
- }
-
- # genre类别型特征(字符串型特征)转为one-hot特征,此处用到风格类型词表genre_vocab
- categorical_columns = []
- for feature, vocab in GENRE_FEATURES.items():
- #独热编码
- cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
- key=feature, vocabulary_list=vocab)
- #向量化
- emb_col = tf.feature_column.embedding_column(cat_col, 10)
-
- categorical_columns.append(emb_col)
-
- # 用户id和电影id(ID)转为one-hot特征,此处不用词表
- #电影id独热编码
- movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
- #电影id向量化
- movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
- categorical_columns.append(movie_emb_col)
-
- #用户id独热编码
- user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
- #用户id向量化
- #为了将得到的one-hot转为稠密向量,所以要加一层embedding
- user_emb_col = tf.feature_column.embedding_column(user_col, 10)
- categorical_columns.append(user_emb_col)
-
-
- #数值型特征 直接输入MLP内
- #tf.feature_column.numeric_column 抽取数值型连续值
- numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
- tf.feature_column.numeric_column('movieRatingCount'),
- tf.feature_column.numeric_column('movieAvgRating'),
- tf.feature_column.numeric_column('movieRatingStddev'),
- tf.feature_column.numeric_column('userRatingCount'),
- tf.feature_column.numeric_column('userAvgRating'),
- tf.feature_column.numeric_column('userRatingStddev')]
- #搭建模型
- model = tf.keras.Sequential()
- #将数值型特征及类别型特征输入MLP
- model.add(tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns))
- model.add(tf.keras.layers.Dense(128, activation='relu'))
- model.add(tf.keras.layers.Dense(128, activation='relu'))
- model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
-
- #编译模型
- model.compile(
- loss='binary_crossentropy',
- optimizer='adam',
- metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'),
- tf.keras.metrics.AUC(curve='PR')])
-
-
- model.fit(train_dataset, epochs=5)
-
- test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
-
- print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,test_roc_auc, test_pr_auc))
-
- predictions = model.predict(test_dataset)
-
- for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
- print("Predicted good rating: {:.2%}".format(prediction[0]),
- " | Actual rating label: ",
- ("Good Rating" if bool(goodRating) else "Bad Rating"))
Epoch 1/5 7403/7403 [==============================] - 308s 41ms/step - loss: 3.5653 - accuracy: 0.5772 - auc_2: 0.5816 - auc_3: 0.6283 Epoch 2/5 7403/7403 [==============================] - 312s 42ms/step - loss: 0.6243 - accuracy: 0.6718 - auc_2: 0.7176 - auc_3: 0.7395 Epoch 3/5 7403/7403 [==============================] - 335s 45ms/step - loss: 0.5597 - accuracy: 0.7137 - auc_2: 0.7770 - auc_3: 0.7984 Epoch 4/5 7403/7403 [==============================] - 296s 40ms/step - loss: 0.5215 - accuracy: 0.7422 - auc_2: 0.8135 - auc_3: 0.8336 Epoch 5/5 7403/7403 [==============================] - 277s 37ms/step - loss: 0.4978 - accuracy: 0.7605 - auc_2: 0.8332 - auc_3: 0.8548 Predicted good rating: 79.00% | Actual rating label: Bad Rating Predicted good rating: 84.13% | Actual rating label: Good Rating Predicted good rating: 85.21% | Actual rating label: Bad Rating Predicted good rating: 57.63% | Actual rating label: Bad Rating Predicted good rating: 10.32% | Actual rating label: Bad Rating Predicted good rating: 45.51% | Actual rating label: Good Rating Predicted good rating: 28.17% | Actual rating label: Bad Rating Predicted good rating: 71.97% | Actual rating label: Bad Rating Predicted good rating: 77.62% | Actual rating label: Bad Rating Predicted good rating: 83.60% | Actual rating label: Bad Rating Predicted good rating: 70.58% | Actual rating label: Bad Rating Predicted good rating: 59.75% | Actual rating label: Bad Rating