# feature 封装
feature_max_idx={'user_id': 4, 'movie_id': 208, 'gender': 3, 'age': 4, 'occupation': 4, 'zip': 4}
embedding_dim=16
#用户特征
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
SparseFeat("age", feature_max_idx['age'], embedding_dim),
SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
VarLenSparseFeat(SparseFeat('hist_movie_id',feature_max_idx['movie_id'],embedding_dim
,embedding_name="movie_id" ),50,'mean','hist_len')
]
#物品特征
item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]
item_feature_name=item_feature_columns[0].name # 'movie_id'
item_vocabulary_size=item_feature_columns[0].vocabulary_size # 208
feature_columns=user_feature_columns+item_feature_columns
# sparse_feature_columns
sparse_feature_columns=list( filter( lambda x: isinstance(x,SparseFeat),feature_columns )) if feature_columns else []
# varlen_sparse_feature_columns
varlen_sparse_feature_columns=list( filter( lambda x: isinstance(x,VarLenSparseFeat),feature_columns )) if feature_columns else []
sparse_embedding_dic={}
for feat in sparse_feature_columns:
print(feat.embedding_name)
emb=keras.layers.Embedding(feat.vocabulary_size,feat.embedding_dim
,name="sparse_emb_"+feat.embedding_name,trainable=feat.trainable)
sparse_embedding_dic[feat.embedding_name]=emb
for feat in varlen_sparse_feature_columns:
print(feat.sparsefeat.embedding_name)
emb=keras.layers.Embedding(feat.sparsefeat.vocabulary_size,feat.sparsefeat.embedding_dim
,name="sparse_seq_emb_"+feat.sparsefeat.name,trainable=feat.sparsefeat.trainable)
sparse_embedding_dic[feat.sparsefeat.embedding_name]=emb
print(sparse_embedding_dic)
{'user_id': ,
'gender': ,
'age': ,
'occupation': ,
'zip': ,
'movie_id': }
input_features = {}
for feat in user_feature_columns:
if isinstance(feat,SparseFeat):
input_features[feat.name]=keras.layers.Input(shape=(1,),name=feat.name,dtype=feat.dtype)
elif isinstance(feat,DenseFeat):
input_features[feat.name]= keras.layers.Input(shape=(feat.dimension,),name=feat.name,dtype=feat.dtype)
elif isinstance(feat,VarLenSparseFeat):
input_features[feat.sparsefeat.name]= keras.layers.Input(shape=(feat.maxlen,),name=feat.sparsefeat.name,dtype=feat.sparsefeat.dtype)
else :
raise TypeError("Invalid feature column type ,got",type(fc))
print(input_features )
{'user_id': ,
'gender': ,
'age': ,
'occupation': ,
'zip': ,
'hist_movie_id': }
input_features 为每个特征的输入形式,
sparse_embedding_dic为每个特征的 embedding结构
单个特征embedding 输出为 sparse_embedding_dic[embedding_name](input_features[feature_name])
user_sparse_feature_columns=list( filter( lambda x: isinstance(x,SparseFeat),user_feature_columns )) if user_feature_columns else []
user_varlen_sparse_feature_columns=list( filter( lambda x: isinstance(x,VarLenSparseFeat),user_feature_columns )) if user_feature_columns else []
embedding_dict={}
for fc in user_sparse_feature_columns:
feature_name=fc.name
embedding_name=fc.embedding_name
print(feature_name)
embedding_dict[feature_name]=sparse_embedding_dic[embedding_name](input_features[feature_name])
# varlen_embedding_lookup
for fc in user_varlen_sparse_feature_columns:
feature_name=fc.sparsefeat.name
embedding_name=fc.sparsefeat.embedding_name
print(feature_name)
embedding_dict[feature_name]=sparse_embedding_dic[embedding_name](input_features[feature_name])
print(embedding_dict)
{'user_id': ,
'gender': ,
'age': ,
'occupation': ,
'zip': ,
'hist_movie_id': }
modelTest=keras.models.Model(inputs=input_features["user_id"],outputs=embedding_dict["user_id"])
print(modelTest(tf.Variable([[1],[2]])))
print("\n*************************************\n")
modelTest.summary()
Tensor("model_14/sparse_emb_user_id/embedding_lookup/Identity_1:0", shape=(2, 1, 16), dtype=float32)
*************************************
Model: "model_14"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
user_id (InputLayer) [(None, 1)] 0
_________________________________________________________________
sparse_emb_user_id (Embeddin (None, 1, 16) 64
=================================================================
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________
多指特征embedding后进行pooling
class SequencePoolingLayer(keras.layers.Layer):
def __init__(self, mode='mean', supports_masking=False, **kwargs):
super(SequencePoolingLayer, self).__init__(**kwargs)
def call(self,inputs):
return tf.expand_dims(tf.reduce_mean(inputs,axis=1), axis=1)
pooling_vec_list={}
pooling_vec_list['hist_movie_id']=SequencePoolingLayer()(embedding_dict['hist_movie_id'])
pooling_vec_list['hist_movie_id']
sparse_embedding_list=[]
list(embedding_dict.values())+list(pooling_vec_list.values())
for key,value in embedding_dict.items():
if key!="hist_movie_id":
sparse_embedding_list.append(value)
for key,value in pooling_vec_list.items():
sparse_embedding_list.append(value)
print(sparse_embedding_list)
[,
,
,
,
,
]
user_dnn_input=keras.layers.Flatten()(keras.layers.Concatenate(-1)(sparse_embedding_list))
user_dnn_input
item_input_features={}
item_feature_columns
for fc in item_feature_columns:
if isinstance(fc, SparseFeat):
item_input_features[fc.name] = keras.layers.Input(
shape=(1,), name= fc.name, dtype=fc.dtype)
item_inputs_list = list(item_input_features.values())
item_inputs_list
class EmbeddingIndex(keras.layers.Layer):
def __init__(self, index, **kwargs):
self.index = index
super(EmbeddingIndex, self).__init__(**kwargs)
def build(self, input_shape):
super(EmbeddingIndex, self).build(
input_shape) # Be sure to call this somewhere!
def call(self, x, **kwargs):
return tf.constant(self.index)
item_index=EmbeddingIndex(list(range(item_vocabulary_size)))(item_input_features[item_feature_name])
item_embedding_weight=sparse_embedding_dic[item_feature_name](item_index)
item_embedding_weight
全连接层为两层,隐藏层为[64,16]
hidden1 =keras.layers.Dense(64, activation='relu')(user_dnn_input)
user_dnn_out =keras.layers.Dense(16, activation='relu')(user_dnn_input)
user_dnn_out
由于训练的全是正样本,所以采用抽样的softmax损失函数
①全量物品embedding
② user_dnn_out
③ item Input
结构如下
embeddings=item_embedding_weight
embeddings
inputs=user_dnn_out
inputs
label_idx=item_input_features[item_feature_name]
label_idx
class SampledSoftmaxLayer(keras.layers.Layer):
def __init__(self, num_sampled=5, **kwargs):
self.num_sampled = num_sampled
super(SampledSoftmaxLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.size = input_shape[0][0]
self.zero_bias = self.add_weight(shape=[self.size],
initializer=keras.initializers.Zeros,
dtype=tf.float32,
trainable=False,
name="bias")
super(SampledSoftmaxLayer, self).build(input_shape)
def call(self, inputs_with_label_idx, training=None, **kwargs):
embeddings, inputs, label_idx = inputs_with_label_idx
loss = tf.nn.sampled_softmax_loss(weights=embeddings, # self.item_embedding.
biases=self.zero_bias,
labels=label_idx,
inputs=inputs,
num_sampled=self.num_sampled,
num_classes=self.size, # self.target_song_size
)
return tf.expand_dims(loss, axis=1)
def compute_output_shape(self, input_shape):
return (None, 1)
def get_config(self, ):
config = {'num_sampled': self.num_sampled}
base_config = super(SampledSoftmaxLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
output=SampledSoftmaxLayer()([embeddings,inputs,label_idx])
output
模型输入为前面的user_inputs_list 和item_inputs_list,结构如下,模型输出为上面的output.
user_inputs_list = list(input_features.values())
print(user_inputs_list )
print(item_inputs_list)
model=keras.models.Model(inputs=user_inputs_list + item_inputs_list, outputs=output)
[,
,
,
,
,
]
[]