            DeepFM(Deep Factorization Machine)是一种结合了深度学习和因子分解机的推荐模型。它在CTR(点击率)预测任务中表现出色,并能够有效地处理稀疏特征。

            DeepFM模型由两个部分组成:因子分解机(Factorization Machine)和深度神经网络(Deep Neural Network)。











    1. import numpy as np
    2. import torch
    3. import torch.optim as optim
    4. from torch.utils.data import DataLoader
    5. from torch.utils.data import sampler
    6. from model.DeepFM import DeepFM
    7. from data.dataset import CriteoDataset
    8. # 900000 items for training, 10000 items for valid, of all 1000000 items
    9. Num_train = 9000
    10. # load data
    11. train_data = CriteoDataset('./data', train=True)
    12. loader_train = DataLoader(train_data, batch_size=16,
    13. sampler=sampler.SubsetRandomSampler(range(Num_train)))
    14. val_data = CriteoDataset('./data', train=True)
    15. loader_val = DataLoader(val_data, batch_size=16,
    16. sampler=sampler.SubsetRandomSampler(range(Num_train, 10000)))
    17. feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
    18. feature_sizes = [int(x) for x in feature_sizes]
    19. print(feature_sizes)
    20. model = DeepFM(feature_sizes, use_cuda=False)
    21. optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
    22. model.fit(loader_train, loader_val, optimizer, epochs=5, verbose=True)

    数据处理文件 data/dataset.py

    1. import torch
    2. from torch.utils.data import Dataset
    3. import pandas as pd
    4. import numpy as np
    5. import os
    6. continous_features = 13
    7. class CriteoDataset(Dataset):
    8. """
    9. Custom dataset class for Criteo dataset in order to use efficient
    10. dataloader tool provided by PyTorch.
    11. """
    12. def __init__(self, root, train=True):
    13. """
    14. Initialize file path and train/test mode.
    15. Inputs:
    16. - root: Path where the processed data file stored.
    17. - train: Train or test. Required.
    18. """
    19. self.root = root
    20. self.train = train
    21. if not self._check_exists():
    22. raise RuntimeError('Dataset not found.')
    23. if self.train:
    24. data = pd.read_csv(os.path.join(root, 'train.txt'))
    25. self.train_data = data.iloc[:, :-1].values
    26. self.target = data.iloc[:, -1].values
    27. else:
    28. data = pd.read_csv(os.path.join(root, 'test.txt'))
    29. self.test_data = data.iloc[:, :-1].values
    30. def __getitem__(self, idx):
    31. if self.train:
    32. dataI, targetI = self.train_data[idx, :], self.target[idx]
    33. # index of continous features are zero
    34. Xi_coutinous = np.zeros_like(dataI[:continous_features])
    35. Xi_categorial = dataI[continous_features:]
    36. Xi = torch.from_numpy(np.concatenate((Xi_coutinous, Xi_categorial)).astype(np.int32)).unsqueeze(-1)
    37. # value of categorial features are one (one hot features)
    38. Xv_categorial = np.ones_like(dataI[continous_features:])
    39. Xv_coutinous = dataI[:continous_features]
    40. Xv = torch.from_numpy(np.concatenate((Xv_coutinous, Xv_categorial)).astype(np.int32))
    41. return Xi, Xv, targetI
    42. else:
    43. dataI = self.test_data.iloc[idx, :]
    44. # index of continous features are one
    45. Xi_coutinous = np.ones_like(dataI[:continous_features])
    46. Xi_categorial = dataI[continous_features:]
    47. Xi = torch.from_numpy(np.concatenate((Xi_coutinous, Xi_categorial)).astype(np.int32)).unsqueeze(-1)
    48. # value of categorial features are one (one hot features)
    49. Xv_categorial = np.ones_like(dataI[continous_features:])
    50. Xv_coutinous = dataI[:continous_features]
    51. Xv = torch.from_numpy(np.concatenate((Xv_coutinous, Xv_categorial)).astype(np.int32))
    52. return Xi, Xv
    53. def __len__(self):
    54. if self.train:
    55. return len(self.train_data)
    56. else:
    57. return len(self.test_data)
    58. def _check_exists(self):
    59. return os.path.exists(self.root)

     模型文件 model/DeepFM.py

    1. # -*- coding: utf-8 -*-
    2. """
    3. A pytorch implementation of DeepFM for rates prediction problem.
    4. """
    5. import torch
    6. import torch.nn as nn
    7. import torch.nn.functional as F
    8. import torch.optim as optim
    9. from time import time
    10. class DeepFM(nn.Module):
    11. """
    12. A DeepFM network with RMSE loss for rates prediction problem.
    13. There are two parts in the architecture of this network: fm part for low
    14. order interactions of features and deep part for higher order. In this
    15. network, we use bachnorm and dropout technology for all hidden layers,
    16. and "Adam" method for optimazation.
    17. You may find more details in this paper:
    18. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction,
    19. Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.
    20. """
    21. def __init__(self, feature_sizes, embedding_size=4,
    22. hidden_dims=[32, 32], num_classes=1, dropout=[0.5, 0.5],
    23. use_cuda=True, verbose=False):
    24. """
    25. Initialize a new network
    26. Inputs:
    27. - feature_size: A list of integer giving the size of features for each field.
    28. - embedding_size: An integer giving size of feature embedding.
    29. - hidden_dims: A list of integer giving the size of each hidden layer.
    30. - num_classes: An integer giving the number of classes to predict. For example,
    31. someone may rate 1,2,3,4 or 5 stars to a film.
    32. - batch_size: An integer giving size of instances used in each interation.
    33. - use_cuda: Bool, Using cuda or not
    34. - verbose: Bool
    35. """
    36. super().__init__()
    37. self.field_size = len(feature_sizes)
    38. self.feature_sizes = feature_sizes
    39. self.embedding_size = embedding_size
    40. self.hidden_dims = hidden_dims
    41. self.num_classes = num_classes
    42. self.dtype = torch.long
    43. self.bias = torch.nn.Parameter(torch.randn(1))
    44. """
    45. check if use cuda
    46. """
    47. if use_cuda and torch.cuda.is_available():
    48. self.device = torch.device('cuda')
    49. else:
    50. self.device = torch.device('cpu')
    51. """
    52. init fm part
    53. """
    54. self.fm_first_order_embeddings = nn.ModuleList(
    55. [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
    56. self.fm_second_order_embeddings = nn.ModuleList(
    57. [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
    58. """
    59. init deep part
    60. """
    61. all_dims = [self.field_size * self.embedding_size] + \
    62. self.hidden_dims + [self.num_classes]
    63. for i in range(1, len(hidden_dims) + 1):
    64. setattr(self, 'linear_'+str(i),
    65. nn.Linear(all_dims[i-1], all_dims[i]))
    66. # nn.init.kaiming_normal_(self.fc1.weight)
    67. setattr(self, 'batchNorm_' + str(i),
    68. nn.BatchNorm1d(all_dims[i]))
    69. setattr(self, 'dropout_'+str(i),
    70. nn.Dropout(dropout[i-1]))
    71. def forward(self, Xi, Xv):
    72. """
    73. Forward process of network.
    74. Inputs:
    75. - Xi: A tensor of input's index, shape of (N, field_size, 1)
    76. - Xv: A tensor of input's value, shape of (N, field_size, 1)
    77. """
    78. """
    79. fm part
    80. """
    81. fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
    82. fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
    83. #print(fm_first_order.shape)
    84. fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
    85. fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
    86. fm_sum_second_order_emb_square = fm_sum_second_order_emb * \
    87. fm_sum_second_order_emb # (x+y)^2
    88. #print(fm_sum_second_order_emb_square.shape)
    89. fm_second_order_emb_square = [
    90. item*item for item in fm_second_order_emb_arr]
    91. fm_second_order_emb_square_sum = sum(
    92. fm_second_order_emb_square) # x^2+y^2
    93. fm_second_order = (fm_sum_second_order_emb_square -
    94. fm_second_order_emb_square_sum) * 0.5
    95. """
    96. deep part
    97. """
    98. deep_emb = torch.cat(fm_second_order_emb_arr, 1)
    99. deep_out = deep_emb
    100. for i in range(1, len(self.hidden_dims) + 1):
    101. deep_out = getattr(self, 'linear_' + str(i))(deep_out)
    102. deep_out = getattr(self, 'batchNorm_' + str(i))(deep_out)
    103. deep_out = getattr(self, 'dropout_' + str(i))(deep_out)
    104. """
    105. sum
    106. """
    107. total_sum = torch.sum(fm_first_order, 1) + \
    108. torch.sum(fm_second_order, 1) + torch.sum(deep_out, 1) + self.bias
    109. return total_sum
    110. def fit(self, loader_train, loader_val, optimizer, epochs=100, verbose=False, print_every=100):
    111. """
    112. Training a model and valid accuracy.
    113. Inputs:
    114. - loader_train: I
    115. - loader_val: .
    116. - optimizer: Abstraction of optimizer used in training process, e.g., "torch.optim.Adam()""torch.optim.SGD()".
    117. - epochs: Integer, number of epochs.
    118. - verbose: Bool, if print.
    119. - print_every: Integer, print after every number of iterations.
    120. """
    121. """
    122. load input data
    123. """
    124. model = self.train().to(device=self.device)
    125. criterion = F.binary_cross_entropy_with_logits
    126. for _ in range(epochs):
    127. for t, (xi, xv, y) in enumerate(loader_train):
    128. xi = xi.to(device=self.device, dtype=self.dtype)
    129. xv = xv.to(device=self.device, dtype=torch.float)
    130. y = y.to(device=self.device, dtype=torch.float)
    131. total = model(xi, xv)
    132. loss = criterion(total, y)
    133. optimizer.zero_grad()
    134. loss.backward()
    135. optimizer.step()
    136. if verbose and t % print_every == 0:
    137. print('Iteration %d, loss = %.4f' % (t, loss.item()))
    138. self.check_accuracy(loader_val, model)
    139. print()
    140. def check_accuracy(self, loader, model):
    141. if loader.dataset.train:
    142. print('Checking accuracy on validation set')
    143. else:
    144. print('Checking accuracy on test set')
    145. num_correct = 0
    146. num_samples = 0
    147. model.eval() # set model to evaluation mode
    148. with torch.no_grad():
    149. for xi, xv, y in loader:
    150. xi = xi.to(device=self.device, dtype=self.dtype) # move to device, e.g. GPU
    151. xv = xv.to(device=self.device, dtype=torch.float)
    152. y = y.to(device=self.device, dtype=torch.bool)
    153. total = model(xi, xv)
    154. preds = (F.sigmoid(total) > 0.5)
    155. num_correct += (preds == y).sum()
    156. num_samples += preds.size(0)
    157. acc = float(num_correct) / num_samples
    158. print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))

     3. 项目数据在下方连接




