• python 综合练习


    条件:ML100k.data

    注意:程序对列表进行修改,为避免列表索引出现问题,避免使用for i in range(len(data)),而使用for i in data可避免这一问题

    1. import pickle
    2. data = []
    3. with open("ML100k.data", 'r') as file:
    4. for line in file:
    5. data.append([int(item) for item in line.strip('\n').split('\t')])
    6. user_dict = {}
    7. film_dict = {}
    8. for i in data:
    9. user = i[0]
    10. film = i[1]
    11. if user not in user_dict:
    12. user_dict[user] = [i]
    13. else:
    14. user_dict[user].append(i)
    15. if film not in film_dict:
    16. film_dict[film] = [i]
    17. else:
    18. film_dict[film].append(i)
    19. # 删掉不活跃用户、冷门电影
    20. for user in user_dict:
    21. if len(user_dict[user]) < 5:
    22. for _ in data:
    23. if _ in user_dict[user]:
    24. data.remove(_)
    25. for film in film_dict:
    26. if len(film_dict[film]) < 5:
    27. for _ in data:
    28. if _ in film_dict[film]:
    29. data.remove(_)
    30. # 统计用户数量user_num、电影数量item_num、评分数量rating_num
    31. user_sum = {}
    32. item_sum = {}
    33. for i in data:
    34. user = i[0]
    35. item = i[1]
    36. if user not in user_sum:
    37. user_sum[user] = [i]
    38. else:
    39. user_sum[user].append(i)
    40. if item not in item_sum:
    41. item_sum[item] = [i]
    42. else:
    43. item_sum[item].append(i)
    44. print(len(user_sum))
    45. print(len(item_sum))
    46. print(len(data))
    47. # 计算稀疏度
    48. sparsity = len(data)/(len(user_sum)*len(item_sum))
    49. print(sparsity)
    50. # 统计每个用户的平均评分user_average、每部电影的平均评分item_average、以及全部评分的平均评分global_average.
    51. user_average = []
    52. item_average = []
    53. sorted_user = list(user_sum.keys())
    54. sorted_item = list(item_sum.keys())
    55. sorted_item.sort()
    56. sorted_user.sort()
    57. for user in sorted_user:
    58. user_average.append(sum(user_sum[user][2])/len(user_sum[user]))
    59. for item in sorted_item:
    60. item_average.append(sum(item_sum[item][2])/len(item_sum[item]))
    61. # print(user_average)
    62. # print(item_average)
    63. # 统计所有评分中1~5的分布情况rating_num
    64. rating_num = [0, 0, 0, 0, 0]
    65. for i in data:
    66. rating = i[2]
    67. rating_num[rating-1] += 1
    68. print(rating_num)
    69. # 将用户和电影分别从0开始标号,使得用户的最大编号为user_sum-1,电影最大编号为item_sum-1
    70. user_num = {}
    71. item_num = {}
    72. count1, count2 = 0, 0
    73. for i in data:
    74. user = i[0]
    75. item = i[1]
    76. if user not in user_num:
    77. user_num[user] = count1
    78. count1 += 1
    79. if item not in item_num:
    80. item_num[item] = count2
    81. count2 += 1
    82. i[0] = user_num[user]
    83. i[1] = item_num[item]

     

  • 相关阅读:
    Ubuntu安装ufw
    前后端分离项目,vue+uni-app+php+mysql在线小说电子书阅读小程序系统 开题报告
    数据结构由中序序列和后序序列构造二叉树
    【毕业设计】电商产品评论数据分析可视化(情感分析) - python 大数据
    测开 - 项目篇 - 细节狂魔
    JavaSE 集合类详解
    【Redis入门笔记 01】redis 安装 & 配置
    ClickHouse(03)ClickHouse怎么安装和部署
    多线程的概念
    MySQL 基础笔记(2)
  • 原文地址:https://blog.csdn.net/Rhett_Butler0922/article/details/132811902