Apriori:这里做了个小优化,比如abcde和adcef自连接出的新项集abcdef,可以用abcde的位置和f的位置取交集,这样第n项集的计算可以用n-1项集的信息和数字本身的位置信息计算出来,只需要保存第n-1项集的位置信息就可以提速
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
- #define N 100000
- #define MAX_RECORD_NUM 88162 + 1
- #define MAX_ITEMID_NUM 16470 + 1
- using namespace std;
-
- int recordNum = 0;
- int mp[MAX_ITEMID_NUM];
- vector<int> items[N];
-
- double support;
- void fastRead(){
- char c;
- bool lastIsNum = false;
- uint16_t num;
- FILE* fp = fopen(DATA_NAME, "r");
- while (~(c = fgetc(fp))) {
- if (c >= '0' && c <= '9') {
- if(lastIsNum){
- num *= 10;
- num += c - '0';
- }
- else{
- num = c - '0';
- }
- lastIsNum = true;
- }
- else {
- if (lastIsNum){
- items[num].push_back(recordNum);
- mp[num]++;
- }
- if (c == '\n'){
- recordNum++;
- }
- lastIsNum = false;
- }
- }
- if (lastIsNum) {
- items[num].push_back(recordNum);
- mp[num]++;
- }
- if (c != '\n') {
- recordNum++;
- }
- fclose(fp);
- }
- vector<int> same_number(vector<int> &tmp1,vector<int> &tmp2){
-
- int ite=0;
- vector<int> res={};
- for(int i=0;i
size();++i){ - while(tmp2[ite]
size()-1) ite++; - if(tmp2[ite]==tmp1[i]){
- //cout<
- res.push_back(tmp1[i]);
- }
- }
- return res;
- }
- vector
int>> v; - vector
int>> v2; - vector
s,s2; - int total;
- void cal(){
- total=0;
- s.clear();
- v.clear();
- for(int i=0;i<=MAX_ITEMID_NUM;++i){
- if(mp[i]>=recordNum*support*0.01){
- string tri=to_string(i);
- int tmp_len=tri.size();
- for(int j=1;j<=5-tmp_len;j++){
- tri="0"+tri;
- }
- s.push_back(tri);
- v.push_back(items[i]);
- }
- }
- int now_set_level=1;
- while(1){
- total+=s.size();
- cout<<"共有"<
size()<<"个频繁"<"项集\n"; - // for(auto t:s){cout<
- // cout<
- v2.clear();
- s2.clear();
- for(int i=0;i
size();++i){ - for(int j=i+1;j
size();++j){ - if(s[i].substr(0,s[i].size()-5)==s[j].substr(0,s[i].size()-5)){
- int num_end=stol(s[j].substr(s[i].size()-5,5));
- vector<int> tmp1=v[i];
- vector<int> tmp2=items[num_end];
- vector<int> same_vector=same_number(tmp1,tmp2);
- if(same_vector.size()>=recordNum*support*0.01){
- v2.push_back(same_vector);
- string new_tmp_string=s[i]+s[j].substr(s[i].size()-5,5);
- s2.push_back(new_tmp_string);
- }
- }
- //cout<<(s[i].substr(s[i].size()-5,5))<
- }
- }
- v=v2;
- s=s2;
- if(v.size()==0){
- break;
- }
- now_set_level+=1;
- }
- cout<<"共有"<
"个频繁项集\n"; - }
- signed main(){
- cout<<"请输入置信度(单位%)\n";
- cin>>support;
- fastRead();
-
- long starttime = GetTickCount();
- cal();
-
- long endtime = GetTickCount();
- long time_cost = endtime - starttime;
- cout << "timecost " << time_cost << " ms" << endl;
- cout<<"请输入操作\n";
- cout<<" 1:更改置信度\n";
- int oper;
- cin>>oper;
- if(oper==1){
- cout<<"请输入置信度\n";
- cin>>support;
- cal();
- }
- }
Fpgrowth的算法,我没有递归建树,只建了一次树,所以速度比完整的fpgrowth要慢(当知道还要建其他树的时候实在不知道我这屎山代码怎么在继续写下去,所以直接后面直接暴力了),建了一次树后直接拿链过去爆搜子集计数了,速度主要慢在我的链最长有10左右,fpgrowth最后剪完只有3-4,通过链获取子集的复杂度是\(2^{len}\)链的长度,所以会慢,如果有一些方法能把无用节点去掉,这种做法也会快,(以后有缘再回来改吧
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
- #define N 100000
- #define MAX_RECORD_NUM 88162 + 1
- #define MAX_ITEMID_NUM 16470 + 1
- using namespace std;
-
- int recordNum = 0;
- int mp[MAX_ITEMID_NUM];
- vector<int> items[N];
-
- double support;
- void fastRead(){
- char c;
- bool lastIsNum = false;
- uint16_t num;
- FILE* fp = fopen(DATA_NAME, "r");
- while (~(c = fgetc(fp))) {
- if (c >= '0' && c <= '9') {
- if(lastIsNum){
- num *= 10;
- num += c - '0';
- }
- else{
- num = c - '0';
- }
- lastIsNum = true;
- }
- else {
- if (lastIsNum){
- items[recordNum].push_back(num);
- mp[num]++;
- }
- if (c == '\n'){
- recordNum++;
- }
- lastIsNum = false;
- }
- }
- if (lastIsNum) {
- items[recordNum].push_back(num);
- mp[num]++;
- }
- if (c != '\n') {
- recordNum++;
- }
- fclose(fp);
- }
-
- int node_number;
- vector
int>> v; - vector<int> head_table[MAX_ITEMID_NUM];
- int head_table_back[10*MAX_ITEMID_NUM];
- vector
int,int>> fp_tree[10*MAX_ITEMID_NUM]; - pair<int,int> fp_tree_value[10*MAX_ITEMID_NUM];
-
- bool cmp(int &a,int &b){
- return mp[a]>mp[b];
- }
- void build(int son,int fa,vector<int> &value,int index){
- if(index==value.size()) return;
- bool exi=0;
- for(auto t:fp_tree[son]){
- if(t.first!=fa && fp_tree_value[t.first].first==value[index]){
- fp_tree_value[t.first].second+=1;
- exi=1;
- build(t.first,son,value,index+1);
- }
- }
- if(exi==0){
- node_number+=1;
- head_table[value[index]].push_back(node_number);
- head_table_back[node_number]=value[index];
- fp_tree_value[node_number]={value[index],1};
- fp_tree[son].push_back({node_number,1});
- fp_tree[node_number].push_back({son,-1});
- build(node_number,son,value,index+1);
- }
- }
- int tmp_dp[10*MAX_ITEMID_NUM];
- void back(int number,vector<int> &res_chain){
- if(number==0 && fp_tree_value[number].second==0) return ;
- for(auto t:fp_tree[number]){
- if(t.second==-1 && fp_tree_value[t.first].second!=0){
- // cout<<"节点 "<
- // cout<<"以前是"<
- res_chain.push_back({head_table_back[t.first]});
- //cout<<"现在是"<
- back(t.first,res_chain);
- }
- }
- }
- int res[MAX_ITEMID_NUM];
- vector<int> number_item;
- void dfs_son(vector
int >> &res_son,vector<int> &value,vector<int> tmp,int index,int now_number){ - if(index==value.size()){
- if(tmp.size()!=0){
- res_son.push_back(tmp);
- }
- return ;
- }
- if(now_number<=3){
- tmp.push_back(value[index]);
- dfs_son(res_son,value,tmp,index+1,now_number+1);
- tmp.pop_back();
- dfs_son(res_son,value,tmp,index+1,now_number);
- }
- else{
- dfs_son(res_son,value,tmp,index+1,now_number);
- }
- }
- signed main(){
- cout<<"请输入置信度(单位%)\n";
- cin>>support;
- fastRead();
- for(int i=0;i
- vector<int> tmp;
- for(auto t:items[i]){
- if(mp[t]>=recordNum*support*0.01){
- tmp.push_back(t);
- }
- }
- if(tmp.size()==0) continue;
- else{
- sort(tmp.begin(),tmp.end(),cmp);
- // for(auto t:tmp){cout<
- // cout<
- v.push_back(tmp);
- }
- }
- // cout<
- long starttime = GetTickCount();
- for(auto t:v){
- build(0,-1,t,0);
- }
- for(int i=0;i
- if(mp[i]>=recordNum*support*0.01){
- res[1]+=1;
- number_item.push_back(i);
- //cout<
- }
- }
-
-
- for(auto t:number_item){
- for(int i=0;i
- tmp_dp[i]=0;
- }
- map
int>,int> map_vec; - for(auto j:head_table[t]){
- //cout<
- vector<int> vt={};
- int value=fp_tree_value[j].second;
- back(j,vt);
- if(!vt.size()) continue;
- // vector
> vs; - // for(int k=0;k<(1<<(vt.size()));k++){
- // vector
resson; - // for(int o=0;o<=vt.size()-1;o++){
- // if((k>>o)&1){
- // resson.push_back(vt[o]);
- // }
- // }
- // if(resson.size()){
- // vs.push_back(resson);
- // }
- // }
- // for(auto t:vs){
- // map_vec[t]+=value;
- // }
- vector
int> > res_son={}; - vector<int> tmp2={};
- dfs_son(res_son,vt,tmp2,0,1);
- for(auto t:res_son){
- map_vec[t]+=value;
- }
- }
- for(auto t:map_vec){
- if(t.second>=recordNum*support*0.01){
- res[t.first.size()+1]+=1;
- }
- }
- }
-
- long endtime = GetTickCount();
- long time_cost = endtime - starttime;
- cout << "timecost " << time_cost << " ms" << endl;
-
- int total=0;
- for(int i=1;i<=MAX_ITEMID_NUM;++i){
- if(res[i]!=0){
- total+=res[i];
- cout<<"共有"<
"个频繁"<"项集\n"; - }
- else{
- cout<<"共有"<
"个频繁项集\n"; - break;
- }
- }
-
- }
-
相关阅读:
JavaScript 基础笔记
计算机网络——运输层作业5.2
【intent-filter】AndroidManifest中<intent-filter>标签的 部分作用
AUTOSARCAN-Tp协议
使用Scrapy的调试工具和日志系统定位并解决爬虫问题
[附源码]计算机毕业设计JAVAjsp医院药房管理系统
【附代码】使用Shapely计算多边形外扩与收缩
java计算机毕业设计基于springboo+vue的学生毕业离校系统
RESTful API — 规范概念、URI 设计、映射 HTTP 方法、API 版本管理、API 命名、统一分页、过滤、排序、 搜索功能
爬取某音乐榜单歌曲
-
原文地址:https://blog.csdn.net/m0_61735576/article/details/138074366