- def split_csv(train_num):
- data=pd.read_csv(r"F:\openXBOW\openXBOW-master\Sentiment Analysis Dataset.csv",error_bad_lines=False) # error_bad_lines忽略报错行 ,delimiter="\t"
- train_data=data.iloc[0:train_num] # iloc:索引从0开始
- train_file_path=r"F:\openXBOW\openXBOW-master\Sentiment analysis Dataset - train.csv"
- train_data.to_csv(train_file_path,index=False) # index 设置为false表示不写行序号
- test_data=data.iloc[train_num+1:]
- test_file_path=r"F:\openXBOW\openXBOW-master\Sentiment Analysis Dataset - test.csv"
- test_data.to_csv(test_file_path,index=False)
将切分出来的训练集生成codebook,然后用于分析测试集 :
- java -Xmx12000m -jar openXBOW.jar -i "Sentiment Analysis Dataset - train.csv" -attributes ncr0 -o bowTwitter-train.arff -minTermFreq 2000 -B dictionaryTwitter
-
- java -Xmx12000m -jar openXBOW.jar -i "Sentiment Analysis Dataset - test.csv" -attributes ncr0 -o bowTwitter-test.arff -b dictionaryTwitter
-
- 解析:“-Xmx12000m”增加堆空间。-minTermFreq 设置最小词频
使用2-gram:
- java -Xmx12000m -jar openXBOW.jar -i "Sentiment Analysis Dataset - train.csv" -attributes ncr0 -o bowTwitter-train.arff -nGram 2 -minTermFreq 2000 -B dictionaryTwitter
-
- 解析:-nGrams 指定n grams
-
- java -Xmx12000m -jar openXBOW.jar -i "Sentiment Analysis Dataset - test.csv" -attributes ncr0 -o bowTwitter-test.arff -nGram 2 -b dictionaryTwitter
- IOTRAIN="-i examples/example5/audio_llds_train.csv -o examples/example5/xbow_train.arff -l examples/example5/labels_train.csv -B examples/example5/codebook"
-
- IOTEST="-i examples/example5/audio_llds_test.csv -o examples/example5/xbow_test.arff -l examples/example5/labels_test.csv -b examples/example5/codebook"
-
- # java -jar openXBOW.jar $IOTRAIN
- # 优化 :增加参数:-standardizeInput 从34%提升到73%
- java -jar openXBOW.jar $IOTRAIN -standardizeInput
-
- # 增加codebook大小:精度:0.792
- java -jar openXBOW.jar $IOTRAIN -standardizeInput -log -size 1000
-
- # 精度降低了?变成了0.354
- java -jar openXBOW.jar $IOTRAIN -standardizeInput -log -size 1000 -a 5 -attributes nt1[65]2[65]
-
- # 使用 weka判断精度:
- java -classpath "E:\Weka-3-8-5\weka.jar" weka.classifiers.functions.SMO -t examples/example5/xbow_train.arff
- # 解析:选择分类器SMO并进行10倍交叉验证(CV)进行评估
-
- # 测试集:
- java -jar openXBOW.jar $IOTEST
-
- # 计算测试集的精度:0.742
- java -classpath "E:\Weka-3-8-5\weka.jar" weka.classifiers.functions.SMO -t examples/example5/xbow_test.arff