• 清洗文本高频词、情感分析、情感分类、主题建模挖掘主题


    import pandas as pd
    import re
    import nltk
    from nltk import FreqDist
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    from nltk.tokenize import word_tokenize
    import spacy
    from spacy.lang.en.stop_words import STOP_WORDS
    from gensim.corpora import Dictionary
    from gensim.models import LdaModel

    # 下载NLTK的停用词、情感分析和词性标注所需的资源
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('vader_lexicon')

    # 加载SpaCy的英文NLP模型
    nlp = spacy.load("en_core_web_sm")

    # 读取Excel文件
    df = pd.read_excel('nltk分词处理结果第二次.xlsx')

    # 定义文本清洗函数
    def clean_text(text):
        # 去除HTML标签
        cleaned_text = re.sub(r'<.*?>', '', text)
        # 去除多余空格和换行符
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        # 转换为小写
        cleaned_text = cleaned_text.lower()
        return cleaned_text

    # 清洗文本数据
    df['cleaned_content'] = df['content'].apply(clean_text)

    # 词频分析
    words = []
    for text in df['cleaned_content']:
        words += word_tokenize(text)
    freq_dist = FreqDist(words)
    print("词频分析结果:", freq_dist.most_common(10))

    # 情感分析
    sia = SentimentIntensityAnalyzer()
    df['sentiment_score'] = df['cleaned_content'].apply(lambda x: sia.polarity_scores(x)['compound'])
    print("情感分析结果:", df['sentiment_score'])

    # 定义阈值
    positive_threshold = 0.5
    negative_threshold = -0.5

    # 根据情感分数进行分类
    def classify_sentiment(score):
        if score > positive_threshold:
            return '积极'
        elif score < negative_threshold:
            return '消极'
        else:
            return '中性'

    # 应用分类函数,创建新的列 'sentiment_category'
    df['sentiment_category'] = df['sentiment_score'].apply(classify_sentiment)

    # 输出带有情感分类的数据
    print(df[['cleaned_content', 'sentiment_score', 'sentiment_category']])


    # 主题建模
    tokens = [[token.text.lower() for token in nlp(text) if token.is_alpha and token.text.lower() not in STOP_WORDS] for text in df['cleaned_content']]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=5)
    print("主题建模结果:")
    for topic in topics:
        print(topic)
     

  • 相关阅读:
    股票交易系列 -- 动规
    线上扭蛋机小程序详解,扭蛋机带来的乐趣
    MHA实现MySQL主从的高可用
    java接口+vue后台管理+uniapp前端 移动端商城
    YoloV8改进策略:将FasterNet与YoloV8深度融合,打造更快更强的检测网络
    微服务sleuth+zipkin——链路追踪
    「Python实用秘技05」在Python中妙用短路机制
    ActiveReports.NET 17.1.X Carack
    如何使用Flask request对象处理请求
    java计算机毕业设计ETC用户自驾游推荐系统MyBatis+系统+LW文档+源码+调试部署
  • 原文地址:https://blog.csdn.net/weston95/article/details/134084078