• python版 html正文提取(CEPF)


    from selectolax.parser import *
    import math
    
    
    
    class CountInfo:
        def __init__(self):
            self.textCount = 0
            self.linkTextCount = 0
            self.tagCount = 0
            self.linkTagCount = 0
            self.density = 0
            self.densitySum = 0
            self.score = 0
            self.pCount = 0
            self.leafList = []
    
        def __str__(self) -> str:
            return f"textCount: {self.textCount}, linkTextCount: {self.linkTextCount}, tagCount: {self.tagCount}, linkTagCount: {self.linkTagCount}, density: {self.density}, densitySum: {self.densitySum}, score: {self.score}, pCount: {self.pCount}, leafList: {self.leafList}"
    
    
    class ContentExtractor:
    
        def __init__(self):
            pass
    
        def reload(self, content):
            self.doc = HTMLParser(content)
            self.infoMap = []
    
        def clear(self):
            tags = ["script", "noscript", "style", "iframe", "br"]
            self.doc.strip_tags(tags)
    
        def computeInfo(self, node):
            if node.tag != "-text":
                countInfo = CountInfo()
                for child_node in node.iter(include_text=True):
                    childCountInfo = self.computeInfo(child_node)
                    countInfo.textCount += childCountInfo.textCount
                    countInfo.linkTextCount += childCountInfo.linkTextCount
                    countInfo.tagCount += childCountInfo.tagCount
                    countInfo.linkTagCount += childCountInfo.linkTagCount
                    countInfo.leafList.extend(childCountInfo.leafList)
                    countInfo.densitySum += childCountInfo.density
                    countInfo.pCount += childCountInfo.pCount
                countInfo.tagCount += 1
                tagname = node.tag
                if tagname == "a":
                    countInfo.linkTextCount = countInfo.textCount
                    countInfo.linkTagCount += 1
                elif tagname == "p":
                    countInfo.pCount += 1
    
                pureLen = countInfo.textCount - countInfo.linkTextCount
                tag_len = countInfo.tagCount - countInfo.linkTagCount
    
                if pureLen == 0 or tag_len == 0:
                    countInfo.density = 0
                else:
                    countInfo.density = pureLen / tag_len
    
                self.infoMap.append({"node": node, "countInfo": countInfo})
                return countInfo
            else:
                countInfo = CountInfo()
                text = node.text_content
                text_len = len(text)
                countInfo.textCount = text_len
                countInfo.leafList.append(text_len)
                return countInfo
    
        def computerVar(self, data):
            """方差"""
            if not data:
                return 0
            if len(data) == 1:
                return data[0] / 2
            avg = sum(data) / len(data)
            return sum((x - avg) ** 2 for x in data) / len(data)
    
        def computeScore(self, countInfo):
            "计算得分"
    
            sqrt = math.sqrt(self.computerVar(countInfo.leafList) + 1)
            score = (
                math.log(sqrt)
                * countInfo.densitySum
                * math.log(countInfo.textCount - countInfo.linkTextCount + 1)
                * math.log10(countInfo.pCount + 2)
            )
            return score
    
        def getContentElement(self):
            self.clear()
            if not self.doc.body:
    
                return ""
            self.computeInfo(self.doc.body)
            content = None
            maxScore = 0
    
            for obj in self.infoMap:
                node = obj.get("node")
                if node.tag == "a" or node.tag == "body":
                    continue
                score = self.computeScore(obj.get("countInfo"))
                if score > maxScore:
                    maxScore = score
                    content = node
    
            return content
    
    
  • 相关阅读:
    【车载开发系列】Autosar框架中的WatchDog
    MySQL的锁
    计算机毕设(附源码)JAVA-SSM基于的楼盘销售系统的设计与实现
    3.0 Python 数字类型常用操作
    java python基于Vue宠物交流网站管理系统
    9、【办公自动化】Python实现Word文件的批量操作
    Excel常用公式总结非常实用
    小程序订单中心path设置本次审核不通过,审核原因:小程序尚未发布,无法审核。
    基于PHP+MySQL托管中心管理系统的设计与实现
    ChatGPT-4 VS 文心一言4.0
  • 原文地址:https://blog.csdn.net/lkjasdgfh/article/details/139631350