• kunpeng的aarch64架构cpu、openeuler系统、昇腾服务器适配文档转换功能(doc转docx、ppt转pptx)


    一、安装flatpak

    sudo yum install flatpak  
    flatpak remote-add --if-not-exists flathub https://flathub.org/repo/flathub.flatpakrepo
    

    二、安装libreoffice

    flatpak install flathub org.libreoffice.LibreOffice
    

    三、使用

    对于使用 flatpak 安装的 LibreOffice,不需要手动启动或设置任何环境变量。flatpak 提供了一个沙箱化的运行环境,确保应用程序可以正常运行。
    flatpak 应用程序的可执行文件通常位于类似

    /var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice
    

    只要在代码中正确指定了这个完整路径,就可以直接运行和调用 LibreOffice,而无需进行任何其他设置。

    四、示例代码-doc

    import tqdm
    import subprocess
    import os
    from dotenv import load_dotenv
    from docx.table import _Cell, Table
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P
    from docx.text.paragraph import Paragraph
    from docx import Document
    import re
    
    load_dotenv()
    libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"
    def convert_doc_to_docx(doc_file_path):
        if os.path.basename(doc_file_path).split(".")[1] == "docx":
            return doc_file_path
        elif os.path.basename(doc_file_path).split(".")[1] == "doc":
            
            # Define the command to run LibreOffice in headless mode
            command = [
                libreoffice_path,
                '--headless',
                '--convert-to', 'docx',
                '--outdir', os.path.dirname(doc_file_path),
                doc_file_path
            ]
            
            # Run the command
            result = subprocess.run(command, capture_output=True, text=True)
            
            if result.returncode != 0:
                raise RuntimeError(f"Failed to convert '{doc_file_path}' to DOCX.\nError: {result.stderr}")
            
            return doc_file_path.replace('.doc', '.docx')
        else:
            return False
    
    def doc2text(filepath):
        filepath = convert_doc_to_docx(filepath)
        doc = Document(filepath)
        resp = ""
    
        def iter_block_items(parent):
            from docx.document import Document
            if isinstance(parent, Document):
                parent_elm = parent.element.body
            elif isinstance(parent, _Cell):
                parent_elm = parent._tc
            else:
                raise ValueError("parse fail")
    
            for child in parent_elm.iterchildren():
                if isinstance(child, CT_P):
                    yield Paragraph(child, parent)
                elif isinstance(child, CT_Tbl):
                    yield Table(child, parent)
    
        for block in iter_block_items(doc):
            if isinstance(block, Paragraph):
                resp += block.text.strip() + "\n"
            elif isinstance(block, Table):
                for row in block.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                            resp += paragraph.text.strip() + "\n"
        resp = re.sub(r'\n+', '\n', resp)
        resp = re.sub(r'(.)\1{4,}', r'\1', resp)
        return {'document':resp,'metadata':filepath,'format':"docx_text"}
    
    if __name__ == '__main__':
        import json
        file_path="/opt/rag/data/xxx.doc"
        loader = doc2text(file_path)
        output_path = os.path.join(os.path.dirname(file_path), os.path.basename(file_path).split(".")[0] + "_docx"+ ".json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(loader, f, ensure_ascii=False, indent=4)
    
    
    

    五、示例代码-ppt

    import os
    import subprocess
    from dotenv import load_dotenv
    from unstructured.chunking.title import chunk_by_title
    from unstructured.documents.elements import CompositeElement, Table
    from unstructured.partition.pptx import partition_pptx
    import json
    
    load_dotenv()
    libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"
    
    def remove_duplicates(lst):
        res = []
        seen = {}
        for i in lst:
            if i not in seen:
                seen[i] = 1
                res.append(i)
        return res
    
    def ppt2text(file_name: str):
        file_name = convert_ppt_to_pptx(file_name)
        elements = partition_pptx(
            filename=file_name,
            multipage_sections=True,
            infer_table_structure=True,
            include_page_breaks=False,
        )
    
        chunks = chunk_by_title(
            elements=elements,
            multipage_sections=True,
            combine_text_under_n_chars=0,
            new_after_n_chars=None,
            max_characters=4096,
        )
        data = dict()
        text_list = []
    
        for chunk in chunks:
            if isinstance(chunk, CompositeElement):
                text = chunk.text
                text_list.append(text)
            elif isinstance(chunk, Table):
                if text_list:
                    text_list[-1] = text_list[-1] + "\n" + chunk.metadata.text_as_html
                else:
                    text_list.append(chunk.hunk.metadata.text_as_html)
        data['document'] = remove_duplicates(text_list)
        data['metadata'] =  file_name
        data['format'] =  "pptx_text"
        return data
    
    def convert_ppt_to_pptx(ppt_file_path):
        if os.path.basename(ppt_file_path).split(".")[1] == "pptx":
            return ppt_file_path
        elif os.path.basename(ppt_file_path).split(".")[1] == "ppt":
            # Define the command to run LibreOffice in headless mode
            command = [
                libreoffice_path,
                '--headless',
                '--convert-to', 'pptx',
                '--outdir', os.path.dirname(ppt_file_path),
                ppt_file_path
            ]
            
            # Run the command
            result = subprocess.run(command, capture_output=True, text=True)
            
            if result.returncode != 0:
                raise RuntimeError(f"Failed to convert '{ppt_file_path}' to PPTX.\nError: {result.stderr}")
            
            return ppt_file_path.replace('.ppt', '.pptx')
        else:
            return False
    
    if __name__ == "__main__":
        pptx_file_path = "/opt/data/xxx.ppt"
        contents = ppt2text(pptx_file_path)
        # print(contents)
    
        output_path = os.path.join(os.path.dirname(pptx_file_path), os.path.basename(pptx_file_path).split(".")[0] + "_ppt" + ".json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(contents, f, ensure_ascii=False, indent=4)
    
    
  • 相关阅读:
    Elasticsearch索引数量限制
    9.吴恩达深度学习--机器翻译
    第七章节 Qt的UI界面设计详解
    接口测试——接口协议抓包分析与mock_L1
    Android frameworks学习
    如何用AR Engine开发一个虚拟形象表情包?
    优化导入大批量数据的Excel(上万行的导入)SpringBoot + Apache POI
    个人项目中用到的Flume 各组件,以及Put 事务和Take 事务介绍
    NASM汇编教程翻译02 第二讲 程序正确退出
    qml实现路径绘制且可编辑
  • 原文地址:https://blog.csdn.net/weixin_46398647/article/details/139654349