• 基于openOffice和python实现office转pdf和html示例代码


    将office文件转化为html格式或者pdf格式

    在转换之前,需要启动openOffice的服务:在openOffice目录下的命令窗口中执行soffice -headless -accept=”socket,host=127.0.0.1,port=8100;urp;” -nofirststartwizard即可启动
    不知道如何启动的参照我的另外一篇文章

    我电脑上安装的是python3.8
    在这里插入图片描述
    python的安装,在这里我就不多说了,在坐的老司机应该都熟悉了。

    准备好了环境之后,话不多说,开始编写脚本。
    脚本代码如下:

    #
    # PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
    #
    # This script converts a document from one office format to another by
    # connecting to an OpenOffice.org instance via Python-UNO bridge.
    #
    # Copyright (C) 2008-2009 Mirko Nasato 
    # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
    # - or any later version.
    #
    DEFAULT_OPENOFFICE_PORT = 8100
    
    import uno
    from os.path import abspath, isfile, splitext
    from com.sun.star.beans import PropertyValue
    from com.sun.star.task import ErrorCodeIOException
    from com.sun.star.connection import NoConnectException
    
    FAMILY_TEXT = "Text"
    FAMILY_WEB = "Web"
    FAMILY_SPREADSHEET = "Spreadsheet"
    FAMILY_PRESENTATION = "Presentation"
    FAMILY_DRAWING = "Drawing"
    
    # ---------------------#
    # Configuration Start #
    # ---------------------#
    
    # see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
    
    # most formats are auto-detected; only those requiring options are defined here
    IMPORT_FILTER_MAP = {
        "txt": {
            "FilterName": "Text (encoded)",
            "FilterOptions": "utf8"
        },
        "csv": {
            "FilterName": "Text - txt - csv (StarCalc)",
            "FilterOptions": "44,34,0"
        }
    }
    
    EXPORT_FILTER_MAP = {
        "pdf": {
            FAMILY_TEXT: {"FilterName": "writer_pdf_Export"},
            FAMILY_WEB: {"FilterName": "writer_web_pdf_Export"},
            FAMILY_SPREADSHEET: {"FilterName": "calc_pdf_Export"},
            FAMILY_PRESENTATION: {"FilterName": "impress_pdf_Export"},
            FAMILY_DRAWING: {"FilterName": "draw_pdf_Export"}
        },
        "html": {
            FAMILY_TEXT: {"FilterName": "HTML (StarWriter)"},
            FAMILY_SPREADSHEET: {"FilterName": "HTML (StarCalc)"},
            FAMILY_PRESENTATION: {"FilterName": "impress_html_Export"}
        },
        "odt": {
            FAMILY_TEXT: {"FilterName": "writer8"},
            FAMILY_WEB: {"FilterName": "writerweb8_writer"}
        },
        "doc": {
            FAMILY_TEXT: {"FilterName": "MS Word 97"}
        },
        "rtf": {
            FAMILY_TEXT: {"FilterName": "Rich Text Format"}
        },
        "txt": {
            FAMILY_TEXT: {
                "FilterName": "Text",
                "FilterOptions": "utf8"
            }
        },
        "ods": {
            FAMILY_SPREADSHEET: {"FilterName": "calc8"}
        },
        "xls": {
            FAMILY_SPREADSHEET: {"FilterName": "MS Excel 97"}
        },
        "csv": {
            FAMILY_SPREADSHEET: {
                "FilterName": "Text - txt - csv (StarCalc)",
                "FilterOptions": "44,34,0"
            }
        },
        "odp": {
            FAMILY_PRESENTATION: {"FilterName": "impress8"}
        },
        "ppt": {
            FAMILY_PRESENTATION: {"FilterName": "MS PowerPoint 97"}
        },
        "swf": {
            FAMILY_DRAWING: {"FilterName": "draw_flash_Export"},
            FAMILY_PRESENTATION: {"FilterName": "impress_flash_Export"}
        }
    }
    
    PAGE_STYLE_OVERRIDE_PROPERTIES = {
        FAMILY_SPREADSHEET: {
            # --- Scale options: uncomment 1 of the 3 ---
            # a) 'Reduce / enlarge printout': 'Scaling factor'
            "PageScale": 100,
            # b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
            # "ScaleToPagesX": 1, "ScaleToPagesY": 1000,
            # c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
            # "ScaleToPages": 1,
            "PrintGrid": False
        }
    }
    
    
    # -------------------#
    # Configuration End #
    # -------------------#
    
    class DocumentConversionException(Exception):
    
        def __init__(self, message):
            self.message = message
    
        def __str__(self):
            return self.message
    
    
    class DocumentConverter:
    
        def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
            localContext = uno.getComponentContext()
            resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver",
                                                                             localContext)
            try:
                context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
            except NoConnectException:
                raise DocumentConversionException("failed to connect to OpenOffice.org on port %s" % port)
            self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
    
        def convert(self, inputFile, outputFile):
    
            inputUrl = self._toFileUrl(inputFile)
            outputUrl = self._toFileUrl(outputFile)
    
            loadProperties = {"Hidden": True}
            inputExt = self._getFileExt(inputFile)
            if IMPORT_FILTER_MAP.has_key(inputExt):
                loadProperties.update(IMPORT_FILTER_MAP[inputExt])
    
            document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
            try:
                document.refresh()
            except AttributeError:
                pass
    
            family = self._detectFamily(document)
            self._overridePageStyleProperties(document, family)
    
            outputExt = self._getFileExt(outputFile)
            storeProperties = self._getStoreProperties(document, outputExt)
    
            try:
                document.storeToURL(outputUrl, self._toProperties(storeProperties))
            finally:
                document.close(True)
    
        def _overridePageStyleProperties(self, document, family):
            if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
                properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
                pageStyles = document.getStyleFamilies().getByName('PageStyles')
                for styleName in pageStyles.getElementNames():
                    pageStyle = pageStyles.getByName(styleName)
                    for name, value in properties.items():
                        pageStyle.setPropertyValue(name, value)
    
        def _getStoreProperties(self, document, outputExt):
            family = self._detectFamily(document)
            try:
                propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
            except KeyError:
                raise DocumentConversionException( "unknown output format: '%s'" % outputExt)
            try:
                return propertiesByFamily[family]
            except KeyError:
                raise DocumentConversionException("unsupported conversion: from '%s' to '%s'" % (family, outputExt))
    
        def _detectFamily(self, document):
            if document.supportsService("com.sun.star.text.WebDocument"):
                return FAMILY_WEB
            if document.supportsService("com.sun.star.text.GenericTextDocument"):
                # must be TextDocument or GlobalDocument
                return FAMILY_TEXT
            if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
                return FAMILY_SPREADSHEET
            if document.supportsService("com.sun.star.presentation.PresentationDocument"):
                return FAMILY_PRESENTATION
            if document.supportsService("com.sun.star.drawing.DrawingDocument"):
                return FAMILY_DRAWING
            raise DocumentConversionException( "unknown document family: %s" % document)
    
        def _getFileExt(self, path):
            ext = splitext(path)[1]
            if ext is not None:
                return ext[1:].lower()
    
        def _toFileUrl(self, path):
            return uno.systemPathToFileUrl(abspath(path))
    
        def _toProperties(self, dict):
            props = []
            for key in dict:
                prop = PropertyValue()
                prop.Name = key
                prop.Value = dict[key]
                props.append(prop)
            return tuple(props)
    
    
    if __name__ == "__main__":
        from sys import argv, exit
    
        if len(argv) < 3:
            print("USAGE: python %s  " % argv[0])
            exit(255)
        if not isfile(argv[1]):
            print("no such input file: %s" % argv[1])
            exit(1)
    
        try:
            converter = DocumentConverter()
            converter.convert(argv[1], argv[2])
        except DocumentConversionException as exception:
            print("ERROR! " + str(exception))
            exit(1)
        except ErrorCodeIOException as exception:
            print("ERROR! ErrorCodeIOException %d" % exception.ErrCode)
            exit(1)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228
    • 229
    • 230
    • 231
    • 232

    编写完上面的脚本之后要放到
    在这里插入图片描述
    因为如果不放openOffice的目录下很多类库都引用不到,会导致程序执行不了。
    最后万事具备,只欠东风了。
    打开cmd命令行,输入这个命令
    在这里插入图片描述
    可以把doc转为pdf,pdf文件已经生成了
    在这里插入图片描述
    输入这个命令
    在这里插入图片描述
    可以把doc转为html,html文件已经生成了
    在这里插入图片描述
    其他的office文件转pdf和html,大家可以按需自行尝试。

  • 相关阅读:
    Java代码审计安全篇-常见Java SQL注入
    微服务节流控制:Eureka中服务速率限制的精妙配置
    蓝桥杯(路径 动态规划 C++)
    怎么做好测试用例评审
    Nextcloud fpm 版在 Dokcer 下安装踩坑
    NFT 交易市场的格局之变:从一家独大到百家争鸣
    什么是TLB文件,怎样从dll文件中提取TYPEID信息?- IID
    MySQL慢查询日志
    腾讯云 BI 数据分析与可视化的快速入门指南
    [附源码]JAVA毕业设计计算机在线学习管理系统-(系统+LW)
  • 原文地址:https://blog.csdn.net/chendongpu/article/details/126720348