当前位置:   article > 正文

pyhton Office文档转txt_python将wps转为txt

python将wps转为txt

只能说是网上代码进行优化后的究极缝合怪

import sys
#from win32com.client import Dispatch, constants
from pptx import Presentation
from pptx.util import Cm, Pt
import codecs
import pandas as pd
import win32com.client
import textract
import docx2txt
import docx
def change_doc_to_txt(word_path, save_path):
    word = win32com.client.Dispatch('Word.Application')  # 调用word应用
    doc = word.Documents.Open(word_path)
    print('保存中。。。')
    doc.SaveAs(save_path, 2)  # 保存格式为txt
    doc.Close()
    word.Quit()
def change_docx_to_txt(word_path, save_path):
    print('读取中。。。')
    doc = docx.Document(word_path)
    f = open(save_path, "w")
    for paragraph in doc.paragraphs:
        f.write(paragraph.text)

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                f.write(cell.text)
    f.close()

def change_ppt_to_txt(word_path, save_path):
    #兼容ppt和pptx
    # ppt = win32com.client.Dispatch('PowerPoint.Application')
    # pptSel = ppt.Presentations.Open(word_path, Untitled=0, WithWindow=0)
    # print('读取中。。。')
    #
    # f = open(save_path, "w")
    # slide_count = pptSel.Slides.Count
    # for i in range(1, slide_count + 1):
    #
    #     shape_count = pptSel.Slides(i).Shapes.Count
    #
    #     for j in range(1, shape_count + 1):
    #
    #         if pptSel.Slides(i).Shapes(j).HasTextFrame:
    #             s = pptSel.Slides(i).Shapes(j).TextFrame.TextRange.Text
    #             f.write(s)
    #
    # f.close()
    # pptSel.Close()
    # ppt.Quit()
    #只能解ppt
    prs = Presentation(word_path)
    f = open(save_path, "w")
    # 获取slide幻灯片
    for slide in prs.slides:
        # 获取形状shape
        for shape in slide.shapes:
            if shape.has_text_frame:  # 判断是否有文字
                text_frame = shape.text_frame  # 获取文字框
                f.write(text_frame.text)
                # 从shape中找段落paragraphs
                for paragraph in text_frame.paragraphs:
                    f.write(paragraph.text)

    f.close()

def change_xls_to_txt(word_path, save_path):
    # 读取excel保存成txt格式,xls和xlsx都可以
    excel_file = pd.read_excel(word_path)
    excel_file.to_csv(save_path, sep=' ', index=False)

if __name__ == '__main__':
    try:
        if len(sys.argv)==3:
            readPath = sys.argv[1]
            type = sys.argv[2]
            savePath = readPath+'.txt'
            if type=='1':
                change_doc_to_txt(readPath, savePath)
            elif type=='2':
                change_docx_to_txt(readPath, savePath)
            elif type =='3':
                change_xls_to_txt(readPath, savePath)
            elif type =='4':
                change_ppt_to_txt(readPath, savePath)
            print('保存成功!')
    except Exception as e:
        print(e)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90

代码写到这里后突然想到这种方式为什么不会有人用exe已经解决了呢,于是找了下就找到了
巨nb的office文字提取exe

下载了直接cmd调用里面的doctotext.exe就好,其中支持doc,docx各种office文档直接提取文字

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/391422
推荐阅读
相关标签
  

闽ICP备14008679号