赞
踩
只能说是网上代码进行优化后的究极缝合怪
import sys #from win32com.client import Dispatch, constants from pptx import Presentation from pptx.util import Cm, Pt import codecs import pandas as pd import win32com.client import textract import docx2txt import docx def change_doc_to_txt(word_path, save_path): word = win32com.client.Dispatch('Word.Application') # 调用word应用 doc = word.Documents.Open(word_path) print('保存中。。。') doc.SaveAs(save_path, 2) # 保存格式为txt doc.Close() word.Quit() def change_docx_to_txt(word_path, save_path): print('读取中。。。') doc = docx.Document(word_path) f = open(save_path, "w") for paragraph in doc.paragraphs: f.write(paragraph.text) for table in doc.tables: for row in table.rows: for cell in row.cells: f.write(cell.text) f.close() def change_ppt_to_txt(word_path, save_path): #兼容ppt和pptx # ppt = win32com.client.Dispatch('PowerPoint.Application') # pptSel = ppt.Presentations.Open(word_path, Untitled=0, WithWindow=0) # print('读取中。。。') # # f = open(save_path, "w") # slide_count = pptSel.Slides.Count # for i in range(1, slide_count + 1): # # shape_count = pptSel.Slides(i).Shapes.Count # # for j in range(1, shape_count + 1): # # if pptSel.Slides(i).Shapes(j).HasTextFrame: # s = pptSel.Slides(i).Shapes(j).TextFrame.TextRange.Text # f.write(s) # # f.close() # pptSel.Close() # ppt.Quit() #只能解ppt prs = Presentation(word_path) f = open(save_path, "w") # 获取slide幻灯片 for slide in prs.slides: # 获取形状shape for shape in slide.shapes: if shape.has_text_frame: # 判断是否有文字 text_frame = shape.text_frame # 获取文字框 f.write(text_frame.text) # 从shape中找段落paragraphs for paragraph in text_frame.paragraphs: f.write(paragraph.text) f.close() def change_xls_to_txt(word_path, save_path): # 读取excel保存成txt格式,xls和xlsx都可以 excel_file = pd.read_excel(word_path) excel_file.to_csv(save_path, sep=' ', index=False) if __name__ == '__main__': try: if len(sys.argv)==3: readPath = sys.argv[1] type = sys.argv[2] savePath = readPath+'.txt' if type=='1': change_doc_to_txt(readPath, savePath) elif type=='2': change_docx_to_txt(readPath, savePath) elif type =='3': change_xls_to_txt(readPath, savePath) elif type =='4': change_ppt_to_txt(readPath, savePath) print('保存成功!') except Exception as e: print(e)
代码写到这里后突然想到这种方式为什么不会有人用exe已经解决了呢,于是找了下就找到了
巨nb的office文字提取exe
下载了直接cmd调用里面的doctotext.exe就好,其中支持doc,docx各种office文档直接提取文字
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。