赞
踩
pdf,搜索相关路径下pdf文档,把pdf文档移动到指定的路径下,再对该路径下的pdf文档进行转换
缺点:无法转换带有图片的pdf文档,转换后的文档格式问题
安装相应的库
1)pip install pdfminer3k 是pdfminer的Python 3端口
2)安装docx库
pip install python_docx
使用了os.walk对特定路径下的pdf文档进行查找,并对该文档进行移动
import os import shutil import importlib import sys import re def load_file(): walk = os.walk(r'C:\Users\ALFIEL\Desktop\20190527') i=1 for root, dirs,files in walk: print((root,dirs,files)) for name in files: if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选 shutil.move(os.path.join(root,name), "C:\\Users\\SALFIEL\\Desktop\\pdfdocement\\"+str(i)+".pdf") i+=1 load_file()
#pdf转换器 from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed from docx import Document from pdfminer.layout import * document = Document() import warnings warnings.filterwarnings("ignore") from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from urllib.request import urlopen import pandas as pd import os def readPDF(pdfFile):#读取pdf文件 #创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() retstr = StringIO()#数据保存到内存中 laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content def save_to_file(file_name, contents):#可以存储为相应的格式的文档,单纯以文字的pdf转换 fh = open(file_name, 'w',encoding='utf-8')#若没有encoding='utf-8'则会爆出gbk编码错误。写入txt,csv需要转换格式utf-8 fh.write(contents) fh.close() # save_to_file('mobiles.txt', 'your contents str') def main(): #当某个目录下面寻找pdf文档文件并把它移动到某个路径下面 #进行txt或则csv的转换 walk=os.walk(r"C:\Users\ALFIEL\Desktop\pdfdocement") for root, dirs,files in walk: for name in files: if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选 pdfFile = open("{}".format(os.path.join(root,name)),'rb') outputString = readPDF(pdfFile) save_to_file('c.csv',outputString) def save_to_doxc(file_name,path): #以二进制读取打开文件 fn = open(file_name,'rb') #用文件对象创建pdf文档分析器 parser = PDFParser(fn) #创建一个pdf文档 doc = PDFDocument() #连接分析器与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: resource = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource,laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(resource,device) num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 for i in doc.get_pages():#循环遍历列表,每次处理一个page的内容 num_page += 1 interpreter.process_page(i) layout = device.get_result() print(layout) for x in layout: if isinstance(x,LTImage): num_image += 1 if isinstance(x,LTCurve): num_curve += 1 if isinstance(x,LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal):#判断变量时水平文本框对象 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 if hasattr(x,"get_text"):#判断x对象是否有get_text方法 with open(r'{}'.format(path), 'a',encoding='utf-8') as f: #生成doc文件的文件名及路径 results = x.get_text() print(results) f.write(results) f.write('\n') # 获取文本内容 print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n' %num_TextBoxHorizontal) print ('处理完成') def main2(): walk=os.walk(r"C:\Users\ALFIEL\Desktop\pdfdocement") for root, dirs,files in walk: for name in files: if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选 pdfFile = os.open(r"{}".format(os.path.join(root,name)),os.O_RDWR) save_to_doxc(pdfFile,path=os.path.join(root,name).replace('.pdf','.doc')) if __name__ == '__main__': main2()
注意:当写入txt,csv文件时,需要吧编码模式encoding为utf-8
还有通过调用win32接口进行doc的转换
from win32com.client import Dispatch, constants def doc2pdf(input1, output): w = Dispatch('Word.Application') try: doc = w.Documents.Open(input1, ReadOnly=1) doc.SaveAs(output, 17) return True except Exception as e: print(e) return False finally: doc.Close() def main(): input1 = r'C:\Users\ALFIEL\Desktop\pdfdocement\1.doc' output = r'C:\Users\ALFIEL\Desktop\pdfdocement\6.pdf' rc = doc2html(input1, output) if rc: print('转换成功') else: print('转换失败') if __name__ == '__main__': main() ```其中SaveAs(output,wdformat) wdfomat参数如下:
wdFormatDocument = 0
wdFormatDocument97= 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18
照着字面意思应该能对应到相应的文件格式,如果你是office 2003可能支持不了这么多格式。word文件转html有两种格式可选wdFormatHTML、wdFormatFilteredHTML(对应数字8、10),区别是如果是wdFormatHTML格式的话,word文件里面的公式等ole对象将会存储成wmf格式,而选用wdFormatFilteredHTML的话公式图片将存储为gif格式,而且目测可以看出用wdFormatFilteredHTML生成的HTML明显比wdFormatHTML要干净许多。
原文链接:https://blog.csdn.net/weixin_41341221/article/details/100204128
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。