赞
踩
python处理pdf文件的所有库 , https://stackabuse.com/working-with-pdfs-in-python-reading-and-splitting-pages/
一下是我列举的几种我所使用的库的简单用法,供大家参考
安装PyPDF2 , pip install PyPDF2
import PyPDF2
from urllib.request import urlopen
file = open('D:/ltn20190716133.pdf', 'rb')
fileReader = PyPDF2.PdfFileReader(file)
#pdf page numbers totals
print(fileReader.numPages)
#get 172 page text
pageObj = fileReader.getPage(172)
print(pageObj.extractText())
安装pdfplumber, pip install pdfplumber
一些常用的方法
.extract_text() 用来提页面中的文本,将页面的所有字符对象整理为的那个字符串
.extract_words() 返回的是所有的单词及其相关信息
.extract_tables() 提取页面的表格
.to_image() 用于可视化调试时,返回PageImage类的一个实例
import pdfplumber
with pdfplumber.open("D:\\com.pdf") as pdf:
page_count = len(pdf.pages)
print(page_count)
for page in pdf.pages:
print('---------- 第[%d]页 ----------' % page.page_number)
print(page.extract_text())
import pdfplumber
path = 'D:/ltn201904301249.pdf'
pdf = pdfplumber.open(path)
for page in pdf.pages:
# 获取当前页面的全部文本信息,包括表格中的文字
print(page.extract_text())
pdf.close()
安装PyMuPDF, pip install PyMuPDF
参考文档:https://pymupdf.readthedocs.io/en/latest/document/#document
import fitz
doc = fitz.open(r"D:\ltn20190906213.pdf")
print(doc.pageCount) #获取总页数
print(doc.getPageText(12)) #获取pdf文件12页文本
for page in doc:
t = page.getText()
print(t)
pdfminer ,安装pip install pdfminer3k
from urllib.request import urlopen from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import logging import re def readPdf(pdf_file): logging.propagate = False logging.getLogger().setLevel(logging.ERROR) rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams) process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file) device.close() content = retstr.getvalue() retstr.close() return content url = "https://www1.sehk/2019/0430/ltn201904301249.pdf" pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开 print(pdf_file) content = readPdf(pdf_file) s = r'(\n|\r|\xa0|/s/|\t| |&#\d*;)' body = re.sub(s,' ',content) print(body.replace(' ',' ')) pdf_file.close()
下载网页pdf文件到本地的两种方法
第一种使用requests下载
import requests
url = 'https://www1.sehk/2019/0430/ltn201904301249.pdf'
r = requests.get(url)
pdf = r.content
with open('aa.pdf','wb') as f:
f.write(pdf)
第二种使用urllib
import urllib.request import re import os def getFile(url): file_name = url.split('/')[-1] u = urllib.request.urlopen(url) f = open(file_name, 'wb') block_sz = 8192 while True: buffer = u.read(block_sz) if not buffer: break f.write(buffer) f.close() print ("Sucessful to download" + " " + file_name) getFile("https://www1.sehk/2019/0430/ltn201904301249.pdf")
windows下生成将pdf文件生成图片
""" 1、安装库 pip install pymupdf 2、直接运行 """ import fitz # 打开PDF文件,生成一个对象 doc = fitz.open(r'D:/简历/王.pdf') for pg in range(doc.pageCount): page = doc[pg] rotate = int(0) # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。 zoom_x = 2.0 zoom_y = 2.0 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) pm.writePNG('%s.jpg' % pg)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。