赞
踩
python读取PDF文件中文本、表格、图片
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
基于fitz
import fitz pdf_file = "example.pdf" pdf_document = fitz.open(pdf_file) text = "" for page_number in range(len(pdf_document)): page = pdf_document.load_page(page_number) for block in page.get_text("blocks"): x0, y0, x1, y1 = block[0:4] text_block = block[4] # 根据文本块属性过滤表格中的文本 # 这只是一个示例,你可以根据文本块的位置和其他属性来进一步过滤 if y1 - y0 < 20: # 通过高度过滤小文本块 continue if "image" in text_block: continue text += text_block pdf_document.close() print(text)
基于fitz
import fitz doc = fitz.open("example.pdf") # open a document for page_index in range(len(doc)): # iterate over pdf pages page = doc[page_index] # get the page image_list = page.get_images() # print the number of images found on the page if image_list: print(f"Found {len(image_list)} images on page {page_index}") else: print("No images found on page", page_index) for image_index, img in enumerate(image_list, start=1): # enumerate the image list xref = img[0] # get the XREF of the image pix = fitz.Pixmap(doc, xref) # create a Pixmap if pix.n - pix.alpha > 3: # CMYK: convert to RGB first pix = fitz.Pixmap(fitz.csRGB, pix) pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png pix = None
基于fitz
import fitz doc = fitz.open("example.pdf") # open a document for page_index in range(len(doc)): # iterate over pdf pages page = doc[page_index] # get the page image_list = page.get_images() # print the number of images found on the page if image_list: print(f"Found {len(image_list)} images on page {page_index}") else: print("No images found on page", page_index) for image_index, img in enumerate(image_list, start=1): # enumerate the image list xref = img[0] # get the XREF of the image pix = fitz.Pixmap(doc, xref) # create a Pixmap if pix.n - pix.alpha > 3: # CMYK: convert to RGB first pix = fitz.Pixmap(fitz.csRGB, pix) pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png pix = None
基于fitz,将表格数据当作文本内容抽取
import fitz
doc = fitz.open("example.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
out.write(text) # write text of page
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()
基于pdfplumber
import pdfplumber
import pandas as pd
# 读取pdf文件,保存为pdf实例
pdf = pdfplumber.open("example.pdf")
# 访问第二页
first_page = pdf.pages[1]
# 自动读取表格信息,返回列表
tables = first_page.extract_tables(table_settings = {})
for table in tables:
table = pd.DataFrame(table[1:], columns=table[0])
print(table)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。