python pdf模块_用于将PDF转换为文本的Python模块

作者：算法编织者 | 2024-01-31 18:47:45

踩

parser = pdfparser(open(pdf_filename,'rb'))

def pdf_to_csv(filename):

from cStringIO import StringIO

from pdfminer.converter import LTChar, TextConverter

from pdfminer.layout import LAParams

from pdfminer.pdfparser import PDFDocument, PDFParser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

class CsvConverter(TextConverter):

def __init__(self, *args, **kwargs):

TextConverter.__init__(self, *args, **kwargs)

def end_page(self, i):

from collections import defaultdict

lines = defaultdict(lambda : {})

for child in self.cur_item._objs: #

if isinstance(child, LTChar):

(_,_,x,y) = child.bbox

line = lines[int(-y)]

line[x] = child._text.encode(self.codec) #

for y in sorted(lines.keys()):

line = lines[y]

self.outfp.write(";".join(line[x] for x in sorted(line.keys())))

self.outfp.write("\n")

# ... the following part of the code is a remix of the

# convert() function in the pdfminer/tools/pdf2text module

rsrc = PDFResourceManager()

outfp = StringIO()

device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())

# becuase my test documents are utf-8 (note: utf-8 is the default codec)

doc = PDFDocument()

fp = open(filename, 'rb')

parser = PDFParser(fp)

parser.set_document(doc)

doc.set_parser(parser)

doc.initialize('')

interpreter = PDFPageInterpre

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/51359