当前位置:   article > 正文

python pdf模块_用于将PDF转换为文本的Python模块

parser = pdfparser(open(pdf_filename,'rb'))

def pdf_to_csv(filename):

from cStringIO import StringIO

from pdfminer.converter import LTChar, TextConverter

from pdfminer.layout import LAParams

from pdfminer.pdfparser import PDFDocument, PDFParser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

class CsvConverter(TextConverter):

def __init__(self, *args, **kwargs):

TextConverter.__init__(self, *args, **kwargs)

def end_page(self, i):

from collections import defaultdict

lines = defaultdict(lambda : {})

for child in self.cur_item._objs: #

if isinstance(child, LTChar):

(_,_,x,y) = child.bbox

line = lines[int(-y)]

line[x] = child._text.encode(self.codec) #

for y in sorted(lines.keys()):

line = lines[y]

self.outfp.write(";".join(line[x] for x in sorted(line.keys())))

self.outfp.write("\n")

# ... the following part of the code is a remix of the

# convert() function in the pdfminer/tools/pdf2text module

rsrc = PDFResourceManager()

outfp = StringIO()

device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())

# becuase my test documents are utf-8 (note: utf-8 is the default codec)

doc = PDFDocument()

fp = open(filename, 'rb')

parser = PDFParser(fp)

parser.set_document(doc)

doc.set_parser(parser)

doc.initialize('')

interpreter = PDFPageInterpre

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/51359
推荐阅读
相关标签
  

闽ICP备14008679号