赞
踩
- import os
- import codecs
- import PyPDF2
- import pdfplumber
-
-
- def extract_content(pdf_path):
- # 内容提取,使用 pdfplumber 打开 PDF,用于提取文本
- with pdfplumber.open(pdf_path) as pdf_file:
- # 使用 PyPDF2 打开 PDF 用于提取图片
- pdf_image_reader = PyPDF2.PdfReader(open(pdf_path, "rb"))
- #print(len(pdf_image_reader.pages) )
-
- content = ''
- # len(pdf.pages)为PDF文档页数,一页页解析
- for i in range(len(pdf_file.pages)):
- #print("当前第 %s 页" % i)
- # pdf.pages[i] 是读取PDF文档第i+1页
- page_text = pdf_file.pages[i]
- # page.extract_text()函数即读取文本内容
- page_content = page_text.extract_text()
- if page_content:
- content = content + page_content + "\n"
- with open('content.txt', mode='w', encoding='utf-8') as f:
- for i in content:
- f.write(i)
- f.close()
- # 将文件中的文本读取出来并存入content.txt中
-
-
- f = codecs.open('content.txt', mode='r', encoding='utf-8') # 打开txt文件,以‘utf-8'编码读取
- line = f.readline() # 以行的形式进行读取文件
- x = [] # 设置x数组
- while line:
- a = line.split('\r\n') # 每行数据分隔情况,此数据以“\r\n”分隔
- b = a[0] # 选取需要读取的数据列数
- x.append(b) # 将其添加在列表之中
- line = f.readline()
- f.close() # close文件
- #print(x[9])
- x1 = x[9] # 读取出x列表中第10个的内容
- x2 = x[11]
- with open('version.txt', mode='a', encoding='utf-8') as f: # 列表中第10个的内容可追加的写入version.txt中
- for i in x1:
- f.write(i)
- f.write("\n")
- f.close() # close文件
- with open('password.txt', mode='a', encoding='utf-8') as f:
- for i in x2:
- f.write(i)
- f.write("\n")
- f.close() # close文件
-
- def strip_suffix(filename):
- return filename[:filename.rfind('.')]
-
- # 遍历列表中的所有文件
- def pdf_files(file_dir):
- for file in file_dir:
- if not os.path.isdir(file_dir):
- print("usage: python3 csv.py csv_dirname out_putfile")
- return []
- list_csv = []
- dir_list = os.listdir(file_dir)
- for file in dir_list:
- path = os.path.join(file_dir, file)
- if os.path.splitext(path)[1] == '.pdf':
- list_csv.append(path)
- #print(list_csv)
- return list_csv
-
-
- if __name__ == '__main__':
- file_dir = r'C:\Users\m1317\Desktop\allfiles' #pdf文件位置
- pdf_files = pdf_files(file_dir)
- for pdf_file in pdf_files:
- extract_content(pdf_file)
-
-

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。