读取pdf图片并识别内容_fitz.pixmap

作者：思考机器2 | 2024-01-31 19:19:15

踩

fitz.pixmap

import fitz
import time
import re
import os

from PIL import Image
from aip import AipOcr

import numpy as np


def pdf2pic(path, pic_path):
    '''
    # 从pdf中提取图片
    :param path: pdf的路径
    :param pic_path: 图片保存的路径
    :return:
    '''
    t0 = time.perf_counter()
    # 使用正则表达式来查找图片
    checkXO = r"/Type(?= */XObject)" 
    checkIM = r"/Subtype(?= */Image)"  


    # 打开pdf
    doc = fitz.open(path)
    # 图片计数
    imgcount = 0
    lenXREF = doc.xref_length()
 
    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

        # 遍历每一个对象
    for i in range(1, lenXREF):
        # 定义对象字符串
        text = doc.xref_object(i)
        isXObject = re.search(checkXO, text)
        # 使用正则表达式查看是否是图片
        isImage = re.search(checkIM, text)
        # 如果不是对象也不是图片，则continue
        if not isXObject or not isImage:
            continue
        imgcount += 1
        # 根据索引生成图像
        pix = fitz.Pixmap(doc, i)
        # 根据pdf的路径生成图片的名称
        new_name = f"{imgcount}.png"


        # 如果pix.n<5,可以直接存为PNG
        if pix.n < 5:
            pix.writePNG(os.path.join(pic_path, new_name))
        # 否则先转换CMYK
        else:
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(pic_path, new_name))
            pix0 = None
        # 释放资源
        pix = None
        t1 = time.perf_counter()
        print("运行时间:{}s".format(t1 - t0))
        print("提取了{}张图片".format(imgcount))

class Tap_img:

	APP_ID = ''
	API_KEY = ''
	SECRET_KEY = ''

	def ResizeImage(self,img_path):
	    '''改变图片尺寸'''
	    filein = img_path
	    fileout = img_path
	    width = 500
	    height = 900
	    img = Image.open(filein)
	    out = img.resize((width, height),Image.ANTIALIAS)
	    out.save(fileout)
	    img.close()


	def get_file_content(self,filepath):
	    '''读取图片内容'''
	    
	    client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
	    with open(filepath, 'rb') as fp:
	        image = fp.read()
	    fp.close()
	    # 定义参数变量
	    options = {
	        # 定义图像方向
	        'detect-direction': 'true',
	        'language-type': 'CHN_ENG'
	    }
	    result = client.general(image, options)
	    content_list=[]
	    for word in result['words_result']:
	        content_list.append(word['words'])

	    res=''.join(content_list).split('%')[:-1]

	    res_dict = {}
	    for x in res:
	    	key=re.findall('[\u4e99-\u9fa5]+',x)[0]
	    	value=re.findall('\d+',x)[0]
	    	res_dict[key]=value+'%'


	    return res_dict


if __name__=='__main__':
    # 提取pdf的图片保存在本地
    path = '1.pdf'
    pic_path = './img'
    pdf2pic(path, pic_path)

    
    ###将图片内容提取出来
    img=Tap_img()
    content=img.get_file_content('img/2.png')
    print(content)






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/51615