赞
踩

采用代码、数据、结果结合的方式按操作顺序给出

此文件夹只用于存储pdf文件
tips:运行前需要修改文件存放与读取地址。此代码会生成pdf读取的初始数据进行存储,再读取初始数据进行数据处理。
import pandas as pd import warnings import pdfplumber import os warnings.filterwarnings("ignore") #自动导出文件夹对应的PDF数据 dir_path = r'C:\Users\24655\Desktop\python知识管理\python脚本和算法\脚本\read_pdf\数据' path = [] for i,j,k in os.walk(dir_path): for m in range(len(k)): path.append(str(i)+'\\'+str(k[m])) for l in range(len(k)): with pdfplumber.open(path[l]) as pdf: #输出pdf隐藏信息 #print(pdf.metadata) # 第一页pdfplumber.Page实例 first_page = pdf.pages[0] # second_page = pdf.pages[1] text = first_page.extract_text() with open(r'C:\Users\24655\Desktop\python知识管理\python脚本和算法\脚本\read_pdf\数据\{}.csv'.format(k[l]),'w') as file: file.write(str(text)+'/n') df_final = pd.DataFrame() for im in range(len(k)): #PDF导出的数据清洗 df = pd.read_csv(r'C:\Users\24655\Desktop\python知识管理\python脚本和算法\脚本\read_pdf\数据\{}.csv'.format(k[im]),encoding = 'gbk') # print(df) df = df.loc[2:11] # print(df) len_df_loc = df.loc[3].str.split(' ')[0] #解决pdf读取数据时,4s在与不在同一行导致的删除列错误 # print(len(len_df_loc)) if len(len_df_loc) <= 2: df.loc[3] = df.loc[3]+' '+df.loc[4] df.drop(index = [4,5,6,9],inplace = True) else: df.loc[3] = df.loc[3]+' '+df.loc[4] df.drop(index = [4,5,8],inplace = True) # print(df) df = df.admin.reset_index() df = df['admin'].str.split(' ',expand = True) #以表格形式切割数据 # print(df) #清洗完的数据精准获取 data_cell = [] data_cell.append(k[im]) #地图名称 split_df_1 = df.loc[3] #解决pdf读取数据时,4s在与不在同一行导致的删除列错误 if len(split_df_1) <= 14: data_case_type = df.loc[3,2] data_cell.append(data_case_type) #项目类型 else: data_case_type = df.loc[3,3] data_cell.append(data_case_type) #项目类型 data_efficiency_of_the_car = df.loc[1,0] data_cell.append(data_efficiency_of_the_car) #车效 data_efficiency_of_the_warehouse = df.loc[1,1] data_cell.append(data_efficiency_of_the_warehouse) #整仓效率 data_Processing_time_for_a_single_container = df.loc[1,2] data_cell.append(data_Processing_time_for_a_single_container) #单箱处理时间 data_the_number_of_car = df.loc[5,0] data_cell.append(data_the_number_of_car) #车数 data_the_number_of_mc = df.loc[3,11] data_cell.append(data_the_number_of_mc) #任务数 data_Workstation_type_of_inbound = df.loc[5,6] data_cell.append(data_Workstation_type_of_inbound) #出库工作站类型 data_Workstation_number_of_inbound = df.loc[5,5] data_cell.append(data_Workstation_number_of_inbound) #出库工作站数量 data_Conveyor_line = df.loc[5,9] data_cell.append(data_Conveyor_line) #是否有输送线 # 解决pdf识别问题导致报告编码与任务名称粘在一起的问题 split_df_1 = df.loc[3] #解决pdf读取数据时,4s在与不在同一行导致的删除列错误 if len(split_df_1) <= 14: data_Pick_up_time_of_outbound = df.loc[3,8] data_cell.append(data_Pick_up_time_of_outbound) #出库拣货时间(h) data_total_time_of_Perform_a_task = df.loc[3,12] data_cell.append(data_total_time_of_Perform_a_task) #执行任务总时长(h) data_Times_the_speed = df.loc[3,3] data_cell.append(data_Times_the_speed) #倍速 else: data_Pick_up_time_of_outbound = df.loc[3,9] data_cell.append(data_Pick_up_time_of_outbound) #出库拣货时间(h) data_total_time_of_Perform_a_task = df.loc[3,13] data_cell.append(data_total_time_of_Perform_a_task) #执行任务总时长(h) data_Times_the_speed = df.loc[3,4] data_cell.append(data_Times_the_speed) #倍速 data_cell = pd.Series(data_cell) data_cell = pd.DataFrame(data_cell.T) data_cell = data_cell.T data_cell.columns = ['地图', '项目类型', '车效(箱//车)', '整仓效率(箱/)', '单箱处理耗时', '车数', '任务数', '出库工作站类型', '出库工作站数量', '是否有输送线', '出库拣货时间(h)', '执行任务总时长(h)', '倍速'] df_final = pd.concat([df_final,data_cell]) df_final = df_final.sort_values(by = '倍速',axis = 0,ascending = True) df_final.to_csv(r'C:\Users\24655\Desktop\python知识管理\python脚本和算法\脚本\read_pdf\数据\{}个pdf文件数据汇总.csv'.format(len(k)),index = False,encoding='gbk')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。