python读取pdf文件提取关键信息到Excel中
以前做过的一个项目,需要从pdf格式的检查报告提取关键信息到Execl中
import numpy as np import pandas as pd import re import pdfplumber import os from tqdm import tqdm pd.set_option(max_row,None) pd.set_option(max_columns,None) a = 0 xuejian_list = [] xuejian_id_list = [] #shunde_file = [] #shunde_id_file = [] #读取路径下的所有pdf文件 for root,dirs,files in os.walk(rC:UsersXX): for file1 in files: if file1.endswith(.pdf) or file1.endswith(.PDF): file2 = re.search((.+?).[pdf,PDF],file1) xuejian_id_list.append(file2.group(1)) file = os.path.join(root,file1) xuejian_list.append(file) if file2.group(1) in shunde_id_list: shunde_file.append(file) shunde_id_file.append(file2.group(1)) #print(file) #print(xuejian_id_list) num = len(xuejian_list) def pdf_to_excel(file): pdf = pdfplumber.open(file) a = 0 dist = { } dist[id] = xuejian_id_list[x] pdf_table_path = pd.DataFrame(None) for page in pdf.pages: a = a + 1 if a == 1: #print(page.extract_text()) pdf_text = page.extract_text() #诊断结果1 result1 = re.search(结果[:,:,:,s]{1,}(.+?)[。]{0,}[ ,s],pdf_text) if result1 != None: l5 = result1.group(1) dist[result1] = l5 else: result1 = re.search(结果[:,:,s]{0,}(.+?)。,pdf_text) l5 = result1.group(1) dist[result1] = l5 #诊断结果2,无规则可循,考虑其他读取pdf包 result2 = re.search((本次[sS]+?)[ ,s]{0,}主[s]{0,}治[s]{0,}医[s]{0,}生,pdf_text) if result2 != None: l6 = result2.group(1).replace(s,).replace( ,) dist[result2] = l6 else: result2 = re.search((本次[sS]+?)[ ,s]{0,}检[s]{0,}验[s]{0,}者,pdf_text) l6 = result2.group(1).replace(s,).replace( ,) l66 = re.search((.+。),l6) dist[result2] = l66.group(1) dist_new = pd.DataFrame.from_dict(dist,orient=index).T #print(dist_new) #表格提取为pd.DataFrame for pdf_table in page.extract_tables(): pdf_table = np.array(pdf_table) pdf_table = pd.DataFrame(pdf_table[1:],columns=pdf_table[0]) if str(pdf_table_path.empty) == False: pdf_table = pd.concat([pdf_table_path,pdf_table],axis=0) pdf_table_path = pdf_table.copy() #print(pdf_table) #print(/n) pdf_table = pdf_table.reset_index(drop=True) pdf_table1 = pd.DataFrame(pdf_table[检测结果].values) pdf_table1.index = pd.Series(pdf_table[英文缩写].values) #pdf_table2 = pd.pivot_table(pdf_table1,columns=[英文缩写],values=[检测结果],aggfunc=[np.sum],fill_value=np.nan) #数据合并 pdf_data = pd.concat([dist_new,pdf_table1.T],axis=1) #print(pdf_data) #pdf_data_columns = pdf_data.columns.values.tolist() pdf.close() return pdf_data #xuejian_dict = {} xuejian_path = pd.DataFrame(None) false_file = [] for x in tqdm(range(len(xuejian_list))): try: #xuejian_dict[id] = xuejian_id_list[x] pdf_data = pdf_to_excel(xuejian_list[x]) if str(xuejian_path.empty) == False: pdf_data = pd.concat([xuejian_path,pdf_data],axis=0) xuejian_path = pdf_data.copy() except: false_file.append(xuejian_list[x]) pdf_data1 = pdf_data.reset_index(drop=True) pdf_data1.to_csv(rxx.csv,encoding="gbk",index=False) false_file1 = pd.DataFrame(false_file,columns=[file_name]) false_file1.to_csv(rxx,encoding=gbk,index=False) print("总pdf文件数:{}".format(len(xuejian_list))) print("已提取的pdf文件数:{}".format(len(pdf_data1)))