python读取pdf文件提取关键信息到Excel中

以前做过的一个项目,需要从pdf格式的检查报告提取关键信息到Execl中

import numpy as np
import pandas as pd
import re
import pdfplumber
import os
from tqdm import tqdm

pd.set_option(max_row,None)
pd.set_option(max_columns,None)

a = 0
xuejian_list = []
xuejian_id_list = []
#shunde_file = []
#shunde_id_file = []

#读取路径下的所有pdf文件
for root,dirs,files in os.walk(rC:UsersXX):
    for file1 in files:
        if file1.endswith(.pdf) or file1.endswith(.PDF):
            file2 = re.search((.+?).[pdf,PDF],file1)
            xuejian_id_list.append(file2.group(1))
            file = os.path.join(root,file1)
            xuejian_list.append(file)
            if file2.group(1) in shunde_id_list:
                shunde_file.append(file)
                shunde_id_file.append(file2.group(1))
            #print(file)
#print(xuejian_id_list)
num = len(xuejian_list)

def pdf_to_excel(file):
    pdf = pdfplumber.open(file)

    a = 0
    dist = {
          
   }
    dist[id] = xuejian_id_list[x]
    
    pdf_table_path = pd.DataFrame(None)
    for page in pdf.pages:
        a = a + 1
        if a == 1:
            #print(page.extract_text())
            pdf_text = page.extract_text()

            #诊断结果1
            result1 = re.search(结果[:,:,:,s]{1,}(.+?)[。]{0,}[
,s],pdf_text)
            if result1 != None:
                l5 = result1.group(1)
                dist[result1] = l5
            else:
                result1 = re.search(结果[:,:,s]{0,}(.+?)。,pdf_text)
                l5 = result1.group(1)
                dist[result1] = l5

            #诊断结果2,无规则可循,考虑其他读取pdf包
            result2 = re.search((本次[sS]+?)[
,s]{0,}主[s]{0,}治[s]{0,}医[s]{0,}生,pdf_text)
            if result2 != None:
                l6 = result2.group(1).replace(s,).replace(
,)
                dist[result2] = l6
            else:
                result2 = re.search((本次[sS]+?)[
,s]{0,}检[s]{0,}验[s]{0,}者,pdf_text)
                l6 = result2.group(1).replace(s,).replace(
,)
                l66 = re.search((.+。),l6)
                dist[result2] = l66.group(1)

            dist_new = pd.DataFrame.from_dict(dist,orient=index).T
            #print(dist_new)


        #表格提取为pd.DataFrame
        for pdf_table in page.extract_tables():
            pdf_table = np.array(pdf_table)
            pdf_table = pd.DataFrame(pdf_table[1:],columns=pdf_table[0])
        if str(pdf_table_path.empty) == False:
            pdf_table = pd.concat([pdf_table_path,pdf_table],axis=0)
        pdf_table_path = pdf_table.copy()
            #print(pdf_table)
            #print(/n)
    pdf_table = pdf_table.reset_index(drop=True)
    pdf_table1 = pd.DataFrame(pdf_table[检测结果].values)
    pdf_table1.index = pd.Series(pdf_table[英文缩写].values)

    #pdf_table2 = pd.pivot_table(pdf_table1,columns=[英文缩写],values=[检测结果],aggfunc=[np.sum],fill_value=np.nan)

    #数据合并
    pdf_data = pd.concat([dist_new,pdf_table1.T],axis=1)
    #print(pdf_data)
    #pdf_data_columns = pdf_data.columns.values.tolist()

    pdf.close()
    return pdf_data

#xuejian_dict = {}
xuejian_path = pd.DataFrame(None)
false_file = []
for x in tqdm(range(len(xuejian_list))):
    try:
        #xuejian_dict[id] = xuejian_id_list[x]
        pdf_data = pdf_to_excel(xuejian_list[x])
        if str(xuejian_path.empty) == False:
               pdf_data = pd.concat([xuejian_path,pdf_data],axis=0)
        xuejian_path = pdf_data.copy()
            
    except:
        false_file.append(xuejian_list[x])

pdf_data1 = pdf_data.reset_index(drop=True)
pdf_data1.to_csv(rxx.csv,encoding="gbk",index=False)
false_file1 = pd.DataFrame(false_file,columns=[file_name])
false_file1.to_csv(rxx,encoding=gbk,index=False)
print("总pdf文件数:{}".format(len(xuejian_list)))
print("已提取的pdf文件数:{}".format(len(pdf_data1)))
经验分享 程序员 微信小程序 职场和发展