python pandas 多线程(携程)写入excel
不知道如何优化,写入excel文件依然很慢.
# -*- coding: utf-8 -*-# # python 3.6.7 # Description: # Author: zzq # Date: 2020/4/27 import math import threadpool import asyncio import pandas as pd def getdata(rows:int, cols:int): """ 生成数据矩阵 :param rows:行 :param cols:列 :return: """ array = [] for rows in range(rows): row = [] for col in range(cols): row.append("Row %sCol %s" % (rows, col)) array.append(row) data = pd.DataFrame(array) return data async def do_work_one(name: str, data: pd.DataFrame, start_row: int, writer: pd.ExcelWriter) -> None: """ 定义携程对象 :param name: 线程名字 :param data: 数据矩阵 :param start_row: 数据写入开始的行号 :param writer: 保存文件对象 :return: """ if start_row == 0: data.to_excel(writer, startrow=start_row) else: data.to_excel(writer, startrow=start_row + 1, header=False) print( %s do_work_one % name, end="") def task_do_work(name:str, data:pd.DataFrame, start_row:int, data_size:int, writer:pd.ExcelWriter) -> pd.ExcelWriter: """ 1、每一个线程里面会有多个协程对象 2、协程的运行是由顺序的,只是在IO交互的时候,不用等待IO交互完成 3、多线程中使用协程的时候必须新建loop对象 :param name: 线程id :param data: 数据矩阵 :param start_row: 数据写入起始行 :param data_size: 数据片大小 :param writer: :return: """ loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [] curr_data = data.iloc[start_row:start_row + data_size, ] coroutine1 = do_work_one(name, curr_data, start_row, writer) tasks.append(coroutine1) loop.run_until_complete(asyncio.wait(tasks)) return writer def call_back(param, result): pass # result.save() if __name__ == __main__: t0 = pd.datetime.now() print("start: %s" % t0) jobs = [] num = 10 pool = threadpool.ThreadPool(num) work_requests = [] data = getdata(rows=12800, cols=50) t1 = pd.datetime.now() print("数据生成: %s" % t1) print("生成数据总计耗时 %s s " % (t1 - t0)) writer = pd.ExcelWriter("text.xlsx") start_row = 0 data_size = math.ceil(data.shape[0] / num) for i in range(num): # work_requests.append(threadpool.WorkRequest(task_do_work, args=( # 线程-{0}.format(i), data, start_row, data_size, writer), callback=call_back, # exc_callback=call_back)) work_requests.append(threadpool.WorkRequest(task_do_work, args=( 线程-{0}.format(i), data, start_row, data_size, writer))) start_row = start_row + data_size [pool.putRequest(req) for req in work_requests] pool.wait() print("") end = pd.datetime.now() print("线程运行完成: %s" % end) print("线程运行耗时: %s" % (end - t1)) writer.save() save_end = pd.datetime.now() print("保存文件: %s" % save_end) print("保存文件耗时: %s" % (save_end - end)) print(总计耗时 %s s % (pd.datetime.now() - t0))
上一篇:
通过多线程提高代码的执行效率例子
下一篇:
如何解决线程不安全问题呢?