自动识别csv文件编码并极速导入Postgresql

1,156 阅读1分钟

好不容易搜到的方法,先直接上代码:

import io
import urllib
import pandas as pd
from sqlalchemy import create_engine
import getfile_encoding as gcode
import datetime
#可将csv文件快速导入pg数据库,实测79万条9秒
def getFileChar(filename):
    f=open(filename,'rb')
    data=f.read(200)
    f.close()
    #print(chardet.detect(data))
    #print(chardet.detect(data).get('encoding'))
    return chardet.detect(data).get('encoding')
def write_to_table(df, table_name, if_exists='fail'):#append
    pwd='qais@123'
    conn_str="postgresql://qais:%s@localhost/db_qais"%urllib.parse.quote_plus(pwd)#?charset=utf8
    db_engine = create_engine(conn_str,encoding='utf-8',echo=False)#
    
    string_data_io = io.StringIO()
    df.to_csv(string_data_io, sep='|', index=False)
    pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine)
    table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df,
                               index=False, if_exists=if_exists,schema = 'public')#goods_code
    table.create()
    string_data_io.seek(0)
    #string_data_io.readline()  # remove header
    with db_engine.connect() as connection:
        with connection.connection.cursor() as cursor:
            copy_cmd = "COPY public.%s FROM STDIN HEADER DELIMITER '|' CSV" %table_name#goods_code
            cursor.copy_expert(copy_cmd, string_data_io)
        connection.connection.commit()
#版权声明:本文为CSDN博主「仙人掌_lz」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
#原文链接:https://blog.csdn.net/qq_36603091/java/article/details/79587971
if __name__=='__main__':
    filename='D:/vs/tmp/数据2020-218new.csv'# test2020-218new.csv
    ecding=gcode.getFileChar(filename)
    df=pd.read_csv(filename,encoding=ecding)
    print(datetime.datetime.now())
    write_to_table(df,'testinfo','append')
    print(datetime.datetime.now())
    

如采用如下方式,速度极慢,还特别耗内存:

filename='D:/vs/tmp/test2020-218new.csv'
    ecding=gcode.getFileChar(filename)
    #engine=create_engine(**pg_config)
    #df=pd.read_csv('D:/vs/tmp/test2020-218new.csv',encoding='latin-1')
    #df=pd.read_csv('D:/vs/tmp/test2020-218new.csv',encoding='utf-8')
    df=pd.read_csv(filename,encoding=ecding)
    #print(df)
    print(datetime.datetime.now())
    pwd='qais@123'
    #pwd_new=parse.quote_plus(pwd)
    conn_str="postgresql://qais:%s@localhost/db_qais"%urllib.parse.quote_plus(pwd)#?charset=utf8
    engine=create_engine(conn_str,encoding='utf-8',echo=False)#
    
    #此方式太慢、占内存,用上面方法中的函数实现非常快

    pd.io.sql.to_sql(df,'custominfo',engine,if_exists='append',index=False,index_label=None)#,method='multi',index="False"
    engine.dispose()
    print(datetime.datetime.now())