#沸你不可#
队名:LOL玩多了起了个无畏竞巅峰
队员:@山花 @盐加泥鸭 @Asiawyz
今日分享:
怎么将数据很大的 csv 文件转为 parquet 格式的文件


import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq


def write_parquet_file():
csv_file = '/home/cloudam/mcule.csv'
parquet_file = '/home/cloudam/mcule.parquet'
chunksize = 100_000

csv_stream = pd.read_csv(csv_file, sep='\t', chunksize=chunksize, low_memory=False)

for i, chunk in enumerate(csv_stream):
print("Chunk", i)
if i == 0:
# Guess the schema of the CSV file from the first chunk
parquet_schema = pa.Table.from_pandas(df=chunk).schema
# Open a Parquet file for writing
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
# Write CSV chunk to the parquet file
table = pa.Table.from_pandas(chunk, schema=parquet_schema)
parquet_writer.write_table(table)

parquet_writer.close()


if __name__ == "__main__":
write_parquet_file()
展开
评论