table of contents
First install pyorc to work with ORC files in Python
pip install pyorc
Official site https://pypi.org/project/pyorc/
*** Below, in the sample program, 1. Read data from CSV and convert it to ORC, 2. Read data from ORC created in 1. and convert it to CSV. *** *** First, prepare a CSV file like this and name it *** file source.csv ***. Please enclose the character string in ".
1,"Amazon","AWS",3.2
2,"MicroSoft","Azure",0.142
3,"Google","GCP",10.0
4,"SaleForce","SalesCloud",2.5
5,"Git","GitHub",0.342
import pyorc
import glob
import re
"""
Writing process
"""
with open("./target.orc", "wb") as data:
#Read source data. In this case, We'll convert CSV to ORC
with open("./source.csv","r") as source:
#Get rid of \n "return code"
lines = [i.strip() for i in source.readlines()]
records= []
header_name = []
#rows process
for line in lines:
record = []
#colums process
for column in line.split(","):
#Data process
if re.match(r'^".*"$',column):
record.append(column.strip('"'))
#header process
if line == lines[0]:
header_name.append("string")
elif re.match(r'^\d+\.\d+$',column):
record.append(float(column))
#header process
if line == lines[0]:
header_name.append("double")
elif re.match(r'^\d+$',column):
record.append(int(column))
#header process
if line == lines[0]:
header_name.append("int")
#one record datas is packed as a tuple
records.append(tuple(record))
#If we are at the first record, we'll give the column names to the ORC table
if line == lines[0]:
for i in range(len(header_name)):
header_name[i] = f"col{i}:{header_name[i]}"
header_name = f'struct<{",".join(header_name)}>'
print(records)
#Get writer Object. give ORC file object at the position of first augument,
#column names at the position of second augument "Writer" method
with pyorc.Writer(data, header_name) as writer:
for record in records:
writer.write(record)
"""
Reading process
"""
with open("./target.orc", "rb") as data:
#Get datas from ORC file without column names
reader = pyorc.Reader(data)
#Get just only column names from ORC file
columns = reader.schema.fields
#Get each column name
for column in columns:
print(column)
print(columns[column].kind)
with open("./target.csv","w") as f:
#loop row datas
records = []
for one_record_data in reader:
records.append(','.join(map(str, one_record_data)))
f.write("\n".join(records))
Library installation
pip install pandas pyarrow
Create the following source in the same directory as target.csv created by the execution result of 1. and execute it.
#-*- encoding:utf-8 -*-
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
"""
CSV to Parquet
"""
# CSV -> DataFrame
df = pd.read_csv("./target.csv")
# DataFrame -> Arrow Table
table = pa.Table.from_pandas(df)
# Arrow Table -> Parquet
pq.write_table(table, "target.pq")
"""
Parquet to CSV
"""
# Parquet -> Arrow Table
table2 = pq.read_table("target.pq")
# Arrow Table -> DataFrame
df2 = table.to_pandas()
#DataFrame -> CSV
csv = df2.to_csv("target2.csv")
Recommended Posts