ORC, Parquet file operations in Python

table of contents

ORC file operation with pyorc
Parquet file operation with pyarrow and pandas

1. ORC file operation with pyorc

First install pyorc to work with ORC files in Python

pip install pyorc

Official site https://pypi.org/project/pyorc/

*** Below, in the sample program, 1. Read data from CSV and convert it to ORC, 2. Read data from ORC created in 1. and convert it to CSV. *** *** First, prepare a CSV file like this and name it *** file source.csv ***. Please enclose the character string in ".

1,"Amazon","AWS",3.2
2,"MicroSoft","Azure",0.142
3,"Google","GCP",10.0
4,"SaleForce","SalesCloud",2.5
5,"Git","GitHub",0.342

import pyorc
import glob
import re

"""
Writing process
"""
with open("./target.orc", "wb") as data:
    
    #Read source data. In this case, We'll convert CSV to ORC
    with open("./source.csv","r") as source:
        #Get rid of \n "return code"
        lines = [i.strip() for i in source.readlines()]
        records= []
        header_name = []
        #rows process
        for line in lines:
            record = []
            #colums process
            for column in line.split(","):
                #Data process
                if re.match(r'^".*"$',column):
                    record.append(column.strip('"'))
                    #header process
                    if line == lines[0]:
                        header_name.append("string") 
                elif re.match(r'^\d+\.\d+$',column):
                    record.append(float(column))
                    #header process
                    if line == lines[0]:
                        header_name.append("double")
                elif re.match(r'^\d+$',column):
                    record.append(int(column))
                    #header process
                    if line == lines[0]:
                        header_name.append("int")
            
            #one record datas is packed as a tuple
            records.append(tuple(record))

            #If we are at the first record, we'll give the column names to the ORC table
            if line == lines[0]:
                for i in range(len(header_name)):
                    header_name[i] = f"col{i}:{header_name[i]}"
                header_name = f'struct<{",".join(header_name)}>'

        print(records)

        #Get writer Object. give ORC file object at the position of first augument, 
        #column names at the position of second augument  "Writer" method
        with pyorc.Writer(data, header_name) as writer:
            for record in records:
                writer.write(record)

"""
Reading process
"""
with open("./target.orc", "rb") as data:
    #Get datas from ORC file without column names
    reader = pyorc.Reader(data)
    #Get just only column names from ORC file
    columns = reader.schema.fields

    #Get each column name
    for column in columns:
        print(column)
        print(columns[column].kind)
                
    with open("./target.csv","w") as f:
        #loop row datas
        records = []
        for one_record_data in reader:
            records.append(','.join(map(str, one_record_data)))
        f.write("\n".join(records))

2. Parquet file operation with pyarrow and pandas

Library installation

pip install pandas pyarrow

Create the following source in the same directory as target.csv created by the execution result of 1. and execute it.

#-*- encoding:utf-8 -*-
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd


"""
CSV to Parquet
"""
# CSV -> DataFrame
df = pd.read_csv("./target.csv")

# DataFrame -> Arrow Table
table = pa.Table.from_pandas(df)

# Arrow Table -> Parquet
pq.write_table(table, "target.pq")

"""
Parquet to CSV
"""
# Parquet -> Arrow Table
table2 = pq.read_table("target.pq")

# Arrow Table -> DataFrame
df2 = table.to_pandas()

#DataFrame -> CSV
csv = df2.to_csv("target2.csv")