A few examples of working with Apache Arrow and Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# examples of apache arrow | |
# create data folder to download stock data to | |
import os | |
cur_dir = os.getcwd() | |
data_dir = os.path.join(cur_dir, 'data') | |
if not os.path.exists(data_dir): | |
os.makedirs('data') | |
# pip install pandas | |
import pandas as pd | |
# conda install arrow-cpp=0.9.* -c conda-forge | |
# conda install pyarrow=0.9.* -c conda-forge | |
# or | |
# https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyarrow | |
# download | |
# pip install pyarrow‑0.4.1‑cp36‑cp36m‑win_amd64.whl | |
# pip install -U pyarrow | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
ticker = 'aapl' | |
eod_freq = 'daily' | |
eod_start_date = '20120517' | |
eod_end_date = '20180517' | |
eod_url = r'https://storage.googleapis.com/shared-data-29319031/aapl_eod_data.csv' | |
df_eod = pd.read_csv(eod_url) | |
eod_filepath = os.path.join(data_dir, f'eod_{ticker}_{eod_start_date}_{eod_end_date}_{eod_freq}.csv') | |
df_eod.to_csv(eod_filepath) | |
# Convert DataFrame to Apache Arrow Table | |
eod_table = pa.Table.from_pandas(df_eod) | |
# Parquet uncompressed | |
pq.write_table(eod_table, eod_filepath.replace(".csv",".pq"), compression='NONE') | |
# Parquet with Snappy compression | |
pq.write_table(eod_table, eod_filepath.replace(".csv",".snappy.pq")) | |
# Parquet with GZIP compression | |
pq.write_table(eod_table, eod_filepath.replace(".csv",".gzip.pq"), compression='GZIP') | |
# Parquet with Brotli compression | |
pq.write_table(eod_table, eod_filepath.replace(".csv",".brotli.pq"), compression='BROTLI') | |
# read Parquet File to dataframe | |
new_eod_table = pq.read_table(eod_filepath.replace(".csv",".brotli.pq")) | |
# Convert back to pandas | |
df_new_eod = new_eod_table.to_pandas() | |
# Feather | |
feather_filepath = eod_filepath.replace(".csv",".f") | |
pa.write_feather(df_eod, feather_filepath) | |
fdf = pa.read_feather(feather_filepath) | |
# multicore read | |
fdf = pa.read_feather(feather_filepath, nthreads=4) | |
# from pandas | |
new_fdf = pa.Table.from_pandas(df_eod) | |
new_fdf.schema |