Saving EOD Stock Data using Apache Arrow Examples

A few examples of working with Apache Arrow and Python

# examples of apache arrow
# create data folder to download stock data to
import os
cur_dir = os.getcwd()
data_dir = os.path.join(cur_dir, 'data')
if not os.path.exists(data_dir):
os.makedirs('data')
# pip install pandas
import pandas as pd
# conda install arrow-cpp=0.9.* -c conda-forge
# conda install pyarrow=0.9.* -c conda-forge
# or
# https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyarrow
# download
# pip install pyarrow‑0.4.1‑cp36‑cp36m‑win_amd64.whl
# pip install -U pyarrow
import pyarrow as pa
import pyarrow.parquet as pq
ticker = 'aapl'
eod_freq = 'daily'
eod_start_date = '20120517'
eod_end_date = '20180517'
eod_url = r'https://storage.googleapis.com/shared-data-29319031/aapl_eod_data.csv'
df_eod = pd.read_csv(eod_url)
eod_filepath = os.path.join(data_dir, f'eod_{ticker}_{eod_start_date}_{eod_end_date}_{eod_freq}.csv')
df_eod.to_csv(eod_filepath)
# Convert DataFrame to Apache Arrow Table
eod_table = pa.Table.from_pandas(df_eod)
# Parquet uncompressed
pq.write_table(eod_table, eod_filepath.replace(".csv",".pq"), compression='NONE')
# Parquet with Snappy compression
pq.write_table(eod_table, eod_filepath.replace(".csv",".snappy.pq"))
# Parquet with GZIP compression
pq.write_table(eod_table, eod_filepath.replace(".csv",".gzip.pq"), compression='GZIP')
# Parquet with Brotli compression
pq.write_table(eod_table, eod_filepath.replace(".csv",".brotli.pq"), compression='BROTLI')
# read Parquet File to dataframe
new_eod_table = pq.read_table(eod_filepath.replace(".csv",".brotli.pq"))
# Convert back to pandas
df_new_eod = new_eod_table.to_pandas()
# Feather
feather_filepath = eod_filepath.replace(".csv",".f")
pa.write_feather(df_eod, feather_filepath)
fdf = pa.read_feather(feather_filepath)
# multicore read
fdf = pa.read_feather(feather_filepath, nthreads=4)
# from pandas
new_fdf = pa.Table.from_pandas(df_eod)
new_fdf.schema

Leave a Reply

Your email address will not be published. Required fields are marked *