fpdn / docs /data /presse.parquet.sh
fil's picture
init
7da72ab
raw
history blame
756 Bytes
echo """
CREATE TABLE presse AS (
SELECT file_id, ocr, title, date, author, page_count, word_count, character_count
FROM read_parquet([""" > $TMPDIR/presse.sql
for i in $(seq 1 320); do
echo " 'https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_$i.parquet'," >> $TMPDIR/presse.sql
done
echo """ ])
);
ALTER TABLE presse ALTER COLUMN date TYPE DATE;
COPY presse TO '$TMPDIR/presse.parquet' (FORMAT 'parquet', COMPRESSION 'GZIP');
""" >> $TMPDIR/presse.sql
duckdb < $TMPDIR/presse.sql
# isatty
if [ -t 1 ]; then
echo parquet file output at: $TMPDIR/presse.parquet
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
else
cat $TMPDIR/presse.parquet
rm $TMPDIR/presse.parquet
fi