Spaces:
Running
Running
# file_id, ocr, title, date, author, page_count, word_count, character_count | |
echo """ | |
CREATE TABLE presse AS ( | |
SELECT title | |
, author | |
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year | |
FROM read_parquet( | |
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)]) | |
); | |
COPY presse TO '$TMPDIR/presse.parquet' (FORMAT 'parquet', COMPRESSION 'GZIP'); | |
""" | duckdb | |
# isatty | |
if [ -t 1 ]; then | |
echo parquet file output at: $TMPDIR/presse.parquet | |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\"" | |
else | |
cat $TMPDIR/presse.parquet | |
rm $TMPDIR/presse.parquet | |
fi | |