Spaces:
Running
Running
File size: 713 Bytes
1be7f91 7da72ab 1be7f91 6d24261 7da72ab 6d24261 7da72ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
# file_id, ocr, title, date, author, page_count, word_count, character_count
echo """
CREATE TABLE presse AS (
SELECT title
, author
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
FROM read_parquet(
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
);
COPY presse TO '$TMPDIR/presse.parquet' (FORMAT 'parquet', COMPRESSION 'GZIP');
""" | duckdb
# isatty
if [ -t 1 ]; then
echo parquet file output at: $TMPDIR/presse.parquet
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
else
cat $TMPDIR/presse.parquet
rm $TMPDIR/presse.parquet
fi
|