Spaces:

HuggingFaceGECLM
/

dataset_explorer

Runtime error

App Files Files Community

ola13 commited on Apr 4, 2023

Commit

eff02e0

1 Parent(s): a4a0058

read from s3

Browse files

Files changed (7) hide show

.ipynb_checkpoints/test-checkpoint.ipynb +279 -0
Makefile +43 -0
app.py +77 -69
data/{cc_filtered_text_examples_with_stats.json → commoncrawl_examples_with_stats.json} +0 -0
data/enwiki_data_examples_with_stats.json +3 -0
data/{s2orc_raw_examples_with_stats.json → s2orc_dedup_examples_with_stats.json} +0 -0
test.ipynb +279 -0

.ipynb_checkpoints/test-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,279 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "585da432",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import math\n",
+    "import os\n",
+    "import random\n",
+    "import uuid\n",
+    "from datetime import datetime\n",
+    "\n",
+    "import gradio as gr\n",
+    "import jsonlines\n",
+    "import pyarrow as pa\n",
+    "import s3fs\n",
+    "from datasets import Dataset\n",
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
+    "\n",
+    "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
+    "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
+    "\n",
+    "DATASETS = [\n",
+    "    \"c4\",\n",
+    "    \"bigcode_python_code\",\n",
+    "    \"bigcode_python_github_issues\",\n",
+    "    \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
+    "    \"books3\",\n",
+    "    \"gutenberg_raw\",\n",
+    "    \"reddit_threaded\",\n",
+    "    \"enwiki_data\",\n",
+    "    \"s2orc_dedup\",\n",
+    "    \"stackexchange2\",\n",
+    "    \"commoncrawl\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def get_parquet_lines(dataset, sample_size=100):\n",
+    "    s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
+    "\n",
+    "    if len(s3_paths) == 0:\n",
+    "        raise FileNotFoundError(f\"Nothing found at {path}\")\n",
+    "\n",
+    "    print(\"Number of parquet files\", len(s3_paths))\n",
+    "    s3_path = random.choice(s3_paths)\n",
+    "    print(\"Reading\", s3_path)\n",
+    "    lines = []\n",
+    "\n",
+    "    with S3.open(s3_path) as f:\n",
+    "        pf = pa.parquet.ParquetFile(f)\n",
+    "        for ix_row_group in range(pf.metadata.num_row_groups):\n",
+    "            # We load dataset by row group - 1000 rows at a time\n",
+    "            # using open_input_stream would return bytes per bytes not row per row\n",
+    "            table = pf.read_row_group(ix_row_group)\n",
+    "            lines.extend(table.to_pylist())\n",
+    "\n",
+    "    random.shuffle(lines)\n",
+    "    return lines[:sample_size]\n",
+    "\n",
+    "\n",
+    "def get_local_lines(dataset):\n",
+    "    lines = []\n",
+    "    with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
+    "        for line in f:\n",
+    "            lines.append(line)\n",
+    "    return lines\n",
+    "\n",
+    "\n",
+    "def line_generator(lines_dict, dataset):\n",
+    "    for line in lines_dict[dataset]:\n",
+    "        yield line\n",
+    "\n",
+    "\n",
+    "# Parallelize the below\n",
+    "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
+    "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
+    "\n",
+    "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
+    "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
+    "\n",
+    "\n",
+    "def send_report(sample, dataset, reason, annotator, campaign):\n",
+    "    text = sample[\"text\"]\n",
+    "    sample.pop(\"text\")\n",
+    "\n",
+    "    sample_id = \"\"\n",
+    "    if \"id\" not in sample:\n",
+    "        if \"title\" in sample:\n",
+    "            sample_id = sample[\"title\"]\n",
+    "    else:\n",
+    "        sample_id = sample[\"id\"]\n",
+    "\n",
+    "    with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
+    "        f.write(\n",
+    "            {\n",
+    "                \"dataset\": dataset,\n",
+    "                \"docid\": sample_id,\n",
+    "                \"text\": text,\n",
+    "                \"metadata\": sample,\n",
+    "                \"reason\": reason,\n",
+    "                \"annotator\": annotator,\n",
+    "                \"campaign\": campaign,\n",
+    "                \"timestamp\": str(datetime.now()),\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "    api = HfApi()\n",
+    "    api.upload_file(\n",
+    "        path_or_fileobj=\"report.jsonl\",\n",
+    "        path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
+    "        repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
+    "        repo_type=\"dataset\",\n",
+    "        token=os.environ.get(\"geclm_token\"),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "description = \"\"\"\n",
+    "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    demo = gr.Blocks()\n",
+    "\n",
+    "    with demo:\n",
+    "        current_sample_state = gr.State(dict())\n",
+    "\n",
+    "        description = gr.Markdown(value=description)\n",
+    "        with gr.Row():\n",
+    "            annotator = gr.Textbox(\n",
+    "                lines=1,\n",
+    "                max_lines=1,\n",
+    "                placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
+    "                label=\"Annotator\",\n",
+    "            )\n",
+    "            campaign = gr.Textbox(\n",
+    "                lines=1,\n",
+    "                max_lines=1,\n",
+    "                placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
+    "                label=\"Annotation campaign\",\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            dataset = gr.Dropdown(\n",
+    "                choices=DATASETS,\n",
+    "                value=\"Pick a dataset below\",\n",
+    "                label=\"Dataset\",\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            reason_txt = gr.Textbox(\n",
+    "                label=\"Flagging reason\",\n",
+    "                placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
+    "                visible=False,\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
+    "            good_btn = gr.Button(\"Next ✅\", visible=False)\n",
+    "        with gr.Row():\n",
+    "            text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
+    "\n",
+    "        def next_line(dataset):\n",
+    "            next_line = next(line_generators_s3[dataset])\n",
+    "\n",
+    "            text_col = \"text\"\n",
+    "            if text_col not in next_line:\n",
+    "                text_col = \"content\"\n",
+    "            return [\n",
+    "                gr.update(value=next_line[text_col], visible=True),\n",
+    "                next_line,\n",
+    "                gr.update(visible=True),\n",
+    "                gr.update(visible=True),\n",
+    "                gr.update(visible=True),\n",
+    "            ]\n",
+    "\n",
+    "        def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
+    "            send_report(current_sample, dataset, reason, annotator, campaign)\n",
+    "            next_line = next(line_generators_s3[dataset])\n",
+    "            text_col = \"text\"\n",
+    "            if text_col not in next_line:\n",
+    "                text_col = \"content\"\n",
+    "            return [\n",
+    "                next_line[text_col],\n",
+    "                gr.update(\n",
+    "                    value=\"\",\n",
+    "                    placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
+    "                ),\n",
+    "                next_line,\n",
+    "            ]\n",
+    "\n",
+    "        good_btn.click(\n",
+    "            next_line,\n",
+    "            inputs=dataset,\n",
+    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
+    "        )\n",
+    "        dataset.change(\n",
+    "            next_line,\n",
+    "            inputs=dataset,\n",
+    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
+    "        )\n",
+    "        bad_btn.click(\n",
+    "            bad_line,\n",
+    "            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
+    "            outputs=[text, reason_txt, current_sample_state],\n",
+    "        )\n",
+    "\n",
+    "    demo.launch(enable_queue=False, debug=True)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Makefile ADDED Viewed

	@@ -0,0 +1,43 @@

+.PHONY: style quality
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+# For later, when we have correct path - and we will likely have to ignore venv folders.
+check_dirs := examples tests src utils
+style:
+	python -m black --line-length 119 --target-version py39 .
+	python -m isort .
+quality:
+	python -m black --check --line-length 119 --target-version py39 .
+	python -m isort --check-only .
+	python -m flake8 --max-line-length 119 .
+# Release stuff
+pre-release:
+	python utils/release.py
+pre-patch:
+	python utils/release.py --patch
+post-release:
+	python utils/release.py --post_release
+post-patch:
+	python utils/release.py --post_release --patch
+#wheels:
+#	python setup.py bdist_wheel && python setup.py sdist
+#
+#wheels_clean:
+#	rm -rf build && rm -rf dist
+#
+#pypi_upload:
+#	python -m pip install twine
+#	twine upload dist/* -r pypi
+#
+#pypi_test_upload:
+#	python -m pip install twine
+#	twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/

app.py CHANGED Viewed

@@ -1,79 +1,78 @@
-import gradio as gr
-import jsonlines
 import os
 import uuid
 from datetime import datetime
 from huggingface_hub import HfApi
-from pprint import pprint
-datasets = [
-    "gutenberg_raw",
-    "stackexchange2",
     "bigcode_python_code",
     "bigcode_python_github_issues",
-    "bigcode_python_jupyter_scripts_dedup_filtered",
     "books3",
-    "c4",
-    "s2orc_raw",
     "reddit_threaded",
-    "cc_filtered_text",
 ]
-def line_generator(dataset):
-    if dataset == "gutenberg_raw":
-        with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "stackexchange2":
-        with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "bigcode_python_code":
-        with jsonlines.open(
-            "data/bigcode_python_code_examples_with_stats.json", "r"
-        ) as f:
-            for line in f:
-                yield line
-    if dataset == "bigcode_python_github_issues":
-        with jsonlines.open(
-            "data/bigcode_python_github_issues_examples_with_stats.json", "r"
-        ) as f:
-            for line in f:
-                yield line
-    if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
-        with jsonlines.open(
-            "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
-            "r",
-        ) as f:
-            for line in f:
-                yield line
-    if dataset == "books3":
-        with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "c4":
-        with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "s2orc_raw":
-        with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "reddit_threaded":
-        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-    if dataset == "cc_filtered_text":
-        with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
-            for line in f:
-                yield line
-line_generators = {dataset: line_generator(dataset) for dataset in datasets}
 def send_report(sample, dataset, reason, annotator, campaign):
@@ -138,7 +137,9 @@ if __name__ == "__main__":
             )
         with gr.Row():
             dataset = gr.Dropdown(
-                choices=datasets, value="Pick a dataset below", label="Dataset",
             )
         with gr.Row():
             reason_txt = gr.Textbox(
@@ -150,12 +151,16 @@ if __name__ == "__main__":
             bad_btn = gr.Button("Bad ❌", visible=False)
             good_btn = gr.Button("Next ✅", visible=False)
         with gr.Row():
-            text = gr.Markdown(visible=False)
         def next_line(dataset):
-            next_line = next(line_generators[dataset])
             return [
-                gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
                 next_line,
                 gr.update(visible=True),
                 gr.update(visible=True),
@@ -164,9 +169,12 @@ if __name__ == "__main__":
         def bad_line(current_sample, dataset, reason, annotator, campaign):
             send_report(current_sample, dataset, reason, annotator, campaign)
-            next_line = next(line_generators[dataset])
             return [
-                "<pre>" + next_line["text"] + "</pre>",
                 gr.update(
                     value="",
                     placeholder="Provide the reason for flagging if you think the sample is bad.",

+import math
 import os
+import random
 import uuid
 from datetime import datetime
+import gradio as gr
+import jsonlines
+import pyarrow as pa
+import s3fs
+from datasets import Dataset
 from huggingface_hub import HfApi
+S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
+DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5
+BASE_S3_DIR = "s3://geclm-datasets/samples/"
+DATASETS = [
+    "c4",
     "bigcode_python_code",
     "bigcode_python_github_issues",
+    "bigcode_python_jupyter_markdowned_clean_dedup",
     "books3",
+    "gutenberg_raw",
     "reddit_threaded",
+    "enwiki_data",
+    "s2orc_dedup",
+    "stackexchange2",
+    "commoncrawl",
 ]
+def get_parquet_lines(dataset, sample_size=100):
+    s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
+    if len(s3_paths) == 0:
+        raise FileNotFoundError(f"Nothing found at {path}")
+    print("Number of parquet files", len(s3_paths))
+    s3_path = random.choice(s3_paths)
+    print("Reading", s3_path)
+    lines = []
+    with S3.open(s3_path) as f:
+        pf = pa.parquet.ParquetFile(f)
+        for ix_row_group in range(pf.metadata.num_row_groups):
+            # We load dataset by row group - 1000 rows at a time
+            # using open_input_stream would return bytes per bytes not row per row
+            table = pf.read_row_group(ix_row_group)
+            lines.extend(table.to_pylist())
+    random.shuffle(lines)
+    return lines[:sample_size]
+def get_local_lines(dataset):
+    lines = []
+    with jsonlines.open("data/{}_examples_with_stats.json".format(dataset), "r") as f:
+        for line in f:
+            lines.append(line)
+    return lines
+def line_generator(lines_dict, dataset):
+    for line in lines_dict[dataset]:
+        yield line
+# Parallelize the below
+local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
+s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
+line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
+line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
 def send_report(sample, dataset, reason, annotator, campaign):
             )
         with gr.Row():
             dataset = gr.Dropdown(
+                choices=DATASETS,
+                value="Pick a dataset below",
+                label="Dataset",
             )
         with gr.Row():
             reason_txt = gr.Textbox(
             bad_btn = gr.Button("Bad ❌", visible=False)
             good_btn = gr.Button("Next ✅", visible=False)
         with gr.Row():
+            text = gr.Textbox(visible=False, label="Datapoint", lines=500)
         def next_line(dataset):
+            next_line = next(line_generators_s3[dataset])
+            text_col = "text"
+            if text_col not in next_line:
+                text_col = "content"
             return [
+                gr.update(value=next_line[text_col], visible=True),
                 next_line,
                 gr.update(visible=True),
                 gr.update(visible=True),
         def bad_line(current_sample, dataset, reason, annotator, campaign):
             send_report(current_sample, dataset, reason, annotator, campaign)
+            next_line = next(line_generators_s3[dataset])
+            text_col = "text"
+            if text_col not in next_line:
+                text_col = "content"
             return [
+                next_line[text_col],
                 gr.update(
                     value="",
                     placeholder="Provide the reason for flagging if you think the sample is bad.",

data/{cc_filtered_text_examples_with_stats.json → commoncrawl_examples_with_stats.json} RENAMED Viewed

File without changes

data/enwiki_data_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3d163bab055381827226140568f3bef7eaac187cebd76878e0b63e9e442356
+size 3

data/{s2orc_raw_examples_with_stats.json → s2orc_dedup_examples_with_stats.json} RENAMED Viewed

File without changes

test.ipynb ADDED Viewed

	@@ -0,0 +1,279 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "585da432",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
+      "Number of parquet files 30\n",
+      "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import math\n",
+    "import os\n",
+    "import random\n",
+    "import uuid\n",
+    "from datetime import datetime\n",
+    "\n",
+    "import gradio as gr\n",
+    "import jsonlines\n",
+    "import pyarrow as pa\n",
+    "import s3fs\n",
+    "from datasets import Dataset\n",
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
+    "\n",
+    "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
+    "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
+    "\n",
+    "DATASETS = [\n",
+    "    \"c4\",\n",
+    "    \"bigcode_python_code\",\n",
+    "    \"bigcode_python_github_issues\",\n",
+    "    \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
+    "    \"books3\",\n",
+    "    \"gutenberg_raw\",\n",
+    "    \"reddit_threaded\",\n",
+    "    \"enwiki_data\",\n",
+    "    \"s2orc_dedup\",\n",
+    "    \"stackexchange2\",\n",
+    "    \"commoncrawl\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def get_parquet_lines(dataset, sample_size=100):\n",
+    "    s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
+    "\n",
+    "    if len(s3_paths) == 0:\n",
+    "        raise FileNotFoundError(f\"Nothing found at {path}\")\n",
+    "\n",
+    "    print(\"Number of parquet files\", len(s3_paths))\n",
+    "    s3_path = random.choice(s3_paths)\n",
+    "    print(\"Reading\", s3_path)\n",
+    "    lines = []\n",
+    "\n",
+    "    with S3.open(s3_path) as f:\n",
+    "        pf = pa.parquet.ParquetFile(f)\n",
+    "        for ix_row_group in range(pf.metadata.num_row_groups):\n",
+    "            # We load dataset by row group - 1000 rows at a time\n",
+    "            # using open_input_stream would return bytes per bytes not row per row\n",
+    "            table = pf.read_row_group(ix_row_group)\n",
+    "            lines.extend(table.to_pylist())\n",
+    "\n",
+    "    random.shuffle(lines)\n",
+    "    return lines[:sample_size]\n",
+    "\n",
+    "\n",
+    "def get_local_lines(dataset):\n",
+    "    lines = []\n",
+    "    with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
+    "        for line in f:\n",
+    "            lines.append(line)\n",
+    "    return lines\n",
+    "\n",
+    "\n",
+    "def line_generator(lines_dict, dataset):\n",
+    "    for line in lines_dict[dataset]:\n",
+    "        yield line\n",
+    "\n",
+    "\n",
+    "# Parallelize the below\n",
+    "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
+    "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
+    "\n",
+    "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
+    "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
+    "\n",
+    "\n",
+    "def send_report(sample, dataset, reason, annotator, campaign):\n",
+    "    text = sample[\"text\"]\n",
+    "    sample.pop(\"text\")\n",
+    "\n",
+    "    sample_id = \"\"\n",
+    "    if \"id\" not in sample:\n",
+    "        if \"title\" in sample:\n",
+    "            sample_id = sample[\"title\"]\n",
+    "    else:\n",
+    "        sample_id = sample[\"id\"]\n",
+    "\n",
+    "    with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
+    "        f.write(\n",
+    "            {\n",
+    "                \"dataset\": dataset,\n",
+    "                \"docid\": sample_id,\n",
+    "                \"text\": text,\n",
+    "                \"metadata\": sample,\n",
+    "                \"reason\": reason,\n",
+    "                \"annotator\": annotator,\n",
+    "                \"campaign\": campaign,\n",
+    "                \"timestamp\": str(datetime.now()),\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "    api = HfApi()\n",
+    "    api.upload_file(\n",
+    "        path_or_fileobj=\"report.jsonl\",\n",
+    "        path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
+    "        repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
+    "        repo_type=\"dataset\",\n",
+    "        token=os.environ.get(\"geclm_token\"),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "description = \"\"\"\n",
+    "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    demo = gr.Blocks()\n",
+    "\n",
+    "    with demo:\n",
+    "        current_sample_state = gr.State(dict())\n",
+    "\n",
+    "        description = gr.Markdown(value=description)\n",
+    "        with gr.Row():\n",
+    "            annotator = gr.Textbox(\n",
+    "                lines=1,\n",
+    "                max_lines=1,\n",
+    "                placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
+    "                label=\"Annotator\",\n",
+    "            )\n",
+    "            campaign = gr.Textbox(\n",
+    "                lines=1,\n",
+    "                max_lines=1,\n",
+    "                placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
+    "                label=\"Annotation campaign\",\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            dataset = gr.Dropdown(\n",
+    "                choices=DATASETS,\n",
+    "                value=\"Pick a dataset below\",\n",
+    "                label=\"Dataset\",\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            reason_txt = gr.Textbox(\n",
+    "                label=\"Flagging reason\",\n",
+    "                placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
+    "                visible=False,\n",
+    "            )\n",
+    "        with gr.Row():\n",
+    "            bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
+    "            good_btn = gr.Button(\"Next ✅\", visible=False)\n",
+    "        with gr.Row():\n",
+    "            text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
+    "\n",
+    "        def next_line(dataset):\n",
+    "            next_line = next(line_generators_s3[dataset])\n",
+    "\n",
+    "            text_col = \"text\"\n",
+    "            if text_col not in next_line:\n",
+    "                text_col = \"content\"\n",
+    "            return [\n",
+    "                gr.update(value=next_line[text_col], visible=True),\n",
+    "                next_line,\n",
+    "                gr.update(visible=True),\n",
+    "                gr.update(visible=True),\n",
+    "                gr.update(visible=True),\n",
+    "            ]\n",
+    "\n",
+    "        def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
+    "            send_report(current_sample, dataset, reason, annotator, campaign)\n",
+    "            next_line = next(line_generators_s3[dataset])\n",
+    "            text_col = \"text\"\n",
+    "            if text_col not in next_line:\n",
+    "                text_col = \"content\"\n",
+    "            return [\n",
+    "                next_line[text_col],\n",
+    "                gr.update(\n",
+    "                    value=\"\",\n",
+    "                    placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
+    "                ),\n",
+    "                next_line,\n",
+    "            ]\n",
+    "\n",
+    "        good_btn.click(\n",
+    "            next_line,\n",
+    "            inputs=dataset,\n",
+    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
+    "        )\n",
+    "        dataset.change(\n",
+    "            next_line,\n",
+    "            inputs=dataset,\n",
+    "            outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
+    "        )\n",
+    "        bad_btn.click(\n",
+    "            bad_line,\n",
+    "            inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
+    "            outputs=[text, reason_txt, current_sample_state],\n",
+    "        )\n",
+    "\n",
+    "    demo.launch(enable_queue=False, debug=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}