ola13 commited on
Commit
eff02e0
β€’
1 Parent(s): a4a0058

read from s3

Browse files
.ipynb_checkpoints/test-checkpoint.ipynb ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "585da432",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Number of parquet files 30\n",
14
+ "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
15
+ "Number of parquet files 30\n",
16
+ "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
17
+ "Number of parquet files 30\n",
18
+ "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
19
+ "Number of parquet files 30\n",
20
+ "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
21
+ "Number of parquet files 30\n",
22
+ "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
23
+ "Number of parquet files 30\n",
24
+ "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
25
+ "Number of parquet files 30\n",
26
+ "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
27
+ "Number of parquet files 30\n",
28
+ "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
29
+ "Number of parquet files 30\n",
30
+ "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
31
+ "Number of parquet files 30\n",
32
+ "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
33
+ "Number of parquet files 30\n",
34
+ "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
35
+ "Running on local URL: http://127.0.0.1:7860\n",
36
+ "\n",
37
+ "To create a public link, set `share=True` in `launch()`.\n"
38
+ ]
39
+ },
40
+ {
41
+ "data": {
42
+ "text/html": [
43
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
44
+ ],
45
+ "text/plain": [
46
+ "<IPython.core.display.HTML object>"
47
+ ]
48
+ },
49
+ "metadata": {},
50
+ "output_type": "display_data"
51
+ }
52
+ ],
53
+ "source": [
54
+ "import math\n",
55
+ "import os\n",
56
+ "import random\n",
57
+ "import uuid\n",
58
+ "from datetime import datetime\n",
59
+ "\n",
60
+ "import gradio as gr\n",
61
+ "import jsonlines\n",
62
+ "import pyarrow as pa\n",
63
+ "import s3fs\n",
64
+ "from datasets import Dataset\n",
65
+ "from huggingface_hub import HfApi\n",
66
+ "\n",
67
+ "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
68
+ "\n",
69
+ "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
70
+ "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
71
+ "\n",
72
+ "DATASETS = [\n",
73
+ " \"c4\",\n",
74
+ " \"bigcode_python_code\",\n",
75
+ " \"bigcode_python_github_issues\",\n",
76
+ " \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
77
+ " \"books3\",\n",
78
+ " \"gutenberg_raw\",\n",
79
+ " \"reddit_threaded\",\n",
80
+ " \"enwiki_data\",\n",
81
+ " \"s2orc_dedup\",\n",
82
+ " \"stackexchange2\",\n",
83
+ " \"commoncrawl\",\n",
84
+ "]\n",
85
+ "\n",
86
+ "\n",
87
+ "def get_parquet_lines(dataset, sample_size=100):\n",
88
+ " s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
89
+ "\n",
90
+ " if len(s3_paths) == 0:\n",
91
+ " raise FileNotFoundError(f\"Nothing found at {path}\")\n",
92
+ "\n",
93
+ " print(\"Number of parquet files\", len(s3_paths))\n",
94
+ " s3_path = random.choice(s3_paths)\n",
95
+ " print(\"Reading\", s3_path)\n",
96
+ " lines = []\n",
97
+ "\n",
98
+ " with S3.open(s3_path) as f:\n",
99
+ " pf = pa.parquet.ParquetFile(f)\n",
100
+ " for ix_row_group in range(pf.metadata.num_row_groups):\n",
101
+ " # We load dataset by row group - 1000 rows at a time\n",
102
+ " # using open_input_stream would return bytes per bytes not row per row\n",
103
+ " table = pf.read_row_group(ix_row_group)\n",
104
+ " lines.extend(table.to_pylist())\n",
105
+ "\n",
106
+ " random.shuffle(lines)\n",
107
+ " return lines[:sample_size]\n",
108
+ "\n",
109
+ "\n",
110
+ "def get_local_lines(dataset):\n",
111
+ " lines = []\n",
112
+ " with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
113
+ " for line in f:\n",
114
+ " lines.append(line)\n",
115
+ " return lines\n",
116
+ "\n",
117
+ "\n",
118
+ "def line_generator(lines_dict, dataset):\n",
119
+ " for line in lines_dict[dataset]:\n",
120
+ " yield line\n",
121
+ "\n",
122
+ "\n",
123
+ "# Parallelize the below\n",
124
+ "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
125
+ "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
126
+ "\n",
127
+ "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
128
+ "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
129
+ "\n",
130
+ "\n",
131
+ "def send_report(sample, dataset, reason, annotator, campaign):\n",
132
+ " text = sample[\"text\"]\n",
133
+ " sample.pop(\"text\")\n",
134
+ "\n",
135
+ " sample_id = \"\"\n",
136
+ " if \"id\" not in sample:\n",
137
+ " if \"title\" in sample:\n",
138
+ " sample_id = sample[\"title\"]\n",
139
+ " else:\n",
140
+ " sample_id = sample[\"id\"]\n",
141
+ "\n",
142
+ " with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
143
+ " f.write(\n",
144
+ " {\n",
145
+ " \"dataset\": dataset,\n",
146
+ " \"docid\": sample_id,\n",
147
+ " \"text\": text,\n",
148
+ " \"metadata\": sample,\n",
149
+ " \"reason\": reason,\n",
150
+ " \"annotator\": annotator,\n",
151
+ " \"campaign\": campaign,\n",
152
+ " \"timestamp\": str(datetime.now()),\n",
153
+ " }\n",
154
+ " )\n",
155
+ "\n",
156
+ " api = HfApi()\n",
157
+ " api.upload_file(\n",
158
+ " path_or_fileobj=\"report.jsonl\",\n",
159
+ " path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
160
+ " repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
161
+ " repo_type=\"dataset\",\n",
162
+ " token=os.environ.get(\"geclm_token\"),\n",
163
+ " )\n",
164
+ "\n",
165
+ "\n",
166
+ "description = \"\"\"\n",
167
+ "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
168
+ "\"\"\"\n",
169
+ "\n",
170
+ "\n",
171
+ "if __name__ == \"__main__\":\n",
172
+ " demo = gr.Blocks()\n",
173
+ "\n",
174
+ " with demo:\n",
175
+ " current_sample_state = gr.State(dict())\n",
176
+ "\n",
177
+ " description = gr.Markdown(value=description)\n",
178
+ " with gr.Row():\n",
179
+ " annotator = gr.Textbox(\n",
180
+ " lines=1,\n",
181
+ " max_lines=1,\n",
182
+ " placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
183
+ " label=\"Annotator\",\n",
184
+ " )\n",
185
+ " campaign = gr.Textbox(\n",
186
+ " lines=1,\n",
187
+ " max_lines=1,\n",
188
+ " placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
189
+ " label=\"Annotation campaign\",\n",
190
+ " )\n",
191
+ " with gr.Row():\n",
192
+ " dataset = gr.Dropdown(\n",
193
+ " choices=DATASETS,\n",
194
+ " value=\"Pick a dataset below\",\n",
195
+ " label=\"Dataset\",\n",
196
+ " )\n",
197
+ " with gr.Row():\n",
198
+ " reason_txt = gr.Textbox(\n",
199
+ " label=\"Flagging reason\",\n",
200
+ " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
201
+ " visible=False,\n",
202
+ " )\n",
203
+ " with gr.Row():\n",
204
+ " bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
205
+ " good_btn = gr.Button(\"Next βœ…\", visible=False)\n",
206
+ " with gr.Row():\n",
207
+ " text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
208
+ "\n",
209
+ " def next_line(dataset):\n",
210
+ " next_line = next(line_generators_s3[dataset])\n",
211
+ "\n",
212
+ " text_col = \"text\"\n",
213
+ " if text_col not in next_line:\n",
214
+ " text_col = \"content\"\n",
215
+ " return [\n",
216
+ " gr.update(value=next_line[text_col], visible=True),\n",
217
+ " next_line,\n",
218
+ " gr.update(visible=True),\n",
219
+ " gr.update(visible=True),\n",
220
+ " gr.update(visible=True),\n",
221
+ " ]\n",
222
+ "\n",
223
+ " def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
224
+ " send_report(current_sample, dataset, reason, annotator, campaign)\n",
225
+ " next_line = next(line_generators_s3[dataset])\n",
226
+ " text_col = \"text\"\n",
227
+ " if text_col not in next_line:\n",
228
+ " text_col = \"content\"\n",
229
+ " return [\n",
230
+ " next_line[text_col],\n",
231
+ " gr.update(\n",
232
+ " value=\"\",\n",
233
+ " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
234
+ " ),\n",
235
+ " next_line,\n",
236
+ " ]\n",
237
+ "\n",
238
+ " good_btn.click(\n",
239
+ " next_line,\n",
240
+ " inputs=dataset,\n",
241
+ " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
242
+ " )\n",
243
+ " dataset.change(\n",
244
+ " next_line,\n",
245
+ " inputs=dataset,\n",
246
+ " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
247
+ " )\n",
248
+ " bad_btn.click(\n",
249
+ " bad_line,\n",
250
+ " inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
251
+ " outputs=[text, reason_txt, current_sample_state],\n",
252
+ " )\n",
253
+ "\n",
254
+ " demo.launch(enable_queue=False, debug=True)\n"
255
+ ]
256
+ }
257
+ ],
258
+ "metadata": {
259
+ "kernelspec": {
260
+ "display_name": "Python 3 (ipykernel)",
261
+ "language": "python",
262
+ "name": "python3"
263
+ },
264
+ "language_info": {
265
+ "codemirror_mode": {
266
+ "name": "ipython",
267
+ "version": 3
268
+ },
269
+ "file_extension": ".py",
270
+ "mimetype": "text/x-python",
271
+ "name": "python",
272
+ "nbconvert_exporter": "python",
273
+ "pygments_lexer": "ipython3",
274
+ "version": "3.10.9"
275
+ }
276
+ },
277
+ "nbformat": 4,
278
+ "nbformat_minor": 5
279
+ }
Makefile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style quality
2
+
3
+ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
4
+ export PYTHONPATH = src
5
+
6
+ # For later, when we have correct path - and we will likely have to ignore venv folders.
7
+ check_dirs := examples tests src utils
8
+
9
+ style:
10
+ python -m black --line-length 119 --target-version py39 .
11
+ python -m isort .
12
+
13
+ quality:
14
+ python -m black --check --line-length 119 --target-version py39 .
15
+ python -m isort --check-only .
16
+ python -m flake8 --max-line-length 119 .
17
+
18
+ # Release stuff
19
+ pre-release:
20
+ python utils/release.py
21
+
22
+ pre-patch:
23
+ python utils/release.py --patch
24
+
25
+ post-release:
26
+ python utils/release.py --post_release
27
+
28
+ post-patch:
29
+ python utils/release.py --post_release --patch
30
+
31
+ #wheels:
32
+ # python setup.py bdist_wheel && python setup.py sdist
33
+ #
34
+ #wheels_clean:
35
+ # rm -rf build && rm -rf dist
36
+ #
37
+ #pypi_upload:
38
+ # python -m pip install twine
39
+ # twine upload dist/* -r pypi
40
+ #
41
+ #pypi_test_upload:
42
+ # python -m pip install twine
43
+ # twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
app.py CHANGED
@@ -1,79 +1,78 @@
1
- import gradio as gr
2
- import jsonlines
3
  import os
 
4
  import uuid
5
-
6
-
7
  from datetime import datetime
 
 
 
 
 
 
8
  from huggingface_hub import HfApi
9
- from pprint import pprint
10
 
 
11
 
12
- datasets = [
13
- "gutenberg_raw",
14
- "stackexchange2",
 
 
15
  "bigcode_python_code",
16
  "bigcode_python_github_issues",
17
- "bigcode_python_jupyter_scripts_dedup_filtered",
18
  "books3",
19
- "c4",
20
- "s2orc_raw",
21
  "reddit_threaded",
22
- "cc_filtered_text",
 
 
 
23
  ]
24
 
25
 
26
- def line_generator(dataset):
27
- if dataset == "gutenberg_raw":
28
- with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
29
- for line in f:
30
- yield line
31
- if dataset == "stackexchange2":
32
- with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
33
- for line in f:
34
- yield line
35
- if dataset == "bigcode_python_code":
36
- with jsonlines.open(
37
- "data/bigcode_python_code_examples_with_stats.json", "r"
38
- ) as f:
39
- for line in f:
40
- yield line
41
- if dataset == "bigcode_python_github_issues":
42
- with jsonlines.open(
43
- "data/bigcode_python_github_issues_examples_with_stats.json", "r"
44
- ) as f:
45
- for line in f:
46
- yield line
47
- if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
48
- with jsonlines.open(
49
- "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
50
- "r",
51
- ) as f:
52
- for line in f:
53
- yield line
54
- if dataset == "books3":
55
- with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
56
- for line in f:
57
- yield line
58
- if dataset == "c4":
59
- with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
60
- for line in f:
61
- yield line
62
- if dataset == "s2orc_raw":
63
- with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
64
- for line in f:
65
- yield line
66
- if dataset == "reddit_threaded":
67
- with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
68
- for line in f:
69
- yield line
70
- if dataset == "cc_filtered_text":
71
- with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
72
- for line in f:
73
- yield line
74
-
75
-
76
- line_generators = {dataset: line_generator(dataset) for dataset in datasets}
77
 
78
 
79
  def send_report(sample, dataset, reason, annotator, campaign):
@@ -138,7 +137,9 @@ if __name__ == "__main__":
138
  )
139
  with gr.Row():
140
  dataset = gr.Dropdown(
141
- choices=datasets, value="Pick a dataset below", label="Dataset",
 
 
142
  )
143
  with gr.Row():
144
  reason_txt = gr.Textbox(
@@ -150,12 +151,16 @@ if __name__ == "__main__":
150
  bad_btn = gr.Button("Bad ❌", visible=False)
151
  good_btn = gr.Button("Next βœ…", visible=False)
152
  with gr.Row():
153
- text = gr.Markdown(visible=False)
154
 
155
  def next_line(dataset):
156
- next_line = next(line_generators[dataset])
 
 
 
 
157
  return [
158
- gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
159
  next_line,
160
  gr.update(visible=True),
161
  gr.update(visible=True),
@@ -164,9 +169,12 @@ if __name__ == "__main__":
164
 
165
  def bad_line(current_sample, dataset, reason, annotator, campaign):
166
  send_report(current_sample, dataset, reason, annotator, campaign)
167
- next_line = next(line_generators[dataset])
 
 
 
168
  return [
169
- "<pre>" + next_line["text"] + "</pre>",
170
  gr.update(
171
  value="",
172
  placeholder="Provide the reason for flagging if you think the sample is bad.",
 
1
+ import math
 
2
  import os
3
+ import random
4
  import uuid
 
 
5
  from datetime import datetime
6
+
7
+ import gradio as gr
8
+ import jsonlines
9
+ import pyarrow as pa
10
+ import s3fs
11
+ from datasets import Dataset
12
  from huggingface_hub import HfApi
 
13
 
14
+ S3 = s3fs.S3FileSystem(anon=False, key=os.getenv("AWS_ACCESS_KEY_ID"), secret=os.getenv("AWS_SECRET_ACCESS_KEY"))
15
 
16
+ DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5
17
+ BASE_S3_DIR = "s3://geclm-datasets/samples/"
18
+
19
+ DATASETS = [
20
+ "c4",
21
  "bigcode_python_code",
22
  "bigcode_python_github_issues",
23
+ "bigcode_python_jupyter_markdowned_clean_dedup",
24
  "books3",
25
+ "gutenberg_raw",
 
26
  "reddit_threaded",
27
+ "enwiki_data",
28
+ "s2orc_dedup",
29
+ "stackexchange2",
30
+ "commoncrawl",
31
  ]
32
 
33
 
34
+ def get_parquet_lines(dataset, sample_size=100):
35
+ s3_paths = S3.glob(BASE_S3_DIR + dataset + "/*")
36
+
37
+ if len(s3_paths) == 0:
38
+ raise FileNotFoundError(f"Nothing found at {path}")
39
+
40
+ print("Number of parquet files", len(s3_paths))
41
+ s3_path = random.choice(s3_paths)
42
+ print("Reading", s3_path)
43
+ lines = []
44
+
45
+ with S3.open(s3_path) as f:
46
+ pf = pa.parquet.ParquetFile(f)
47
+ for ix_row_group in range(pf.metadata.num_row_groups):
48
+ # We load dataset by row group - 1000 rows at a time
49
+ # using open_input_stream would return bytes per bytes not row per row
50
+ table = pf.read_row_group(ix_row_group)
51
+ lines.extend(table.to_pylist())
52
+
53
+ random.shuffle(lines)
54
+ return lines[:sample_size]
55
+
56
+
57
+ def get_local_lines(dataset):
58
+ lines = []
59
+ with jsonlines.open("data/{}_examples_with_stats.json".format(dataset), "r") as f:
60
+ for line in f:
61
+ lines.append(line)
62
+ return lines
63
+
64
+
65
+ def line_generator(lines_dict, dataset):
66
+ for line in lines_dict[dataset]:
67
+ yield line
68
+
69
+
70
+ # Parallelize the below
71
+ local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}
72
+ s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}
73
+
74
+ line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}
75
+ line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}
 
 
 
 
 
 
 
 
 
76
 
77
 
78
  def send_report(sample, dataset, reason, annotator, campaign):
 
137
  )
138
  with gr.Row():
139
  dataset = gr.Dropdown(
140
+ choices=DATASETS,
141
+ value="Pick a dataset below",
142
+ label="Dataset",
143
  )
144
  with gr.Row():
145
  reason_txt = gr.Textbox(
 
151
  bad_btn = gr.Button("Bad ❌", visible=False)
152
  good_btn = gr.Button("Next βœ…", visible=False)
153
  with gr.Row():
154
+ text = gr.Textbox(visible=False, label="Datapoint", lines=500)
155
 
156
  def next_line(dataset):
157
+ next_line = next(line_generators_s3[dataset])
158
+
159
+ text_col = "text"
160
+ if text_col not in next_line:
161
+ text_col = "content"
162
  return [
163
+ gr.update(value=next_line[text_col], visible=True),
164
  next_line,
165
  gr.update(visible=True),
166
  gr.update(visible=True),
 
169
 
170
  def bad_line(current_sample, dataset, reason, annotator, campaign):
171
  send_report(current_sample, dataset, reason, annotator, campaign)
172
+ next_line = next(line_generators_s3[dataset])
173
+ text_col = "text"
174
+ if text_col not in next_line:
175
+ text_col = "content"
176
  return [
177
+ next_line[text_col],
178
  gr.update(
179
  value="",
180
  placeholder="Provide the reason for flagging if you think the sample is bad.",
data/{cc_filtered_text_examples_with_stats.json β†’ commoncrawl_examples_with_stats.json} RENAMED
File without changes
data/enwiki_data_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca3d163bab055381827226140568f3bef7eaac187cebd76878e0b63e9e442356
3
+ size 3
data/{s2orc_raw_examples_with_stats.json β†’ s2orc_dedup_examples_with_stats.json} RENAMED
File without changes
test.ipynb ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "585da432",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Number of parquet files 30\n",
14
+ "Reading geclm-datasets/samples/c4/20230404_102105_00007_t8w9z_3085d601-45f1-443a-b50d-8eb4812dd227\n",
15
+ "Number of parquet files 30\n",
16
+ "Reading geclm-datasets/samples/bigcode_python_code/20230404_102116_00007_ajvns_4e5b2899-8640-4a4c-b0cd-758662178176\n",
17
+ "Number of parquet files 30\n",
18
+ "Reading geclm-datasets/samples/bigcode_python_github_issues/20230404_102127_00022_yv77i_982f928f-1431-4ea7-986d-c5c5cb0f4a3f\n",
19
+ "Number of parquet files 30\n",
20
+ "Reading geclm-datasets/samples/bigcode_python_jupyter_markdowned_clean_dedup/20230404_102137_00026_vwcg7_3167c932-87a1-4fec-ad01-215831d0bf6e\n",
21
+ "Number of parquet files 30\n",
22
+ "Reading geclm-datasets/samples/books3/20230404_102143_00027_t4kwf_198fc997-b871-4e4a-b88e-3776f1cf92fe\n",
23
+ "Number of parquet files 30\n",
24
+ "Reading geclm-datasets/samples/gutenberg_raw/20230404_102215_00007_x3ntt_30873bfe-c94c-439a-96e2-71165570dc99\n",
25
+ "Number of parquet files 30\n",
26
+ "Reading geclm-datasets/samples/reddit_threaded/20230404_102241_00049_xj4uk_d7612f5a-5107-46e1-b710-47e7db95a7e6\n",
27
+ "Number of parquet files 30\n",
28
+ "Reading geclm-datasets/samples/enwiki_data/20230404_102246_00007_ye63c_57166ca6-f0d2-40ef-8ae7-ed4bc7ecd28d\n",
29
+ "Number of parquet files 30\n",
30
+ "Reading geclm-datasets/samples/s2orc_dedup/20230404_102252_00080_6ce5q_330e23f7-1270-4a52-b277-af823baf1de6\n",
31
+ "Number of parquet files 30\n",
32
+ "Reading geclm-datasets/samples/stackexchange2/20230404_102308_00031_qvnh6_cec28e17-f163-4a04-9fbe-dc617d9ea03e\n",
33
+ "Number of parquet files 30\n",
34
+ "Reading geclm-datasets/samples/commoncrawl/20230404_124237_00026_sin5w_c2e65b68-2449-47fa-be8b-a6e6e83611d0\n",
35
+ "Running on local URL: http://127.0.0.1:7860\n",
36
+ "\n",
37
+ "To create a public link, set `share=True` in `launch()`.\n"
38
+ ]
39
+ },
40
+ {
41
+ "data": {
42
+ "text/html": [
43
+ "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
44
+ ],
45
+ "text/plain": [
46
+ "<IPython.core.display.HTML object>"
47
+ ]
48
+ },
49
+ "metadata": {},
50
+ "output_type": "display_data"
51
+ }
52
+ ],
53
+ "source": [
54
+ "import math\n",
55
+ "import os\n",
56
+ "import random\n",
57
+ "import uuid\n",
58
+ "from datetime import datetime\n",
59
+ "\n",
60
+ "import gradio as gr\n",
61
+ "import jsonlines\n",
62
+ "import pyarrow as pa\n",
63
+ "import s3fs\n",
64
+ "from datasets import Dataset\n",
65
+ "from huggingface_hub import HfApi\n",
66
+ "\n",
67
+ "S3 = s3fs.S3FileSystem(anon=False, key=os.getenv(\"AWS_ACCESS_KEY_ID\"), secret=os.getenv(\"AWS_SECRET_ACCESS_KEY\"))\n",
68
+ "\n",
69
+ "DEFAULT_SHUFFLE_BUFFER_SIZE_RATIO = 5\n",
70
+ "BASE_S3_DIR = \"s3://geclm-datasets/samples/\"\n",
71
+ "\n",
72
+ "DATASETS = [\n",
73
+ " \"c4\",\n",
74
+ " \"bigcode_python_code\",\n",
75
+ " \"bigcode_python_github_issues\",\n",
76
+ " \"bigcode_python_jupyter_markdowned_clean_dedup\",\n",
77
+ " \"books3\",\n",
78
+ " \"gutenberg_raw\",\n",
79
+ " \"reddit_threaded\",\n",
80
+ " \"enwiki_data\",\n",
81
+ " \"s2orc_dedup\",\n",
82
+ " \"stackexchange2\",\n",
83
+ " \"commoncrawl\",\n",
84
+ "]\n",
85
+ "\n",
86
+ "\n",
87
+ "def get_parquet_lines(dataset, sample_size=100):\n",
88
+ " s3_paths = S3.glob(BASE_S3_DIR + dataset + \"/*\")\n",
89
+ "\n",
90
+ " if len(s3_paths) == 0:\n",
91
+ " raise FileNotFoundError(f\"Nothing found at {path}\")\n",
92
+ "\n",
93
+ " print(\"Number of parquet files\", len(s3_paths))\n",
94
+ " s3_path = random.choice(s3_paths)\n",
95
+ " print(\"Reading\", s3_path)\n",
96
+ " lines = []\n",
97
+ "\n",
98
+ " with S3.open(s3_path) as f:\n",
99
+ " pf = pa.parquet.ParquetFile(f)\n",
100
+ " for ix_row_group in range(pf.metadata.num_row_groups):\n",
101
+ " # We load dataset by row group - 1000 rows at a time\n",
102
+ " # using open_input_stream would return bytes per bytes not row per row\n",
103
+ " table = pf.read_row_group(ix_row_group)\n",
104
+ " lines.extend(table.to_pylist())\n",
105
+ "\n",
106
+ " random.shuffle(lines)\n",
107
+ " return lines[:sample_size]\n",
108
+ "\n",
109
+ "\n",
110
+ "def get_local_lines(dataset):\n",
111
+ " lines = []\n",
112
+ " with jsonlines.open(\"data/{}_examples_with_stats.json\".format(dataset), \"r\") as f:\n",
113
+ " for line in f:\n",
114
+ " lines.append(line)\n",
115
+ " return lines\n",
116
+ "\n",
117
+ "\n",
118
+ "def line_generator(lines_dict, dataset):\n",
119
+ " for line in lines_dict[dataset]:\n",
120
+ " yield line\n",
121
+ "\n",
122
+ "\n",
123
+ "# Parallelize the below\n",
124
+ "local_lines = {dataset: get_local_lines(dataset) for dataset in DATASETS}\n",
125
+ "s3_lines = {dataset: get_parquet_lines(dataset) for dataset in DATASETS}\n",
126
+ "\n",
127
+ "line_generators_local = {dataset: line_generator(local_lines, dataset) for dataset in DATASETS}\n",
128
+ "line_generators_s3 = {dataset: line_generator(s3_lines, dataset) for dataset in DATASETS}\n",
129
+ "\n",
130
+ "\n",
131
+ "def send_report(sample, dataset, reason, annotator, campaign):\n",
132
+ " text = sample[\"text\"]\n",
133
+ " sample.pop(\"text\")\n",
134
+ "\n",
135
+ " sample_id = \"\"\n",
136
+ " if \"id\" not in sample:\n",
137
+ " if \"title\" in sample:\n",
138
+ " sample_id = sample[\"title\"]\n",
139
+ " else:\n",
140
+ " sample_id = sample[\"id\"]\n",
141
+ "\n",
142
+ " with jsonlines.open(\"report.jsonl\", \"w\") as f:\n",
143
+ " f.write(\n",
144
+ " {\n",
145
+ " \"dataset\": dataset,\n",
146
+ " \"docid\": sample_id,\n",
147
+ " \"text\": text,\n",
148
+ " \"metadata\": sample,\n",
149
+ " \"reason\": reason,\n",
150
+ " \"annotator\": annotator,\n",
151
+ " \"campaign\": campaign,\n",
152
+ " \"timestamp\": str(datetime.now()),\n",
153
+ " }\n",
154
+ " )\n",
155
+ "\n",
156
+ " api = HfApi()\n",
157
+ " api.upload_file(\n",
158
+ " path_or_fileobj=\"report.jsonl\",\n",
159
+ " path_in_repo=\"report-{}.jsonl\".format(uuid.uuid4()),\n",
160
+ " repo_id=\"HuggingFaceGECLM/data_feedback\",\n",
161
+ " repo_type=\"dataset\",\n",
162
+ " token=os.environ.get(\"geclm_token\"),\n",
163
+ " )\n",
164
+ "\n",
165
+ "\n",
166
+ "description = \"\"\"\n",
167
+ "GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.\n",
168
+ "\"\"\"\n",
169
+ "\n",
170
+ "\n",
171
+ "if __name__ == \"__main__\":\n",
172
+ " demo = gr.Blocks()\n",
173
+ "\n",
174
+ " with demo:\n",
175
+ " current_sample_state = gr.State(dict())\n",
176
+ "\n",
177
+ " description = gr.Markdown(value=description)\n",
178
+ " with gr.Row():\n",
179
+ " annotator = gr.Textbox(\n",
180
+ " lines=1,\n",
181
+ " max_lines=1,\n",
182
+ " placeholder=\"Optionally provide your name here if you'd like it to be recorded.\",\n",
183
+ " label=\"Annotator\",\n",
184
+ " )\n",
185
+ " campaign = gr.Textbox(\n",
186
+ " lines=1,\n",
187
+ " max_lines=1,\n",
188
+ " placeholder=\"Optionally provide the name of the annotation campagin for ease of filtering the reports.\",\n",
189
+ " label=\"Annotation campaign\",\n",
190
+ " )\n",
191
+ " with gr.Row():\n",
192
+ " dataset = gr.Dropdown(\n",
193
+ " choices=DATASETS,\n",
194
+ " value=\"Pick a dataset below\",\n",
195
+ " label=\"Dataset\",\n",
196
+ " )\n",
197
+ " with gr.Row():\n",
198
+ " reason_txt = gr.Textbox(\n",
199
+ " label=\"Flagging reason\",\n",
200
+ " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
201
+ " visible=False,\n",
202
+ " )\n",
203
+ " with gr.Row():\n",
204
+ " bad_btn = gr.Button(\"Bad ❌\", visible=False)\n",
205
+ " good_btn = gr.Button(\"Next βœ…\", visible=False)\n",
206
+ " with gr.Row():\n",
207
+ " text = gr.Textbox(visible=False, label=\"Datapoint\", lines=500)\n",
208
+ "\n",
209
+ " def next_line(dataset):\n",
210
+ " next_line = next(line_generators_s3[dataset])\n",
211
+ "\n",
212
+ " text_col = \"text\"\n",
213
+ " if text_col not in next_line:\n",
214
+ " text_col = \"content\"\n",
215
+ " return [\n",
216
+ " gr.update(value=next_line[text_col], visible=True),\n",
217
+ " next_line,\n",
218
+ " gr.update(visible=True),\n",
219
+ " gr.update(visible=True),\n",
220
+ " gr.update(visible=True),\n",
221
+ " ]\n",
222
+ "\n",
223
+ " def bad_line(current_sample, dataset, reason, annotator, campaign):\n",
224
+ " send_report(current_sample, dataset, reason, annotator, campaign)\n",
225
+ " next_line = next(line_generators_s3[dataset])\n",
226
+ " text_col = \"text\"\n",
227
+ " if text_col not in next_line:\n",
228
+ " text_col = \"content\"\n",
229
+ " return [\n",
230
+ " next_line[text_col],\n",
231
+ " gr.update(\n",
232
+ " value=\"\",\n",
233
+ " placeholder=\"Provide the reason for flagging if you think the sample is bad.\",\n",
234
+ " ),\n",
235
+ " next_line,\n",
236
+ " ]\n",
237
+ "\n",
238
+ " good_btn.click(\n",
239
+ " next_line,\n",
240
+ " inputs=dataset,\n",
241
+ " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
242
+ " )\n",
243
+ " dataset.change(\n",
244
+ " next_line,\n",
245
+ " inputs=dataset,\n",
246
+ " outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],\n",
247
+ " )\n",
248
+ " bad_btn.click(\n",
249
+ " bad_line,\n",
250
+ " inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],\n",
251
+ " outputs=[text, reason_txt, current_sample_state],\n",
252
+ " )\n",
253
+ "\n",
254
+ " demo.launch(enable_queue=False, debug=True)"
255
+ ]
256
+ }
257
+ ],
258
+ "metadata": {
259
+ "kernelspec": {
260
+ "display_name": "Python 3 (ipykernel)",
261
+ "language": "python",
262
+ "name": "python3"
263
+ },
264
+ "language_info": {
265
+ "codemirror_mode": {
266
+ "name": "ipython",
267
+ "version": 3
268
+ },
269
+ "file_extension": ".py",
270
+ "mimetype": "text/x-python",
271
+ "name": "python",
272
+ "nbconvert_exporter": "python",
273
+ "pygments_lexer": "ipython3",
274
+ "version": "3.10.9"
275
+ }
276
+ },
277
+ "nbformat": 4,
278
+ "nbformat_minor": 5
279
+ }