Spaces:

HuggingFaceGECLM
/

dataset_explorer

Runtime error

App Files Files Community

dataset_explorer / app.py

ola13

remove redundant logging

a4a0058 over 1 year ago

raw

history blame

No virus

6.3 kB

	import gradio as gr
	import jsonlines
	import os
	import uuid


	from datetime import datetime
	from huggingface_hub import HfApi
	from pprint import pprint


	datasets = [
	"gutenberg_raw",
	"stackexchange2",
	"bigcode_python_code",
	"bigcode_python_github_issues",
	"bigcode_python_jupyter_scripts_dedup_filtered",
	"books3",
	"c4",
	"s2orc_raw",
	"reddit_threaded",
	"cc_filtered_text",
	]


	def line_generator(dataset):
	if dataset == "gutenberg_raw":
	with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "stackexchange2":
	with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "bigcode_python_code":
	with jsonlines.open(
	"data/bigcode_python_code_examples_with_stats.json", "r"
	) as f:
	for line in f:
	yield line
	if dataset == "bigcode_python_github_issues":
	with jsonlines.open(
	"data/bigcode_python_github_issues_examples_with_stats.json", "r"
	) as f:
	for line in f:
	yield line
	if dataset == "bigcode_python_jupyter_scripts_dedup_filtered":
	with jsonlines.open(
	"data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json",
	"r",
	) as f:
	for line in f:
	yield line
	if dataset == "books3":
	with jsonlines.open("data/books3_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "c4":
	with jsonlines.open("data/c4_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "s2orc_raw":
	with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "reddit_threaded":
	with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
	for line in f:
	yield line
	if dataset == "cc_filtered_text":
	with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f:
	for line in f:
	yield line


	line_generators = {dataset: line_generator(dataset) for dataset in datasets}


	def send_report(sample, dataset, reason, annotator, campaign):
	text = sample["text"]
	sample.pop("text")

	sample_id = ""
	if "id" not in sample:
	if "title" in sample:
	sample_id = sample["title"]
	else:
	sample_id = sample["id"]

	with jsonlines.open("report.jsonl", "w") as f:
	f.write(
	{
	"dataset": dataset,
	"docid": sample_id,
	"text": text,
	"metadata": sample,
	"reason": reason,
	"annotator": annotator,
	"campaign": campaign,
	"timestamp": str(datetime.now()),
	}
	)

	api = HfApi()
	api.upload_file(
	path_or_fileobj="report.jsonl",
	path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
	repo_id="HuggingFaceGECLM/data_feedback",
	repo_type="dataset",
	token=os.environ.get("geclm_token"),
	)


	description = """
	GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset.
	"""


	if __name__ == "__main__":
	demo = gr.Blocks()

	with demo:
	current_sample_state = gr.State(dict())

	description = gr.Markdown(value=description)
	with gr.Row():
	annotator = gr.Textbox(
	lines=1,
	max_lines=1,
	placeholder="Optionally provide your name here if you'd like it to be recorded.",
	label="Annotator",
	)
	campaign = gr.Textbox(
	lines=1,
	max_lines=1,
	placeholder="Optionally provide the name of the annotation campagin for ease of filtering the reports.",
	label="Annotation campaign",
	)
	with gr.Row():
	dataset = gr.Dropdown(
	choices=datasets, value="Pick a dataset below", label="Dataset",
	)
	with gr.Row():
	reason_txt = gr.Textbox(
	label="Flagging reason",
	placeholder="Provide the reason for flagging if you think the sample is bad.",
	visible=False,
	)
	with gr.Row():
	bad_btn = gr.Button("Bad ❌", visible=False)
	good_btn = gr.Button("Next ✅", visible=False)
	with gr.Row():
	text = gr.Markdown(visible=False)

	def next_line(dataset):
	next_line = next(line_generators[dataset])
	return [
	gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True),
	next_line,
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	]

	def bad_line(current_sample, dataset, reason, annotator, campaign):
	send_report(current_sample, dataset, reason, annotator, campaign)
	next_line = next(line_generators[dataset])
	return [
	"<pre>" + next_line["text"] + "</pre>",
	gr.update(
	value="",
	placeholder="Provide the reason for flagging if you think the sample is bad.",
	),
	next_line,
	]

	good_btn.click(
	next_line,
	inputs=dataset,
	outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
	)
	dataset.change(
	next_line,
	inputs=dataset,
	outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn],
	)
	bad_btn.click(
	bad_line,
	inputs=[current_sample_state, dataset, reason_txt, annotator, campaign],
	outputs=[text, reason_txt, current_sample_state],
	)

	demo.launch(enable_queue=False, debug=True)