File size: 3,264 Bytes
215f60a a6e1ff6 0f43f50 a6e1ff6 215f60a 0f43f50 724b1ea 0f43f50 724b1ea 0f43f50 a6e1ff6 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea a6e1ff6 0112a25 215f60a 724b1ea a6e1ff6 0f43f50 a6e1ff6 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a a6e1ff6 215f60a a6e1ff6 215f60a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
import math
import os
import uuid
from functools import partial
import jsonlines
import streamlit as st
import streamlit.components.v1 as components
from huggingface_hub import HfApi
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
with jsonlines.open("report.jsonl", "w") as f:
f.write(
{
"dataset": dataset,
"docid": docid,
"text": text,
"metadata": metadata,
"reason": reason,
"annotator": annotator,
}
)
api = HfApi()
api.upload_file(
path_or_fileobj="report.jsonl",
path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
repo_id="HuggingFaceGECLM/data_feedback",
repo_type="dataset",
token=os.environ.get("geclm_token"),
)
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_flag_and_get_next_item(sample, issue):
if issue is None or issue == "":
issue = "None"
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
text = sample["text"]
sample.pop("text")
sample.pop("issue")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
report_result_dataset(dataset, sample_id, text, str(sample), issue, "")
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD",
on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_flag_and_get_next_item,
args=(sample, issue),
)
|