Spaces:

HuggingFaceGECLM
/

random_dataset_exploration

Sleeping

File size: 3,264 Bytes

215f60a
a6e1ff6
0f43f50
 
a6e1ff6
215f60a
0f43f50
724b1ea
 
0f43f50
724b1ea
 
 
0f43f50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6e1ff6
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
a6e1ff6
0112a25
 
215f60a
 
 
 
724b1ea
a6e1ff6
 
 
 
 
 
 
 
 
 
0f43f50
a6e1ff6
724b1ea
 
 
215f60a
 
 
 
 
 
 
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
215f60a
724b1ea
215f60a
 
 
 
724b1ea
215f60a
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
a6e1ff6
 
215f60a
 
 
a6e1ff6
215f60a

import json
import math
import os
import uuid
from functools import partial

import jsonlines
import streamlit as st
import streamlit.components.v1 as components
from huggingface_hub import HfApi

BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"


def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
    with jsonlines.open("report.jsonl", "w") as f:
        f.write(
            {
                "dataset": dataset,
                "docid": docid,
                "text": text,
                "metadata": metadata,
                "reason": reason,
                "annotator": annotator,
            }
        )

    api = HfApi()
    api.upload_file(
        path_or_fileobj="report.jsonl",
        path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
        repo_id="HuggingFaceGECLM/data_feedback",
        repo_type="dataset",
        token=os.environ.get("geclm_token"),
    )


def load_jsonl(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))

    return data


if "idx" not in st.session_state:
    st.session_state.idx = 0


def get_next_item():
    st.session_state.idx += 1


def save_flag_and_get_next_item(sample, issue):
    if issue is None or issue == "":
        issue = "None"
    sample["issue"] = issue

    with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
        f.write(json.dumps(sample) + "\n")

    text = sample["text"]
    sample.pop("text")
    sample.pop("issue")
    sample_id = ""
    if "id" not in sample:
        if "title" in sample:
            sample_id = sample["title"]
    else:
        sample_id = sample["id"]

    report_result_dataset(dataset, sample_id, text, str(sample), issue, "")

    get_next_item()


datasets = [
    "gutenberg_raw",
    "stackexchange2",
    "bigcode_python_code",
    "bigcode_python_github_issues",
    "bigcode_python_jupyter_scripts_dedup_filtered",
    "books3",
    "c4",
    "s2orc_raw",
    "reddit_threaded",
    "cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
    pass

st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
    st.sidebar.download_button(
        "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
    )

st.sidebar.button(
    "Clear bad examples file",
    on_click=lambda: open(
        f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
    ).close(),
)

with st.form(key="bad_form", clear_on_submit=True):
    sample = data[st.session_state.idx]
    text = sample["text"]
    st.text_area(f"text id: {st.session_state.idx}", text, height=500)

    issue = st.text_input(
        "What's wrong with this example? (leave blank if example is fine)"
    )

    good = st.form_submit_button(
        "GOOD",
        on_click=get_next_item,
    )
    bad = st.form_submit_button(
        "BAD",
        on_click=save_flag_and_get_next_item,
        args=(sample, issue),
    )