Spaces:

imomayiz
/

DODa

Sleeping

File size: 5,254 Bytes

9481a42
 
 
 
eeeb878
9481a42
eeeb878
 
 
 
9481a42
 
c5139c6
 
9481a42
 
 
eeeb878
 
4821540
eeeb878
2c0f00c
 
 
 
 
 
 
 
 
ae41ba2
 
2c0f00c
 
 
 
ae41ba2
9481a42
 
f249f4d
9481a42
 
 
 
 
 
 
 
0ecaa8b
 
 
 
9481a42
 
eeeb878
 
 
 
 
bcb5974
eeeb878
 
 
bcb5974
eeeb878
 
bcb5974
eeeb878
 
 
 
 
 
 
2c0f00c
 
 
 
eeeb878
2c0f00c
 
 
 
 
 
eeeb878
bcb5974
 
eeeb878
bcb5974
9481a42
 
 
 
 
 
5ef45dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeeb878
5ef45dd
 
 
 
 
 
 
 
 
9481a42
 
5ef45dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcb5974
eeeb878
bcb5974

import streamlit as st
from datasets import load_dataset
import datetime as dt
import random
import json
import os
from huggingface_hub import HfApi, CommitScheduler
import uuid


HF_API_KEY = os.environ.get("HF_TOKEN", None)

api = HfApi(token=HF_API_KEY)

REPO_ID = "imomayiz/darija-english"
DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"

submissions_folder = "submissions"
submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
os.makedirs(submissions_folder, exist_ok=True)

scheduler = CommitScheduler(
    token=HF_API_KEY,
    hf_api=api,
    repo_id=REPO_ID,
    repo_type="dataset",
    folder_path=submissions_folder,
    path_in_repo=submissions_folder,
    every=1,
)

# Define the ParquetScheduler instance with your repo details
# scheduler = ParquetScheduler(repo_id=REPO_ID, 
#                              token=HF_API_KEY, every=1, 
#                              path_in_repo=submissions_folder, 
#                              repo_type="dataset")


def load_data(repo_id):
    dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
    return dataset

def fetch_sentence(dataset, column_name="darija_ar"):
    
    # Get a random sentence
    random_sentence_index = random.randint(0, len(dataset) - 1)
    random_sentence = dataset[random_sentence_index][column_name]

    st.session_state.sentence = random_sentence
    st.session_state.translation_input = ""
    st.session_state.translation_input_fr = ""

    return random_sentence

def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str): 
    """
    Append input/outputs and user feedback to a JSON Lines file 
    using a thread lock to avoid concurrent writes from different users.
    """
    ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
    # folder_path = "submissions"
    # os.makedirs(folder_path, exist_ok=True)
    # filename = os.path.join(folder_path, f"submissions_{ts}.txt")

    # with open(filename, "w", encoding="utf-8") as f:
    #     f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}")
        
    # api.upload_file(
    #             path_or_fileobj=filename,
    #             path_in_repo=filename,
    #             repo_id=REPO_ID,
    #             repo_type="dataset",
    #         )

    with scheduler.lock:
        with submissions_file.open("a") as f:
            f.write(json.dumps({
                "darija": translation_fr, 
                "eng": translation, 
                "darija_ar": sentence}))
            f.write("\n")

    # scheduler.append({"darija": translation_fr, 
    #             "eng": translation, 
    #             "darija_ar": sentence})

    st.success(
        f"""Translation submitted successfully to 
        {DATASET_REPO_URL}/tree/main/{submissions_folder}"""
        )


# Load the dataset
dataset = load_data(REPO_ID)


if "sentence" not in st.session_state:
    st.session_state.sentence = fetch_sentence(dataset) 
if 'translation_input' not in st.session_state:
    st.session_state.translation_input = ""
if 'translation_input_fr' not in st.session_state:
    st.session_state.translation_input_fr = ""
if 'display_new' not in st.session_state:
    st.session_state.display_new = False

st.title("Translate From Arabic to English")

st.markdown(
"""This mini-app allows you to contribute to the **darija-english** dataset 
as part of [DODa](https://darija-open-dataset.github.io/)
project. To contribute, simply translate the given sentence from Arabic to English.
The translated sentence will be submitted to the dataset 
[here](https://huggingface.co/datasets/imomayiz/darija-english)."""
)

st.divider()

st.write(f"""
    <div style="
        padding: 5px;
        border: 1px solid #000000;
        border-radius: 5px;
    ">
        <p style="font-size: 20px;">{st.session_state.sentence}.</p>
    </div>""", unsafe_allow_html=True)
    

# Display new sentence button
st.session_state.display_new = st.button("New Sentence",
                                            on_click=fetch_sentence,
                                            args=(dataset,))


# Input field for translation
translation_input_placeholder = st.empty()

with translation_input_placeholder.container():
    translation_input = st.text_input("Enter translation to english: ",
                                        st.session_state.translation_input)
    st.session_state.translation_input = translation_input

# Input field for translation
translation_input_placeholder_fr = st.empty()

with translation_input_placeholder_fr.container():
    translation_input_fr = st.text_input(
        "Enter translation to darija in latin characters: ",
        st.session_state.translation_input_fr
        )
    st.session_state.translation_input_fr = translation_input_fr

if st.button("Submit Translation"):
    if not st.session_state.translation_input_fr or st.session_state.translation_input:
        st.warning("Please enter a translation before submitting.")
    else:
        store_submission(api,
                    st.session_state.sentence, 
                    st.session_state.translation_input,
                    st.session_state.translation_input_fr
                        )