File size: 5,254 Bytes
9481a42 eeeb878 9481a42 eeeb878 9481a42 c5139c6 9481a42 eeeb878 4821540 eeeb878 2c0f00c ae41ba2 2c0f00c ae41ba2 9481a42 f249f4d 9481a42 0ecaa8b 9481a42 eeeb878 bcb5974 eeeb878 bcb5974 eeeb878 bcb5974 eeeb878 2c0f00c eeeb878 2c0f00c eeeb878 bcb5974 eeeb878 bcb5974 9481a42 5ef45dd eeeb878 5ef45dd 9481a42 5ef45dd bcb5974 eeeb878 bcb5974 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import streamlit as st
from datasets import load_dataset
import datetime as dt
import random
import json
import os
from huggingface_hub import HfApi, CommitScheduler
import uuid
HF_API_KEY = os.environ.get("HF_TOKEN", None)
api = HfApi(token=HF_API_KEY)
REPO_ID = "imomayiz/darija-english"
DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"
submissions_folder = "submissions"
submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
os.makedirs(submissions_folder, exist_ok=True)
scheduler = CommitScheduler(
token=HF_API_KEY,
hf_api=api,
repo_id=REPO_ID,
repo_type="dataset",
folder_path=submissions_folder,
path_in_repo=submissions_folder,
every=1,
)
# Define the ParquetScheduler instance with your repo details
# scheduler = ParquetScheduler(repo_id=REPO_ID,
# token=HF_API_KEY, every=1,
# path_in_repo=submissions_folder,
# repo_type="dataset")
def load_data(repo_id):
dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
return dataset
def fetch_sentence(dataset, column_name="darija_ar"):
# Get a random sentence
random_sentence_index = random.randint(0, len(dataset) - 1)
random_sentence = dataset[random_sentence_index][column_name]
st.session_state.sentence = random_sentence
st.session_state.translation_input = ""
st.session_state.translation_input_fr = ""
return random_sentence
def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str):
"""
Append input/outputs and user feedback to a JSON Lines file
using a thread lock to avoid concurrent writes from different users.
"""
ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
# folder_path = "submissions"
# os.makedirs(folder_path, exist_ok=True)
# filename = os.path.join(folder_path, f"submissions_{ts}.txt")
# with open(filename, "w", encoding="utf-8") as f:
# f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}")
# api.upload_file(
# path_or_fileobj=filename,
# path_in_repo=filename,
# repo_id=REPO_ID,
# repo_type="dataset",
# )
with scheduler.lock:
with submissions_file.open("a") as f:
f.write(json.dumps({
"darija": translation_fr,
"eng": translation,
"darija_ar": sentence}))
f.write("\n")
# scheduler.append({"darija": translation_fr,
# "eng": translation,
# "darija_ar": sentence})
st.success(
f"""Translation submitted successfully to
{DATASET_REPO_URL}/tree/main/{submissions_folder}"""
)
# Load the dataset
dataset = load_data(REPO_ID)
if "sentence" not in st.session_state:
st.session_state.sentence = fetch_sentence(dataset)
if 'translation_input' not in st.session_state:
st.session_state.translation_input = ""
if 'translation_input_fr' not in st.session_state:
st.session_state.translation_input_fr = ""
if 'display_new' not in st.session_state:
st.session_state.display_new = False
st.title("Translate From Arabic to English")
st.markdown(
"""This mini-app allows you to contribute to the **darija-english** dataset
as part of [DODa](https://darija-open-dataset.github.io/)
project. To contribute, simply translate the given sentence from Arabic to English.
The translated sentence will be submitted to the dataset
[here](https://huggingface.co/datasets/imomayiz/darija-english)."""
)
st.divider()
st.write(f"""
<div style="
padding: 5px;
border: 1px solid #000000;
border-radius: 5px;
">
<p style="font-size: 20px;">{st.session_state.sentence}.</p>
</div>""", unsafe_allow_html=True)
# Display new sentence button
st.session_state.display_new = st.button("New Sentence",
on_click=fetch_sentence,
args=(dataset,))
# Input field for translation
translation_input_placeholder = st.empty()
with translation_input_placeholder.container():
translation_input = st.text_input("Enter translation to english: ",
st.session_state.translation_input)
st.session_state.translation_input = translation_input
# Input field for translation
translation_input_placeholder_fr = st.empty()
with translation_input_placeholder_fr.container():
translation_input_fr = st.text_input(
"Enter translation to darija in latin characters: ",
st.session_state.translation_input_fr
)
st.session_state.translation_input_fr = translation_input_fr
if st.button("Submit Translation"):
if not st.session_state.translation_input_fr or st.session_state.translation_input:
st.warning("Please enter a translation before submitting.")
else:
store_submission(api,
st.session_state.sentence,
st.session_state.translation_input,
st.session_state.translation_input_fr
) |