File size: 5,254 Bytes
9481a42
 
 
 
eeeb878
9481a42
eeeb878
 
 
 
9481a42
 
c5139c6
 
9481a42
 
 
eeeb878
 
4821540
eeeb878
2c0f00c
 
 
 
 
 
 
 
 
ae41ba2
 
2c0f00c
 
 
 
ae41ba2
9481a42
 
f249f4d
9481a42
 
 
 
 
 
 
 
0ecaa8b
 
 
 
9481a42
 
eeeb878
 
 
 
 
bcb5974
eeeb878
 
 
bcb5974
eeeb878
 
bcb5974
eeeb878
 
 
 
 
 
 
2c0f00c
 
 
 
eeeb878
2c0f00c
 
 
 
 
 
eeeb878
bcb5974
 
eeeb878
bcb5974
9481a42
 
 
 
 
 
5ef45dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeeb878
5ef45dd
 
 
 
 
 
 
 
 
9481a42
 
5ef45dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcb5974
eeeb878
bcb5974
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
from datasets import load_dataset
import datetime as dt
import random
import json
import os
from huggingface_hub import HfApi, CommitScheduler
import uuid


HF_API_KEY = os.environ.get("HF_TOKEN", None)

api = HfApi(token=HF_API_KEY)

REPO_ID = "imomayiz/darija-english"
DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"

submissions_folder = "submissions"
submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
os.makedirs(submissions_folder, exist_ok=True)

scheduler = CommitScheduler(
    token=HF_API_KEY,
    hf_api=api,
    repo_id=REPO_ID,
    repo_type="dataset",
    folder_path=submissions_folder,
    path_in_repo=submissions_folder,
    every=1,
)

# Define the ParquetScheduler instance with your repo details
# scheduler = ParquetScheduler(repo_id=REPO_ID, 
#                              token=HF_API_KEY, every=1, 
#                              path_in_repo=submissions_folder, 
#                              repo_type="dataset")


def load_data(repo_id):
    dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
    return dataset

def fetch_sentence(dataset, column_name="darija_ar"):
    
    # Get a random sentence
    random_sentence_index = random.randint(0, len(dataset) - 1)
    random_sentence = dataset[random_sentence_index][column_name]

    st.session_state.sentence = random_sentence
    st.session_state.translation_input = ""
    st.session_state.translation_input_fr = ""

    return random_sentence

def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str): 
    """
    Append input/outputs and user feedback to a JSON Lines file 
    using a thread lock to avoid concurrent writes from different users.
    """
    ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
    # folder_path = "submissions"
    # os.makedirs(folder_path, exist_ok=True)
    # filename = os.path.join(folder_path, f"submissions_{ts}.txt")

    # with open(filename, "w", encoding="utf-8") as f:
    #     f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}")
        
    # api.upload_file(
    #             path_or_fileobj=filename,
    #             path_in_repo=filename,
    #             repo_id=REPO_ID,
    #             repo_type="dataset",
    #         )

    with scheduler.lock:
        with submissions_file.open("a") as f:
            f.write(json.dumps({
                "darija": translation_fr, 
                "eng": translation, 
                "darija_ar": sentence}))
            f.write("\n")

    # scheduler.append({"darija": translation_fr, 
    #             "eng": translation, 
    #             "darija_ar": sentence})

    st.success(
        f"""Translation submitted successfully to 
        {DATASET_REPO_URL}/tree/main/{submissions_folder}"""
        )


# Load the dataset
dataset = load_data(REPO_ID)


if "sentence" not in st.session_state:
    st.session_state.sentence = fetch_sentence(dataset) 
if 'translation_input' not in st.session_state:
    st.session_state.translation_input = ""
if 'translation_input_fr' not in st.session_state:
    st.session_state.translation_input_fr = ""
if 'display_new' not in st.session_state:
    st.session_state.display_new = False

st.title("Translate From Arabic to English")

st.markdown(
"""This mini-app allows you to contribute to the **darija-english** dataset 
as part of [DODa](https://darija-open-dataset.github.io/)
project. To contribute, simply translate the given sentence from Arabic to English.
The translated sentence will be submitted to the dataset 
[here](https://huggingface.co/datasets/imomayiz/darija-english)."""
)

st.divider()

st.write(f"""
    <div style="
        padding: 5px;
        border: 1px solid #000000;
        border-radius: 5px;
    ">
        <p style="font-size: 20px;">{st.session_state.sentence}.</p>
    </div>""", unsafe_allow_html=True)
    

# Display new sentence button
st.session_state.display_new = st.button("New Sentence",
                                            on_click=fetch_sentence,
                                            args=(dataset,))


# Input field for translation
translation_input_placeholder = st.empty()

with translation_input_placeholder.container():
    translation_input = st.text_input("Enter translation to english: ",
                                        st.session_state.translation_input)
    st.session_state.translation_input = translation_input

# Input field for translation
translation_input_placeholder_fr = st.empty()

with translation_input_placeholder_fr.container():
    translation_input_fr = st.text_input(
        "Enter translation to darija in latin characters: ",
        st.session_state.translation_input_fr
        )
    st.session_state.translation_input_fr = translation_input_fr

if st.button("Submit Translation"):
    if not st.session_state.translation_input_fr or st.session_state.translation_input:
        st.warning("Please enter a translation before submitting.")
    else:
        store_submission(api,
                    st.session_state.sentence, 
                    st.session_state.translation_input,
                    st.session_state.translation_input_fr
                        )