Ali Elfilali commited on
Commit
57036fc
β€’
1 Parent(s): 79b96ed

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +432 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import uuid
4
+ import random
5
+ import datetime
6
+ import pandas as pd
7
+ from typing import Any, Dict, List, Optional, Union
8
+ from pathlib import Path
9
+ import tempfile
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
12
+
13
+ import streamlit as st
14
+ import huggingface_hub as hf
15
+ from huggingface_hub import HfApi, login, CommitScheduler
16
+ from datasets import load_dataset
17
+ import openai
18
+ from openai import OpenAI
19
+
20
+ # File Path
21
+ # DATA_PATH = "Dr-En-space-test.csv"
22
+ # DATA_REPO = "M-A-D/dar-en-space-test"
23
+ DATA_REPO = "M-A-D/DarijaBridge"
24
+
25
+ api = hf.HfApi()
26
+ access_token_write = "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"
27
+ login(token=access_token_write)
28
+ repo_id = "M-A-D/dar-en-space-test"
29
+
30
+ st.set_page_config(layout="wide")
31
+
32
+ # Initialize the ParquetScheduler
33
+ class ParquetScheduler(CommitScheduler):
34
+ """
35
+ Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
36
+ call will result in 1 row in your final dataset.
37
+
38
+ ```py
39
+ # Start scheduler
40
+ >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
41
+
42
+ # Append some data to be uploaded
43
+ >>> scheduler.append({...})
44
+ >>> scheduler.append({...})
45
+ >>> scheduler.append({...})
46
+ ```
47
+
48
+ The scheduler will automatically infer the schema from the data it pushes.
49
+ Optionally, you can manually set the schema yourself:
50
+
51
+ ```py
52
+ >>> scheduler = ParquetScheduler(
53
+ ... repo_id="my-parquet-dataset",
54
+ ... schema={
55
+ ... "prompt": {"_type": "Value", "dtype": "string"},
56
+ ... "negative_prompt": {"_type": "Value", "dtype": "string"},
57
+ ... "guidance_scale": {"_type": "Value", "dtype": "int64"},
58
+ ... "image": {"_type": "Image"},
59
+ ... },
60
+ ... )
61
+
62
+ See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
63
+ possible values.
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ *,
69
+ repo_id: str,
70
+ schema: Optional[Dict[str, Dict[str, str]]] = None,
71
+ every: Union[int, float] = 5,
72
+ path_in_repo: Optional[str] = "data",
73
+ repo_type: Optional[str] = "dataset",
74
+ revision: Optional[str] = None,
75
+ private: bool = False,
76
+ token: Optional[str] = None,
77
+ allow_patterns: Union[List[str], str, None] = None,
78
+ ignore_patterns: Union[List[str], str, None] = None,
79
+ hf_api: Optional[HfApi] = None,
80
+ ) -> None:
81
+ super().__init__(
82
+ repo_id=repo_id,
83
+ folder_path="dummy", # not used by the scheduler
84
+ every=every,
85
+ path_in_repo=path_in_repo,
86
+ repo_type=repo_type,
87
+ revision=revision,
88
+ private=private,
89
+ token=token,
90
+ allow_patterns=allow_patterns,
91
+ ignore_patterns=ignore_patterns,
92
+ hf_api=hf_api,
93
+ )
94
+
95
+ self._rows: List[Dict[str, Any]] = []
96
+ self._schema = schema
97
+
98
+ def append(self, row: Dict[str, Any]) -> None:
99
+ """Add a new item to be uploaded."""
100
+ with self.lock:
101
+ self._rows.append(row)
102
+
103
+ def push_to_hub(self):
104
+ # Check for new rows to push
105
+ with self.lock:
106
+ rows = self._rows
107
+ self._rows = []
108
+ if not rows:
109
+ return
110
+ print(f"Got {len(rows)} item(s) to commit.")
111
+
112
+ # Load images + create 'features' config for datasets library
113
+ schema: Dict[str, Dict] = self._schema or {}
114
+ path_to_cleanup: List[Path] = []
115
+ for row in rows:
116
+ for key, value in row.items():
117
+ # Infer schema (for `datasets` library)
118
+ if key not in schema:
119
+ schema[key] = _infer_schema(key, value)
120
+
121
+ # Load binary files if necessary
122
+ if schema[key]["_type"] in ("Image", "Audio"):
123
+ # It's an image or audio: we load the bytes and remember to cleanup the file
124
+ file_path = Path(value)
125
+ if file_path.is_file():
126
+ row[key] = {
127
+ "path": file_path.name,
128
+ "bytes": file_path.read_bytes(),
129
+ }
130
+ path_to_cleanup.append(file_path)
131
+
132
+ # Complete rows if needed
133
+ for row in rows:
134
+ for feature in schema:
135
+ if feature not in row:
136
+ row[feature] = None
137
+
138
+ # Export items to Arrow format
139
+ table = pa.Table.from_pylist(rows)
140
+
141
+ # Add metadata (used by datasets library)
142
+ table = table.replace_schema_metadata(
143
+ {"huggingface": json.dumps({"info": {"features": schema}})}
144
+ )
145
+
146
+ # Write to parquet file
147
+ archive_file = tempfile.NamedTemporaryFile()
148
+ pq.write_table(table, archive_file.name)
149
+
150
+ # Upload
151
+ self.api.upload_file(
152
+ repo_id=self.repo_id,
153
+ repo_type=self.repo_type,
154
+ revision=self.revision,
155
+ path_in_repo=f"{uuid.uuid4()}.parquet",
156
+ path_or_fileobj=archive_file.name,
157
+ )
158
+ print(f"Commit completed.")
159
+
160
+ # Cleanup
161
+ archive_file.close()
162
+ for path in path_to_cleanup:
163
+ path.unlink(missing_ok=True)
164
+
165
+
166
+
167
+ # Define the ParquetScheduler instance with your repo details
168
+ scheduler = ParquetScheduler(repo_id=repo_id)
169
+
170
+
171
+ # Function to append new translation data to the ParquetScheduler
172
+ def append_translation_data(original, translation, translated, corrected=False):
173
+ data = {
174
+ "original": original,
175
+ "translation": translation,
176
+ "translated": translated,
177
+ "corrected": corrected,
178
+ "timestamp": datetime.datetime.utcnow().isoformat(),
179
+ "id": str(uuid.uuid4()) # Unique identifier for each translation
180
+ }
181
+ scheduler.append(data)
182
+
183
+
184
+ # Load data
185
+ def load_data():
186
+ return pd.DataFrame(load_dataset(DATA_REPO,download_mode="force_redownload",split='test'))
187
+
188
+ #def save_data(data):
189
+ # data.to_csv(DATA_PATH, index=False)
190
+ # # to_save = datasets.Dataset.from_pandas(data)
191
+ # api.upload_file(
192
+ # path_or_fileobj="./Dr-En-space-test.csv",
193
+ # path_in_repo="Dr-En-space-test.csv",
194
+ # repo_id=DATA_REPO,
195
+ # repo_type="dataset",
196
+ #)
197
+ # # to_save.push_to_hub(DATA_REPO)
198
+
199
+ def skip_correction():
200
+ noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
201
+ if noncorrected_sentences:
202
+ st.session_state.orig_sentence = random.choice(noncorrected_sentences)
203
+ st.session_state.orig_translation = st.session_state.data[st.session_state.data.sentence == st.session_state.orig_sentence]['translation']
204
+ else:
205
+ st.session_state.orig_sentence = "No more sentences to be corrected"
206
+ st.session_state.orig_translation = "No more sentences to be corrected"
207
+
208
+ st.title("""
209
+ Darija Translation Corpus Collection
210
+
211
+ **What This Space Is For:**
212
+ - **Translating Darija to English:** Add your translations here.
213
+ - **Correcting Translations:** Review and correct existing translations.
214
+ - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
215
+ - **Helping Develop Darija Language Resources:** Your translations make a difference.
216
+
217
+ **How to Contribute:**
218
+ - **Choose a Tab:** Translation, Correction, or Auto-Translate.
219
+ - **Add or Correct Translations:** Use text areas to enter translations.
220
+ - **Save Your Work:** Click 'Save' to submit.
221
+
222
+ **Every Contribution Counts! Let's make Darija GREAT!**
223
+ """)
224
+
225
+ if "data" not in st.session_state:
226
+ st.session_state.data = load_data()
227
+
228
+ if "sentence" not in st.session_state:
229
+ untranslated_sentences = st.session_state.data[st.session_state.data['translated'] == False]['sentence'].tolist()
230
+ if untranslated_sentences:
231
+ st.session_state.sentence = random.choice(untranslated_sentences)
232
+ else:
233
+ st.session_state.sentence = "No more sentences to translate"
234
+
235
+ if "orig_translation" not in st.session_state:
236
+ noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
237
+ noncorrected_translations = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['translation'].tolist()
238
+
239
+ if noncorrected_sentences:
240
+ st.session_state.orig_sentence = random.choice(noncorrected_sentences)
241
+ st.session_state.orig_translation = st.session_state.data.loc[st.session_state.data.sentence == st.session_state.orig_sentence]['translation'].values[0]
242
+ else:
243
+ st.session_state.orig_sentence = "No more sentences to be corrected"
244
+ st.session_state.orig_translation = "No more sentences to be corrected"
245
+
246
+ if "user_translation" not in st.session_state:
247
+ st.session_state.user_translation = ""
248
+
249
+
250
+ # with st.sidebar:
251
+ # st.subheader("About")
252
+ # st.markdown("""This is app is designed to collect Darija translation corpus.""")
253
+
254
+ # with st.sidebar:
255
+ # st.subheader("About")
256
+ # st.markdown("""
257
+ # ### Darija Translation Corpus Collection
258
+
259
+ # **What This Space Is For:**
260
+ # - **Translating Darija to English:** Add your translations here.
261
+ # - **Correcting Translations:** Review and correct existing translations.
262
+ # - **Using GPT-4 for Auto-Translation:** Try auto-translating Darija sentences.
263
+ # - **Helping Develop Darija Language Resources:** Your translations make a difference.
264
+
265
+ # **How to Contribute:**
266
+ # - **Choose a Tab:** Translation, Correction, or Auto-Translate.
267
+ # - **Add or Correct Translations:** Use text areas to enter translations.
268
+ # - **Save Your Work:** Click 'Save' to submit.
269
+
270
+ # **Every Contribution Counts! Let's make Darija GREAT!**
271
+ # """)
272
+
273
+ tab1, tab2, tab3 = st.tabs(["Translation", "Correction", "Auto-Translate"])
274
+
275
+ with tab1:
276
+ with st.container():
277
+ st.subheader("Original Text:")
278
+
279
+ st.write('<div style="height: 150px; overflow: auto; border: 2px solid #ddd; padding: 10px; border-radius: 5px;">{}</div>'.format(st.session_state.sentence), unsafe_allow_html=True)
280
+
281
+
282
+ st.subheader("Translation:")
283
+ st.session_state.user_translation = st.text_area("Enter your translation here:", value=st.session_state.user_translation)
284
+
285
+ if st.button("πŸ’Ύ Save"):
286
+ if st.session_state.user_translation:
287
+ # Append data to be saved
288
+ append_translation_data(
289
+ original=st.session_state.sentence,
290
+ translation=st.session_state.user_translation,
291
+ translated=True
292
+ )
293
+ st.session_state.user_translation = ""
294
+ # st.toast("Saved!", icon="πŸ‘")
295
+ st.success("Saved!")
296
+
297
+ # Update the sentence for the next iteration.
298
+ untranslated_sentences = st.session_state.data[st.session_state.data['translated'] == False]['sentence'].tolist()
299
+ if untranslated_sentences:
300
+ st.session_state.sentence = random.choice(untranslated_sentences)
301
+
302
+ else:
303
+ st.session_state.sentence = "No more sentences to translate"
304
+
305
+ time.sleep(0.5)
306
+ # Rerun the app
307
+ st.rerun()
308
+
309
+
310
+ with tab2:
311
+ with st.container():
312
+ st.subheader("Original Darija Text:")
313
+ st.write('<div style="height: 150px; overflow: auto; border: 2px solid #ddd; padding: 10px; border-radius: 5px;">{}</div>'.format(st.session_state.orig_sentence), unsafe_allow_html=True)
314
+
315
+ with st.container():
316
+ st.subheader("Original English Translation:")
317
+ st.write('<div style="height: 150px; overflow: auto; border: 2px solid #ddd; padding: 10px; border-radius: 5px;">{}</div>'.format(st.session_state.orig_translation), unsafe_allow_html=True)
318
+
319
+ st.subheader("Corrected Darija Translation:")
320
+ corrected_translation = st.text_area("Enter the corrected Darija translation here:")
321
+
322
+ if st.button("πŸ’Ύ Save Translation"):
323
+ if corrected_translation:
324
+ # Append data to be saved
325
+ append_translation_data(
326
+ original=st.session_state.orig_sentence,
327
+ translation=corrected_translation,
328
+ translated=True,
329
+ corrected=True
330
+ )
331
+ st.success("Saved!")
332
+
333
+ # Update the sentence for the next iteration.
334
+ noncorrected_sentences = st.session_state.data[(st.session_state.data.translated == True) & (st.session_state.data.corrected == False)]['sentence'].tolist()
335
+ # noncorrected_sentences = st.session_state.data[st.session_state.data['corrected'] == False]['sentence'].tolist()
336
+ if noncorrected_sentences:
337
+ st.session_state.orig_sentence = random.choice(noncorrected_sentences)
338
+ st.session_state.orig_translation = st.session_state.data[st.session_state.data.sentence == st.session_state.orig_sentence]['translation']
339
+
340
+ else:
341
+ st.session_state.orig_translation = "No more sentences to be corrected"
342
+
343
+ corrected_translation = "" # Reset the input value after saving
344
+
345
+ st.button("⏩ Skip to the Next Pair", key="skip_button", on_click=skip_correction)
346
+
347
+ with tab3:
348
+ st.subheader("Auto-Translate")
349
+
350
+ # User input for OpenAI API key
351
+ openai_api_key = st.text_input("Paste your OpenAI API key:")
352
+
353
+ # Slider for the user to choose the number of samples to translate
354
+ num_samples = st.slider("Select the number of samples to translate", min_value=1, max_value=100, value=10)
355
+
356
+ # Estimated cost display
357
+ cost = num_samples * 0.0012
358
+ st.write(f"The estimated cost for translating {num_samples} samples is: ${cost:.4f}")
359
+
360
+ if st.button("Do the MAGIC with Auto-Translate ✨"):
361
+ if openai_api_key:
362
+ openai.api_key = openai_api_key
363
+
364
+ client = OpenAI(
365
+ # defaults to os.environ.get("OPENAI_API_KEY")
366
+ api_key=openai_api_key,
367
+ )
368
+
369
+ # Get 10 samples from the dataset for translation
370
+ samples_to_translate = st.session_state.data.sample(10)['sentence'].tolist()
371
+
372
+ # # System prompt for translation assistant
373
+ # translation_prompt = """
374
+ # You are a helpful AI-powered translation assistant designed for users seeking reliable translation assistance. Your primary function is to provide context-aware translations from Moroccan Arabic (Darija) to English.
375
+ # """
376
+
377
+ # auto_translations = []
378
+
379
+ # for sentence in samples_to_translate:
380
+ # # Create messages for the chat model
381
+ # messages = [
382
+ # {"role": "system", "content": translation_prompt},
383
+ # {"role": "user", "content": f"Translate the following sentence to English: '{sentence}'"}
384
+ # ]
385
+ # System prompt for translation assistant
386
+ translation_system_prompt = """
387
+ You are a native speaker of both Moroccan Arabic (Darija) and English. You are an expert of translations from Moroccan Arabic (Darija) into English.
388
+ """
389
+
390
+ auto_translations = []
391
+
392
+ for sentence in samples_to_translate:
393
+ # Create messages for the chat model
394
+ messages = [
395
+ {"role": "system", "content": translation_system_prompt},
396
+ {"role": "user", "content": f"Translate the following sentence from Moroccan Arabic (Darija) to English, only return the translated sentence: '{sentence}'"}
397
+ ]
398
+
399
+ # Perform automatic translation using OpenAI GPT-3.5-turbo model
400
+ response = client.chat.completions.create(
401
+ # model="gpt-3.5-turbo",
402
+ model="gpt-4-1106-preview",
403
+ # api_key=openai_api_key,
404
+ messages=messages
405
+ )
406
+
407
+ # Extract the translated text from the response
408
+ translated_text = response.choices[0].message['content'].strip()
409
+
410
+ # Append the translated text to the list
411
+ auto_translations.append(translated_text)
412
+
413
+ # Update the dataset with auto-translations
414
+ st.session_state.data.loc[
415
+ st.session_state.data['sentence'].isin(samples_to_translate),
416
+ 'translation'
417
+ ] = auto_translations
418
+
419
+ # Append data to be saved
420
+ append_translation_data(
421
+ original=st.session_state.orig_sentence,
422
+ translation=corrected_translation,
423
+ translated=True,
424
+ corrected=True
425
+ )
426
+
427
+
428
+ st.success("Auto-Translations saved!")
429
+
430
+ else:
431
+ st.warning("Please paste your OpenAI API key.")
432
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ openai==1.2.2
2
+ huggingface_hub
3
+ datasets