Imane Momayiz commited on
Commit
d9514f5
1 Parent(s): 3c66851
Files changed (4) hide show
  1. app.py +15 -90
  2. src/components.py +48 -130
  3. src/layout.py +5 -0
  4. src/utils.py +31 -0
app.py CHANGED
@@ -1,24 +1,17 @@
1
- import streamlit as st
2
- from datasets import load_dataset
3
- import datetime as dt
4
- import random
5
- import json
6
  import os
 
7
  from huggingface_hub import HfApi, CommitScheduler
8
- import uuid
9
-
 
 
10
 
 
11
  HF_API_KEY = os.environ.get("HF_TOKEN", None)
12
-
13
  api = HfApi(token=HF_API_KEY)
14
-
15
- REPO_ID = "imomayiz/darija-english"
16
- DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"
17
-
18
- submissions_folder = "submissions"
19
- submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
20
  os.makedirs(submissions_folder, exist_ok=True)
21
 
 
22
  scheduler = CommitScheduler(
23
  token=HF_API_KEY,
24
  hf_api=api,
@@ -29,71 +22,10 @@ scheduler = CommitScheduler(
29
  every=1,
30
  )
31
 
32
- # Define the ParquetScheduler instance with your repo details
33
- # scheduler = ParquetScheduler(repo_id=REPO_ID,
34
- # token=HF_API_KEY, every=1,
35
- # path_in_repo=submissions_folder,
36
- # repo_type="dataset")
37
-
38
-
39
- def load_data(repo_id):
40
- dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
41
- return dataset
42
-
43
- def fetch_sentence(dataset, column_name="darija_ar"):
44
-
45
- # Get a random sentence
46
- random_sentence_index = random.randint(0, len(dataset) - 1)
47
- random_sentence = dataset[random_sentence_index][column_name]
48
-
49
- st.session_state.sentence = random_sentence
50
- st.session_state.translation_input = ""
51
- st.session_state.translation_input_fr = ""
52
-
53
- return random_sentence
54
-
55
- def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str):
56
- """
57
- Append input/outputs and user feedback to a JSON Lines file
58
- using a thread lock to avoid concurrent writes from different users.
59
- """
60
- ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
61
- # folder_path = "submissions"
62
- # os.makedirs(folder_path, exist_ok=True)
63
- # filename = os.path.join(folder_path, f"submissions_{ts}.txt")
64
-
65
- # with open(filename, "w", encoding="utf-8") as f:
66
- # f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}")
67
-
68
- # api.upload_file(
69
- # path_or_fileobj=filename,
70
- # path_in_repo=filename,
71
- # repo_id=REPO_ID,
72
- # repo_type="dataset",
73
- # )
74
-
75
- with scheduler.lock:
76
- with open(submissions_file, "a") as f:
77
- f.write(json.dumps({
78
- "darija": translation_fr,
79
- "eng": translation,
80
- "darija_ar": sentence}))
81
- f.write("\n")
82
-
83
- # scheduler.append({"darija": translation_fr,
84
- # "eng": translation,
85
- # "darija_ar": sentence})
86
-
87
- st.success(
88
- f"""Translation submitted successfully to
89
- {DATASET_REPO_URL}/tree/main/{submissions_folder}"""
90
- )
91
-
92
-
93
  # Load the dataset
94
  dataset = load_data(REPO_ID)
95
 
96
-
97
  if "sentence" not in st.session_state:
98
  st.session_state.sentence = fetch_sentence(dataset)
99
  if 'translation_input' not in st.session_state:
@@ -105,13 +37,7 @@ if 'display_new' not in st.session_state:
105
 
106
  st.title("Translate From Arabic to English")
107
 
108
- st.markdown(
109
- """This mini-app allows you to contribute to the **darija-english** dataset
110
- as part of [DODa](https://darija-open-dataset.github.io/)
111
- project. To contribute, simply translate the given sentence from Arabic to English.
112
- The translated sentence will be submitted to the dataset
113
- [here](https://huggingface.co/datasets/imomayiz/darija-english)."""
114
- )
115
 
116
  st.divider()
117
 
@@ -132,21 +58,20 @@ st.session_state.display_new = st.button("New Sentence",
132
 
133
 
134
  # Input field for translation
135
- translation_input_placeholder = st.empty()
136
-
137
- translation_input = st.text_input("Enter translation to english: ",
138
- st.session_state.translation_input)
139
  st.session_state.translation_input = translation_input
140
 
141
- # Input field for translation
142
- translation_input_placeholder_fr = st.empty()
143
-
144
  translation_input_fr = st.text_input(
145
  "Enter translation to darija in latin characters: ",
146
  st.session_state.translation_input_fr
147
  )
148
  st.session_state.translation_input_fr = translation_input_fr
149
 
 
150
  if st.button("Submit Translation"):
151
  if st.session_state.translation_input_fr or st.session_state.translation_input:
152
  store_submission(api,
 
 
 
 
 
 
1
  import os
2
+ import streamlit as st
3
  from huggingface_hub import HfApi, CommitScheduler
4
+ from src.components import (
5
+ load_data, fetch_sentence, store_submission,
6
+ REPO_ID, submissions_folder)
7
+ from src.layout import INTRO_TEXT
8
 
9
+ # setup
10
  HF_API_KEY = os.environ.get("HF_TOKEN", None)
 
11
  api = HfApi(token=HF_API_KEY)
 
 
 
 
 
 
12
  os.makedirs(submissions_folder, exist_ok=True)
13
 
14
+ # Create a commit scheduler
15
  scheduler = CommitScheduler(
16
  token=HF_API_KEY,
17
  hf_api=api,
 
22
  every=1,
23
  )
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Load the dataset
26
  dataset = load_data(REPO_ID)
27
 
28
+ # Initialize session state
29
  if "sentence" not in st.session_state:
30
  st.session_state.sentence = fetch_sentence(dataset)
31
  if 'translation_input' not in st.session_state:
 
37
 
38
  st.title("Translate From Arabic to English")
39
 
40
+ st.markdown(INTRO_TEXT, unsafe_allow_html=True)
 
 
 
 
 
 
41
 
42
  st.divider()
43
 
 
58
 
59
 
60
  # Input field for translation
61
+ translation_input = st.text_input(
62
+ "Enter translation to english: ",
63
+ st.session_state.translation_input
64
+ )
65
  st.session_state.translation_input = translation_input
66
 
67
+ # Input field for translation in latin characters
 
 
68
  translation_input_fr = st.text_input(
69
  "Enter translation to darija in latin characters: ",
70
  st.session_state.translation_input_fr
71
  )
72
  st.session_state.translation_input_fr = translation_input_fr
73
 
74
+ # Submit button
75
  if st.button("Submit Translation"):
76
  if st.session_state.translation_input_fr or st.session_state.translation_input:
77
  store_submission(api,
src/components.py CHANGED
@@ -1,140 +1,58 @@
1
- from huggingface_hub import HfApi, CommitScheduler
2
- from typing import Any, Dict, List, Optional, Union
 
 
 
 
 
3
  import uuid
4
- from pathlib import Path
5
- import json
6
- import tempfile
7
- import pyarrow as pa
8
- import pyarrow.parquet as pq
9
 
10
 
11
- # Initialize the ParquetScheduler
12
- class ParquetScheduler(CommitScheduler):
13
- """
14
- Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
15
- call will result in 1 row in your final dataset.
16
- ```py
17
- # Start scheduler
18
- >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
19
- # Append some data to be uploaded
20
- >>> scheduler.append({...})
21
- >>> scheduler.append({...})
22
- >>> scheduler.append({...})
23
- ```
24
- The scheduler will automatically infer the schema from the data it pushes.
25
- Optionally, you can manually set the schema yourself:
26
- ```py
27
- >>> scheduler = ParquetScheduler(
28
- ... repo_id="my-parquet-dataset",
29
- ... schema={
30
- ... "prompt": {"_type": "Value", "dtype": "string"},
31
- ... "negative_prompt": {"_type": "Value", "dtype": "string"},
32
- ... "guidance_scale": {"_type": "Value", "dtype": "int64"},
33
- ... "image": {"_type": "Image"},
34
- ... },
35
- ... )
36
- See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
37
- possible values.
38
- """
39
-
40
- def __init__(
41
- self,
42
- *,
43
- repo_id: str,
44
- schema: Optional[Dict[str, Dict[str, str]]] = None,
45
- every: Union[int, float] = 5,
46
- path_in_repo: Optional[str] = "data",
47
- repo_type: Optional[str] = "dataset",
48
- revision: Optional[str] = None,
49
- private: bool = False,
50
- token: Optional[str] = None,
51
- allow_patterns: Union[List[str], str, None] = None,
52
- ignore_patterns: Union[List[str], str, None] = None,
53
- hf_api: Optional[HfApi] = None,
54
- ) -> None:
55
- super().__init__(
56
- repo_id=repo_id,
57
- folder_path="dummy", # not used by the scheduler
58
- every=every,
59
- path_in_repo=path_in_repo,
60
- repo_type=repo_type,
61
- revision=revision,
62
- private=private,
63
- token=token,
64
- allow_patterns=allow_patterns,
65
- ignore_patterns=ignore_patterns,
66
- hf_api=hf_api,
67
- )
68
-
69
- self._rows: List[Dict[str, Any]] = []
70
- self._schema = schema
71
-
72
- def append(self, row: Dict[str, Any]) -> None:
73
- """Add a new item to be uploaded."""
74
- with self.lock:
75
- self._rows.append(row)
76
-
77
- def push_to_hub(self):
78
- # Check for new rows to push
79
- with self.lock:
80
- rows = self._rows
81
- self._rows = []
82
- if not rows:
83
- return
84
- print(f"Got {len(rows)} item(s) to commit.")
85
 
86
- # Load images + create 'features' config for datasets library
87
- schema: Dict[str, Dict] = self._schema or {}
88
- path_to_cleanup: List[Path] = []
89
- for row in rows:
90
- for key, value in row.items():
91
- # Infer schema (for `datasets` library)
92
- if key not in schema:
93
- schema[key] = _infer_schema(key, value)
94
 
95
- # Load binary files if necessary
96
- if schema[key]["_type"] in ("Image", "Audio"):
97
- # It's an image or audio: we load the bytes and remember to cleanup the file
98
- file_path = Path(value)
99
- if file_path.is_file():
100
- row[key] = {
101
- "path": file_path.name,
102
- "bytes": file_path.read_bytes(),
103
- }
104
- path_to_cleanup.append(file_path)
105
 
106
- # Complete rows if needed
107
- for row in rows:
108
- for feature in schema:
109
- if feature not in row:
110
- row[feature] = None
111
 
112
- # Export items to Arrow format
113
- table = pa.Table.from_pylist(rows)
114
-
115
- # Add metadata (used by datasets library)
116
- table = table.replace_schema_metadata(
117
- {"huggingface": json.dumps({"info": {"features": schema}})}
118
- )
119
-
120
- # Write to parquet file
121
- archive_file = tempfile.NamedTemporaryFile()
122
- pq.write_table(table, archive_file.name)
123
-
124
- # Upload
125
- self.api.upload_file(
126
- repo_id=self.repo_id,
127
- repo_type=self.repo_type,
128
- revision=self.revision,
129
- path_in_repo=f"{uuid.uuid4()}.parquet",
130
- path_or_fileobj=archive_file.name,
131
- )
132
- print("Commit completed.")
133
-
134
- # Cleanup
135
- archive_file.close()
136
- for path in path_to_cleanup:
137
- path.unlink(missing_ok=True)
138
 
 
 
 
139
 
 
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import datetime as dt
3
+ import random
4
+ import json
5
+ import os
6
+ from huggingface_hub import CommitScheduler
7
+ from datasets import load_dataset
8
  import uuid
 
 
 
 
 
9
 
10
 
11
+ REPO_ID = "imomayiz/darija-english"
12
+ DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ submissions_folder = "submissions"
15
+ submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def load_data(repo_id):
19
+ dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
20
+ return dataset
 
 
21
 
22
+ def fetch_sentence(dataset, column_name="darija_ar"):
23
+
24
+ # Get a random sentence
25
+ random_sentence_index = random.randint(0, len(dataset) - 1)
26
+ random_sentence = dataset[random_sentence_index][column_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ st.session_state.sentence = random_sentence
29
+ st.session_state.translation_input = ""
30
+ st.session_state.translation_input_fr = ""
31
 
32
+ return random_sentence
33
 
34
+ def store_submission(
35
+ scheduler: CommitScheduler, sentence: str, translation: str, translation_fr: str
36
+ ):
37
+ """
38
+ Append input/outputs and user feedback to a JSON Lines file
39
+ using a thread lock to avoid concurrent writes from different users.
40
+ """
41
+ ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
42
+
43
+ with scheduler.lock:
44
+ with open(submissions_file, "a") as f:
45
+ f.write(json.dumps({
46
+ "darija": translation_fr,
47
+ "eng": translation,
48
+ "darija_ar": sentence,
49
+ "timestamp": ts}),
50
+ ensure_ascii=False)
51
+ f.write("\n")
52
+
53
+ st.success(
54
+ f"""Translation submitted successfully.
55
+ You will see your commit in 1 minute at
56
+ {DATASET_REPO_URL}/tree/main/{submissions_folder}.
57
+ You can submit another translation or check the dataset."""
58
+ )
src/layout.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ INTRO_TEXT = """This mini-app allows you to contribute to the **darija-english** dataset
2
+ as part of [DODa](https://darija-open-dataset.github.io/)
3
+ project. To contribute, simply translate the given sentence from Arabic to English.
4
+ The translated sentence will be submitted to the dataset
5
+ [here](https://huggingface.co/datasets/imomayiz/darija-english)."""
src/utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ def push_data_to_hf(repo_id, folder_path, path_in_repo, token=None):
4
+ """
5
+ Pushes data to a dataset on the Hugging Face Hub.
6
+
7
+ Parameters:
8
+ - repo_id (str): The ID of the repository on the Hugging Face Hub.
9
+ - folder_path (str): Local path to the folder containing the data.
10
+ - path_in_repo (str): Path within the repository where the data should be stored.
11
+ - token (str, optional): Your authentication token for the Hugging Face Hub.
12
+
13
+ Returns:
14
+ - str: URL of the uploaded data.
15
+ """
16
+
17
+ api = HfApi(token=token)
18
+
19
+ try:
20
+ api.upload_folder(
21
+ folder_path=folder_path,
22
+ repo_id=repo_id,
23
+ repo_type="dataset",
24
+ path_in_repo=path_in_repo,
25
+ )
26
+ except Exception as e:
27
+ return f"Error uploading data: {str(e)}"
28
+
29
+ url = f"https://huggingface.co/{repo_id}/raw/main/{path_in_repo}"
30
+
31
+ return f"Data successfully uploaded. Access it at: {url}"