= commited on
Commit
9b14109
·
1 Parent(s): 1daaf06

test database in huggingface

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ pages/provide_sentences.py
app.py CHANGED
@@ -74,7 +74,7 @@ st.markdown("Translation page 🔠")
74
  # max_len = models[version][translation_type]['max_len']
75
 
76
  # # let us get the best model
77
- # # @st.cache_resource
78
  # def get_modelfw_v3():
79
 
80
  # # recuperate checkpoints
 
74
  # max_len = models[version][translation_type]['max_len']
75
 
76
  # # let us get the best model
77
+ # @st.cache_resource
78
  # def get_modelfw_v3():
79
 
80
  # # recuperate checkpoints
pages/provide_sentences.py CHANGED
@@ -11,7 +11,7 @@ sentences = sentences_.copy()
11
  # get french and wolof sentences
12
  french_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/french.csv')
13
 
14
- wolof_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/wolof.csv')
15
 
16
  # add special characters from Wolof
17
  sp_wolof_chars = pd.read_csv('wolof-translate/wolof_translate/data/wolof_writing/wolof_special_chars.csv')
 
11
  # get french and wolof sentences
12
  french_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/french.csv')
13
 
14
+ wolof_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/wolof_2.csv')
15
 
16
  # add special characters from Wolof
17
  sp_wolof_chars = pd.read_csv('wolof-translate/wolof_translate/data/wolof_writing/wolof_special_chars.csv')
pages/provide_sentences_mongo.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wolof_translate.utils.database_manager import TranslationMongoDBManager
2
+ import streamlit as st
3
+ import pandas as pd
4
+
5
+ st.markdown("Provide your own 🤗 sentences")
6
+
7
+ # let us initialize the database manage
8
+ @st.cache_resource
9
+ def get_cluster():
10
+
11
+ db_manager = TranslationMongoDBManager('mongodb+srv://oumar199:Jacksparrow360@woloftranslationcluster.u0gk7.mongodb.net/?retryWrites=true&w=majority', 'WolofTranslation')
12
+
13
+ return db_manager
14
+
15
+ db_manager = get_cluster()
16
+
17
+ # recuperate the already saved sentences (for the moment french/wolof sentences)
18
+ sentences_, deleted = db_manager.load_data_frames()
19
+
20
+ sentences = sentences_.copy()
21
+
22
+ # get french and wolof sentences
23
+ french_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/french.csv')
24
+
25
+ wolof_examples = pd.read_csv('wolof-translate/wolof_translate/data/sentences/wolof_2.csv')
26
+
27
+ # add special characters from Wolof
28
+ sp_wolof_chars = pd.read_csv('wolof-translate/wolof_translate/data/wolof_writing/wolof_special_chars.csv')
29
+
30
+ # add definitions
31
+ sp_wolof_words = pd.read_csv('wolof-translate/wolof_translate/data/wolof_writing/definitions.csv')
32
+
33
+ sp_wolof_words.sort_values(by = ['french', 'wolof'], inplace = True)
34
+
35
+ # initialize the input texts
36
+ st.title("Provide sentences below ⤵️")
37
+
38
+ st.markdown("""---""")
39
+
40
+ # create three columns
41
+ left, right = st.columns(2)
42
+
43
+ # let us add a callback functions to change the input text
44
+ def add_symbol_to_french():
45
+
46
+ st.session_state.left_sentence += st.session_state.fr_symbol
47
+
48
+ def add_symbol_to_wolof():
49
+
50
+ st.session_state.right_sentence += st.session_state.wf_symbol
51
+
52
+ def add_special_token_french():
53
+
54
+ st.session_state.left_sentence += '<mask>'
55
+
56
+ def add_special_token_wolof():
57
+
58
+ st.session_state.right_sentence += '<mask>'
59
+
60
+ def add_word_to_text():
61
+
62
+ word = st.session_state.word.split('/')[0].strip()
63
+
64
+ st.session_state.right_sentence += word
65
+
66
+ def add_french_sentence_to_text():
67
+
68
+ sentence = st.session_state.french_sentence.strip()
69
+
70
+ st.session_state.left_sentence = sentence
71
+
72
+ def add_wolof_sentence_to_text():
73
+
74
+ sentence = st.session_state.wolof_sentence.strip()
75
+
76
+ st.session_state.right_sentence = sentence
77
+
78
+ # let us create a callback which permit us to add sentences inside a DataFrame
79
+ def add_new_sentences():
80
+
81
+ global sentences
82
+
83
+ sentence_1 = st.session_state.left_sentence.strip()
84
+
85
+ sentence_2 = st.session_state.right_sentence.strip()
86
+
87
+ if sentence_1 == '' or sentence_2 == '':
88
+
89
+ st.warning("You didn't provide a sentence ! Please provide before submitting.", icon= "🚨")
90
+
91
+ else:
92
+
93
+
94
+
95
+ ## save the result
96
+ sentences.to_csv('wolof-translate/wolof_translate/data/sentences/wolof_french.csv', index=False)
97
+
98
+ # recuperate the already saved sentences (for the moment french/wolof)
99
+ sentences = pd.read_csv('wolof-translate/wolof_translate/data/sentences/wolof_french.csv')
100
+
101
+ # add the last position to delete and modify
102
+ st.session_state.line1 = len(sentences) - 1
103
+
104
+ st.session_state.line2 = len(sentences) - 1
105
+
106
+ # let us create a callback which permit us to add sentences inside a DataFrame
107
+ def add_new_sentences2():
108
+
109
+ global sentences
110
+
111
+ sentence_1 = st.session_state.left_sentence.strip()
112
+
113
+ sentence_2 = st.session_state.right_sentence.strip()
114
+
115
+ if sentence_1 == '' or sentence_2 == '':
116
+
117
+ st.warning("You didn't provide a sentence ! Please provide before submitting.", icon= "🚨")
118
+
119
+ else:
120
+
121
+ # insert the new sentences
122
+ db_manager.insert_document({
123
+ 'french': sentence_1,
124
+ 'wolof': sentence_2
125
+ })
126
+
127
+ # recuperate the already saved sentences (for the moment french/wolof)
128
+ sentences = db_manager.load_data_frames()
129
+
130
+ # clean the inputs' contents
131
+ st.session_state.left_sentence = ''
132
+ st.session_state.right_sentence = ''
133
+
134
+ # add the last position to delete and modify
135
+ st.session_state.line1 = len(sentences) - 1
136
+
137
+ st.session_state.line2 = len(sentences) - 1
138
+
139
+ def delete_line():
140
+
141
+ global sentences
142
+
143
+ number = st.session_state.line1
144
+
145
+ if number > len(sentences) - 1:
146
+
147
+ st.warning("The line that you provided does not exist !")
148
+
149
+ else:
150
+
151
+ # delete a document
152
+ db_manager.delete_document(number)
153
+
154
+ # recuperate the already saved sentences (for the moment french/wolof)
155
+ sentences = db_manager.load_data_frames()
156
+
157
+ def modify_line():
158
+
159
+ global sentences
160
+
161
+ number = st.session_state.line2
162
+
163
+ if number > len(sentences) - 1:
164
+
165
+ st.warning("The line that you provided does not exist !")
166
+
167
+ else:
168
+ st.session_state.left_sentence = sentences.loc[number, sentences.columns.tolist()[1]]
169
+
170
+ st.session_state.right_sentence = sentences.loc[number, sentences.columns.tolist()[2]]
171
+
172
+ def update_sentences():
173
+
174
+ global sentences
175
+
176
+ number = st.session_state.line2
177
+
178
+ sentence_1 = st.session_state.left_sentence.strip()
179
+
180
+ sentence_2 = st.session_state.right_sentence.strip()
181
+
182
+ if number > len(sentences) - 1:
183
+
184
+ st.warning("The line that you provided does not exist !")
185
+
186
+ elif sentence_1 == '' or sentence_2 == '':
187
+
188
+ st.warning("You didn't provide a sentence ! Please provide before submitting.", icon= "🚨")
189
+
190
+ else:
191
+
192
+ db_manager.update_document(number,
193
+ {
194
+ 'french': sentence_1,
195
+ 'wolof': sentence_2
196
+ })
197
+
198
+ # recuperate the already saved sentences (for the moment french/wolof)
199
+ sentences = db_manager.load_data_frames()
200
+
201
+ left.header("French")
202
+ left.text_area(sentences.columns.tolist()[1], key = "left_sentence")
203
+
204
+ right.header("Wolof")
205
+ right.text_area(sentences.columns.tolist()[2], key = "right_sentence")
206
+
207
+ fr_symbol = left.selectbox("French characters", key="fr_symbol", options = sp_wolof_chars['wolof_special_chars'])
208
+
209
+ wf_symbol = right.selectbox("Wolof characters", key="wf_symbol", options = sp_wolof_chars['wolof_special_chars'])
210
+
211
+ word = right.selectbox("Wolof words/Definitions", key="word", options = [sp_wolof_words.loc[i, 'wolof']+" / "+sp_wolof_words.loc[i, 'french'] for i in range(sp_wolof_words.shape[0])], on_change=add_word_to_text)
212
+
213
+ left.button("Add symbol", "add fr symbol", on_click=add_symbol_to_french)
214
+
215
+ left.button("Add mask", "french mask", on_click=add_special_token_french)
216
+
217
+ right.button("Add symbol", "add wf symbol", on_click=add_symbol_to_wolof)
218
+
219
+ right.button("Add mask", "wolof mask", on_click=add_special_token_wolof)
220
+
221
+ # add sentences at sidebar
222
+ st.sidebar.selectbox("French sentences", key="french_sentence", options = french_examples['sentences'], on_change=add_french_sentence_to_text)
223
+
224
+ st.sidebar.selectbox("Wolof sentences", key="wolof_sentence", options = wolof_examples['sentences'], on_change=add_wolof_sentence_to_text)
225
+
226
+ # add a submit button to add new sentences
227
+ st.button('Submit', 'submit_button', on_click=add_new_sentences)
228
+ st.button('Place', 'submit_button2', on_click=add_new_sentences2)
229
+ st.button('Update', 'submit_button3', on_click=update_sentences)
230
+
231
+ st.markdown("""---""")
232
+
233
+ # add delete button
234
+ number = st.number_input("Provide line to delete", key="line1", min_value=0)
235
+
236
+ st.button("Delete", 'delete_button', on_click=delete_line)
237
+
238
+ # add modify button
239
+ number = st.number_input("Provide line to modify", key="line2", min_value=0)
240
+
241
+ st.button("Modify", 'modify_button', on_click=modify_line)
242
+
243
+ # add data frame
244
+ st.dataframe(sentences.set_index('_id'), width=900)
245
+
246
+
recuperate_huggingface_data.ipynb CHANGED
@@ -19,7 +19,7 @@
19
  },
20
  {
21
  "cell_type": "code",
22
- "execution_count": 1,
23
  "metadata": {},
24
  "outputs": [],
25
  "source": [
@@ -83,6 +83,14 @@
83
  "- Wolof sentences"
84
  ]
85
  },
 
 
 
 
 
 
 
 
86
  {
87
  "cell_type": "code",
88
  "execution_count": 2,
@@ -149,7 +157,83 @@
149
  "outputs": [],
150
  "source": [
151
  "# save the sentences\n",
152
- "pd.DataFrame({'sentences': wolof_sents}).to_csv('wolof.csv', index=False)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  ]
154
  }
155
  ],
 
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 2,
23
  "metadata": {},
24
  "outputs": [],
25
  "source": [
 
83
  "- Wolof sentences"
84
  ]
85
  },
86
+ {
87
+ "attachments": {},
88
+ "cell_type": "markdown",
89
+ "metadata": {},
90
+ "source": [
91
+ "1. First one hundred"
92
+ ]
93
+ },
94
  {
95
  "cell_type": "code",
96
  "execution_count": 2,
 
157
  "outputs": [],
158
  "source": [
159
  "# save the sentences\n",
160
+ "pd.DataFrame({'sentences': wolof_sents}).to_csv('wolof_1.csv', index=False)"
161
+ ]
162
+ },
163
+ {
164
+ "attachments": {},
165
+ "cell_type": "markdown",
166
+ "metadata": {},
167
+ "source": [
168
+ "2. second ..."
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 3,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "curl = \"https://datasets-server.huggingface.co/rows?dataset=perrynelson%2Fwaxal-wolof&config=perrynelson--waxal-wolof&split=test&offset=100&limit=100\"\n",
178
+ "\n",
179
+ "def query():\n",
180
+ " \n",
181
+ " response = requests.get(curl)\n",
182
+ " return response.json()\n",
183
+ "\n",
184
+ "data = query()"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "markdown",
189
+ "metadata": {},
190
+ "source": [
191
+ "Get the transcriptions in a data frame."
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 4,
197
+ "metadata": {},
198
+ "outputs": [
199
+ {
200
+ "data": {
201
+ "text/plain": [
202
+ "100"
203
+ ]
204
+ },
205
+ "execution_count": 4,
206
+ "metadata": {},
207
+ "output_type": "execute_result"
208
+ }
209
+ ],
210
+ "source": [
211
+ "len(data['rows'])"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 5,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "sentences = data['rows']\n",
221
+ "\n",
222
+ "wolof_sents = []\n",
223
+ "\n",
224
+ "for i in range(len(sentences)):\n",
225
+ " \n",
226
+ " wolof_sents.append(sentences[i]['row']['transcription'])"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 6,
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": [
235
+ "# save the sentences\n",
236
+ "pd.DataFrame({'sentences': wolof_sents}).to_csv('wolof_2.csv', index=False)"
237
  ]
238
  }
239
  ],
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  altair<5
2
  transformers
3
  torch
4
- pandas
 
 
1
  altair<5
2
  transformers
3
  torch
4
+ pandas
5
+ wolof-translate
send_and_retrieve_data.ipynb ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "MongoDB Cluster management\n",
9
+ "-----------------------\n",
10
+ "\n",
11
+ "Let us create some function to send (insert documents) and retrieve collections from our cluster database."
12
+ ]
13
+ },
14
+ {
15
+ "attachments": {},
16
+ "cell_type": "markdown",
17
+ "metadata": {},
18
+ "source": [
19
+ "We must import the pymongo the client and the server modules."
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 101,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from pymongo.mongo_client import MongoClient\n",
29
+ "from pymongo.server_api import ServerApi\n",
30
+ "import pandas as pd"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 102,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# let us set below the cluster uri\n",
40
+ "uri = \"mongodb+srv://oumar199:Jacksparrow360@woloftranslationcluster.u0gk7.mongodb.net/?retryWrites=true&w=majority\""
41
+ ]
42
+ },
43
+ {
44
+ "attachments": {},
45
+ "cell_type": "markdown",
46
+ "metadata": {},
47
+ "source": [
48
+ "### Inserting new documents"
49
+ ]
50
+ },
51
+ {
52
+ "attachments": {},
53
+ "cell_type": "markdown",
54
+ "metadata": {},
55
+ "source": [
56
+ "Let us insert new documents to our collection."
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 103,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "# we must initialize the client\n",
66
+ "client = MongoClient(uri, server_api = ServerApi('1'))"
67
+ ]
68
+ },
69
+ {
70
+ "attachments": {},
71
+ "cell_type": "markdown",
72
+ "metadata": {},
73
+ "source": [
74
+ "Let us create a new collection."
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 104,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# create a database\n",
84
+ "db = client.get_database('WolofTranslation')"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 105,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "# create a new collection for the new sentences\n",
94
+ "sentences = db.sentences"
95
+ ]
96
+ },
97
+ {
98
+ "attachments": {},
99
+ "cell_type": "markdown",
100
+ "metadata": {},
101
+ "source": [
102
+ "It is time to insert the new documents."
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 106,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "# recuperate the already created sentences\n",
112
+ "corpora = pd.read_csv('wolof-translate/wolof_translate/data/sentences/wolof_french.csv')"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 107,
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "# let us reset the indices\n",
122
+ "corpora.reset_index(inplace=True)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 108,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "text/html": [
133
+ "<div>\n",
134
+ "<style scoped>\n",
135
+ " .dataframe tbody tr th:only-of-type {\n",
136
+ " vertical-align: middle;\n",
137
+ " }\n",
138
+ "\n",
139
+ " .dataframe tbody tr th {\n",
140
+ " vertical-align: top;\n",
141
+ " }\n",
142
+ "\n",
143
+ " .dataframe thead th {\n",
144
+ " text-align: right;\n",
145
+ " }\n",
146
+ "</style>\n",
147
+ "<table border=\"1\" class=\"dataframe\">\n",
148
+ " <thead>\n",
149
+ " <tr style=\"text-align: right;\">\n",
150
+ " <th></th>\n",
151
+ " <th>index</th>\n",
152
+ " <th>french</th>\n",
153
+ " <th>wolof</th>\n",
154
+ " </tr>\n",
155
+ " </thead>\n",
156
+ " <tbody>\n",
157
+ " <tr>\n",
158
+ " <th>0</th>\n",
159
+ " <td>0</td>\n",
160
+ " <td>J'arrive tout de suite chez toi.</td>\n",
161
+ " <td>Léegui léegui ma egg sa kër.</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <th>1</th>\n",
165
+ " <td>1</td>\n",
166
+ " <td>J'en suis sûr, cette photo ci c'est la photo p...</td>\n",
167
+ " <td>Waaw nataal bii nataal la boob ay nit ñu baree...</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>2</th>\n",
171
+ " <td>2</td>\n",
172
+ " <td>Je vois devant moi une photo sur laquelle beau...</td>\n",
173
+ " <td>Nataal bii maa ngi ciy janloog haa ay nit yu b...</td>\n",
174
+ " </tr>\n",
175
+ " <tr>\n",
176
+ " <th>3</th>\n",
177
+ " <td>3</td>\n",
178
+ " <td>Ceux-ci sont des personnes qui sont sortis pou...</td>\n",
179
+ " <td>Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ...</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>4</th>\n",
183
+ " <td>4</td>\n",
184
+ " <td>Salut ! Ceux-là qui ressemblent à des personne...</td>\n",
185
+ " <td>Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ...</td>\n",
186
+ " </tr>\n",
187
+ " <tr>\n",
188
+ " <th>5</th>\n",
189
+ " <td>5</td>\n",
190
+ " <td>Cette photo ci c'est une photo sur laquelle je...</td>\n",
191
+ " <td>Nataal bi ab nataal la boo xamante yni maa ngi...</td>\n",
192
+ " </tr>\n",
193
+ " <tr>\n",
194
+ " <th>6</th>\n",
195
+ " <td>6</td>\n",
196
+ " <td>Sur la photo, ont voit des personnes qui se ré...</td>\n",
197
+ " <td>Nataal bii ñoo ngi ciy gis ay nit ñuy ñaxtu wa...</td>\n",
198
+ " </tr>\n",
199
+ " <tr>\n",
200
+ " <th>7</th>\n",
201
+ " <td>7</td>\n",
202
+ " <td>On voit sur la photo beaucoup de personnes sor...</td>\n",
203
+ " <td>Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...</td>\n",
204
+ " </tr>\n",
205
+ " <tr>\n",
206
+ " <th>8</th>\n",
207
+ " <td>8</td>\n",
208
+ " <td>C'est des poissons, oui. Ils sont de couleur b...</td>\n",
209
+ " <td>Jën la waaw, Wu am wirgo Wu baxa ak Wu xonq.</td>\n",
210
+ " </tr>\n",
211
+ " <tr>\n",
212
+ " <th>9</th>\n",
213
+ " <td>9</td>\n",
214
+ " <td>Ah sur cette photo ci cependant, il y a un poi...</td>\n",
215
+ " <td>Aah nataal bii nag, aw jën la. Jën wi mi ngi a...</td>\n",
216
+ " </tr>\n",
217
+ " </tbody>\n",
218
+ "</table>\n",
219
+ "</div>"
220
+ ],
221
+ "text/plain": [
222
+ " index french \\\n",
223
+ "0 0 J'arrive tout de suite chez toi. \n",
224
+ "1 1 J'en suis sûr, cette photo ci c'est la photo p... \n",
225
+ "2 2 Je vois devant moi une photo sur laquelle beau... \n",
226
+ "3 3 Ceux-ci sont des personnes qui sont sortis pou... \n",
227
+ "4 4 Salut ! Ceux-là qui ressemblent à des personne... \n",
228
+ "5 5 Cette photo ci c'est une photo sur laquelle je... \n",
229
+ "6 6 Sur la photo, ont voit des personnes qui se ré... \n",
230
+ "7 7 On voit sur la photo beaucoup de personnes sor... \n",
231
+ "8 8 C'est des poissons, oui. Ils sont de couleur b... \n",
232
+ "9 9 Ah sur cette photo ci cependant, il y a un poi... \n",
233
+ "\n",
234
+ " wolof \n",
235
+ "0 Léegui léegui ma egg sa kër. \n",
236
+ "1 Waaw nataal bii nataal la boob ay nit ñu baree... \n",
237
+ "2 Nataal bii maa ngi ciy janloog haa ay nit yu b... \n",
238
+ "3 Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ... \n",
239
+ "4 Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ... \n",
240
+ "5 Nataal bi ab nataal la boo xamante yni maa ngi... \n",
241
+ "6 Nataal bii ñoo ngi ciy gis ay nit ñuy ñaxtu wa... \n",
242
+ "7 Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ... \n",
243
+ "8 Jën la waaw, Wu am wirgo Wu baxa ak Wu xonq. \n",
244
+ "9 Aah nataal bii nag, aw jën la. Jën wi mi ngi a... "
245
+ ]
246
+ },
247
+ "execution_count": 108,
248
+ "metadata": {},
249
+ "output_type": "execute_result"
250
+ }
251
+ ],
252
+ "source": [
253
+ "corpora.head(10)"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 109,
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "# insert the sentences\n",
263
+ "results = sentences.insert_many({\n",
264
+ " '_id': corp, # set the id\n",
265
+ " 'french': corpora.loc[corp, 'french'],\n",
266
+ " 'wolof': corpora.loc[corp, 'wolof']\n",
267
+ " } for corp in corpora.index\n",
268
+ ")"
269
+ ]
270
+ },
271
+ {
272
+ "attachments": {},
273
+ "cell_type": "markdown",
274
+ "metadata": {},
275
+ "source": [
276
+ "Let us insert the deleted sentences."
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": 110,
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "# create a new collection named deleted.\n",
286
+ "deleted = db.deleted"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 111,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "# recuperated the data frame of deleted sentences\n",
296
+ "del_corpora = pd.read_csv('wolof-translate/wolof_translate/data/sentences/deleted_lines.csv')"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 112,
302
+ "metadata": {},
303
+ "outputs": [],
304
+ "source": [
305
+ "# reset the indices\n",
306
+ "del_corpora.reset_index(inplace=True)"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 113,
312
+ "metadata": {},
313
+ "outputs": [],
314
+ "source": [
315
+ "# insert the deleted sentences\n",
316
+ "results = deleted.insert_many({\n",
317
+ " '_id': corp, # set the id\n",
318
+ " 'french': corpora.loc[corp, 'french'],\n",
319
+ " 'wolof': corpora.loc[corp, 'wolof']\n",
320
+ " } for corp in del_corpora.index\n",
321
+ ")"
322
+ ]
323
+ },
324
+ {
325
+ "attachments": {},
326
+ "cell_type": "markdown",
327
+ "metadata": {},
328
+ "source": [
329
+ "### Modify sentences"
330
+ ]
331
+ },
332
+ {
333
+ "attachments": {},
334
+ "cell_type": "markdown",
335
+ "metadata": {},
336
+ "source": [
337
+ "We want to modify only one sentence at a time."
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 114,
343
+ "metadata": {},
344
+ "outputs": [],
345
+ "source": [
346
+ "# select the id to modify\n",
347
+ "id_ = 1\n",
348
+ "\n",
349
+ "# retrieve new sentences\n",
350
+ "french = corpora.loc[id_, 'french']\n",
351
+ "wolof = corpora.loc[id_, 'wolof']"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 115,
357
+ "metadata": {},
358
+ "outputs": [
359
+ {
360
+ "data": {
361
+ "text/plain": [
362
+ "(\"J'en suis sûr, cette photo ci c'est la photo pris au moment où plusieurs personnes font une marche de révolte tendant leurs mains. Ceux là sont assis, ceux là sont debout entrain de marcher. On a écrit sur la photo quelque chose de bleu concernant la Casamance.\",\n",
363
+ " 'Waaw nataal bii nataal la boob ay nit ñu baree bari ñoo xam ni dañuy doxub ñaxtu ñoo ci nekk tàllal seen i loxo. Ñee sukku ñeel taxaw jodd di dox. Ñu bind ci kaw nataal bi lu xaw a baxa la bind ci laa kaasamãs.')"
364
+ ]
365
+ },
366
+ "execution_count": 115,
367
+ "metadata": {},
368
+ "output_type": "execute_result"
369
+ }
370
+ ],
371
+ "source": [
372
+ "# print the sentences\n",
373
+ "french, wolof"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": 118,
379
+ "metadata": {},
380
+ "outputs": [],
381
+ "source": [
382
+ "# modify the sentences at the id\n",
383
+ "results = sentences.update_one(\n",
384
+ " {\n",
385
+ " '_id': {'$eq': id_}\n",
386
+ " },\n",
387
+ " {\n",
388
+ " '$set': {\n",
389
+ " 'french': french,\n",
390
+ " 'wolof': wolof + \"--------\" # we added a modification\n",
391
+ " }\n",
392
+ " }\n",
393
+ ")"
394
+ ]
395
+ },
396
+ {
397
+ "attachments": {},
398
+ "cell_type": "markdown",
399
+ "metadata": {},
400
+ "source": [
401
+ "Let us show the first documents."
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "code",
406
+ "execution_count": 119,
407
+ "metadata": {},
408
+ "outputs": [
409
+ {
410
+ "data": {
411
+ "text/html": [
412
+ "<div>\n",
413
+ "<style scoped>\n",
414
+ " .dataframe tbody tr th:only-of-type {\n",
415
+ " vertical-align: middle;\n",
416
+ " }\n",
417
+ "\n",
418
+ " .dataframe tbody tr th {\n",
419
+ " vertical-align: top;\n",
420
+ " }\n",
421
+ "\n",
422
+ " .dataframe thead th {\n",
423
+ " text-align: right;\n",
424
+ " }\n",
425
+ "</style>\n",
426
+ "<table border=\"1\" class=\"dataframe\">\n",
427
+ " <thead>\n",
428
+ " <tr style=\"text-align: right;\">\n",
429
+ " <th></th>\n",
430
+ " <th>_id</th>\n",
431
+ " <th>french</th>\n",
432
+ " <th>wolof</th>\n",
433
+ " </tr>\n",
434
+ " </thead>\n",
435
+ " <tbody>\n",
436
+ " <tr>\n",
437
+ " <th>0</th>\n",
438
+ " <td>0</td>\n",
439
+ " <td>J'arrive tout de suite chez toi.</td>\n",
440
+ " <td>Léegui léegui ma egg sa kër.</td>\n",
441
+ " </tr>\n",
442
+ " <tr>\n",
443
+ " <th>1</th>\n",
444
+ " <td>1</td>\n",
445
+ " <td>J'en suis sûr, cette photo ci c'est la photo p...</td>\n",
446
+ " <td>Waaw nataal bii nataal la boob ay nit ñu baree...</td>\n",
447
+ " </tr>\n",
448
+ " <tr>\n",
449
+ " <th>2</th>\n",
450
+ " <td>2</td>\n",
451
+ " <td>Je vois devant moi une photo sur laquelle beau...</td>\n",
452
+ " <td>Nataal bii maa ngi ciy janloog haa ay nit yu b...</td>\n",
453
+ " </tr>\n",
454
+ " <tr>\n",
455
+ " <th>3</th>\n",
456
+ " <td>3</td>\n",
457
+ " <td>Ceux-ci sont des personnes qui sont sortis pou...</td>\n",
458
+ " <td>Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ...</td>\n",
459
+ " </tr>\n",
460
+ " <tr>\n",
461
+ " <th>4</th>\n",
462
+ " <td>4</td>\n",
463
+ " <td>Salut ! Ceux-là qui ressemblent à des personne...</td>\n",
464
+ " <td>Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ...</td>\n",
465
+ " </tr>\n",
466
+ " <tr>\n",
467
+ " <th>5</th>\n",
468
+ " <td>5</td>\n",
469
+ " <td>Cette photo ci c'est une photo sur laquelle je...</td>\n",
470
+ " <td>Nataal bi ab nataal la boo xamante yni maa ngi...</td>\n",
471
+ " </tr>\n",
472
+ " <tr>\n",
473
+ " <th>6</th>\n",
474
+ " <td>6</td>\n",
475
+ " <td>Sur la photo, ont voit des personnes qui se ré...</td>\n",
476
+ " <td>Nataal bii ñoo ngi ciy gis ay nit ñuy ñaxtu wa...</td>\n",
477
+ " </tr>\n",
478
+ " <tr>\n",
479
+ " <th>7</th>\n",
480
+ " <td>7</td>\n",
481
+ " <td>On voit sur la photo beaucoup de personnes sor...</td>\n",
482
+ " <td>Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ...</td>\n",
483
+ " </tr>\n",
484
+ " <tr>\n",
485
+ " <th>8</th>\n",
486
+ " <td>8</td>\n",
487
+ " <td>C'est des poissons, oui. Ils sont de couleur b...</td>\n",
488
+ " <td>Jën la waaw, Wu am wirgo Wu baxa ak Wu xonq.</td>\n",
489
+ " </tr>\n",
490
+ " <tr>\n",
491
+ " <th>9</th>\n",
492
+ " <td>9</td>\n",
493
+ " <td>Ah sur cette photo ci cependant, il y a un poi...</td>\n",
494
+ " <td>Aah nataal bii nag, aw jën la. Jën wi mi ngi a...</td>\n",
495
+ " </tr>\n",
496
+ " </tbody>\n",
497
+ "</table>\n",
498
+ "</div>"
499
+ ],
500
+ "text/plain": [
501
+ " _id french \\\n",
502
+ "0 0 J'arrive tout de suite chez toi. \n",
503
+ "1 1 J'en suis sûr, cette photo ci c'est la photo p... \n",
504
+ "2 2 Je vois devant moi une photo sur laquelle beau... \n",
505
+ "3 3 Ceux-ci sont des personnes qui sont sortis pou... \n",
506
+ "4 4 Salut ! Ceux-là qui ressemblent à des personne... \n",
507
+ "5 5 Cette photo ci c'est une photo sur laquelle je... \n",
508
+ "6 6 Sur la photo, ont voit des personnes qui se ré... \n",
509
+ "7 7 On voit sur la photo beaucoup de personnes sor... \n",
510
+ "8 8 C'est des poissons, oui. Ils sont de couleur b... \n",
511
+ "9 9 Ah sur cette photo ci cependant, il y a un poi... \n",
512
+ "\n",
513
+ " wolof \n",
514
+ "0 Léegui léegui ma egg sa kër. \n",
515
+ "1 Waaw nataal bii nataal la boob ay nit ñu baree... \n",
516
+ "2 Nataal bii maa ngi ciy janloog haa ay nit yu b... \n",
517
+ "3 Lii, ay nit lañu yu génn di ñaxtu. Jëm yi nag ... \n",
518
+ "4 Salaawaalekum ! Ñii de, mel nañ ne, ay nit ñu ... \n",
519
+ "5 Nataal bi ab nataal la boo xamante yni maa ngi... \n",
520
+ "6 Nataal bii ñoo ngi ciy gis ay nit ñuy ñaxtu wa... \n",
521
+ "7 Ñu gis ci nataal bi ay nit ñu bari ñu génn ci ... \n",
522
+ "8 Jën la waaw, Wu am wirgo Wu baxa ak Wu xonq. \n",
523
+ "9 Aah nataal bii nag, aw jën la. Jën wi mi ngi a... "
524
+ ]
525
+ },
526
+ "execution_count": 119,
527
+ "metadata": {},
528
+ "output_type": "execute_result"
529
+ }
530
+ ],
531
+ "source": [
532
+ "# get the 10 first sentences into a Data Frame\n",
533
+ "pd.DataFrame(list(sentences.find().limit(10)))"
534
+ ]
535
+ },
536
+ {
537
+ "attachments": {},
538
+ "cell_type": "markdown",
539
+ "metadata": {},
540
+ "source": [
541
+ "### Delete sentences"
542
+ ]
543
+ },
544
+ {
545
+ "attachments": {},
546
+ "cell_type": "markdown",
547
+ "metadata": {},
548
+ "source": [
549
+ "We want to modify only one sentence at a time. The deleted sentences must be added into the 'deleted' collection. "
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "execution_count": null,
555
+ "metadata": {},
556
+ "outputs": [],
557
+ "source": [
558
+ "# recuperate the sentences to delete (id = 0)\n",
559
+ "id_ = 0\n",
560
+ "\n",
561
+ "del_sent = sentences.find_one(\n",
562
+ " {\n",
563
+ " '_id': {'$eq': id_}\n",
564
+ " } \n",
565
+ ")\n",
566
+ "\n",
567
+ "# delete the sentence and add it into the deleted sentences\n",
568
+ "sentences.delete_one(\n",
569
+ " {\n",
570
+ " '_id': {'$eq': del_sent['_id']}\n",
571
+ " }\n",
572
+ ")\n",
573
+ "\n",
574
+ "results = deleted.insert_one(\n",
575
+ " {\n",
576
+ " '_id': len(list(deleted.find())),\n",
577
+ " 'french': del_sent['french'],\n",
578
+ " 'wolof': del_sent['wolof']\n",
579
+ " }\n",
580
+ ")"
581
+ ]
582
+ },
583
+ {
584
+ "attachments": {},
585
+ "cell_type": "markdown",
586
+ "metadata": {},
587
+ "source": [
588
+ "### From collection to DataFrame"
589
+ ]
590
+ },
591
+ {
592
+ "attachments": {},
593
+ "cell_type": "markdown",
594
+ "metadata": {},
595
+ "source": [
596
+ "We must convert the sentences to csv files in order to use at the training step."
597
+ ]
598
+ },
599
+ {
600
+ "cell_type": "code",
601
+ "execution_count": 123,
602
+ "metadata": {},
603
+ "outputs": [],
604
+ "source": [
605
+ "# recuperate the new corpora\n",
606
+ "new_corpora = pd.DataFrame(list(sentences.find()))\n",
607
+ "\n",
608
+ "# recuperate the deleted sentences as a Data Frame\n",
609
+ "deleted_df = pd.DataFrame(list(deleted.find()))\n",
610
+ "\n",
611
+ "# save the data frames as csv files\n",
612
+ "new_corpora.set_index('_id', inplace=True)\n",
613
+ "\n",
614
+ "deleted_df.set_index('_id', inplace=True)\n",
615
+ "\n",
616
+ "new_corpora.to_csv('wolof-translate/wolof_translate/data/sentences/wolof_french.csv', index=False)\n",
617
+ "\n",
618
+ "deleted_df.to_csv('wolof-translate/wolof_translate/data/sentences/deleted_lines.csv', index=False)"
619
+ ]
620
+ },
621
+ {
622
+ "attachments": {},
623
+ "cell_type": "markdown",
624
+ "metadata": {},
625
+ "source": [
626
+ "### All in one"
627
+ ]
628
+ },
629
+ {
630
+ "attachments": {},
631
+ "cell_type": "markdown",
632
+ "metadata": {},
633
+ "source": [
634
+ "Let us create a class which consider each of the methods we investigated previously."
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 129,
640
+ "metadata": {},
641
+ "outputs": [
642
+ {
643
+ "data": {
644
+ "text/plain": [
645
+ "{'_id': 150,\n",
646
+ " 'french': \"Sur la photo que vous m'avez envoyée, j'ai vu qu'il s'agissait de gendarmes. Des gendarmes qui portent, cependant, des... des boucliers. Des verres conçus pour les protéger. Ils sont faces au peuple s'échangeant contre eux des cailloux et des pierres.\",\n",
647
+ " 'wolof': 'Nataal bi ngeen ma yonnee, gis naa ni ay takk-der la. Takk der yoo xamantane bii nag, jël nañ loo xamantane bii mooy ay,... Ay baar. Ay verre yoo xam ne dañ kaa defar pour ñu leen di baare, ñu jàkkarlook askan wi di sànnanteek ñoom ay xeer ak ay doj.'}"
648
+ ]
649
+ },
650
+ "execution_count": 129,
651
+ "metadata": {},
652
+ "output_type": "execute_result"
653
+ }
654
+ ],
655
+ "source": [
656
+ "list(sentences.find().sort('_id', -1).limit(1))[0]"
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "code",
661
+ "execution_count": 132,
662
+ "metadata": {},
663
+ "outputs": [
664
+ {
665
+ "name": "stdout",
666
+ "output_type": "stream",
667
+ "text": [
668
+ "Overwriting wolof-translate/wolof_translate/utils/database_manager.py\n"
669
+ ]
670
+ }
671
+ ],
672
+ "source": [
673
+ "# %%writefile wolof-translate/wolof_translate/utils/database_manager.py\n",
674
+ "from pymongo.mongo_client import MongoClient\n",
675
+ "from pymongo.server_api import ServerApi\n",
676
+ "import pandas as pd\n",
677
+ "\n",
678
+ "class TranslationMongoDBManager:\n",
679
+ " \n",
680
+ " def __init__(self, uri: str, database: str):\n",
681
+ " \n",
682
+ " # recuperate the client\n",
683
+ " self.client = MongoClient(uri)\n",
684
+ " \n",
685
+ " # recuperate the database\n",
686
+ " self.db = self.client.get_database(database)\n",
687
+ " \n",
688
+ " def insert_documents(self, documents: list, collection: str = \"sentences\"):\n",
689
+ " \n",
690
+ " # insert documents inside a collection\n",
691
+ " results = self.db[collection].insert_many(documents)\n",
692
+ " \n",
693
+ " return results\n",
694
+ " \n",
695
+ " def insert_document(self, document: dict, collection: str = \"sentences\"):\n",
696
+ " \n",
697
+ " assert '_id' in document\n",
698
+ " \n",
699
+ " # get the id of the last sentence (recuperate the max id and add 1 to it)\n",
700
+ " max_id = self.get_max_id(collection)\n",
701
+ " \n",
702
+ " # add the new sentences\n",
703
+ " document['_id'] = max_id\n",
704
+ " \n",
705
+ " results = self.db[collection].insert_one(\n",
706
+ " document\n",
707
+ " )\n",
708
+ " \n",
709
+ " return results\n",
710
+ " \n",
711
+ " def update_document(self, id: int, document: dict, collection: str = \"sentences\", update_collection: str = \"updated\"):\n",
712
+ " \n",
713
+ " # recuperate the document to update\n",
714
+ " upd_sent = self.db[collection].find_one(\n",
715
+ " {\n",
716
+ " '_id': {\n",
717
+ " '$eq': id\n",
718
+ " }\n",
719
+ " }\n",
720
+ " )\n",
721
+ " \n",
722
+ " # update the document\n",
723
+ " results = self.db[collection].update_one(\n",
724
+ " {\n",
725
+ " '_id': {\n",
726
+ " '$eq': id\n",
727
+ " }\n",
728
+ " },\n",
729
+ " {\n",
730
+ " '$set': document\n",
731
+ " }\n",
732
+ " )\n",
733
+ " \n",
734
+ " # add the sentences to the deleted sentences\n",
735
+ " upd_sent['_id'] = len(list(self.db[update_collection].find()))\n",
736
+ " \n",
737
+ " results = self.db[update_collection].insert_one(\n",
738
+ " upd_sent\n",
739
+ " )\n",
740
+ " \n",
741
+ " return results\n",
742
+ " \n",
743
+ " def delete_document(self, id: int, collection: str = \"sentences\", del_collection: str = \"deleted\"):\n",
744
+ " \n",
745
+ " # recuperate the document to delete\n",
746
+ " del_sent = self.db[collection].find_one(\n",
747
+ " {\n",
748
+ " '_id': {\n",
749
+ " '$eq': id\n",
750
+ " }\n",
751
+ " }\n",
752
+ " )\n",
753
+ " \n",
754
+ " # delete the sentence\n",
755
+ " self.db[collection].delete_one(\n",
756
+ " {\n",
757
+ " '_id': {'$eq': del_sent['_id']}\n",
758
+ " }\n",
759
+ " )\n",
760
+ " \n",
761
+ " # add the sentences to the deleted sentences\n",
762
+ " del_sent['_id'] = len(list(self.db[del_collection].find()))\n",
763
+ " \n",
764
+ " results = self.db[del_collection].insert_one(\n",
765
+ " del_sent\n",
766
+ " )\n",
767
+ " \n",
768
+ " return results\n",
769
+ " \n",
770
+ " def get_max_id(self, collection):\n",
771
+ " \n",
772
+ " # recuperate the maximum id\n",
773
+ " id = list(self.db[collection].find().sort('_id', -1).limit(1))[0]['_id']\n",
774
+ " \n",
775
+ " return id\n",
776
+ " \n",
777
+ " def save_data_frames(self, sentences_path: str, deleted_path: str, collection: str = \"sentences\", del_collection: str = \"deleted\"):\n",
778
+ " \n",
779
+ " # recuperate the new corpora\n",
780
+ " new_corpora = pd.DataFrame(list(self.db[collection].find()))\n",
781
+ "\n",
782
+ " # recuperate the deleted sentences as a Data Frame\n",
783
+ " deleted_df = pd.DataFrame(list(self.db[del_collection].find()))\n",
784
+ "\n",
785
+ " # save the data frames as csv files\n",
786
+ " new_corpora.set_index('_id', inplace=True)\n",
787
+ "\n",
788
+ " deleted_df.set_index('_id', inplace=True)\n",
789
+ "\n",
790
+ " new_corpora.to_csv(sentences_path, index=False)\n",
791
+ "\n",
792
+ " deleted_df.to_csv(deleted_path, index=False)\n",
793
+ " \n",
794
+ " def load_data_frames(self, collection: str = \"sentences\", del_collection: str = \"deleted\"):\n",
795
+ " \n",
796
+ " # recuperate the new corpora\n",
797
+ " new_corpora = pd.DataFrame(list(self.db[collection].find()))\n",
798
+ "\n",
799
+ " # recuperate the deleted sentences as a Data Frame\n",
800
+ " deleted_df = pd.DataFrame(list(self.db[del_collection].find()))\n",
801
+ " \n",
802
+ " return new_corpora, deleted_df"
803
+ ]
804
+ }
805
+ ],
806
+ "metadata": {
807
+ "kernelspec": {
808
+ "display_name": "pytorch1-HleOW5am-py3.10",
809
+ "language": "python",
810
+ "name": "python3"
811
+ },
812
+ "language_info": {
813
+ "codemirror_mode": {
814
+ "name": "ipython",
815
+ "version": 3
816
+ },
817
+ "file_extension": ".py",
818
+ "mimetype": "text/x-python",
819
+ "name": "python",
820
+ "nbconvert_exporter": "python",
821
+ "pygments_lexer": "ipython3",
822
+ "version": "3.10.8"
823
+ },
824
+ "orig_nbformat": 4
825
+ },
826
+ "nbformat": 4,
827
+ "nbformat_minor": 2
828
+ }
upper_case_new_sent.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ sentences = pd.read_csv("wolof-translate/wolof_translate/data/sentences/wolof_french.csv")
3
+ french = sentences['french']
4
+ wolof = sentences['wolof']
5
+ new_french = []
6
+ new_wolof = []
7
+
8
+ for sent in french:
9
+ letters = [sent[0], sent[1]]
10
+ for l in sent[2:]:
11
+ if letters[-1] in ['?', '!', '.'] or letters[-2] in ['?', '.', '!'] and l.isupper():
12
+ letters.append(l.upper())
13
+ else:
14
+ letters.append(l)
15
+ new_french.append("".join(letters))
16
+
17
+ for sent in wolof:
18
+ letters = [sent[0], sent[1]]
19
+ for l in sent[2:]:
20
+ if letters[-1] in ['?', '!', '.'] or letters[-2] in ['?', '.', '!'] and l.isupper():
21
+ letters.append(l.upper())
22
+ else:
23
+ letters.append(l)
24
+ new_wolof.append("".join(letters))
25
+
26
+ new_sents = pd.DataFrame({'french': new_french, 'wolof': new_wolof})
27
+ new_sents.to_csv('wolof-translate/wolof_translate/data/sentences/wolof_french.csv', index = False)
28
+ # print(new_sents.head(10))
29
+
wolof-translate/wolof_translate/checkpoints/t5_base_custom_train_results_fw_v3/best_checkpoints.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c27c98fb26335a018a9e39fa61efe4b16e356439d266fee60f41e7c30ac9e95
3
+ size 801021373
wolof-translate/wolof_translate/checkpoints/t5_small_custom_train_results_wf_v3/best_checkpoints.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0427e418e75d5842f8b95ebe9025e91d6e16dd79ab7d6f5815320e239e8b350f
3
- size 180980359
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e543a1882f682a4fa84f6c6ff4d21e8afceb0cbee401bd934c7338f7515320f
3
+ size 180980103
wolof-translate/wolof_translate/data/sentences/deleted_lines.csv CHANGED
@@ -1,2 +1,2 @@
1
  french,wolof
2
- J'ai vu quelque chose qu'on achète au marché ...,Gis naa benn affaire bu ñuy ndugge marché ...
 
1
  french,wolof
2
+ J'arrive tout de suite chez toi.,Léegui léegui ma egg sa kër.
wolof-translate/wolof_translate/data/sentences/wolof_2.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentences
2
+ "nataal bii ngeen ma yónnee de, gis naa ci biir nataal bi ab buntu bu rafet ñu liggéeyee ko dénk ak weer. am na ca kaw bunt ba limiyeer buy leer"
3
+ "Nataal bii ñu ma yónnee gis naa ci ab kaadar def ay dàll, dàll yoo xam ne ay dàlli góor la, dàlli fermé yi tamit mi ngi ci."
4
+ "Lii ay bunt la yoo xam ne ""menuisier métallique"" yi ñoo koy defar. Bunt yi nag dañ leen defar ba wéer leen daldi koy ""peinture"" ñu ame wirgo wu dóomu-taal. Mu mel na nag bunt bi ñaari bunt la nga xam ne kon day mel ne bunt saal lay doon."
5
+ "Waaw nataal bii de ay bunt yu dóomu-taal moo ci nekk. Bunt yi ëe dafa dóoomu-taal waawaw! Am na ay lu mel ni ay fu ngelaw di jaar boo ko xëccee waaw! Mën nga, mën nga ko xëcc, bunt yi waaw. Du benn bunt nag."
6
+ Gis naa fi ab nataal bo xam ne ab lal la. Boo xam ne daa sokolaa am ñaari tànk. Am na wéeruwaay tamit. Loolu laa fi gis.
7
+ Nataal bii ngeen ma yónnee de gis naa ci biir nataal bi ay toogu yu am tànki weñ te toogu ya daa ñuul.
8
+ "Sama nataal bii ñu ma yónnee dafa mel ni ab sées la, sées yi ñuy tooge. Waaye nag couleur bi moom dafa bula couleur bu bula la."
9
+ "Nataal bii maa ngi ciy janook ""chaise"" bu nu liggéeye niki lu xaw a baxa. Moom toogukaay bii. Mel na ne yit ëe toogukaay la bu nu ràbbale ba fatt. Moom laay janool ci nataal bii ma janool."
10
+ "Man de Li ma gis mooy benn lakku doj la. Yaakaar naa aw xeer ñu wërële ko ay, ay kii ay ""ciment"" ak suuf ak ndox, waawaaw!"
11
+ "Waaw nataal bi de, ay xeer la yoo xam ni dañ dajaloo. Xeer yi nag Daf am am ay pax-pax ci biir, am ay cat-cat. Dafa xaw a marõ,xaw a tor lu mel ne kulëeru jën."
12
+ "Xale yuy fo la, yor seeni fowukaay. Xale yi dañuy fo, yor seeni fowukaay. Ay xale yuy fo la."
13
+ "Nataal bii ngeen ma yónnee de gis naa ci biir nataal bi jën yu mag, yuñ dagg ay dagg teg ko ci fenn."
14
+ "Lii nag nataal la, nataalu gaas bu mag. Gaas bi gaas bu toggu la bu orãs am pupu ruus ci kaw ñu bind ci bind bu weex Total."
15
+ "Nataal bii, maa ngi ciy janook ab gaas. Gaas boo xamantane bii ñoo ngi bind ci biir gaas bi Totaal. Gis naa ni gaas bu bees la bu nga xamantane bii masuñu koo jëfandikoo ndax benn kubéer bu ñuul bu nekk ci kaw."
16
+ "Waa nataal bii nag moom ab furno laa siy gis ñu teg cin ci kaw, ak ab baŋ ci wet gi, ak ak nit ku taxaw, ak ñaari nit ñu taxaw, ak ay baŋ."
17
+ "Waaw nataal bii de ngeen ma yónnee de nuru na lool furne ban, furne ban. Furne boo xam ne dañ koo defar ci ban, waaw moom laa gis ci nataal bi."
18
+ Nataal bii ngeen ma yónnee gis naa ci biir nataal bi àndub ban bu am ay bën-bën ñu tabaxe ko ban.
19
+ "Waaw ñi ngi siyaare bu baax, nataal bi maa ngi ci gis ay ay këll ak layu, ay këll ak layu yu rafet ak ñoom seen."
20
+ "Li nga ma yónnee ba tey, li ma ci gis ay jëfandikukaay la, baraada la ak mësin pour tàngalukaay ak xellikaay la."
21
+ Nataal bii ñaari tëddukaay la ñaari lal la. Benn ba teg nañ ca njegenaay lal ca darab. Benn bi ci suuf teguñu ci njegenaay kenn laluñ ci dara.
22
+ "Waaw lii lañu naan beerso. Dénk lanu ko defaree, dénk. Benn beerso bi nag ci biir lu mel ne xeetu darab bu weex moo ci nekk am benn njegeñaay. Beneen bi nag danu cee lal lu mel ne lu sokolaa ; darab bu sokolaa."
23
+ "Nataal bii de nataal la boo xam ne verre la, verre bi nag mi ngi nekk weer weer yoo xam ne dañoo tegalante, verre yi nag benn bokku ceek moroomam waaw.Verre bi am na affaire bu ko tée boo xam ne mi ngi ci suuf, waaw affaire bu ko tée."
24
+ "Waaw lii nii ab nataal bob mel na ni cemptang la boo xam ne dees koo defar, def ci ay weer. Cemptang gi itam mel na ni gis naa ci lu mel ni ki wañ... Woo xam ni dañ koo peentiir peentiir gu weex, dafal ko lu mel ni ki ak toog goo xam ni moo koy téye. Muy ñaari cemptang."
25
+ "Nataalu juróom benn fukk ak juróom ñeent day wone mburu mu tëdd mel ne mburu mi lakk bi ñor na waaye ñorul lool. Ndax am na wuute genn wàll gi dafa ñor bay bëgg a xëm, geneen wàll gi ñor rekk."
26
+ "Waaw lii nag ay néegi ñax la, néegi ñax bi ci kaw dañ kaa defar ko bam taaru lool. Bi ci des tamit ñu defar ko jël ab sars wërale ko ci kaw wutal kob buntu."
27
+ "Nataal bii ngeen ma yónnee, gis naa ci biir nataal bi basaŋ yuñ ràbb, tegele leen. Boo xoolee, biñ leen ràbbe rafet nañ lool sax."
28
+ "Waaw lii de ab ëe ay kii la ay pañe la, pañe ndugg. Am na yu mboq, am na yu baxa ñu teg ko ci kaw lal walla kaw moket moket.. Xam naa ne ku ko jox jigéén dana si ndugg bu baax."
29
+ "Lii nag dañ ciy faral di def ndugg maanaam jigéen ñi bu ñuy dem ja ba ëe, moom lañuy yor. Ñu koy woowe ci farañse pañe.. Dañu koy gàddu nag. Buñ ko gàddoo dañu ciy def lu mel niki ay ndugg daal lu mel ne jën, batañse,suppome, jën, naaje... Yooyu daal lañuy def ci biir."
30
+ Lii ay toogukaayu mbedd la jardin public. Ñu ngi jagleel ñii di rombu ñii bëgg a noppaleeku ci ab gox mën nañu ñëw toog fii toog waxtaan.
31
+ Waaw lii de ab nataal la boog. Maa ngi gis ñaari jigéen yu toog noonu yor ay leget yu dénk. Leget la yu dénk ak ay paan.
32
+ "Lii na ñuy wax ""la tour la plus haute au Sénégal"", mi ngi nekk ""Monument de la Renaissance Africaine"". Mi ngi nekk barab bi ñu bare di daje. Moo xam ñiy tàngal séen yaram, ñiy def i kõseer, moo xam ñiy def simnastig. Ñoom ñépp a fay daje. Palaas publig la boo xam ni bii ñoom ñépp a fay daje. Képp ku am xew-xew boo xam ni bii xaj na fa dangay ñów nga defe ko fa. Bu ko defee ñépp ñëw. Am na ci ñoo xam ni tàggat yaram moo leen fa aandi. Am na ñoo xam ni da ñuy jóge dëkk ak dëkk, ñów di ko wisite si. Monimaŋ bi, ñi ngi ko jëmale pàpp yaay ak doom. Doom jaa ngi joxoñ, baay ji téyee yaay ji ci ndigg li."
33
+ "Lii duggukaay bu Luga la frontière bu Luga fii ngay dugg pour ñu wan la yaw mii nii yaa ngi dëkk biñ naan Luga. Luga nag ci Senegaal la bokk, ab lii la ci Senegaal."
34
+ "Fii gis naa fi ñaari palaat yu weex. Ñaari palaat yu weex yi, benn bi am na cere. Benn bi, ñeexum kese la. Cere ji nag, ñeex na ñu ko, waaye ca wet ga gis naa ci ñaari bool yuy feeñ. Ñu ngi feeñ."
35
+ "lii daal, am taax la mu gudd mu weex. bari lool ay ubbi féexal. ci kaw, juróom benni ubbi-féexal feeñ na fi. ci suuf, juróom benni ubbi-féexal feeñ na fi. ñu lal laltu bu xonq, ay nit yu sol lu xonq séq ko. waaw noo ngi ziyaar. yaang ci jàmm?"
36
+ Waaw nataal bii nuru na ma ab li ñiy tudde ab tableau d'art mooy ñi nga xam ni dañ leen di defar daldi leen di bindaat daldi leen di dessiné mi ngi may jox benn waay it koo xam ni góor la may nirool ñu def ko bopp bi rëy lool sax gémmiñ gaa ngi ubbéeku mu am ay bëñ yu weex ca biir ay bët yu sew it bët yi tamit ubbéeku na mu am per bu ñuul kukk am ci wet gi ay lu weex nga gis doq gi mu xaw a nóox lool am ay am tamit ci ginnaaw mu mel ni dañ koo wat amaale lu ñuulaale ñu daldi ñuulal ginnaaw gi lool sax lool lool.
37
+ "Nataal bii ma jot dee, Palais lay nuru waaye Palais bi nag am na ay garde corps yoo xam ne ñi ngi koy gardé, ba noppi mu am ñaari gaynde yoo xam ni ñoo ko séq. Palais bi nag paleb Senegaal lay nuru, ba noppi mu am ay xob yu wert ba noppi Palais bi bâtiment bi daa weex tàll."
38
+ "Gis naa ni lii ab nataal la. Gis naa ci itam ay nit. Gis naa ci ku sol lu xonq. gis naa ci ku sol lu ñuul. Gis naa ci ku sol mbaxane bu ñuul ak ay dàll yu ñuul. Gis naa itam ay xob ci ginnaaw; xob yu ""vertes"". Gis naa itam ay ""grillage"" yu ñuul."
39
+ Nataal bii de ñaari président la fii nag France la mu di président Maki Sàll ak Macron ñi ngi noon di dox ci affaire bu rouge bi am na oto yu topp seen gannaaw am na ñu taxaw seen wet yor lu mel ni ay fiil ay policier lañ Maki Sàll ak Macron ku nekk sol yére bu ñuul ak dàll yu ñuul kostin.
40
+ "Ñii de ay góor lañu, ñu yéeg ci ay fas yor ay bant wutali fa ñu jëm."
41
+ "Kale ngay séen njiitu réew mi la, Maki Sàll. Mi ngi nekk noonu ca kaw ndaamar ga, yékkati ay yoxoom. Leegi nag, polise yeek sàndarm si topp ko nii, toppekat yi di ko gàrde, war seeni ndaamar ñoom it. Njiitu réew mi moo nekk nee ca kow, ci ndaamar gu ñuul gi yékkati ay yoxoom."
42
+ "Waaw nataal bi de waa ji war ab moto la. Genn góor gu yab ab moto. Am na ñu nekk ci wetam. Mu ngi sol Lu bula, lu bula..sol lu bula, moto bi am na lu weex am na lu ñuul.. Am na ñu sëgg nee sol yu xonq"
43
+ "Ñii moom tamit ay takk-der nañ. Ñi ñi ngi bàyyikoo ci sàndarmëri nasiyonaal. Ñoom nag seen njariñ mooy di aar kaaraange njiitu réew mi. Ñu leen di tudde ci tubaab ""garde rouge, garde rapprochée ou garde présidentielle"". Ñoom la ñuy dénk wàllu kaaraange njiitu réew mi."
44
+ "Nataal bi ab takk-der la waaw, yéeg ci benn móto waaw. Ci gannaawam yeneen takk-der yi ñi ngi taxaw di ko xool."
45
+ "Ñenti alkaati, ñaar ñi yor seen bat ñow di àtte ñaar ñu nekk di xeex tëdd ci suuf. Mu am benn waay bu ta... Mu am benn taatu garab bu nekk ca wet ga, am benn waay bu ñëw."
46
+ "Nataal bii maa ngi ci gis ay takk-der, ñu jàpp kenn ci doomi ndawi réewum Senegaal, di ko bunduxataal, di ko mettital, di ko dóor ba xaw koo tëddal ci suuf."
47
+ "Lii nag ay pólisee yu jigéeñ la. Ëe bari nañu lool, ñu ràngoo ay fetal, sol ay gā yu weex, sol ay mbaxana, ëe, mbaxana mi tamit am na ay rëdd yu weex ci kaw, ak ay yére yu ñuul, jiital lu weex."
48
+ aaa mooy li ñu waxoon rekk ñii ay policiers lañ ñoo xam ni da ñu ànd ak seen seef. ñoom la ñu gis ñu nekk taxaw nii. seen seef bi nag moo nekk nee. xay na nag da ñoo war a taxaw rekk coster les bras nii ci li ñu man a gis nii. mu nekk loo xam ne daal lu kii la daal.
49
+ nataal bii Capitaine Séydinaa Omar Ture la. moo nekkoon section de recherche ñu radier ko.
50
+ "nataal bii mu ngi nekk ak nit, nit ku xees, ku toog ci baŋ bu ñuul wéerukaayu baŋ bi, ci fi ngay tegee sa bopp ni am ay bind yu weex. moom nag mu ngi tegle ñaari loxoom yi tiimale ko ci sikim bi. nit ki nak mu ngi sol yéreeg takk-der. maanaam yére bu am..."
51
+ "Waaw lii de comme ni ma la ko waxe woon rekk, ceebu jën la, ceebu jën, ñu teg ko ciii, ci ci ci ci ci palaat bu weex, teg lu mel ni ay ñambaan, daldi koy teg ci kow. Waaw boo xoolee ceeb bi, am na benn kuddu bu ñuul bu koy cubb, bu cubb ci kow nii, bu cubb cubb ñambaan ci kow."
52
+ Juróom-ñetteelu nataal bi seetlu naa ci cere joo xam ne dañ koo defar. Biñ ko defaree bam noppi mu àndak ñeex moo xam ne moom lañ koy siife.
53
+ Ba tey ñoo ngi gis ci nataal bii ay soldat soldat ñu ngi tollu ci fukkak ñaar ba fukkak ñett ci ay soldat soldat ya nga xam ne ñoom ñu ngi nekk ca palais ba kenn du laal palais bi ñoom nag duñ ree duñ kaf duñ muu dañuy mel ni ñu amul xol seen ginnaaw ñoo ngi gis buntu palais bi buntu palais bi rafet lool am benn tapis bu xonq am na ñaari gaynde ci ginnaaw am na ñeenti drapeau Senegaal ñoom nag ñoom ñoom noom waa Soldaar yi am na ñaari soldaar yoo xam ni ñi ngi sol seen i ay yére yu bula ak seen i tubéy yu ñuul bu fekkee ñi ci des nag ñoom ñoo bokk lenn lañ sol ñii sol seenum mbaxana mu xonq am lu noir am lu ñuul ak seen i tubéy yu ñuul yor seen i ay yar ñee sol seen i gã ngir mën a gardé suñu suñu njiit bi.
54
+ "Waaw ba tey lii de gis naa ne ag daamar, daamar gu taxaw la, ay sàndarma ñi ngi ci biir daamar ga, am na benn sàndarma bu nekk ca kanam. Waaw loolu de laa gis."
55
+ "Waaw nataal bii tamit de gis naa ci benn bool bu weex ñu def ci caakiri def rasin ca kaw, teg ko nii ci suuf lool daal laa gis."
56
+ "Kii de sennas ngóor la su taxaw si benn tali daldi yor aw xeer, takk masque. Xeer wi mu jëmale ko ca way-takk-der ya."
57
+ "Nataal bii maa ngi ciy gis ay takk-der yu yor ay ngànnaay takk ay casque. Am na it góor gu tëdd nii, ñu mel ni ñu jëmsi ci moom."
58
+ "Nataal bii nag, ay nit lañ yu taxaw di ñaxtu, maanaam di manifestewu, di jël ay xeef di leen sànnanteek takk-der yi. Am na nag ku nekk ci kanam nii solul maska, yor ay xeer di leen sànni. Mi ngi sol siletmaa bu weex ak tubéy ju ñuul. Mi ngi takk nag mbub lu xonq ci digg ginaaw gi."
59
+ "Nataal bii maa ngi ciy gis ku yor raaya Senegaal. Daa mel ni ab nag doxu-ñaxtu la doonoon. Dagg lu mel ne tàndarma, garabu koko bu xonq, teg ko ci digg tali bi. Garabu koko. Ñii yore ""velo"", ñii dox ak seen tànk."
60
+ "Waaw lii nag ab alkaati bu yor ngànnaayam, takk ndiggam di dàq ndaw sii di dem."
61
+ "Gis naa fi ay takk-der yoo xam ni ñoo ngi solu seen i tenue, uuf seen i baaraas yoo xam ni dañuy def manifestation pour aar seen bopp ci baaraas bi."
62
+ "Nataal bi ngeen ma yonnee, gis naa ni ay takk-der la. Takk der yoo xamantane bii nag, jël nañ loo xamantane bii mooy ay,... Ay baar. Ay verre yoo xam ne dañ kaa defar pour ñu leen di baare, ñu jàkkarlook askan wi di sànnanteek ñoom ay xeer ak ay doj."
63
+ "Waa ñii la nuy tuddee waay-takk -der yi. Ñii la ñuy tuddee sunu takk-der yi, ñooy waay takk-der yi. Pólise yi, ñoo taxaw nii daal dee takk ab... Daa dee sol benn mbaxanaam, daa di takk ab deram ci ndig li. Der gi nag dafa xaw a xonq tuuti. Mu ngi sol moom lu bula. Waaw... Daa di taxaw ci benn daamaar, mooy benn woto rekk. Benn woto la taxaw."
64
+ "Lii de ab nataal la. Boo xoolee yaa ngi gis lu mel ne ay kàdd lañ si def, ay matt yun dagg moom la gën a niru.Waaw! Boo xoole am na ay wirgo yu nëtëx yuy feeñ ci ginnaaw, wirgo yu nëtëx yuy feeñ ci ginnaaw. Xaw naa niru ab ndawal, xaw naa niroo itam ab ndawal."
65
+ Màndarga bi de; uhuum! dafa mel ne ni ma ko gise yàpp la. Xaw ma yàpp nag xaw ma yàppu xar lay doon waaye yàpp la. Teg ko ci kayit ; ci kayit lañ ko laxas. Waawaw! Tabarkalla! Maasàllaa! yàpp bi moom neex naa defar lool de.
66
+ "Haa lii de moom, nataal bii may séen moom koŋ fuule laa ko teg. Koŋ Fuule de moom ku ko def suppu-kànja moom, ku ko togg suppu kànja bu neex, def sa sippax ak sa yépp di nga am suppu kànja bu neex boo tegal jaxate-jaxate yi."
67
+ "Lii de lanuy tuddee jën, keccax, keccax. Ñu defar ko ba mu pare def si nataal."
68
+ "Lii ma gis nii de : aw yàpp wun làkk la. Waaye nag sooy xool jëmm yàpp wi day mel ni ginaar lay niru. Ñu ngi ko def ci aw kayit, dunq wi dangay gis ay karawi karaw"
69
+ "Nataal bii ma jot nag, lii nag daa ndiru yàpp. Yàpp wum mën a donn nag, waaye yàpp woo xam ni daal dañ mel ni ku ko làkk daal lay xaw a ndiru. xam naa ñi daal yàpp lañ koy wax."
70
+ "Waaw foto bi de, ñu ngi ciy gis benn palaat bu weex boo xam ne si dafa def laax. Laaxu araw. Laax yi nag nu ngi ko cifee cif boo xam ne sii nag cifaayu nëtëri la..."
71
+ "Ëe waaw bii ay takk-der la. Takk-der yi ñu jiital Guy Maris Saañaa. Moom lañu jiital nii, jàpp ko di ko jiital di ko yóbb waaw. Am na taskatu-xibaar bu nekk nële ca wet ga sol yére bu weex, bu bula ak tubéy bu ñuul."
72
+ Nataal bii day wane ndungu siin bu nekk ca Ndakaaru ñu koy woowe rëbes. Booy xool ci buntu bi am na ay nit yu fa taxaw ak ay alkaati.
73
+ "Lii ab nataal la buy wone benn béréb bu tudd ""Maison d'arrêt de de correction bu rëbës"". Rëbës nag fii ci Senegaal la nekk. Gis nanu ci bunt bu wert ak ay alkaati yu taxaw ci bunt bi. Gis nanu fi ay nit ñu bari. Ñenn ñi sol yére yu bula, ñeneen ci ñu sol yére yu ñuul."
74
+ "Maa ngi gis ci nataal bi benn buntu, benn buntu bu pharmacie bu nice bu vert mooy yooyu di nekk ci buntu muy benn gendarmerie..."
75
+ Waaw mbokk mi nataal bii de bi ma xoolee gis naa ci ay góor ñu taxaw ñu yékkati seen i loxo jàkkarloo ak stade bi gis naa ci ku yékkati drapeau Senegaal bi ca kaw.
76
+ Lii ab barabu bu xalaatukaayu xam-xam la. Ñu gis ay nit ñu toog ci seeni buro di sëg ci seeni ordinaatëer. Buro yi nag ñu ngi am ay kulëer yu baxa ak ay làmp yu tàkk ci kow.
77
+ "Nataal bii nag moom, ag jigéen a ci nekk yor ag cameraa teg ko ci bët yi, di xool talal loxo bi. Am na lu ci nekk lu mel ni sãru lënd bulo am ay... Ay làmp ci koñ yi."
78
+ "Waaw nataal bii, nataal la boob as ndaw moo fi nekk jël ay am mbir teg ko ci ay gëtam yékkati ab yoxoom. Am na tamit mbir mu baxa loo xam ne mi ngi ko wër mel ni ag lënd."
79
+ "Nataal bi ngeen ma yónnee mooy juróom fukkeelu nataal bi ak ñaar moom ngeen ma yónneewoon léegi nii. Tontu naa ci waaye li ma ci gis mooy am jumtukaay la boo xam ne ci wàllu ""internet"" la jëm te jumtukaay boobu te moom lañ naan ""réseau"" bi nga xam ne mooy tigo te mu soppeeku nekk free moom la. Am na am mbind mu xonq ca kaw. Moom lu ñuul la dafa ñuul, moom jumtukaay bi am lu xonq ci kaw, am tamit ay bësukaay."
80
+ "Nataal bii nag gis nan ci ndox maanaam dex . Nit ñi toog ci digg dex gi, ëe... Di naan. Seeni, ëe... Di noos, di noos. Gis nan si ay góor ak ay jigéen. Gis nan sii... benn góor moomit moom mu tée doomam di ko nax yaayam mu ngi leen di, mu ngi leen di nataal. Mu leen di dóor nataal. Moom mu ngi sol yére bu ñuul ni ibaadu yi di soloo. Ñi ci des ñoom , ñu ngi toog di waxtaan di naan ndox."
81
+ "Waaw nataal bi ñi ngi ci gis ay jëmmu yu toog, ñu tiim seen ay jumtukaay, jumtukaay ak ñoom seen."
82
+ "Nataal bii ngeen ma yónnee de centre bu mag a mag la boo xam ne jigéen ñi ak góor ñi ñépp ñoo ngi cib liggéey, ku nekk yaa ngi tiim sab ordinateur di liggéey."
83
+ "Nataal bii maa ngi ciy gis ab ordinatéer manaam ab nosukaay, nosukaay laa ci gis mooy ordinatéer moom laa ci gis ñu taal ko. Moom rekk laa ci gis."
84
+ "Li ñi jot a gis ab ""tampon"" la walla sax ñu naan ko ""cachet"". ""Cachet"" nag dafa nekk loo xamente ni moom lañii jël ngir màndargaal yoo xamente ne ay nit ñoo ko bokk walla sax foo xamante ne kenn nit moo ko moom. Loolu nag ""tampon"" bu nekk am na fu mu jóge am na lu muy wone ci bataaxal buñ ko def"
85
+ "Lii ab torlu la, maanaam njëfandikukaayu torlu la. Ag jëfekaay la boo xam ni bii mën. nga cee def sab torlu. Torlu nag mooy màndarga ginnaaw boo bindee ab bataaxal ci misaal, ca mujj ga nga def ci sa torlu. Loolu day wone ni yow kat bataaxal bii yaa ko bind. Walla bu fekkee da ngaa séq ak nit, nga jël sa torlu def ca, day wone ni yow kat yaa séq ak nit ki lii ci ca jëmm. Kon torlu lu am a am solo la. Lii nag ab jëfandikukaat la boo xam ni mën nga cee def sa torlu bopp Boo xam ni da ngay bës rek my daldi tàmpe ci kayit gi, daldi ci sax. Mu nekk sa màndarga."
86
+ Waaw nataal bii dafay wone ab masin bu ndaw boo xam ni dan cii bind waxtu . Masin bi daa mel ni xool waxtu moo ko tax a jóg. Waayit mënees na ko jifandikoo xéy na ci generen waaw..
87
+ "Benn montar la rekk, benn montar. Witëer taransiis moo jot, waaw witëer taransiis moo jot, benn montar la bu weex. Benn montar bu weex la, waaw."
88
+ "Waaw nataal bii de jumtukaay la. Jumtukaay bi melo bi dafa xonq, ci biir ëe lu xonq li lu weex moo nekk ci biir. Am na ay joxoñ yu ñuul waaw. Mu mel ne ""façon"" tele."
89
+ "Gis naa fi benn affaire bu am ñaari couleur. Couleur bu xonq ak couleur bu weex. Ci suuf ñi ngi ci bind presse, mbind mu ñuul presse."
90
+ "Waaw lii nag jumtukaay la bu xereñ boo xam ni ni mën nan cee nattee, xaymaa mbir mu mu mën a doon ; lu mel ne cee, ceeb walla dugub walla suukër, walla lu ni mel mën ka cee natt ba xayma ab tolluwaayam."
91
+ "Waaw nataal bii ag xarala la goo xamente ne ""écran"" la bu mag ci saalu sinemaa, nit ñi toog di seetaan, nit ñu bari. Am na ku toog ci puus-puus, am ñu taxaw. Am na xale bu ci nekk."
92
+ Lii de ab jollasu bu ñu tijji. Ki ko tijji nag nuru na ab defarkat ngir saytu lan mooy sikk si.
93
+ "Lii ab palaas bu nu tëj la aa, dessin nanu si benn nit muy niroo ak nit ku weex, takk ay weer ag genn jollasu ci wetu ndeyjoor gi. Palaas bi nag palaasu defarkatu telefon la may, defarkatu jollasu la may nirool. Waa ñu peinture ko peinture bu ñuul, peinture boo xam ne ay kulëer la. Mu mel ne gis naa si ñuul, gis naa si kulëer bu xonq, gis naa si tamit kulëer bu soon."
94
+ Nataal bii gis naa ci luñ bind télécentre......
95
+ "Nataal bii ab tele moo ci nekk bu ànd ak rojo. Mi ngi am ci digg bi lu verte. Am na ay biton yu nekk ci wet gi, ñaari biton. Am na tamit ab ""baffle"" bu nekk ci suuf. Am na tamit ñaari NTN yu taxaw."
96
+ Waaw liñ seetlu si nataal bi nag ab radio la radio bi nag am na ñaari xëccukaay kaay si kaw loolu lañ seetlu si nataal bi ab radio la daal.
97
+ "Lii ngeen gis nag ci nataal bi palaa la boo xam ne saa senegaal yi ñoo ëpp li ñu koy togg muy domodaa. Am na limoŋ ci kaw, am na kaani, am na bulet, am na pombiteer, am na karoot, am na suppome. Lu weex li nag mooy ñànkatañ bi."
98
+ Kii senn waay la di yor ub telefon. Moom nag mi ngi ko téye ak ñaari yoxoom di ci fo.
99
+ "Lii benn nit la boo xam ne dafa yor lu ñuy fowe foyukaay, foyukaay la mi ngi tuddu PES"
100
+ Nataal bii de ag jollasu la gu xaw a ñuul am ay bitoŋ ñaari mbind am mbind mu ñuul ci kaw. Mu ngi nekk ñaari jollasu daal yu tege ci suuf. Am na lu mel ni ag kees gu tege ci wetam gu ñuul.
101
+ "Waaw lii mën nanu wax ay jumtykaay la ; jumtukaay yu bees yi boo xam ne soo tàngee danga koy taal nga sedd guyy waaw. Ci ay jumtukaay yu bees la, ba mu xewee sax yàggul."
wolof-translate/wolof_translate/data/sentences/wolof_french.csv CHANGED
The diff for this file is too large to render. See raw diff
 
wolof-translate/wolof_translate/utils/__init__.py ADDED
File without changes
wolof-translate/wolof_translate/utils/database_manager.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo.mongo_client import MongoClient
2
+ from pymongo.server_api import ServerApi
3
+ import pandas as pd
4
+
5
+ class TranslationMongoDBManager:
6
+
7
+ def __init__(self, uri: str, database: str):
8
+
9
+ # recuperate the client
10
+ self.client = MongoClient(uri)
11
+
12
+ # recuperate the database
13
+ self.db = self.client.get_database(database)
14
+
15
+ def insert_documents(self, documents: list, collection: str = "sentences"):
16
+
17
+ # insert documents inside a collection
18
+ results = self.db[collection].insert_many(documents)
19
+
20
+ return results
21
+
22
+ def insert_document(self, document: dict, collection: str = "sentences"):
23
+
24
+ assert '_id' in document
25
+
26
+ # get the id of the last sentence (recuperate the max id and add 1 to it)
27
+ max_id = self.get_max_id(collection)
28
+
29
+ # add the new sentences
30
+ document['_id'] = max_id
31
+
32
+ results = self.db[collection].insert_one(
33
+ document
34
+ )
35
+
36
+ return results
37
+
38
+ def update_document(self, id: int, document: dict, collection: str = "sentences"):
39
+
40
+ # update the document
41
+ results = self.db[collection].update_one(
42
+ {
43
+ '_id': {
44
+ '$eq': id
45
+ }
46
+ },
47
+ {
48
+ '$set': document
49
+ }
50
+ )
51
+
52
+ return results
53
+
54
+ def delete_document(self, id: int, document: dict, collection: str = "sentences", del_collection: str = "deleted"):
55
+
56
+ # recuperate the document to delete
57
+ del_sent = self.db[collection].find_one(
58
+ {
59
+ '_id': {
60
+ '$eq': id
61
+ }
62
+ }
63
+ )
64
+
65
+ # delete the sentence
66
+ self.db[collection].delete_one(
67
+ {
68
+ '_id': {'$eq': del_sent['_id']}
69
+ }
70
+ )
71
+
72
+ # add the sentences to the deleted sentences
73
+ del_sent['_id'] = len(list(deleted.find()))
74
+
75
+ results = self.db[del_collection].insert_one(
76
+ del_sent
77
+ )
78
+
79
+ return results
80
+
81
+ def get_max_id(self, collection):
82
+
83
+ # recuperate the maximum id
84
+ id = list(self.db[collection].find().sort('_id', -1).limit(1))[0]['_id']
85
+
86
+ return id
87
+
88
+ def save_data_frames(self, sentences_path: str, deleted_path: str, collection: str = "sentences", del_collection: str = "deleted"):
89
+
90
+ # recuperate the new corpora
91
+ new_corpora = pd.DataFrame(list(self.db[collection].find()))
92
+
93
+ # recuperate the deleted sentences as a Data Frame
94
+ deleted_df = pd.DataFrame(list(self.db[del_collection].find()))
95
+
96
+ # save the data frames as csv files
97
+ new_corpora.set_index('_id', inplace=True)
98
+
99
+ deleted_df.set_index('_id', inplace=True)
100
+
101
+ new_corpora.to_csv(sentences_path, index=False)
102
+
103
+ deleted_df.to_csv(deleted_path, index=False)
104
+
105
+ def load_data_frames(self, collection: str = "sentences", del_collection: str = "deleted"):
106
+
107
+ # recuperate the new corpora
108
+ new_corpora = pd.DataFrame(list(self.db[collection].find()))
109
+
110
+ # recuperate the deleted sentences as a Data Frame
111
+ deleted_df = pd.DataFrame(list(self.db[del_collection].find()))
112
+
113
+ return new_corpora, deleted_df