Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -40,6 +40,7 @@ st.sidebar.write("""
|
|
40 |
6. **Translation Tips:**
|
41 |
- English as the target language gives the best results.
|
42 |
- You can also do inter-language translation i.e yoruba to igbo
|
|
|
43 |
7. **Performance Note:**
|
44 |
- The model's performance varies due to its size and training data. It performs best on text generation and translation.
|
45 |
- For other tasks, try multiple times if model's output is not optimal (This is due to the generator's sampling parameter settings).
|
@@ -90,7 +91,7 @@ st.title("SabiYarn-125M : Generates text in multiple Nigerian languages.")
|
|
90 |
st.write("**Supported Languages: English, Yoruba, Igbo, Hausa, Pidgin, Efik, Urhobo, Fulfulde, Fulah. \nResults may not be coherent for less represented languages (i.e Efik, \
|
91 |
Urhobo, Fulfulde, Fulah).**")
|
92 |
st.write("**It takes a while (~25s) to return an output on the first 'generate' click. Avg response time: 1-2s on GPU, 40s on CPU**")
|
93 |
-
st.write("**Model outputs 80 tokens as default. Adjust in the side bar
|
94 |
st.write("**For convenience, you can use chatgpt to provide input text and translate/evaluate model output.**")
|
95 |
st.write("-" * 50)
|
96 |
popular_topics = [
|
@@ -101,6 +102,12 @@ popular_topics = [
|
|
101 |
"Philosophy", "Religion", "Society", "World"
|
102 |
]
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
async def assign_topic(generated_text, topic_list=popular_topics):
|
105 |
lower_generated_text = generated_text.lower()
|
106 |
for topic in topic_list:
|
@@ -113,7 +120,7 @@ def count_sentences(text):
|
|
113 |
sentences = re.split(r'[.!?]+\s*', text.strip())
|
114 |
# Filter out any empty strings from the resulting list
|
115 |
sentences = [sentence for sentence in sentences if sentence]
|
116 |
-
return len(sentences)
|
117 |
|
118 |
def wrap_text(text, task_value):
|
119 |
tasks = ["<classify>", "<prompt>", "<clean>", "<title>", "<diacritize>", "<translate>"]
|
@@ -170,8 +177,8 @@ sample_texts = {
|
|
170 |
"Translate 'Often, all Yoruba children...' to Yoruba": "Often, all Yoruba children take pride in speaking the Yoruba language.",
|
171 |
"Classify the sentiment": "Anyi na-echefu oke ike.",
|
172 |
"what is the topic of this text": "Africa Free Trade Zone: Kò sí ìdènà láti kó ọjà láti orílẹ̀èdè kan sí òmíràn",
|
173 |
-
|
174 |
-
|
175 |
"headline of this text": '** Sylvain Itté French ambassador don comot Niger Republic **. Sylvain Itté, di French ambassador for Niger don comot Niamey and currently e dey for flight from Ndjamena to Paris. Sylvain Itté, di French ambassador for Niger don comot Niamey very early dis morning and currently e dey for flight from Ndjamena to Paris.\n\nDi military detain Bazoum and im family for di presidential palace. Niger na former French colony, and France still get 1,500 sojas for di African country.\n\n"France don decide to withdraw dia ambassador. In di next hours our ambassador and several diplomats go return to France," Oga Macron tok.\n\nE add say di military co-operation dey "over" and French troops go leave in "di months to come".\n\n"Dis Sunday we celebrate one new step towards di sovereignty of Niger," di junta tok, for one statement wey AFP news agency quote.\n\nDi decision by Paris dey come afta months of hostility and protest against di presence of French for di kontri, wit regular demonstrations for di capital Niamey.\n\nDi move don scata France operations against Islamist militants for di wider Sahel region and Paris influence for there. But oga Macron tok say "putschists no go hold France hostage,"'
|
176 |
}
|
177 |
instruction_wrap = {
|
@@ -280,7 +287,7 @@ if st.button("Generate"):
|
|
280 |
|
281 |
|
282 |
# generated_text = re.split(r"\|(end_f_text|end_of_text|end_ofext|end_of_text_||end_of_te|end_oftext)|:|`", generated_text)[0]
|
283 |
-
generated_text = re.sub(r"\|(end_f_text|end_of_text|end_ofext|end_of_text_|end_of_te|end_o|end_of_tet|end_oftext)
|
284 |
generated_text = generated_text.strip("\n")
|
285 |
# print("Generated text: ", generated_text)
|
286 |
|
@@ -295,17 +302,18 @@ if st.button("Generate"):
|
|
295 |
elif task == "Topic Classification" or "<topic>" in wrapped_input:
|
296 |
generated_text = generated_text[:15]
|
297 |
# print("split", generated_text.split(" ")[0], re.split(r"\.|\n|\*\*|\*", generated_text)[0], generated_text.split(" "))
|
298 |
-
generated_text = re.split(r"\.|\n|\*\*|\*", generated_text)[0]
|
299 |
generated_text = asyncio.run(assign_topic(generated_text))
|
300 |
|
301 |
elif task == "Translation" or "<translate>" in wrapped_input:
|
302 |
# print("split for translation: ", n_sentences, re.split(r"\.|\n", generated_text)[:n_sentences])
|
303 |
-
n_sentences = count_sentences(initial_input)
|
304 |
-
|
|
|
305 |
|
306 |
elif task == "Question Generation" or "Question Generation:" in sample_text:
|
307 |
if "?" in generated_text:
|
308 |
-
generated_text = "?".join(re.split(r"\?", generated_text)[:-1]) + "?"
|
309 |
|
310 |
|
311 |
full_output = st.empty()
|
|
|
40 |
6. **Translation Tips:**
|
41 |
- English as the target language gives the best results.
|
42 |
- You can also do inter-language translation i.e yoruba to igbo
|
43 |
+
- Use sentences instead of words for better results.
|
44 |
7. **Performance Note:**
|
45 |
- The model's performance varies due to its size and training data. It performs best on text generation and translation.
|
46 |
- For other tasks, try multiple times if model's output is not optimal (This is due to the generator's sampling parameter settings).
|
|
|
91 |
st.write("**Supported Languages: English, Yoruba, Igbo, Hausa, Pidgin, Efik, Urhobo, Fulfulde, Fulah. \nResults may not be coherent for less represented languages (i.e Efik, \
|
92 |
Urhobo, Fulfulde, Fulah).**")
|
93 |
st.write("**It takes a while (~25s) to return an output on the first 'generate' click. Avg response time: 1-2s on GPU, 40s on CPU**")
|
94 |
+
st.write("**Model outputs 80 tokens as default. Adjust in the side bar (longer inputs/tokens will increase response time). MAX TOKENS=1024**")
|
95 |
st.write("**For convenience, you can use chatgpt to provide input text and translate/evaluate model output.**")
|
96 |
st.write("-" * 50)
|
97 |
popular_topics = [
|
|
|
102 |
"Philosophy", "Religion", "Society", "World"
|
103 |
]
|
104 |
|
105 |
+
def extract_answer(text):
|
106 |
+
pattern = r'[a-z][A-Z]'
|
107 |
+
result = re.split(pattern, text)[0]
|
108 |
+
result = text[:len(result) + 1]
|
109 |
+
return result
|
110 |
+
|
111 |
async def assign_topic(generated_text, topic_list=popular_topics):
|
112 |
lower_generated_text = generated_text.lower()
|
113 |
for topic in topic_list:
|
|
|
120 |
sentences = re.split(r'[.!?]+\s*', text.strip())
|
121 |
# Filter out any empty strings from the resulting list
|
122 |
sentences = [sentence for sentence in sentences if sentence]
|
123 |
+
return len(sentences), sentences
|
124 |
|
125 |
def wrap_text(text, task_value):
|
126 |
tasks = ["<classify>", "<prompt>", "<clean>", "<title>", "<diacritize>", "<translate>"]
|
|
|
177 |
"Translate 'Often, all Yoruba children...' to Yoruba": "Often, all Yoruba children take pride in speaking the Yoruba language.",
|
178 |
"Classify the sentiment": "Anyi na-echefu oke ike.",
|
179 |
"what is the topic of this text": "Africa Free Trade Zone: Kò sí ìdènà láti kó ọjà láti orílẹ̀èdè kan sí òmíràn",
|
180 |
+
"diacritize this text: ": "E sun, Alaga, fun ise amalayi ti e n se ni Naijiria. E maa ba a lo, egbon!",
|
181 |
+
"clean this text": "Abin mamaki ne aikin da shugabaZn HNajeriya ybake yi. kCiF 39gaba Tda haRkGa sir!",
|
182 |
"headline of this text": '** Sylvain Itté French ambassador don comot Niger Republic **. Sylvain Itté, di French ambassador for Niger don comot Niamey and currently e dey for flight from Ndjamena to Paris. Sylvain Itté, di French ambassador for Niger don comot Niamey very early dis morning and currently e dey for flight from Ndjamena to Paris.\n\nDi military detain Bazoum and im family for di presidential palace. Niger na former French colony, and France still get 1,500 sojas for di African country.\n\n"France don decide to withdraw dia ambassador. In di next hours our ambassador and several diplomats go return to France," Oga Macron tok.\n\nE add say di military co-operation dey "over" and French troops go leave in "di months to come".\n\n"Dis Sunday we celebrate one new step towards di sovereignty of Niger," di junta tok, for one statement wey AFP news agency quote.\n\nDi decision by Paris dey come afta months of hostility and protest against di presence of French for di kontri, wit regular demonstrations for di capital Niamey.\n\nDi move don scata France operations against Islamist militants for di wider Sahel region and Paris influence for there. But oga Macron tok say "putschists no go hold France hostage,"'
|
183 |
}
|
184 |
instruction_wrap = {
|
|
|
287 |
|
288 |
|
289 |
# generated_text = re.split(r"\|(end_f_text|end_of_text|end_ofext|end_of_text_||end_of_te|end_oftext)|:|`", generated_text)[0]
|
290 |
+
generated_text = re.sub(r"\|(end_f_text|end_of_text|end_ofext|end_of_text_|end_of_te|end_o|end_of_tet|end_oftext)|:|`", "", generated_text)
|
291 |
generated_text = generated_text.strip("\n")
|
292 |
# print("Generated text: ", generated_text)
|
293 |
|
|
|
302 |
elif task == "Topic Classification" or "<topic>" in wrapped_input:
|
303 |
generated_text = generated_text[:15]
|
304 |
# print("split", generated_text.split(" ")[0], re.split(r"\.|\n|\*\*|\*", generated_text)[0], generated_text.split(" "))
|
305 |
+
generated_text = re.split(r"\.|\n|\*\*|\*", generated_text)[0]
|
306 |
generated_text = asyncio.run(assign_topic(generated_text))
|
307 |
|
308 |
elif task == "Translation" or "<translate>" in wrapped_input:
|
309 |
# print("split for translation: ", n_sentences, re.split(r"\.|\n", generated_text)[:n_sentences])
|
310 |
+
n_sentences, split_= count_sentences(initial_input)
|
311 |
+
print(n_sentences,split_)
|
312 |
+
generated_text = ". ".join(re.split(r"\.|\n", generated_text)[:n_sentences]) + "."
|
313 |
|
314 |
elif task == "Question Generation" or "Question Generation:" in sample_text:
|
315 |
if "?" in generated_text:
|
316 |
+
generated_text = "? ".join(re.split(r"\?", generated_text)[:-1]) + "?"
|
317 |
|
318 |
|
319 |
full_output = st.empty()
|