Spaces:
Build error
Build error
plot per language
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from ctypes.wintypes import LANGID
|
|
|
2 |
from email.policy import default
|
3 |
import pycountry
|
4 |
import os
|
@@ -179,7 +180,16 @@ def get_metadata_json(path):
|
|
179 |
except Exception:
|
180 |
return []
|
181 |
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
repo.git_pull()
|
184 |
REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
|
185 |
repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
|
@@ -188,29 +198,8 @@ def show_records():
|
|
188 |
audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
|
189 |
metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
|
190 |
metadata_all = [m for m in metadata_all if m!=[]]
|
191 |
-
|
192 |
-
|
193 |
-
langs=[m['language_name'] for m in metadata_all]
|
194 |
-
lang_dict = Counter(langs)
|
195 |
-
lang_dict.update({'All others':0})
|
196 |
-
all_langs = list(lang_dict.keys())
|
197 |
-
langs_count = [lang_dict[k] for k in all_langs]
|
198 |
-
y_pos = np.arange(len(all_langs))
|
199 |
-
plt.barh(all_langs, langs_count)
|
200 |
-
plt.ylabel("Language")
|
201 |
-
plt.xlabel('Number of audio samples')
|
202 |
-
plt.title('Distribution of audio samples over languages')
|
203 |
-
|
204 |
-
#audios = [a for a in audios_all]
|
205 |
-
#texts = [m['text'] for m in metadata_all]
|
206 |
-
#numbers = [m['number'] for m in metadata_all]
|
207 |
|
208 |
-
html = f"""<div class="infoPoint">
|
209 |
-
<h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
|
210 |
-
"""
|
211 |
-
|
212 |
-
return html,plt
|
213 |
-
|
214 |
|
215 |
|
216 |
def display_records():
|
@@ -315,9 +304,62 @@ with block:
|
|
315 |
</div>
|
316 |
""")
|
317 |
plot = gr.Plot(type="matplotlib")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
#listen = gr.Button("Listen")
|
320 |
listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
gr.Markdown(ARTICLE)
|
322 |
|
323 |
block.launch()
|
|
|
1 |
from ctypes.wintypes import LANGID
|
2 |
+
from curses import meta
|
3 |
from email.policy import default
|
4 |
import pycountry
|
5 |
import os
|
|
|
180 |
except Exception:
|
181 |
return []
|
182 |
|
183 |
+
|
184 |
+
def plot_bar(value,name,x_name,y_name,title):
|
185 |
+
|
186 |
+
plt.barh(name, value)
|
187 |
+
plt.ylabel(y_name)
|
188 |
+
plt.xlabel(x_name)
|
189 |
+
plt.title(title)
|
190 |
+
return plt
|
191 |
+
|
192 |
+
def get_metadata_of_dataset():
|
193 |
repo.git_pull()
|
194 |
REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
|
195 |
repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
|
|
|
198 |
audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
|
199 |
metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
|
200 |
metadata_all = [m for m in metadata_all if m!=[]]
|
201 |
+
return metadata_all
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
|
205 |
def display_records():
|
|
|
304 |
</div>
|
305 |
""")
|
306 |
plot = gr.Plot(type="matplotlib")
|
307 |
+
metadata_all = get_metadata_of_dataset()
|
308 |
+
|
309 |
+
def show_records():
|
310 |
+
langs=[m['language_name'] for m in metadata_all]
|
311 |
+
all_genders = [m['gender'] for m in metadata_all
|
312 |
+
]
|
313 |
+
lang_dict = Counter(langs)
|
314 |
+
lang_dict.update({'All others':0})
|
315 |
+
all_langs = list(lang_dict.keys())
|
316 |
+
langs_count = [lang_dict[k] for k in all_langs]
|
317 |
+
plt_ = plot_bar(langs_count,all_langs,'Number of audio samples',"Language",'Distribution of audio samples over languages')
|
318 |
+
html = f"""<div class="infoPoint">
|
319 |
+
<h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
|
320 |
+
"""
|
321 |
+
|
322 |
+
return html,plt_
|
323 |
+
|
324 |
+
|
325 |
+
|
326 |
+
languages = list(Counter([m['language_name'] for m in metadata_all]).keys())
|
327 |
+
for language in languages:
|
328 |
+
with gr.Row() as row_lang:
|
329 |
+
metadata_for_language = [m for m in metadata_all if m['language_name']==language]
|
330 |
+
gender_for_language = [m['gender'] for m in metadata_for_language]
|
331 |
+
digits_for_language = [m['number'] for m in metadata_for_language]
|
332 |
+
gender_for_language = [g if g!="" else 'Not given' for g in gender_for_language]
|
333 |
+
|
334 |
+
digits_dict = Counter(digits_for_language)
|
335 |
+
gender_dict = Counter(gender_for_language)
|
336 |
+
|
337 |
+
digits_name_for_language = list(digits_dict.keys())
|
338 |
+
digits_count_for_language = [digits_dict[k] for k in digits_name_for_language]
|
339 |
+
|
340 |
+
gender_name_for_language = list(gender_dict.keys())
|
341 |
+
gender_count_for_language = [gender_dict[k] for k in gender_name_for_language]
|
342 |
|
343 |
+
plot_digits = gr.Plot(type="matplotlib")
|
344 |
+
plot_gender = gr.Plot(type="matplotlib")
|
345 |
+
|
346 |
+
def plot_metadata_for_language():
|
347 |
+
plt_digits = plot_bar(digits_count_for_language,digits_name_for_language,'Number of audio samples',"Digit",f"Distribution of audio samples over digits for {language.upper()} ")
|
348 |
+
plt_gender = plot_bar(gender_count_for_language,gender_name_for_language,'Number of audio samples',"Gender",f"Distribution of audio samples over digits for {language.upper()}")
|
349 |
+
return plt_digits, plt_gender
|
350 |
+
|
351 |
+
|
352 |
+
row_lang.select(plot_metadata_for_language,inputs=[],outputs=[plot_digits,plot_gender])
|
353 |
+
|
354 |
+
|
355 |
#listen = gr.Button("Listen")
|
356 |
listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
|
357 |
+
|
358 |
+
|
359 |
+
# Have a list of the languages. lang
|
360 |
+
# We want digits per language and gender per language
|
361 |
+
# for l in range(len(lang),step =4)
|
362 |
+
# with Row().... d
|
363 |
gr.Markdown(ARTICLE)
|
364 |
|
365 |
block.launch()
|
data
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Subproject commit
|
|
|
1 |
+
Subproject commit af4ec56533825ccc0877c32d8ad73301181e8e98
|