File size: 7,320 Bytes
2f63626 38aa581 ef72cdf 277be2c f68d656 2570281 2f63626 10b61cb 3e9b351 38aa581 3e9b351 38aa581 3e9b351 100ee4d 3e9b351 100ee4d 38aa581 100ee4d 10b61cb ef72cdf 19ec674 100ee4d d2b6076 dc7d969 64d7c56 dc7d969 ef72cdf 277be2c 64d7c56 dc7d969 868f0f3 277be2c 64d7c56 14da95d 750e743 64d7c56 dc7d969 100ee4d 64d7c56 dc7d969 118d6d2 d2b6076 118d6d2 d2b6076 a909906 7e045a8 d2b6076 750e743 dc7d969 64d7c56 d2b6076 750e743 34e8585 dc7d969 d2b6076 118d6d2 750e743 540656c 750e743 540656c 750e743 540656c 1360ce0 dc7d969 2f63626 ef72cdf 2f63626 aaec87b 896c819 aaec87b 3c81006 dc7d969 3c81006 aaec87b 3c81006 ef72cdf d2b6076 dc7d969 64d7c56 d2b6076 dc7d969 ef72cdf 331a033 3cb3592 e3f901b 540656c bbb56a5 e3f901b 540656c 868f0f3 dc7d969 d2b6076 dc7d969 64d7c56 d2b6076 64d7c56 dc7d969 d2b6076 dc7d969 ef72cdf 97325b8 1360ce0 3c81006 aaec87b 3c81006 1360ce0 3c81006 aaec87b 1360ce0 540656c aaec87b 540656c 3c81006 1360ce0 97325b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import datas,ctcalign,graph
from numpy import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def setup():
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
print('PWD::', r0.stdout)
r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
print(r1.stdout)
subprocess.run(["unzip", "./master.zip"])
subprocess.run(["mv", "REAPER-master", "REAPER"])
subprocess.run(["rm", "./master.zip"])
os.chdir('./REAPER')
subprocess.run(["mkdir", "build"])
os.chdir('./build')
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
print(r2.stdout)
r3 = subprocess.run(["make"], capture_output=True, text=True)
print(r3.stdout)
os.chdir('../..')
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9.stdout)
#print('about to setup')
setup()
# return the whole corpus as a state
# display some of it
# (because gradio pagination is currently broken)
# and reset all filter menus
# return [ds,databrowser,gmenu,amenu,dmenu]
def pick_lang(langname):
if langname=="Icelandic":
df = datas.ds_i
ages = ["all", '18-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90']
diaVis = False
elif langname =="Faroese":
df = datas.ds_f
ages = ["all", '15-35', '36-60', '61+']
diaVis = True
dfd = df.drop(columns=['audio', 'speaker_id','duration'])
return (df, dfd[:15], "all", gr.update(choices=ages,value="all"),gr.update(visible=diaVis,value="all"))
def apply_filters(df,langname,gender,age,dia):
if langname=="Icelandic":
df = datas.ds_i
elif langname =="Faroese":
df = datas.ds_f
if dia != "all":
df = df[df.dialect.str.lower() == dia.lower()]
if gender != "all":
df = df[df.gender.str.startswith(gender)]
if age != "all":
df = df[df.age == age]
dfd = df.drop(columns=['audio', 'speaker_id','duration'])
return (df,dfd[:min(15,len(dfd))])
def f1(langname,ds):
if langname=="Icelandic":
lang_aligner = datas.a_i
elif langname =="Faroese":
lang_aligner = datas.a_f
ex = ds.sample()
sound_path = ex['audio'].iloc[0]['path']
transcript = ex['normalized_text'].iloc[0]
rec_info = f"{ex['audio_id'].iloc[0]}, {ex['gender'].iloc[0]}, {ex['age'].iloc[0]}"
if langname =="Faroese":
rec_info += f", {ex['dialect'].iloc[0]}"
return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path,rec_info)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# Demo under construction
### 1. Choose a language to load
### 2. See a small sample of the selected corpus
### 3. Click the button below to view time-aligned prosody information for a random example
""" )
with gr.Row():
langmenu = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")
gr.Markdown(
"""
Pitch is shown in dark blue and loudness is the light orange line. The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
The random example can be from the whole corpus, not necessarily one of the visible rows. More information below.
""" )
ds = gr.State()
with gr.Row():
gmenu = gr.Dropdown(["all", "f", "m"], label="Gender", value="all")
amenu = gr.Dropdown(["all"], label="Age", value="all")
dmenu = gr.Dropdown(["all", "Norðuroyggjar (inklusive of Eiði, Gjógv og Funningur)",
'Norðurstreymoy/Eysturoy (exclusive of Eiði, Gjógv og Funningur)',
'Vágar', 'Sandoy', 'Suðuroy', 'Suðurstreymoy'], label="Dialect", value = "all", visible = False)
btn0 = gr.Button(value="Apply filters")
with gr.Row():
databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')
with gr.Row():
with gr.Column(scale=1):
btn1 = gr.Button(value="CLICK HERE")
btn1.style(size="lg",full_width=True)
with gr.Column(scale=4):
audio1 = gr.Audio(interactive=False)
ainfo = gr.Markdown("""
Audio file info
""")
pl1 = gr.Plot()
# when user selects a language,
# reset the dataset
# display some data from it
# and reset all filter menus
langmenu.change(pick_lang,langmenu,[ds,databrowser,gmenu,amenu,dmenu])
# filter the current data and change the state based on this
# and display the update
btn0.click(apply_filters,[ds,langmenu,gmenu,amenu,dmenu],[ds,databrowser])
#
btn1.click(f1, [langmenu,ds], [pl1,audio1,ainfo])
gr.Markdown(
"""
# ABOUT
The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).
### Forced alignment
The prosody graphs are marked with time-alignments for the words found by [CTC decoding](https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html). This uses wav2vec-2.0 based models ([Faroese](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h), [Icelandic](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h)) and tends to be more robust than Montreal Forced Aligner.
However, this aligner does not contain any phoneme representation, and therefore, segment alignments are for orthographic characters rather than phonemes. Especially in languages with shallow orthography, these letter alignments probably indicate something about the timing of sounds in a word, but the exact durations should not be taken too seriously especially in cases like doubled or silent letters.
### Pitch tracking (F0 estimation)
Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).
### Intensity
The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should correspond to vowels and similar sounds.
This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic.
So far, you cannot select or upload your own choice of sentence for analysis, nor search the corpora. Also, it does not display well when the sentence is too long. In that case, or if there are serious errors in the automated analyses, try another random sentence.
Contact caitlinr@ru.is / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this. Unfortunately I am not a web/interface designer so this is not going to look nice or be user friendly, I only do speech processing.
The source code is available under the Files tab at the top of the Space.
"""
)
if __name__ == "__main__":
bl.launch() |