File size: 7,320 Bytes
2f63626
38aa581
ef72cdf
277be2c
f68d656
2570281
 
2f63626
 
 
 
 
10b61cb
 
 
 
 
3e9b351
 
 
38aa581
3e9b351
38aa581
3e9b351
100ee4d
3e9b351
 
100ee4d
38aa581
100ee4d
 
10b61cb
 
ef72cdf
19ec674
100ee4d
d2b6076
 
dc7d969
 
64d7c56
dc7d969
ef72cdf
277be2c
64d7c56
 
dc7d969
868f0f3
277be2c
64d7c56
 
14da95d
750e743
64d7c56
dc7d969
100ee4d
 
64d7c56
dc7d969
118d6d2
d2b6076
118d6d2
d2b6076
a909906
7e045a8
d2b6076
 
750e743
dc7d969
64d7c56
 
d2b6076
 
750e743
34e8585
dc7d969
d2b6076
 
 
 
 
 
118d6d2
750e743
 
 
 
540656c
750e743
540656c
750e743
540656c
1360ce0
 
dc7d969
2f63626
ef72cdf
2f63626
aaec87b
 
 
896c819
 
 
aaec87b
3c81006
dc7d969
3c81006
 
aaec87b
 
3c81006
ef72cdf
d2b6076
dc7d969
 
 
64d7c56
 
 
 
d2b6076
dc7d969
 
ef72cdf
 
 
331a033
3cb3592
e3f901b
540656c
bbb56a5
e3f901b
540656c
 
 
 
868f0f3
 
 
 
dc7d969
d2b6076
 
dc7d969
64d7c56
d2b6076
 
 
64d7c56
dc7d969
 
 
d2b6076
dc7d969
ef72cdf
97325b8
1360ce0
 
 
3c81006
 
aaec87b
 
 
 
 
3c81006
 
1360ce0
3c81006
aaec87b
1360ce0
540656c
aaec87b
540656c
3c81006
1360ce0
 
 
 
97325b8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import gradio as gr
import subprocess,os
from datasets import load_dataset, Audio
import datas,ctcalign,graph
from numpy import random


import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


def setup():
    r0 = subprocess.run(["pwd"], capture_output=True, text=True)
    print('PWD::', r0.stdout)
    r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
    print(r1.stdout)
    subprocess.run(["unzip", "./master.zip"])
    subprocess.run(["mv", "REAPER-master", "REAPER"])
    subprocess.run(["rm", "./master.zip"])
    os.chdir('./REAPER')
    subprocess.run(["mkdir", "build"])
    os.chdir('./build')
    r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
    print(r2.stdout)
    r3 = subprocess.run(["make"], capture_output=True, text=True)
    print(r3.stdout)
    
    os.chdir('../..')
    r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9.stdout)

                        
#print('about to setup')
setup()

# return the whole corpus as a state
# display some of it
# (because gradio pagination is currently broken)
# and reset all filter menus
# return [ds,databrowser,gmenu,amenu,dmenu]
def pick_lang(langname):
    if langname=="Icelandic":
        df = datas.ds_i
        ages = ["all", '18-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90']
        diaVis = False
        
    elif langname =="Faroese":
        df = datas.ds_f
        ages = ["all", '15-35', '36-60', '61+']
        diaVis = True

    dfd = df.drop(columns=['audio', 'speaker_id','duration'])
    return (df, dfd[:15], "all", gr.update(choices=ages,value="all"),gr.update(visible=diaVis,value="all"))



def apply_filters(df,langname,gender,age,dia):

    if langname=="Icelandic":
        df = datas.ds_i
    elif langname =="Faroese":
        df = datas.ds_f
        if dia != "all":
            df = df[df.dialect.str.lower() == dia.lower()]
            
    if gender != "all":
        df = df[df.gender.str.startswith(gender)]

    if age != "all":
        df = df[df.age == age]


    dfd = df.drop(columns=['audio', 'speaker_id','duration'])
    return (df,dfd[:min(15,len(dfd))])


def f1(langname,ds):
    if langname=="Icelandic":
        lang_aligner = datas.a_i
    elif langname =="Faroese":
        lang_aligner = datas.a_f

    
    ex = ds.sample()
    sound_path = ex['audio'].iloc[0]['path']
    transcript = ex['normalized_text'].iloc[0]

    rec_info = f"{ex['audio_id'].iloc[0]}, {ex['gender'].iloc[0]}, {ex['age'].iloc[0]}"
    if langname =="Faroese":
        rec_info += f", {ex['dialect'].iloc[0]}"
    return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path,rec_info)



bl = gr.Blocks()

with bl:
    gr.Markdown(
    """
    # Demo under construction
    ### 1. Choose a language to load
    ### 2. See a small sample of the selected corpus
    ### 3. Click the button below to view time-aligned prosody information for a random example
    """ )
    with gr.Row():
        langmenu = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")
        gr.Markdown(
        """
        Pitch is shown in dark blue and loudness is the light orange line. The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
        The random example can be from the whole corpus, not necessarily one of the visible rows. More information below.
        """ )

    ds = gr.State()

    with gr.Row():
        gmenu = gr.Dropdown(["all", "f", "m"], label="Gender", value="all")
        amenu = gr.Dropdown(["all"], label="Age", value="all")
        dmenu = gr.Dropdown(["all", "Norðuroyggjar (inklusive of Eiði, Gjógv og Funningur)",
                             'Norðurstreymoy/Eysturoy (exclusive of Eiði, Gjógv og Funningur)',
                             'Vágar', 'Sandoy', 'Suðuroy', 'Suðurstreymoy'], label="Dialect", value = "all", visible = False)
        btn0 = gr.Button(value="Apply filters")


    with gr.Row():
        databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')


    with gr.Row():
        with gr.Column(scale=1):
            btn1 = gr.Button(value="CLICK HERE")
            btn1.style(size="lg",full_width=True)
        with gr.Column(scale=4):
            audio1 = gr.Audio(interactive=False)
            ainfo = gr.Markdown(""" 
            Audio file info
            """)

    pl1 = gr.Plot()

    
    # when user selects a language,
    # reset the dataset
    # display some data from it
    # and reset all filter menus
    langmenu.change(pick_lang,langmenu,[ds,databrowser,gmenu,amenu,dmenu])

    # filter the current data and change the state based on this
    # and display the update
    btn0.click(apply_filters,[ds,langmenu,gmenu,amenu,dmenu],[ds,databrowser])
    
    
    # 
    btn1.click(f1, [langmenu,ds], [pl1,audio1,ainfo])



    gr.Markdown(
        """
    # ABOUT

    The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).

    ### Forced alignment
    The prosody graphs are marked with time-alignments for the words found by [CTC decoding](https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html). This uses wav2vec-2.0 based models ([Faroese](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h), [Icelandic](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h)) and tends to be more robust than Montreal Forced Aligner.
    However, this aligner does not contain any phoneme representation, and therefore, segment alignments are for orthographic characters rather than phonemes. Especially in languages with shallow orthography, these letter alignments probably indicate something about the timing of sounds in a word, but the exact durations should not be taken too seriously especially in cases like doubled or silent letters.

    ### Pitch tracking (F0 estimation)
    Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).

    ### Intensity
    The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should correspond to vowels and similar sounds.

    This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic. 
    So far, you cannot select or upload your own choice of sentence for analysis, nor search the corpora. Also, it does not display well when the sentence is too long. In that case, or if there are serious errors in the automated analyses, try another random sentence.
    Contact caitlinr@ru.is / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this. Unfortunately I am not a web/interface designer so this is not going to look nice or be user friendly, I only do speech processing.
    The source code is available under the Files tab at the top of the Space.
    """
    )


if __name__ == "__main__":
    bl.launch()