FahadAlam commited on
Commit
c5abae9
·
1 Parent(s): 7faa719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -13
app.py CHANGED
@@ -1,17 +1,56 @@
1
- # load the pipeline from Hugginface Hub
2
  import gradio as gr
3
  from pyannote.audio import Pipeline
4
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
5
 
6
- def speakerDiarization(audio):
7
- # apply the pipeline to an audio file
8
- diarization = pipeline(audio)
9
- # dump the diarization output to disk using RTTM format
10
- return diarization
 
 
11
 
12
- gr.Interface(speakerDiarization,
13
- inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Please record your voice"),
14
- outputs = gr.outputs.Image(label="Output Text"),
15
- title="ASR using Wav2Vec 2.0",
16
- description = "This application displays transcribed text for given audio input",
17
- examples = [["TestAudio1.wav"]], theme="grass").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from pyannote.audio import Pipeline
3
+ from transformers import pipeline
4
 
5
+ asr = pipeline(
6
+ "automatic-speech-recognition",
7
+ model="facebook/wav2vec2-large-960h-lv60-self",
8
+ feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
9
+
10
+ )
11
+ speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")
12
 
13
+ def segmentation(audio):
14
+ speaker_output = speaker_segmentation(audio)
15
+ text_output = asr(audio,return_timestamps="word")
16
+
17
+ full_text = text_output['text'].lower()
18
+ chunks = text_output['chunks']
19
+
20
+ diarized_output = ""
21
+ i = 0
22
+ for turn, _, speaker in speaker_output.itertracks(yield_label=True):
23
+ diarized = ""
24
+ while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
25
+ diarized += chunks[i]['text'].lower() + ' '
26
+ i += 1
27
+
28
+ if diarized != "":
29
+ diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
30
+
31
+ return diarized_output, full_text
32
+
33
+ title = "Speech Recognition with Speaker Segmentation"
34
+ description = "Speaker Diarization is the act of attributing individual speakers to their corresponding parts in an audio recording. This space aims to distinguish the speakers with speaker segmentation and their speech with speech-to-text from a given input audio file. Pre-trained models used are Pyannote[1] for the Speaker Segmentation and Wav2Vec2[2] for the Automatic Speech Recognition."
35
+ article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Segmentation model (GitHub repo)</a></p>"
36
+ article += "<p style='text-align: center'><a href='https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#wav2vec-20' target='_blank'>[2] Facebook Wav2Vec2 (GitHub repo)</a></p>"
37
+ article += "<p style='text-align: center'>Audio File Sources: <a href='https://www.youtube.com/watch?v=DYu_bGbZiiQ&t=132s' target='_blank'>1</a> <a href='https://www.youtube.com/watch?v=DDjWTWHHkpk&t=29s' target='_blank'>2</a> <a href='https://www.youtube.com/watch?v=G2xWg2ckKHI&t=24s' target='_blank'>3</a> <a href='https://www.youtube.com/watch?v=sCcv9uqSBU0&t=32s' target='_blank'>4</a> <a href='https://www.youtube.com/watch?v=K1hlp0DCE_8&t=71s' target='_blank'>5</a></p>"
38
+
39
+ inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
40
+ outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),
41
+ gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
42
+ examples = [["meeting_audio.wav"],
43
+ ["noisy_london_interview.wav"],
44
+ ["clean_london_interview.wav"],
45
+ ["podcast_audio.wav"],
46
+ ["air_traffic_control_audio.wav"],]
47
+
48
+ app = gr.Interface(fn=segmentation,
49
+ inputs=inputs,
50
+ outputs=outputs,
51
+ examples=examples,
52
+ title=title,
53
+ description=description,
54
+ article=article,
55
+ allow_flagging=False)
56
+ app.launch()