peb-peb commited on
Commit
f78137c
·
1 Parent(s): ce8df04

add audio to text conversion

Browse files
Files changed (2) hide show
  1. app.py +20 -122
  2. transcribe.py +95 -0
app.py CHANGED
@@ -1,121 +1,26 @@
1
- # import whisper
2
  import gradio as gr
3
- import datetime
4
-
5
- import subprocess
6
- import wave
7
- import contextlib
8
-
9
- # import torch
10
- # import pyannote.audio
11
- # from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
12
- # from pyannote.audio import Audio
13
- # from pyannote.core import Segment
14
- # from sklearn.cluster import AgglomerativeClustering
15
- # import numpy as np
16
-
17
- # model = whisper.load_model("large-v2")
18
- # embedding_model = PretrainedSpeakerEmbedding(
19
- # "speechbrain/spkrec-ecapa-voxceleb",
20
- # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- # )
22
-
23
- # def transcribe(audio, num_speakers):
24
- # path, error = convert_to_wav(audio)
25
- # if error is not None:
26
- # return error
27
-
28
- # duration = get_duration(path)
29
- # if duration > 4 * 60 * 60:
30
- # return "Audio duration too long"
31
-
32
- # result = model.transcribe(path)
33
- # segments = result["segments"]
34
-
35
- # num_speakers = min(max(round(num_speakers), 1), len(segments))
36
- # if len(segments) == 1:
37
- # segments[0]['speaker'] = 'SPEAKER 1'
38
- # else:
39
- # embeddings = make_embeddings(path, segments, duration)
40
- # add_speaker_labels(segments, embeddings, num_speakers)
41
- # output = get_output(segments)
42
- # return output
43
-
44
- # def convert_to_wav(path):
45
- # if path[-3:] != 'wav':
46
- # new_path = '.'.join(path.split('.')[:-1]) + '.wav'
47
- # try:
48
- # subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
49
- # except:
50
- # return path, 'Error: Could not convert file to .wav'
51
- # path = new_path
52
- # return path, None
53
-
54
- # def get_duration(path):
55
- # with contextlib.closing(wave.open(path,'r')) as f:
56
- # frames = f.getnframes()
57
- # rate = f.getframerate()
58
- # return frames / float(rate)
59
-
60
- # def make_embeddings(path, segments, duration):
61
- # embeddings = np.zeros(shape=(len(segments), 192))
62
- # for i, segment in enumerate(segments):
63
- # embeddings[i] = segment_embedding(path, segment, duration)
64
- # return np.nan_to_num(embeddings)
65
-
66
- # audio = Audio()
67
-
68
- # def segment_embedding(path, segment, duration):
69
- # start = segment["start"]
70
- # # Whisper overshoots the end timestamp in the last segment
71
- # end = min(duration, segment["end"])
72
- # clip = Segment(start, end)
73
- # waveform, sample_rate = audio.crop(path, clip)
74
- # return embedding_model(waveform[None])
75
-
76
- # def add_speaker_labels(segments, embeddings, num_speakers):
77
- # clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
78
- # labels = clustering.labels_
79
- # for i in range(len(segments)):
80
- # segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
81
-
82
- # def time(secs):
83
- # return datetime.timedelta(seconds=round(secs))
84
-
85
- # def get_output(segments):
86
- # output = ''
87
- # for (i, segment) in enumerate(segments):
88
- # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
89
- # if i != 0:
90
- # output += '\n\n'
91
- # output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
92
- # output += segment["text"][1:] + ' '
93
- # return output
94
-
95
- s = ""
96
-
97
- def greet1(name):
98
- global s
99
- s = "modified"
100
- return "Hello " + name + "!"
101
-
102
-
103
- def greet2(name):
104
- return "Hi " + name + "!" + " " + s
105
-
106
-
107
- def greet3(name):
108
- return "Hola " + name + "!"
109
-
110
  with gr.Blocks() as demo:
111
  with gr.Box():
112
  with gr.Row():
113
  with gr.Column():
114
  audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
115
- # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
116
  number_of_speakers = gr.Number(label="Number of Speakers", value=2)
117
  with gr.Row():
118
- btn_clear = gr.Button(value="Clear")
119
  btn_submit = gr.Button(value="Submit")
120
  with gr.Column():
121
  title = gr.Textbox(label="Title", placeholder="Title for Conversation")
@@ -123,22 +28,15 @@ with gr.Blocks() as demo:
123
  sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
124
  quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
125
  detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
 
126
  gr.Markdown("## Examples")
127
  gr.Examples(
128
  examples=[
129
- [
130
- "Harsh",
131
- 2,
132
- ],
133
- [
134
- "Rahul",
135
- 2,
136
- ],
137
  ],
138
- inputs=[title],
139
- outputs=[short_summary],
140
- fn=greet1,
141
- cache_examples=True,
142
  )
143
  gr.Markdown(
144
  """
 
 
1
  import gradio as gr
2
+ from transcribe import transcribe
3
+
4
+ def main(audio_file, number_of_speakers):
5
+ # Audio to Text Converter
6
+ text_data = transcribe(audio_file, number_of_speakers)
7
+ print(text_data)
8
+ title = "ss"
9
+ short_summary = "dsa"
10
+ sentiment_analysis = "gyn"
11
+ quality = "dsdww"
12
+ detailed_summary = "jbjbjbjs"
13
+ return title, short_summary, sentiment_analysis, quality, detailed_summary
14
+
15
+ # UI Interface on the Hugging Face Page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  with gr.Blocks() as demo:
17
  with gr.Box():
18
  with gr.Row():
19
  with gr.Column():
20
  audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
 
21
  number_of_speakers = gr.Number(label="Number of Speakers", value=2)
22
  with gr.Row():
23
+ btn_clear = gr.ClearButton(value="Clear", components=[audio_file, number_of_speakers])
24
  btn_submit = gr.Button(value="Submit")
25
  with gr.Column():
26
  title = gr.Textbox(label="Title", placeholder="Title for Conversation")
 
28
  sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
29
  quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
30
  detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
31
+ btn_submit.click(fn=main, inputs=[audio_file, number_of_speakers], outputs=[title, short_summary, sentiment_analysis, quality, detailed_summary])
32
  gr.Markdown("## Examples")
33
  gr.Examples(
34
  examples=[
35
+ ["./examples/sample4.wav", 2],
 
 
 
 
 
 
 
36
  ],
37
+ inputs=[audio_file, number_of_speakers],
38
+ outputs=[title, short_summary, sentiment_analysis, quality, detailed_summary],
39
+ fn=main,
 
40
  )
41
  gr.Markdown(
42
  """
transcribe.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import datetime
3
+ import subprocess
4
+ import wave
5
+ import contextlib
6
+
7
+
8
+ import torch
9
+ import pyannote.audio
10
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+ from sklearn.cluster import AgglomerativeClustering
14
+ import numpy as np
15
+
16
+ model = whisper.load_model("large-v2")
17
+ embedding_model = PretrainedSpeakerEmbedding(
18
+ "speechbrain/spkrec-ecapa-voxceleb",
19
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
+ )
21
+
22
+ def transcribe(audio, num_speakers):
23
+ path, error = convert_to_wav(audio)
24
+ if error is not None:
25
+ return error
26
+
27
+ duration = get_duration(path)
28
+ if duration > 4 * 60 * 60:
29
+ return "Audio duration too long"
30
+
31
+ result = model.transcribe(path)
32
+ segments = result["segments"]
33
+
34
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
35
+ if len(segments) == 1:
36
+ segments[0]['speaker'] = 'SPEAKER 1'
37
+ else:
38
+ embeddings = make_embeddings(path, segments, duration)
39
+ add_speaker_labels(segments, embeddings, num_speakers)
40
+ output = get_output(segments)
41
+ return output
42
+
43
+ def convert_to_wav(path):
44
+ if path[-3:] != 'wav':
45
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
46
+ try:
47
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
48
+ except:
49
+ return path, 'Error: Could not convert file to .wav'
50
+ path = new_path
51
+ return path, None
52
+
53
+ def get_duration(path):
54
+ with contextlib.closing(wave.open(path,'r')) as f:
55
+ frames = f.getnframes()
56
+ rate = f.getframerate()
57
+ return frames / float(rate)
58
+
59
+ def make_embeddings(path, segments, duration):
60
+ embeddings = np.zeros(shape=(len(segments), 192))
61
+ for i, segment in enumerate(segments):
62
+ embeddings[i] = segment_embedding(path, segment, duration)
63
+ return np.nan_to_num(embeddings)
64
+
65
+ audio = Audio()
66
+
67
+ def segment_embedding(path, segment, duration):
68
+ start = segment["start"]
69
+ # Whisper overshoots the end timestamp in the last segment
70
+ end = min(duration, segment["end"])
71
+ clip = Segment(start, end)
72
+ waveform, sample_rate = audio.crop(path, clip)
73
+ return embedding_model(waveform[None])
74
+
75
+ def add_speaker_labels(segments, embeddings, num_speakers):
76
+ """Add speaker labels"""
77
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
78
+ labels = clustering.labels_
79
+ for i in range(len(segments)):
80
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
81
+
82
+ def time(secs):
83
+ """Function to return time delta"""
84
+ return datetime.timedelta(seconds=round(secs))
85
+
86
+ def get_output(segments):
87
+ """Format and generate the output string"""
88
+ output = ''
89
+ for (i, segment) in enumerate(segments):
90
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
91
+ if i != 0:
92
+ output += '\n\n'
93
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
94
+ output += segment["text"][1:] + ' '
95
+ return output