Spaces:

peb-peb
/

shravan

Runtime error

App Files Files Community

peb-peb commited on Jul 2, 2023

Commit

f78137c

1 Parent(s): ce8df04

add audio to text conversion

Browse files

Files changed (2) hide show

app.py +20 -122
transcribe.py +95 -0

app.py CHANGED Viewed

@@ -1,121 +1,26 @@
-# import whisper
 import gradio as gr
-import datetime
-import subprocess
-import wave
-import contextlib
-# import torch
-# import pyannote.audio
-# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
-# from pyannote.audio import Audio
-# from pyannote.core import Segment
-# from sklearn.cluster import AgglomerativeClustering
-# import numpy as np
-# model = whisper.load_model("large-v2")
-# embedding_model = PretrainedSpeakerEmbedding(
-#     "speechbrain/spkrec-ecapa-voxceleb",
-#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# )
-# def transcribe(audio, num_speakers):
-#   path, error = convert_to_wav(audio)
-#   if error is not None:
-#     return error
-#   duration = get_duration(path)
-#   if duration > 4 * 60 * 60:
-#     return "Audio duration too long"
-#   result = model.transcribe(path)
-#   segments = result["segments"]
-#   num_speakers = min(max(round(num_speakers), 1), len(segments))
-#   if len(segments) == 1:
-#     segments[0]['speaker'] = 'SPEAKER 1'
-#   else:
-#     embeddings = make_embeddings(path, segments, duration)
-#     add_speaker_labels(segments, embeddings, num_speakers)
-#   output = get_output(segments)
-#   return output
-# def convert_to_wav(path):
-#   if path[-3:] != 'wav':
-#     new_path = '.'.join(path.split('.')[:-1]) + '.wav'
-#     try:
-#       subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
-#     except:
-#       return path, 'Error: Could not convert file to .wav'
-#     path = new_path
-#   return path, None
-# def get_duration(path):
-#   with contextlib.closing(wave.open(path,'r')) as f:
-#     frames = f.getnframes()
-#     rate = f.getframerate()
-#     return frames / float(rate)
-# def make_embeddings(path, segments, duration):
-#   embeddings = np.zeros(shape=(len(segments), 192))
-#   for i, segment in enumerate(segments):
-#     embeddings[i] = segment_embedding(path, segment, duration)
-#   return np.nan_to_num(embeddings)
-# audio = Audio()
-# def segment_embedding(path, segment, duration):
-#   start = segment["start"]
-#   # Whisper overshoots the end timestamp in the last segment
-#   end = min(duration, segment["end"])
-#   clip = Segment(start, end)
-#   waveform, sample_rate = audio.crop(path, clip)
-#   return embedding_model(waveform[None])
-# def add_speaker_labels(segments, embeddings, num_speakers):
-#   clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
-#   labels = clustering.labels_
-#   for i in range(len(segments)):
-#     segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
-# def time(secs):
-#   return datetime.timedelta(seconds=round(secs))
-# def get_output(segments):
-#   output = ''
-#   for (i, segment) in enumerate(segments):
-#     if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
-#       if i != 0:
-#         output += '\n\n'
-#       output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
-#     output += segment["text"][1:] + ' '
-#   return output
-s = ""
-def greet1(name):
-    global s
-    s = "modified"
-    return "Hello " + name + "!"
-def greet2(name):
-    return "Hi " + name + "!" + " " + s
-def greet3(name):
-    return "Hola " + name + "!"
 with gr.Blocks() as demo:
   with gr.Box():
     with gr.Row():
       with gr.Column():
         audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
-        # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
         number_of_speakers = gr.Number(label="Number of Speakers", value=2)
         with gr.Row():
-          btn_clear = gr.Button(value="Clear")
           btn_submit = gr.Button(value="Submit")
       with gr.Column():
         title = gr.Textbox(label="Title", placeholder="Title for Conversation")
@@ -123,22 +28,15 @@ with gr.Blocks() as demo:
         sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
         quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
         detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
     gr.Markdown("## Examples")
     gr.Examples(
       examples=[
-        [
-          "Harsh",
-          2,
-        ],
-        [
-          "Rahul",
-          2,
-        ],
       ],
-      inputs=[title],
-      outputs=[short_summary],
-      fn=greet1,
-      cache_examples=True,
     )
   gr.Markdown(
     """

 import gradio as gr
+from transcribe import transcribe
+def main(audio_file, number_of_speakers):
+  # Audio to Text Converter
+  text_data = transcribe(audio_file, number_of_speakers)
+  print(text_data)
+  title = "ss"
+  short_summary = "dsa"
+  sentiment_analysis = "gyn"
+  quality = "dsdww"
+  detailed_summary = "jbjbjbjs"
+  return title, short_summary, sentiment_analysis, quality, detailed_summary
+# UI Interface on the Hugging Face Page
 with gr.Blocks() as demo:
   with gr.Box():
     with gr.Row():
       with gr.Column():
         audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
         number_of_speakers = gr.Number(label="Number of Speakers", value=2)
         with gr.Row():
+          btn_clear = gr.ClearButton(value="Clear", components=[audio_file, number_of_speakers])
           btn_submit = gr.Button(value="Submit")
       with gr.Column():
         title = gr.Textbox(label="Title", placeholder="Title for Conversation")
         sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
         quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
         detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
+      btn_submit.click(fn=main, inputs=[audio_file, number_of_speakers], outputs=[title, short_summary, sentiment_analysis, quality, detailed_summary])
     gr.Markdown("## Examples")
     gr.Examples(
       examples=[
+        ["./examples/sample4.wav", 2],
       ],
+      inputs=[audio_file, number_of_speakers],
+      outputs=[title, short_summary, sentiment_analysis, quality, detailed_summary],
+      fn=main,
     )
   gr.Markdown(
     """

transcribe.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import whisper
+import datetime
+import subprocess
+import wave
+import contextlib
+import torch
+import pyannote.audio
+from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+from pyannote.audio import Audio
+from pyannote.core import Segment
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+model = whisper.load_model("large-v2")
+embedding_model = PretrainedSpeakerEmbedding(
+    "speechbrain/spkrec-ecapa-voxceleb",
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+)
+def transcribe(audio, num_speakers):
+  path, error = convert_to_wav(audio)
+  if error is not None:
+    return error
+  duration = get_duration(path)
+  if duration > 4 * 60 * 60:
+    return "Audio duration too long"
+  result = model.transcribe(path)
+  segments = result["segments"]
+  num_speakers = min(max(round(num_speakers), 1), len(segments))
+  if len(segments) == 1:
+    segments[0]['speaker'] = 'SPEAKER 1'
+  else:
+    embeddings = make_embeddings(path, segments, duration)
+    add_speaker_labels(segments, embeddings, num_speakers)
+  output = get_output(segments)
+  return output
+def convert_to_wav(path):
+  if path[-3:] != 'wav':
+    new_path = '.'.join(path.split('.')[:-1]) + '.wav'
+    try:
+      subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
+    except:
+      return path, 'Error: Could not convert file to .wav'
+    path = new_path
+  return path, None
+def get_duration(path):
+  with contextlib.closing(wave.open(path,'r')) as f:
+    frames = f.getnframes()
+    rate = f.getframerate()
+    return frames / float(rate)
+def make_embeddings(path, segments, duration):
+  embeddings = np.zeros(shape=(len(segments), 192))
+  for i, segment in enumerate(segments):
+    embeddings[i] = segment_embedding(path, segment, duration)
+  return np.nan_to_num(embeddings)
+audio = Audio()
+def segment_embedding(path, segment, duration):
+  start = segment["start"]
+  # Whisper overshoots the end timestamp in the last segment
+  end = min(duration, segment["end"])
+  clip = Segment(start, end)
+  waveform, sample_rate = audio.crop(path, clip)
+  return embedding_model(waveform[None])
+def add_speaker_labels(segments, embeddings, num_speakers):
+  """Add speaker labels"""
+  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+  labels = clustering.labels_
+  for i in range(len(segments)):
+    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
+def time(secs):
+  """Function to return time delta"""
+  return datetime.timedelta(seconds=round(secs))
+def get_output(segments):
+  """Format and generate the output string"""
+  output = ''
+  for (i, segment) in enumerate(segments):
+    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+      if i != 0:
+        output += '\n\n'
+      output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
+    output += segment["text"][1:] + ' '
+  return output