Spaces:

tensorlake
/

audio-extractors

Sleeping

App Files Files Community

rishiraj commited on May 13

Commit

a647c50

•

1 Parent(s): 698d07a

add audio extractor

Browse files

Files changed (7) hide show

__init__.py +0 -0
app.py +146 -0
extractors/__init__.py +0 -0
extractors/asrdiarization/__init__.py +0 -0
extractors/asrdiarization/asr_extractor.py +134 -0
extractors/asrdiarization/diarization_utils.py +141 -0
requirements.txt +12 -0

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import spaces
+import gradio as gr
+import base64
+import librosa
+from extractors.asrdiarization.asr_extractor import ASRExtractorConfig, ASRExtractor
+from indexify_extractor_sdk import Content
+MAX_AUDIO_MINUTES = 60 # wont try to transcribe if longer than this
+asr_extractor = ASRExtractor()
+def check_audio(audio_filepath):
+	"""
+	Do not convert and raise error if audio too long.
+	"""
+	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
+	duration = librosa.get_duration(y=data, sr=sr)
+	if duration / 60.0 > MAX_AUDIO_MINUTES:
+		raise gr.Error(
+			f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
+			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
+			"(click on the scissors icon to start trimming audio)."
+		)
+	return audio_filepath
+@spaces.GPU
+def transcribe(audio_filepath, task, batch_size, chunk_length_s, sampling_rate, language, num_speakers, min_speakers, max_speakers, assisted):
+	if audio_filepath is None:
+		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
+	audio_filepath = check_audio(audio_filepath)
+	with open(audio_filepath, "rb") as f:
+		converted_audio_filepath = base64.b64encode(f.read()).decode("utf-8")
+	content = Content(content_type="audio/mpeg", data=converted_audio_filepath)
+	config = ASRExtractorConfig(task=task, batch_size=batch_size, chunk_length_s=chunk_length_s, sampling_rate=sampling_rate, language=language, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, assisted=assisted)
+	result = asr_extractor.extract(content, config)
+	text_content = next(content.data.decode('utf-8') for content in result)
+	return text_content
+with gr.Blocks(
+	title="ASR + diarization + speculative decoding with Indexify"
+) as audio_demo:
+	gr.HTML("<h1 style='text-align: center'>ASR + diarization + speculative decoding with Indexify</h1>")
+	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
+	gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
+	with gr.Row():
+		with gr.Column():
+			gr.HTML(
+				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
+				"<p style='color: #A0A0A0;'>Use this demo for audio files only up to 60 mins long. "
+				"You can transcribe longer files and try various other extractors locally with "
+				"<a href='https://getindexify.io/'>Indexify</a>.</p>"
+			)
+			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
+			gr.HTML("<p><b>Step 2:</b> Choose the parameters or leave to default.</p>")
+			task = gr.Dropdown(
+				choices=["transcribe", "translate"],
+				value="transcribe",
+                info="passed to the ASR pipeline",
+				label="Task:"
+			)
+			with gr.Column():
+				batch_size = gr.Number(
+					value=24,
+                    info="for assisted generation the `batch_size` must be set to 1",
+					label="Batch Size:"
+				)
+				chunk_length_s = gr.Number(
+					value=30,
+                    info="passed to the ASR pipeline",
+					label="Chunk Length:"
+				)
+				sampling_rate = gr.Number(
+					value=16000,
+                    info="`sampling_rate` indicates the sampling rate of the audio to process and is used for preprocessing",
+					label="Sampling Rate:"
+				)
+				language = gr.Dropdown(
+                    choices=['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian', 'mandarin'],
+					info="passed to the ASR pipeline",
+					label="Language:"
+				)
+				num_speakers = gr.Number(
+					info="passed to diarization pipeline",
+					label="Number of Speakers:"
+				)
+				min_speakers = gr.Number(
+					info="passed to diarization pipeline",
+					label="Minimum Speakers:"
+				)
+				max_speakers = gr.Number(
+					info="passed to diarization pipeline",
+					label="Maximum Speakers:"
+				)
+				assisted = gr.Checkbox(
+					value=False,
+                    info="the `assisted` flag tells the pipeline whether to use speculative decoding",
+					label="Assisted?",
+				)
+		with gr.Column():
+			gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")
+			go_button = gr.Button(
+				value="Run extractor",
+				variant="primary", # make "primary" so it stands out (default is "secondary")
+			)
+			model_output_text_box = gr.Textbox(
+				label="Extractor Output",
+				elem_id="model_output_text_box",
+			)
+	with gr.Row():
+		gr.HTML(
+			"<p style='text-align: center'>"
+				"Developed with 🫶 by <a href='https://getindexify.io/' target='_blank'>Indexify</a> | "
+				"a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
+			"</p>"
+		)
+	go_button.click(
+		fn=transcribe,
+		inputs = [audio_file, task, batch_size, chunk_length_s, sampling_rate, language, num_speakers, min_speakers, max_speakers, assisted],
+		outputs = [model_output_text_box]
+	)
+demo = gr.TabbedInterface([audio_demo], ["Audio Extraction"], theme=gr.themes.Soft())
+demo.queue()
+demo.launch()

extractors/__init__.py ADDED Viewed

File without changes

extractors/asrdiarization/__init__.py ADDED Viewed

File without changes

extractors/asrdiarization/asr_extractor.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import logging
+import torch
+import base64
+import os
+from indexify_extractor_sdk import Content, Extractor, Feature
+from pyannote.audio import Pipeline
+from transformers import pipeline, AutoModelForCausalLM
+from .diarization_utils import diarize
+from huggingface_hub import HfApi
+from starlette.exceptions import HTTPException
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+from typing import Optional, Literal, List, Union
+logger = logging.getLogger(__name__)
+token = os.getenv('HF_TOKEN')
+class ModelSettings(BaseSettings):
+    asr_model: str = "openai/whisper-large-v3"
+    assistant_model: Optional[str] = "distil-whisper/distil-large-v3"
+    diarization_model: Optional[str] = "pyannote/speaker-diarization-3.1"
+    hf_token: Optional[str] = token
+model_settings = ModelSettings()
+class ASRExtractorConfig(BaseModel):
+    task: Literal["transcribe", "translate"] = "transcribe"
+    batch_size: int = 24
+    assisted: bool = False
+    chunk_length_s: int = 30
+    sampling_rate: int = 16000
+    language: Optional[str] = None
+    num_speakers: Optional[int] = None
+    min_speakers: Optional[int] = None
+    max_speakers: Optional[int] = None
+class ASRExtractor(Extractor):
+    name = "tensorlake/asrdiarization"
+    description = "Powerful ASR + diarization + speculative decoding."
+    system_dependencies = ["ffmpeg"]
+    input_mime_types = ["audio", "audio/mpeg"]
+    def __init__(self):
+        super(ASRExtractor, self).__init__()
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        logger.info(f"Using device: {device.type}")
+        torch_dtype = torch.float32 if device.type == "cpu" else torch.float16
+        self.assistant_model = AutoModelForCausalLM.from_pretrained(
+            model_settings.assistant_model,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True
+        ) if model_settings.assistant_model else None
+        if self.assistant_model:
+            self.assistant_model.to(device)
+        self.asr_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=model_settings.asr_model,
+            torch_dtype=torch_dtype,
+            device=device
+        )
+        if model_settings.diarization_model:
+            # diarization pipeline doesn't raise if there is no token
+            HfApi().whoami(model_settings.hf_token)
+            self.diarization_pipeline = Pipeline.from_pretrained(
+                checkpoint_path=model_settings.diarization_model,
+                use_auth_token=model_settings.hf_token,
+            )
+            self.diarization_pipeline.to(device)
+        else:
+            self.diarization_pipeline = None
+    def extract(self, content: Content, params: ASRExtractorConfig) -> List[Union[Feature, Content]]:
+        file = base64.b64decode(content.data)
+        logger.info(f"inference params: {params}")
+        generate_kwargs = {
+            "task": params.task,
+            "language": params.language,
+            "assistant_model": self.assistant_model if params.assisted else None
+        }
+        try:
+            asr_outputs = self.asr_pipeline(
+                file,
+                chunk_length_s=params.chunk_length_s,
+                batch_size=params.batch_size,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=True,
+            )
+        except RuntimeError as e:
+            logger.error(f"ASR inference error: {str(e)}")
+            raise HTTPException(status_code=400, detail=f"ASR inference error: {str(e)}")
+        except Exception as e:
+            logger.error(f"Unknown error diring ASR inference: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Unknown error diring ASR inference: {str(e)}")
+        if self.diarization_pipeline:
+            try:
+                transcript = diarize(self.diarization_pipeline, file, params, asr_outputs)
+            except RuntimeError as e:
+                logger.error(f"Diarization inference error: {str(e)}")
+                raise HTTPException(status_code=400, detail=f"Diarization inference error: {str(e)}")
+            except Exception as e:
+                logger.error(f"Unknown error during diarization: {str(e)}")
+                raise HTTPException(status_code=500, detail=f"Unknown error during diarization: {str(e)}")
+        else:
+            transcript = []
+        feature = Feature.metadata(value={"chunks": asr_outputs["chunks"], "text": asr_outputs["text"]})
+        return [Content.from_text(str(transcript), features=[feature])]
+    def sample_input(self) -> Content:
+        filepath = "sample.mp3"
+        with open(filepath, 'rb') as f:
+            audio_encoded = base64.b64encode(f.read()).decode("utf-8")
+        return Content(content_type="audio/mpeg", data=audio_encoded)
+if __name__ == "__main__":
+    filepath = "sample.mp3"
+    with open(filepath, 'rb') as f:
+        audio_encoded = base64.b64encode(f.read()).decode("utf-8")
+    data = Content(content_type="audio/mpeg", data=audio_encoded)
+    params = ASRExtractorConfig(batch_size=24)
+    extractor = ASRExtractor()
+    results = extractor.extract(data, params=params)
+    print(results)

extractors/asrdiarization/diarization_utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import numpy as np
+from torchaudio import functional as F
+from transformers.pipelines.audio_utils import ffmpeg_read
+from starlette.exceptions import HTTPException
+import sys
+# Code from insanely-fast-whisper:
+# https://github.com/Vaibhavs10/insanely-fast-whisper
+import logging
+logger = logging.getLogger(__name__)
+def preprocess_inputs(inputs, sampling_rate):
+    inputs = ffmpeg_read(inputs, sampling_rate)
+    if sampling_rate != 16000:
+        inputs = F.resample(
+            torch.from_numpy(inputs), sampling_rate, 16000
+        ).numpy()
+    if len(inputs.shape) != 1:
+        logger.error(f"Diarization pipeline expecs single channel audio, received {inputs.shape}")
+        raise HTTPException(
+            status_code=400,
+            detail=f"Diarization pipeline expecs single channel audio, received {inputs.shape}"
+        )
+    # diarization model expects float32 torch tensor of shape `(channels, seq_len)`
+    diarizer_inputs = torch.from_numpy(inputs).float()
+    diarizer_inputs = diarizer_inputs.unsqueeze(0)
+    return inputs, diarizer_inputs
+def diarize_audio(diarizer_inputs, diarization_pipeline, parameters):
+    diarization = diarization_pipeline(
+        {"waveform": diarizer_inputs, "sample_rate": parameters.sampling_rate},
+        num_speakers=parameters.num_speakers,
+        min_speakers=parameters.min_speakers,
+        max_speakers=parameters.max_speakers,
+    )
+    segments = []
+    for segment, track, label in diarization.itertracks(yield_label=True):
+        segments.append(
+            {
+                "segment": {"start": segment.start, "end": segment.end},
+                "track": track,
+                "label": label,
+            }
+        )
+    # diarizer output may contain consecutive segments from the same speaker (e.g. {(0 -> 1, speaker_1), (1 -> 1.5, speaker_1), ...})
+    # we combine these segments to give overall timestamps for each speaker's turn (e.g. {(0 -> 1.5, speaker_1), ...})
+    new_segments = []
+    prev_segment = cur_segment = segments[0]
+    for i in range(1, len(segments)):
+        cur_segment = segments[i]
+        # check if we have changed speaker ("label")
+        if cur_segment["label"] != prev_segment["label"] and i < len(segments):
+            # add the start/end times for the super-segment to the new list
+            new_segments.append(
+                {
+                    "segment": {
+                        "start": prev_segment["segment"]["start"],
+                        "end": cur_segment["segment"]["start"],
+                    },
+                    "speaker": prev_segment["label"],
+                }
+            )
+            prev_segment = segments[i]
+    # add the last segment(s) if there was no speaker change
+    new_segments.append(
+        {
+            "segment": {
+                "start": prev_segment["segment"]["start"],
+                "end": cur_segment["segment"]["end"],
+            },
+            "speaker": prev_segment["label"],
+        }
+    )
+    return new_segments
+def post_process_segments_and_transcripts(new_segments, transcript, group_by_speaker) -> list:
+    # get the end timestamps for each chunk from the ASR output
+    end_timestamps = np.array(
+        [chunk["timestamp"][-1] if chunk["timestamp"][-1] is not None else sys.float_info.max for chunk in transcript])
+    segmented_preds = []
+    # align the diarizer timestamps and the ASR timestamps
+    for segment in new_segments:
+        # get the diarizer end timestamp
+        end_time = segment["segment"]["end"]
+        # find the ASR end timestamp that is closest to the diarizer's end timestamp and cut the transcript to here
+        upto_idx = np.argmin(np.abs(end_timestamps - end_time))
+        if group_by_speaker:
+            segmented_preds.append(
+                {
+                    "speaker": segment["speaker"],
+                    "text": "".join(
+                        [chunk["text"] for chunk in transcript[: upto_idx + 1]]
+                    ),
+                    "timestamp": (
+                        transcript[0]["timestamp"][0],
+                        transcript[upto_idx]["timestamp"][1],
+                    ),
+                }
+            )
+        else:
+            for i in range(upto_idx + 1):
+                segmented_preds.append({"speaker": segment["speaker"], **transcript[i]})
+        # crop the transcripts and timestamp lists according to the latest timestamp (for faster argmin)
+        transcript = transcript[upto_idx + 1:]
+        end_timestamps = end_timestamps[upto_idx + 1:]
+        if len(end_timestamps) == 0:
+            break
+    return segmented_preds
+def diarize(diarization_pipeline, file, parameters, asr_outputs):
+    _, diarizer_inputs = preprocess_inputs(file, parameters.sampling_rate)
+    segments = diarize_audio(
+        diarizer_inputs,
+        diarization_pipeline,
+        parameters
+    )
+    return post_process_segments_and_transcripts(
+        segments, asr_outputs["chunks"], group_by_speaker=False
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+indexify-extractor-sdk
+accelerate==0.27.2
+pyannote-audio==3.1.1
+transformers==4.40.2
+numpy==1.26.4
+torchaudio==2.2.0
+pydantic==2.6.3
+pydantic-settings==2.2.1
+librosa==0.10.2
+torch==2.2.0
+bitsandbytes
+peft