Spaces:
Sleeping
Sleeping
import torch | |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSpeechSeq2Seq | |
import gradio as gr | |
import librosa | |
# Determine the device | |
if torch.cuda.is_available(): # for CUDA | |
device = torch.device("cuda") | |
elif torch.backends.mps.is_available(): # for Apple MPS | |
device = torch.device("mps") | |
else: # fallback for CPU | |
device = torch.device("cpu") | |
# Load the audio processor and model | |
stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en') | |
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
"openai/whisper-large" | |
) | |
# Move the model to the device | |
stt_model.to(device) | |
def transcribe_audio(audio_path: str): | |
try: | |
audio, sr = librosa.load(audio_path, sr=16000) | |
inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en') | |
input_features = inputs.input_features.to(device) | |
with torch.no_grad(): | |
predicted_ids = stt_model.generate(input_features) | |
transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0] | |
except Exception as e: | |
return f"Error during transcription: {str(e)}" | |
finally: | |
return transcript | |
def extract_action_items(transcript: str) -> str: | |
""" | |
Extracts action items from a transcript using the Llama-3.1-8B-Instruct model. | |
see example code in the model card: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct | |
""" | |
model_id = "Qwen/Qwen2-VL-7B-Instruct" | |
try: | |
model = Qwen2VLForConditionalGeneration.from_pretrained( | |
model_id, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
# attn_implementation="flash_attention_2" | |
) | |
# default processer | |
processor = AutoProcessor.from_pretrained(model_id) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": f"""Infer the action items from the following meeting transcript | |
and list them as a bulleted list in the format:\n- [item short title]: [item description] | |
The [item short title] should be a short phrase that summarizes the action item. | |
The [item description] should be a longer description of the action item. | |
TRANSCRIPT: | |
{transcript} | |
""" | |
} | |
], | |
} | |
] | |
# Preparation for inference | |
text = processor.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
inputs = processor( | |
text=[text], | |
padding=True, | |
return_tensors="pt", | |
) | |
inputs = inputs.to(device) | |
# Extract action items | |
generated_ids = model.generate(**inputs, max_new_tokens=128) | |
generated_ids_trimmed = [ | |
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
] | |
output_text = processor.batch_decode( | |
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
) | |
return output_text | |
except Exception as e: | |
return f"Error during action item extraction: {str(e)}" | |
def transcribe_and_extract_action_items(audio_path): | |
transcript = transcribe_audio(audio_path) | |
action_items_text = extract_action_items(transcript) | |
return transcript, action_items_text | |
################################################## | |
# Gradio Interface | |
################################################## | |
# Define the Gradio interface components | |
input_audio = gr.Audio( | |
type="filepath", | |
label="Upload or Record Audio" | |
) | |
output_transcript = gr.Textbox( | |
label="Transcript", | |
lines=10, | |
placeholder="The transcribed text will appear here..." | |
) | |
output_action_items = gr.Textbox( | |
label="Action Items", | |
lines=10, | |
placeholder="Extracted action items will appear here..." | |
) | |
# 4. Create the Gradio interface | |
interface = gr.Interface( | |
fn=transcribe_and_extract_action_items, | |
inputs=input_audio, | |
outputs=[output_transcript, output_action_items], | |
title="Audio Transcription and Action Item Extraction", | |
description=( | |
"Upload or record an audio clip. The system will transcribe the audio " | |
"and extract actionable items from the transcript." | |
), | |
theme="default" | |
) | |
# 5. Launch the interface | |
if __name__ == "__main__": | |
interface.launch() | |