import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSpeechSeq2Seq import gradio as gr import librosa # Determine the device if torch.cuda.is_available(): # for CUDA device = torch.device("cuda") elif torch.backends.mps.is_available(): # for Apple MPS device = torch.device("mps") else: # fallback for CPU device = torch.device("cpu") # Load the audio processor and model stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en') stt_model = AutoModelForSpeechSeq2Seq.from_pretrained( "openai/whisper-large" ) # Move the model to the device stt_model.to(device) def transcribe_audio(audio_path: str): try: audio, sr = librosa.load(audio_path, sr=16000) inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en') input_features = inputs.input_features.to(device) with torch.no_grad(): predicted_ids = stt_model.generate(input_features) transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0] except Exception as e: return f"Error during transcription: {str(e)}" finally: return transcript def extract_action_items(transcript: str) -> str: """ Extracts action items from a transcript using the Llama-3.1-8B-Instruct model. see example code in the model card: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct """ model_id = "Qwen/Qwen2-VL-7B-Instruct" try: model = Qwen2VLForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", # attn_implementation="flash_attention_2" ) # default processer processor = AutoProcessor.from_pretrained(model_id) messages = [ { "role": "user", "content": [ {"type": "text", "text": f"""Infer the action items from the following meeting transcript and list them as a bulleted list in the format:\n- [item short title]: [item description] The [item short title] should be a short phrase that summarizes the action item. The [item description] should be a longer description of the action item. TRANSCRIPT: {transcript} """ } ], } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=[text], padding=True, return_tensors="pt", ) inputs = inputs.to(device) # Extract action items generated_ids = model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text except Exception as e: return f"Error during action item extraction: {str(e)}" def transcribe_and_extract_action_items(audio_path): transcript = transcribe_audio(audio_path) action_items_text = extract_action_items(transcript) return transcript, action_items_text ################################################## # Gradio Interface ################################################## # Define the Gradio interface components input_audio = gr.Audio( type="filepath", label="Upload or Record Audio" ) output_transcript = gr.Textbox( label="Transcript", lines=10, placeholder="The transcribed text will appear here..." ) output_action_items = gr.Textbox( label="Action Items", lines=10, placeholder="Extracted action items will appear here..." ) # 4. Create the Gradio interface interface = gr.Interface( fn=transcribe_and_extract_action_items, inputs=input_audio, outputs=[output_transcript, output_action_items], title="Audio Transcription and Action Item Extraction", description=( "Upload or record an audio clip. The system will transcribe the audio " "and extract actionable items from the transcript." ), theme="default" ) # 5. Launch the interface if __name__ == "__main__": interface.launch()