Spaces:

nslaughter
/

voicenote-2-actionitems

Sleeping

Nathan Slaughter

add Qwen2VL for action item inference

e8ecce6 5 months ago

4.55 kB

	import torch
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSpeechSeq2Seq
	import gradio as gr
	import librosa

	# Determine the device
	if torch.cuda.is_available(): # for CUDA
	device = torch.device("cuda")
	elif torch.backends.mps.is_available(): # for Apple MPS
	device = torch.device("mps")
	else: # fallback for CPU
	device = torch.device("cpu")

	# Load the audio processor and model
	stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
	stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"openai/whisper-large"
	)

	# Move the model to the device
	stt_model.to(device)

	def transcribe_audio(audio_path: str):
	try:
	audio, sr = librosa.load(audio_path, sr=16000)
	inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
	input_features = inputs.input_features.to(device)
	with torch.no_grad():
	predicted_ids = stt_model.generate(input_features)
	transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
	except Exception as e:
	return f"Error during transcription: {str(e)}"
	finally:
	return transcript


	def extract_action_items(transcript: str) -> str:
	"""
	Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
	see example code in the model card: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
	"""
	model_id = "Qwen/Qwen2-VL-7B-Instruct"
	try:
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	# attn_implementation="flash_attention_2"
	)
	# default processer
	processor = AutoProcessor.from_pretrained(model_id)
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": f"""Infer the action items from the following meeting transcript
	and list them as a bulleted list in the format:\n- [item short title]: [item description]

	The [item short title] should be a short phrase that summarizes the action item.
	The [item description] should be a longer description of the action item.

	TRANSCRIPT:

	{transcript}
	"""
	}
	],
	}
	]

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = processor(
	text=[text],
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(device)
	# Extract action items
	generated_ids = model.generate(**inputs, max_new_tokens=128)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	return output_text
	except Exception as e:
	return f"Error during action item extraction: {str(e)}"

	def transcribe_and_extract_action_items(audio_path):
	transcript = transcribe_audio(audio_path)
	action_items_text = extract_action_items(transcript)
	return transcript, action_items_text

	##################################################
	# Gradio Interface
	##################################################

	# Define the Gradio interface components
	input_audio = gr.Audio(
	type="filepath",
	label="Upload or Record Audio"
	)

	output_transcript = gr.Textbox(
	label="Transcript",
	lines=10,
	placeholder="The transcribed text will appear here..."
	)

	output_action_items = gr.Textbox(
	label="Action Items",
	lines=10,
	placeholder="Extracted action items will appear here..."
	)

	# 4. Create the Gradio interface
	interface = gr.Interface(
	fn=transcribe_and_extract_action_items,
	inputs=input_audio,
	outputs=[output_transcript, output_action_items],
	title="Audio Transcription and Action Item Extraction",
	description=(
	"Upload or record an audio clip. The system will transcribe the audio "
	"and extract actionable items from the transcript."
	),
	theme="default"
	)

	# 5. Launch the interface
	if __name__ == "__main__":
	interface.launch()