import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSpeechSeq2Seq
import gradio as gr
import librosa

# Determine the device
if torch.cuda.is_available(): # for CUDA
    device = torch.device("cuda")
elif torch.backends.mps.is_available(): # for Apple MPS
    device = torch.device("mps")
else: # fallback for CPU
    device = torch.device("cpu")

# Load the audio processor and model
stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large"
)

# Move the model to the device
stt_model.to(device)

def transcribe_audio(audio_path: str):
    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
        input_features = inputs.input_features.to(device)
        with torch.no_grad():
            predicted_ids = stt_model.generate(input_features)
        transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
    except Exception as e:
        return f"Error during transcription: {str(e)}"
    finally:
        return transcript


def extract_action_items(transcript: str) -> str:
    """
    Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
    see example code in the model card: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
    """
    model_id = "Qwen/Qwen2-VL-7B-Instruct"
    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            # attn_implementation="flash_attention_2"
        )
        # default processer
        processor = AutoProcessor.from_pretrained(model_id)
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"""Infer the action items from the following meeting transcript
                     and list them as a bulleted list in the format:\n- [item short title]: [item description]

                    The [item short title] should be a short phrase that summarizes the action item.
                    The [item description] should be a longer description of the action item.

                    TRANSCRIPT:

                    {transcript}
                    """
                    }
                ],
            }
        ]

        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = processor(
            text=[text],
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(device)
        # Extract action items
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        return output_text
    except Exception as e:
        return f"Error during action item extraction: {str(e)}"

def transcribe_and_extract_action_items(audio_path):
    transcript = transcribe_audio(audio_path)
    action_items_text = extract_action_items(transcript)
    return transcript, action_items_text

##################################################
# Gradio Interface
##################################################

# Define the Gradio interface components
input_audio = gr.Audio(
    type="filepath",
    label="Upload or Record Audio"
)

output_transcript = gr.Textbox(
    label="Transcript",
    lines=10,
    placeholder="The transcribed text will appear here..."
)

output_action_items = gr.Textbox(
    label="Action Items",
    lines=10,
    placeholder="Extracted action items will appear here..."
)

# 4. Create the Gradio interface
interface = gr.Interface(
    fn=transcribe_and_extract_action_items,
    inputs=input_audio,
    outputs=[output_transcript, output_action_items],
    title="Audio Transcription and Action Item Extraction",
    description=(
        "Upload or record an audio clip. The system will transcribe the audio "
        "and extract actionable items from the transcript."
    ),
    theme="default"
)

# 5. Launch the interface
if __name__ == "__main__":
    interface.launch()