import gradio as gr
from PIL import Image
import os
import random
from transformers import pipeline
from difflib import SequenceMatcher
import logging

all_images = os.listdir("assets")
current_image = None
pipe = pipeline(task="automatic-speech-recognition", model="DurreSudoku/whisper-small-sv")  # change to "your-username/the-name-you-picked"

def test_func():
    random_int = random.randint(1, 100)
    string = "Test successful" + str(random_int)
    return string


def empty_string():
    return ""


def open_image():
    # Open a random image
    global all_images
    global current_image

    if len(all_images) == 0:
        all_images = os.listdir("assets")
        
    img_name = random.choice(all_images)
    all_images.remove(img_name)
    
    current_image = img_name
    
    img = Image.open(os.path.join(r"assets", img_name))
    # print(img.filename)
    return img


def transcribe(audio_input):

    # Transcribe the audio and split the string into a list of words
    try:
        transcribed_audio = pipe(audio_input)["text"]
    except Exception as e:
        logging.exception(e)
        return "Encountered an error. Are you sure that you recorded audio before submitting?"
    
    transcribed_audio = transcribed_audio.replace(",", "")
    transcribed_audio = transcribed_audio.replace(".", "")
    transcribed_audio = transcribed_audio.replace("!", "")
    transcribed_audio = transcribed_audio.replace("?", "")
    transcribed_audio = transcribed_audio.lower()
    
    
    correct_answer = current_image.split(".png")[0]
    text_list = transcribed_audio.split(" ")
    ratio = SequenceMatcher(None, transcribed_audio, correct_answer).ratio()
    
    if ratio >= 0.75:
        return f"Correct! The answer is {correct_answer}."
    elif correct_answer in text_list:
        return f"Correct! The answer is {correct_answer}."
    
    # Check for partial match, in case the model mistakes a letter or two.
    for text in text_list:
        match_ratio =  SequenceMatcher(None, text, correct_answer).ratio()
        
        if match_ratio >= 0.75:
            return f"The answer is {correct_answer}. I heard {text}."
    # If no match is found.
    return f"The correct answer is {correct_answer}. I heard {transcribed_audio}."


with gr.Blocks(title="Interactive Language Learning") as demo:
    with gr.Row():
        gr.Markdown(
    """
    # Interactive Language Learning Prototype
    
    Hello!
    
    This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image, 
    record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button.
    """)
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here")
        with gr.Column():
            image = gr.Image(value=open_image(),type="pil", interactive=False)
    with gr.Row():
        answer_box = gr.Text(label="Answer appears here", interactive=False)
    with gr.Row():
        with gr.Column():
            process_input = gr.Button("Submit Answer")
            process_input.click(fn=transcribe, inputs=audio, outputs=answer_box)
            # process_input.click(fn=test_func, inputs=None, outputs=answer_box)
        with gr.Column():
            refresh = gr.Button("New Image")
            refresh.click(fn=open_image, inputs=None, outputs=image)
            refresh.click(fn=empty_string, inputs=None, outputs=answer_box)
demo.launch(debug=True)