import gradio as gr from PIL import Image import os import random from transformers import pipeline from difflib import SequenceMatcher import logging all_images = os.listdir("assets") current_image = None pipe = pipeline(task="automatic-speech-recognition", model="DurreSudoku/whisper-small-sv") # change to "your-username/the-name-you-picked" def test_func(): random_int = random.randint(1, 100) string = "Test successful" + str(random_int) return string def empty_string(): return "" def open_image(): # Open a random image global all_images global current_image if len(all_images) == 0: all_images = os.listdir("assets") img_name = random.choice(all_images) all_images.remove(img_name) current_image = img_name img = Image.open(os.path.join(r"assets", img_name)) # print(img.filename) return img def transcribe(audio_input): # Transcribe the audio and split the string into a list of words try: transcribed_audio = pipe(audio_input)["text"] except Exception as e: logging.exception(e) return "Encountered an error. Are you sure that you recorded audio before submitting?" transcribed_audio = transcribed_audio.replace(",", "") transcribed_audio = transcribed_audio.replace(".", "") transcribed_audio = transcribed_audio.replace("!", "") transcribed_audio = transcribed_audio.replace("?", "") transcribed_audio = transcribed_audio.lower() correct_answer = current_image.split(".png")[0] text_list = transcribed_audio.split(" ") ratio = SequenceMatcher(None, transcribed_audio, correct_answer).ratio() if ratio >= 0.75: return f"Correct! The answer is {correct_answer}." elif correct_answer in text_list: return f"Correct! The answer is {correct_answer}." # Check for partial match, in case the model mistakes a letter or two. for text in text_list: match_ratio = SequenceMatcher(None, text, correct_answer).ratio() if match_ratio >= 0.75: return f"The answer is {correct_answer}. I heard {text}." # If no match is found. return f"The correct answer is {correct_answer}. I heard {transcribed_audio}." with gr.Blocks(title="Interactive Language Learning") as demo: with gr.Row(): gr.Markdown( """ # Interactive Language Learning Prototype Hello! This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image, record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button. """) with gr.Row(): with gr.Column(): audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here") with gr.Column(): image = gr.Image(value=open_image(),type="pil", interactive=False) with gr.Row(): answer_box = gr.Text(label="Answer appears here", interactive=False) with gr.Row(): with gr.Column(): process_input = gr.Button("Submit Answer") process_input.click(fn=transcribe, inputs=audio, outputs=answer_box) # process_input.click(fn=test_func, inputs=None, outputs=answer_box) with gr.Column(): refresh = gr.Button("New Image") refresh.click(fn=open_image, inputs=None, outputs=image) refresh.click(fn=empty_string, inputs=None, outputs=answer_box) demo.launch(debug=True)