from transformers import pipeline, WhisperModel
import gradio as gr
import pandas as pd
import string
pipe = pipeline(model="matteocirca/whisper-small-it-2",return_timestamps="word")
current_audio = None
segments = {}
def audio2segments(audio,word):
global segments,current_audio
if audio != current_audio or current_audio == None:
segments = pipe(audio)
current_audio = audio
if not word:
if current_audio != None:
return segments["text"],"
No Word inserted!
"
else:
return "","No Word inserted!
"
df = pd.DataFrame(columns=["Occurrence n","Starting TimeStamp","Ending TimeStamp"])
if word:
ranges_list = []
ranges = []
print(segments)
for w in segments['chunks']:
if word == w["text"].translate(str.maketrans('', '', string.punctuation)).replace(" ","").lower() :
ranges_list.append(w["timestamp"])
res = "Occurrence n° | Start | End |
"
for i,r in enumerate(ranges_list):
# ranges_list.append({"Occurrence n":i,"Starting TimeStamp":r[0],"Ending TimeStamp":r[1]})
res += f"{i} | {r[0]} | {r[1]} |
"
res+="
"
print(res)
return segments["text"],res
def clear():
segments = {}
iface = gr.Interface(
fn=audio2segments,
inputs=[gr.Audio(sources=["upload","microphone"], type="filepath"),"text"],
outputs=["text","html"],
title="Whisper Small Italian",
description="Realtime demo for Italian speech recognition using a fine-tuned Whisper small model.",
)
iface.launch()