Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import librosa | |
import os | |
import uuid | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import Levenshtein | |
from pathlib import Path | |
# Load the processor and model for Wav2Vec2 once | |
def load_model(): | |
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" | |
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) | |
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) | |
return processor, model | |
processor, model = load_model() | |
def save_audio(audio_data, folder="recorded_audios"): | |
""" | |
Saves the recorded audio data to a file in the specified folder. | |
Args: | |
audio_data (str): The file path of the audio file. | |
folder (str): The directory where the audio file will be saved. | |
Returns: | |
str: The file path of the saved audio file. | |
""" | |
# Ensure the folder exists | |
Path(folder).mkdir(parents=True, exist_ok=True) | |
# Generate a unique filename | |
filename = f"{uuid.uuid4()}.wav" | |
file_path = os.path.join(folder, filename) | |
# Move the audio file to the desired folder | |
os.rename(audio_data, file_path) | |
return file_path | |
def transcribe_audio(audio_file_path): | |
""" | |
Transcribes speech from an audio file using a pretrained Wav2Vec2 model. | |
Args: | |
audio_file_path (str): Path to the audio file. | |
Returns: | |
str: The transcription of the speech in the audio file. | |
""" | |
speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000) | |
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0].strip() | |
return transcription | |
def levenshtein_similarity(transcription1, transcription2): | |
""" | |
Calculate the Levenshtein similarity between two transcriptions. | |
Args: | |
transcription1 (str): The first transcription. | |
transcription2 (str): The second transcription. | |
Returns: | |
float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions. | |
""" | |
distance = Levenshtein.distance(transcription1, transcription2) | |
max_len = max(len(transcription1), len(transcription2)) | |
return 1 - distance / max_len # Normalize to get similarity score | |
def evaluate_audio_similarity(original_audio_path, user_audio_path): | |
""" | |
Compares the similarity between the transcription of an original audio file and a user's audio file. | |
Args: | |
original_audio_path (str): Path to the original audio file. | |
user_audio_path (str): Path to the user's audio file. | |
Returns: | |
tuple: Transcriptions and Levenshtein similarity score. | |
""" | |
transcription_original = transcribe_audio(original_audio_path) | |
transcription_user = transcribe_audio(user_audio_path) | |
similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user) | |
return transcription_original, transcription_user, similarity_score_levenshtein | |
def perform_testing(original_audio, user_audio): | |
# Debugging: Check if audio data is received | |
if original_audio is None: | |
print("Original audio is None") | |
else: | |
print(f"Original audio path: {original_audio}") | |
if user_audio is None: | |
print("User audio is None") | |
else: | |
print(f"User audio path: {user_audio}") | |
if original_audio is None or user_audio is None: | |
return {"Error": "Please provide both original and user audio."} | |
# Save the recorded audio files | |
original_audio_path = save_audio(original_audio) | |
user_audio_path = save_audio(user_audio) | |
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_path, user_audio_path) | |
result = { | |
"Original Transcription": transcription_original, | |
"User Transcription": transcription_user, | |
"Levenshtein Similarity Score": similarity_score, | |
} | |
if similarity_score > 0.8: | |
result["Feedback"] = "The pronunciation is likely correct based on transcription similarity." | |
else: | |
result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity." | |
return result | |
# Define the Gradio app for recording and processing audio | |
def gradio_app(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Audio Transcription and Similarity Checker") | |
original_audio = gr.Audio(label="Record Original Audio", type="filepath") | |
user_audio = gr.Audio(label="Record User Audio", type="filepath") | |
result_output = gr.JSON(label="Output") | |
# Button to perform the testing | |
test_button = gr.Button("Perform Testing") | |
test_button.click(perform_testing, inputs=[original_audio, user_audio], outputs=result_output) | |
return demo | |
# Launch the Gradio app | |
demo = gradio_app() | |
demo.launch() | |