Hammad712 commited on
Commit
f19273b
1 Parent(s): f58b608

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -101
app.py CHANGED
@@ -1,35 +1,70 @@
1
  import gradio as gr
2
- import requests
 
 
 
 
3
  import Levenshtein
4
- import numpy as np
5
- from transformers import pipeline
6
 
7
- # Function to securely load the Hugging Face API token
8
- def load_hf_token():
9
- # Replace this with your actual Hugging Face API token
10
- return "your_huggingface_api_token"
 
 
11
 
12
- # Function to query the Hugging Face Inference API
13
- def transcribe_audio_hf(audio):
 
14
  """
15
- Transcribes speech from an audio file using the Hugging Face Inference API.
 
16
  Args:
17
- audio (numpy.array): Audio data as a numpy array.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  Returns:
19
  str: The transcription of the speech in the audio file.
20
  """
21
- API_URL = "https://api-inference.huggingface.co/models/jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
22
- headers = {"Authorization": f"Bearer {load_hf_token()}"}
23
- response = requests.post(API_URL, headers=headers, data=audio.tobytes())
24
- return response.json().get("text", "").strip()
 
 
 
25
 
26
- # Function to calculate Levenshtein similarity
27
  def levenshtein_similarity(transcription1, transcription2):
28
  """
29
  Calculate the Levenshtein similarity between two transcriptions.
 
30
  Args:
31
  transcription1 (str): The first transcription.
32
  transcription2 (str): The second transcription.
 
33
  Returns:
34
  float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
35
  """
@@ -37,99 +72,72 @@ def levenshtein_similarity(transcription1, transcription2):
37
  max_len = max(len(transcription1), len(transcription2))
38
  return 1 - distance / max_len # Normalize to get similarity score
39
 
40
- # Function to evaluate audio similarity
41
- def evaluate_audio_similarity(original_audio, user_audio):
42
  """
43
  Compares the similarity between the transcription of an original audio file and a user's audio file.
 
44
  Args:
45
- original_audio (numpy.array): Original audio data.
46
- user_audio (numpy.array): User's audio data.
 
47
  Returns:
48
  tuple: Transcriptions and Levenshtein similarity score.
49
  """
50
- transcription_original = transcribe_audio_hf(original_audio)
51
- transcription_user = transcribe_audio_hf(user_audio)
52
- similarity_score = levenshtein_similarity(transcription_original, transcription_user)
53
- return transcription_original, transcription_user, similarity_score
54
-
55
- # Set up the Whisper ASR model for full-context and streaming ASR
56
- whisper_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
57
-
58
- # Full-context ASR function
59
- def full_context_asr(audio):
60
- sr, y = audio
61
- y = y.astype(np.float32)
62
- y /= np.max(np.abs(y))
63
- return whisper_transcriber({"sampling_rate": sr, "raw": y})["text"]
64
-
65
- # Streaming ASR function
66
- def streaming_asr(stream, new_chunk):
67
- sr, y = new_chunk
68
- y = y.astype(np.float32)
69
- y /= np.max(np.abs(y))
70
-
71
- if stream is not None:
72
- stream = np.concatenate([stream, y])
73
  else:
74
- stream = y
75
-
76
- return stream, whisper_transcriber({"sampling_rate": sr, "raw": stream})["text"]
77
 
78
- # Define Gradio interface for full-context ASR
79
- def gradio_full_context_interface(audio):
80
- if audio is not None:
81
- transcription = full_context_asr(audio)
82
- return transcription
83
  else:
84
- return "Please provide an audio file."
85
 
86
- # Define Gradio interface for audio similarity checking
87
- def gradio_similarity_interface(original_audio, user_audio):
88
- if original_audio is not None and user_audio is not None:
89
- transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio, user_audio)
90
-
91
- result = {
92
- "Original Transcription": transcription_original,
93
- "User Transcription": transcription_user,
94
- "Levenshtein Similarity Score": similarity_score,
95
- }
96
-
97
- if similarity_score > 0.8: # Adjust the threshold as needed
98
- result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
99
- else:
100
- result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
101
-
102
- return result
103
  else:
104
- return "Please provide both original and user audio for comparison."
105
-
106
- # Create Gradio app for full-context ASR
107
- full_context_demo = gr.Interface(
108
- fn=gradio_full_context_interface,
109
- inputs=gr.Audio(type="numpy", source="microphone"),
110
- outputs="text",
111
- title="Full-Context ASR Demo"
112
- )
113
-
114
- # Create Gradio app for streaming ASR
115
- streaming_demo = gr.Interface(
116
- fn=streaming_asr,
117
- inputs=["state", gr.Audio(type="numpy", streaming=True)],
118
- outputs=["state", "text"],
119
- live=True,
120
- title="Streaming ASR Demo"
121
- )
122
-
123
- # Create Gradio app for audio similarity checking
124
- similarity_demo = gr.Interface(
125
- fn=gradio_similarity_interface,
126
- inputs=[
127
- gr.Audio(type="numpy", label="Original Audio"),
128
- gr.Audio(type="numpy", label="User Audio")
129
- ],
130
- outputs="json",
131
- title="Audio Transcription and Similarity Checker"
132
- )
133
-
134
- # Launch all three demos
135
- gr.TabbedInterface([full_context_demo, streaming_demo, similarity_demo], ["Full-Context ASR", "Streaming ASR", "Similarity Checker"]).launch()
 
1
  import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import os
5
+ import uuid
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
  import Levenshtein
8
+ from pathlib import Path
 
9
 
10
+ # Load the processor and model for Wav2Vec2 once
11
+ def load_model():
12
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
13
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
14
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
15
+ return processor, model
16
 
17
+ processor, model = load_model()
18
+
19
+ def save_audio(audio_data, folder="recorded_audios"):
20
  """
21
+ Saves the recorded audio data to a file in the specified folder.
22
+
23
  Args:
24
+ audio_data (str): The file path of the audio file.
25
+ folder (str): The directory where the audio file will be saved.
26
+
27
+ Returns:
28
+ str: The file path of the saved audio file.
29
+ """
30
+ # Ensure the folder exists
31
+ Path(folder).mkdir(parents=True, exist_ok=True)
32
+
33
+ # Generate a unique filename
34
+ filename = f"{uuid.uuid4()}.wav"
35
+ file_path = os.path.join(folder, filename)
36
+
37
+ # Move the audio file to the desired folder
38
+ os.rename(audio_data, file_path)
39
+
40
+ return file_path
41
+
42
+ def transcribe_audio(audio_file_path):
43
+ """
44
+ Transcribes speech from an audio file using a pretrained Wav2Vec2 model.
45
+
46
+ Args:
47
+ audio_file_path (str): Path to the audio file.
48
+
49
  Returns:
50
  str: The transcription of the speech in the audio file.
51
  """
52
+ speech_array, sampling_rate = librosa.load(audio_file_path, sr=16000)
53
+ input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
54
+ with torch.no_grad():
55
+ logits = model(input_values).logits
56
+ predicted_ids = torch.argmax(logits, dim=-1)
57
+ transcription = processor.batch_decode(predicted_ids)[0].strip()
58
+ return transcription
59
 
 
60
  def levenshtein_similarity(transcription1, transcription2):
61
  """
62
  Calculate the Levenshtein similarity between two transcriptions.
63
+
64
  Args:
65
  transcription1 (str): The first transcription.
66
  transcription2 (str): The second transcription.
67
+
68
  Returns:
69
  float: A normalized similarity score between 0 and 1, where 1 indicates identical transcriptions.
70
  """
 
72
  max_len = max(len(transcription1), len(transcription2))
73
  return 1 - distance / max_len # Normalize to get similarity score
74
 
75
+ def evaluate_audio_similarity(original_audio_path, user_audio_path):
 
76
  """
77
  Compares the similarity between the transcription of an original audio file and a user's audio file.
78
+
79
  Args:
80
+ original_audio_path (str): Path to the original audio file.
81
+ user_audio_path (str): Path to the user's audio file.
82
+
83
  Returns:
84
  tuple: Transcriptions and Levenshtein similarity score.
85
  """
86
+ transcription_original = transcribe_audio(original_audio_path)
87
+ transcription_user = transcribe_audio(user_audio_path)
88
+ similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
89
+ return transcription_original, transcription_user, similarity_score_levenshtein
90
+
91
+ def perform_testing(original_audio, user_audio):
92
+ # Debugging: Check if audio data is received
93
+ if original_audio is None:
94
+ print("Original audio is None")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  else:
96
+ print(f"Original audio path: {original_audio}")
 
 
97
 
98
+ if user_audio is None:
99
+ print("User audio is None")
 
 
 
100
  else:
101
+ print(f"User audio path: {user_audio}")
102
 
103
+ if original_audio is None or user_audio is None:
104
+ return {"Error": "Please provide both original and user audio."}
105
+
106
+ # Save the recorded audio files
107
+ original_audio_path = save_audio(original_audio)
108
+ user_audio_path = save_audio(user_audio)
109
+
110
+ transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_path, user_audio_path)
111
+
112
+ result = {
113
+ "Original Transcription": transcription_original,
114
+ "User Transcription": transcription_user,
115
+ "Levenshtein Similarity Score": similarity_score,
116
+ }
117
+
118
+ if similarity_score > 0.8:
119
+ result["Feedback"] = "The pronunciation is likely correct based on transcription similarity."
120
  else:
121
+ result["Feedback"] = "The pronunciation may be incorrect based on transcription similarity."
122
+
123
+ return result
124
+
125
+ # Define the Gradio app for recording and processing audio
126
+ def gradio_app():
127
+ with gr.Blocks() as demo:
128
+ gr.Markdown("# Audio Transcription and Similarity Checker")
129
+
130
+ original_audio = gr.Audio(label="Record Original Audio", type="filepath")
131
+ user_audio = gr.Audio(label="Record User Audio", type="filepath")
132
+
133
+ result_output = gr.JSON(label="Output")
134
+
135
+ # Button to perform the testing
136
+ test_button = gr.Button("Perform Testing")
137
+ test_button.click(perform_testing, inputs=[original_audio, user_audio], outputs=result_output)
138
+
139
+ return demo
140
+
141
+ # Launch the Gradio app
142
+ demo = gradio_app()
143
+ demo.launch()