Spaces:

aletrn
/

ai-pronunciation-trainer

Running

App Files Files Community

alessandro trinca tornidor commited on Feb 25

Commit

0700cb3

1 Parent(s): 2f5403b

doc: add/update docstring and typing hints

Browse files

Files changed (14) hide show

AIModels.py +64 -5
ModelInterfaces.py +4 -0
RuleBasedModels.py +51 -0
WordMatching.py +52 -1
WordMetrics.py +32 -20
faster_whisper_wrapper.py +28 -5
lambdaChangeModel.py +3 -1
lambdaGetSample.py +31 -13
lambdaSpeechToScore.py +12 -12
models.py +11 -0
pronunciationTrainer.py +135 -15
typing_hints.py +13 -0
utilsFileIO.py +4 -4
whisper_wrapper.py +19 -2

AIModels.py CHANGED Viewed

@@ -8,22 +8,50 @@ class NeuralASR(ModelInterfaces.IASRModel):
     audio_transcript = None
     def __init__(self, model: torch.nn.Module, decoder) -> None:
         super().__init__()
         self.model = model
         self.decoder = decoder  # Decoder from CTC-outputs to transcripts
     def getTranscript(self) -> str:
-        """Get the transcripts of the process audio"""
         assert self.audio_transcript is not None, 'Can get audio transcripts without having processed the audio'
         return self.audio_transcript
     def getWordLocations(self) -> list:
-        """Get the pair of words location from audio"""
         assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
         return self.word_locations_in_samples
-    def processAudio(self, audio: torch.Tensor):
-        """Process the audio"""
         audio_length_in_samples = audio.shape[1]
         with torch.inference_mode():
             nn_output = self.model(audio)
@@ -34,11 +62,27 @@ class NeuralASR(ModelInterfaces.IASRModel):
 class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
     def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
         super().__init__()
         self.model = model
         self.sampling_rate = sampling_rate
     def getAudioFromSentence(self, sentence: str) -> np.array:
         with torch.inference_mode():
             audio_transcript = self.model.apply_tts(texts=[sentence],
                                                     sample_rate=self.sampling_rate)[0]
@@ -48,12 +92,27 @@ class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
 class NeuralTranslator(ModelInterfaces.ITranslationModel):
     def __init__(self, model: torch.nn.Module, tokenizer) -> None:
         super().__init__()
         self.model = model
         self.tokenizer = tokenizer
     def translateSentence(self, sentence: str) -> str:
-        """Get the transcripts of the process audio"""
         tokenized_text = self.tokenizer(sentence, return_tensors='pt')
         translation = self.model.generate(**tokenized_text)
         translated_text = self.tokenizer.batch_decode(

     audio_transcript = None
     def __init__(self, model: torch.nn.Module, decoder) -> None:
+        """
+        Initialize the NeuralASR (Audio Speech Recognition) model.
+        Args:
+            model (torch.nn.Module): The neural network model for ASR.
+            decoder: The decoder to convert CTC outputs to transcripts.
+        """
         super().__init__()
         self.model = model
         self.decoder = decoder  # Decoder from CTC-outputs to transcripts
     def getTranscript(self) -> str:
+        """
+        Get the transcript of the processed audio.
+        Returns:
+            str: The audio transcript.
+        Raises:
+            AssertionError: If the audio has not been processed.
+        """
         assert self.audio_transcript is not None, 'Can get audio transcripts without having processed the audio'
         return self.audio_transcript
     def getWordLocations(self) -> list:
+        """
+        Get the word locations from the processed audio.
+        Returns:
+            list: A list of word locations in samples.
+        Raises:
+            AssertionError: If the audio has not been processed.
+        """
         assert self.word_locations_in_samples is not None, 'Can get word locations without having processed the audio'
         return self.word_locations_in_samples
+    def processAudio(self, audio: torch.Tensor) -> None:
+        """
+        Process the audio to generate transcripts and word locations.
+        Args:
+            audio (torch.Tensor): The input audio tensor.
+        """
         audio_length_in_samples = audio.shape[1]
         with torch.inference_mode():
             nn_output = self.model(audio)
 class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
     def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
+        """
+        Initialize the NeuralTTS (Text to Speech) model.
+        Args:
+            model (torch.nn.Module): The neural network model for TTS.
+            sampling_rate (int): The sampling rate for the audio.
+        """
         super().__init__()
         self.model = model
         self.sampling_rate = sampling_rate
     def getAudioFromSentence(self, sentence: str) -> np.array:
+        """
+        Generate audio from a given sentence.
+        Args:
+            sentence (str): The input sentence.
+        Returns:
+            np.array: The generated audio as a numpy array.
+        """
         with torch.inference_mode():
             audio_transcript = self.model.apply_tts(texts=[sentence],
                                                     sample_rate=self.sampling_rate)[0]
 class NeuralTranslator(ModelInterfaces.ITranslationModel):
     def __init__(self, model: torch.nn.Module, tokenizer) -> None:
+        """
+        Initialize the NeuralTranslator model.
+        Args:
+            model (torch.nn.Module): The neural network model for translation.
+            tokenizer: The tokenizer for text processing.
+        """
         super().__init__()
         self.model = model
         self.tokenizer = tokenizer
     def translateSentence(self, sentence: str) -> str:
+        """
+        Translate a given sentence to the target language.
+        Args:
+            sentence (str): The input sentence.
+        Returns:
+            str: The translated sentence.
+        """
         tokenized_text = self.tokenizer(sentence, return_tensors='pt')
         translation = self.model.generate(**tokenized_text)
         translated_text = self.tokenizer.batch_decode(

ModelInterfaces.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 class IASRModel(metaclass=abc.ABCMeta):
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'getTranscript') and
@@ -30,6 +31,7 @@ class IASRModel(metaclass=abc.ABCMeta):
 class ITranslationModel(metaclass=abc.ABCMeta):
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'translateSentence') and
@@ -42,6 +44,7 @@ class ITranslationModel(metaclass=abc.ABCMeta):
 class ITextToSpeechModel(metaclass=abc.ABCMeta):
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'getAudioFromSentence') and
@@ -54,6 +57,7 @@ class ITextToSpeechModel(metaclass=abc.ABCMeta):
 class ITextToPhonemModel(metaclass=abc.ABCMeta):
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'convertToPhonem') and

 class IASRModel(metaclass=abc.ABCMeta):
+    """Automatic Speech Recognition Model Interface"""
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'getTranscript') and
 class ITranslationModel(metaclass=abc.ABCMeta):
+    """Translation model"""
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'translateSentence') and
 class ITextToSpeechModel(metaclass=abc.ABCMeta):
+    """Text to Speech model"""
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'getAudioFromSentence') and
 class ITextToPhonemModel(metaclass=abc.ABCMeta):
+    """Text to Phonem model, needed to evaluate the correctness of speech pronunciation"""
     @classmethod
     def __subclasshook__(cls, subclass):
         return (hasattr(subclass, 'convertToPhonem') and

RuleBasedModels.py CHANGED Viewed

@@ -4,8 +4,22 @@ import numpy as np
 import epitran
 import eng_to_ipa
 def get_phonem_converter(language: str):
     if language == 'de':
         phonem_converter = EpitranPhonemConverter(
             epitran.Epitran('deu-Latn'))
@@ -17,24 +31,61 @@ def get_phonem_converter(language: str):
     return phonem_converter
 class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
     word_locations_in_samples = None
     audio_transcript = None
     def __init__(self, epitran_model) -> None:
         super().__init__()
         self.epitran_model = epitran_model
     def convertToPhonem(self, sentence: str) -> str:
         phonem_representation = self.epitran_model.transliterate(sentence)
         return phonem_representation
 class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
     def __init__(self,) -> None:
         super().__init__()
     def convertToPhonem(self, sentence: str) -> str:
         phonem_representation = eng_to_ipa.convert(sentence)
         phonem_representation = phonem_representation.replace('*','')
         return phonem_representation

 import epitran
 import eng_to_ipa
+from constants import app_logger
 def get_phonem_converter(language: str):
+    """
+    Get the phoneme converter for the specified language.
+    Args:
+        language (str): The language code (e.g., 'de' for German, 'en' for English).
+    Returns:
+        ModelInterfaces.ITextToPhonemModel: The phoneme converter for the specified language.
+    Raises:
+        ValueError: If the language is not implemented.
+    """
     if language == 'de':
         phonem_converter = EpitranPhonemConverter(
             epitran.Epitran('deu-Latn'))
     return phonem_converter
 class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
+    """
+    A phoneme converter using the Epitran library for transliteration.
+    """
     word_locations_in_samples = None
     audio_transcript = None
     def __init__(self, epitran_model) -> None:
+        """
+        Initialize the EpitranPhonemConverter with an Epitran model.
+        Args:
+            epitran_model: The Epitran model for transliteration.
+        """
         super().__init__()
         self.epitran_model = epitran_model
     def convertToPhonem(self, sentence: str) -> str:
+        """
+        Convert a sentence to its phoneme representation.
+        Args:
+            sentence (str): The input sentence.
+        Returns:
+            str: The phoneme representation of the sentence.
+        """
+        app_logger.debug(f'starting EpitranPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
         phonem_representation = self.epitran_model.transliterate(sentence)
+        app_logger.debug(f'EpitranPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
         return phonem_representation
 class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
+    """
+    A phoneme converter for English using the eng\_to\_ipa library.
+    """
     def __init__(self,) -> None:
+        """
+        Initialize the EngPhonemConverter.
+        """
         super().__init__()
     def convertToPhonem(self, sentence: str) -> str:
+        """
+        Convert a sentence to its phoneme representation.
+        Args:
+            sentence (str): The input sentence.
+        Returns:
+            str: The phoneme representation of the sentence.
+        """
+        app_logger.debug(f'starting EngPhonemConverter.convertToPhonem for sentence/token "{sentence}"...')
         phonem_representation = eng_to_ipa.convert(sentence)
         phonem_representation = phonem_representation.replace('*','')
+        app_logger.debug(f'EngPhonemConverter: got phonem_representation for sentence/token "{sentence}"!')
         return phonem_representation

WordMatching.py CHANGED Viewed

@@ -14,6 +14,16 @@ TIME_THRESHOLD_MAPPING = 5.0
 def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndarray:
     number_of_real_words = len(words_real)
     number_of_estimated_words = len(words_estimated)
@@ -32,6 +42,15 @@ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndar
 def get_best_path_from_distance_matrix(word_distance_matrix):
     modelCpp = cp_model.CpModel()
     number_of_real_words = word_distance_matrix.shape[1]
@@ -86,6 +105,17 @@ def get_best_path_from_distance_matrix(word_distance_matrix):
 def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, words_real: list) -> Tuple[List, List]:
     mapped_words = []
     mapped_words_indices = []
     WORD_NOT_FOUND_TOKEN = '-'
@@ -128,6 +158,17 @@ def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, word
 def get_best_mapped_words(words_estimated: list | str, words_real: list | str, use_dtw:bool = False) -> tuple[list, list]:
     app_logger.info(f"words_estimated: '{words_estimated}', words_real: '{words_real}', use_dtw:{use_dtw}.")
     word_distance_matrix = get_word_distance_matrix(
         words_estimated, words_real)
@@ -175,7 +216,17 @@ def get_best_mapped_words(words_estimated: list | str, words_real: list | str, u
 #     return mapped_words, mapped_words_indices
-def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
     is_leter_correct = [None] * len(real_word)
     for idx, letter in enumerate(real_word):
         letter = letter.lower()

 def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.ndarray:
+    """
+    Calculate the word distance matrix using Levenshtein distance.
+    Args:
+        words_estimated (list): List of estimated words.
+        words_real (list): List of real words.
+    Returns:
+        np.ndarray: The word distance matrix.
+    """
     number_of_real_words = len(words_real)
     number_of_estimated_words = len(words_estimated)
 def get_best_path_from_distance_matrix(word_distance_matrix):
+    """
+    Get the best path from the word distance matrix using constraint programming.
+    Args:
+        word_distance_matrix (np.ndarray): The word distance matrix.
+    Returns:
+        np.ndarray: The best path indices.
+    """
     modelCpp = cp_model.CpModel()
     number_of_real_words = word_distance_matrix.shape[1]
 def get_resulting_string(mapped_indices: np.ndarray, words_estimated: list, words_real: list) -> Tuple[List, List]:
+    """
+    Get the resulting string and indices from the mapped indices.
+    Args:
+        mapped_indices (np.ndarray): The mapped indices.
+        words_estimated (list): List of estimated words.
+        words_real (list): List of real words.
+    Returns:
+        Tuple[List, List]: The mapped words and their indices.
+    """
     mapped_words = []
     mapped_words_indices = []
     WORD_NOT_FOUND_TOKEN = '-'
 def get_best_mapped_words(words_estimated: list | str, words_real: list | str, use_dtw:bool = False) -> tuple[list, list]:
+    """
+    Get the best mapped words using either DTW or constraint programming.
+    Args:
+        words_estimated (list | str): List of estimated words or a single estimated word.
+        words_real (list | str): List of real words or a single real word.
+        use_dtw (bool, optional): Whether to use DTW for mapping. Defaults to False.
+    Returns:
+        tuple[list, list]: The mapped words and their indices.
+    """
     app_logger.info(f"words_estimated: '{words_estimated}', words_real: '{words_real}', use_dtw:{use_dtw}.")
     word_distance_matrix = get_word_distance_matrix(
         words_estimated, words_real)
 #     return mapped_words, mapped_words_indices
+def getWhichLettersWereTranscribedCorrectly(real_word: str, transcribed_word: list) -> list:
+    """
+    Determine which letters were transcribed correctly.
+    Args:
+        real_word (str): The real word.
+        transcribed_word (str): The transcribed word.
+    Returns:
+        list: A list indicating whether each letter was transcribed correctly (1 for correct, 0 for incorrect).
+    """
     is_leter_correct = [None] * len(real_word)
     for idx, letter in enumerate(real_word):
         letter = letter.lower()

WordMetrics.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import numpy as np
 # ref from https://gitlab.com/-/snippets/1948157
 # For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
-# Pure python
-def edit_distance_python2(a, b):
     # This version is commutative, so as an optimization we force |a|>=|b|
     if len(a) < len(b):
         return edit_distance_python(b, a)
@@ -12,44 +14,54 @@ def edit_distance_python2(a, b):
         return len(a)
     # Only two rows are really needed: the one currently filled in, and the previous
     distances = []
-    distances.append([i for i in range(len(b)+1)])
-    distances.append([0 for _ in range(len(b)+1)])
     # We can prefill the first row:
     costs = [0 for _ in range(3)]
     for i, a_token in enumerate(a, start=1):
         distances[1][0] += 1  # Deals with the first column.
         for j, b_token in enumerate(b, start=1):
-            costs[0] = distances[1][j-1] + 1
             costs[1] = distances[0][j] + 1
-            costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
             distances[1][j] = min(costs)
         # Move to the next row:
         distances[0][:] = distances[1][:]
     return distances[1][len(b)]
 #https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
-def edit_distance_python(seq1, seq2):
     size_x = len(seq1) + 1
     size_y = len(seq2) + 1
-    matrix = np.zeros ((size_x, size_y))
     for x in range(size_x):
-        matrix [x, 0] = x
     for y in range(size_y):
-        matrix [0, y] = y
     for x in range(1, size_x):
         for y in range(1, size_y):
-            if seq1[x-1] == seq2[y-1]:
-                matrix [x,y] = min(
-                    matrix[x-1, y] + 1,
-                    matrix[x-1, y-1],
-                    matrix[x, y-1] + 1
                 )
             else:
-                matrix [x,y] = min(
-                    matrix[x-1,y] + 1,
-                    matrix[x-1,y-1] + 1,
-                    matrix[x,y-1] + 1
                 )
     #print (matrix)
-    return matrix[size_x - 1, size_y - 1]

 import numpy as np
 # ref from https://gitlab.com/-/snippets/1948157
 # For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
+# Pure/numpy python
+def edit_distance_python2(a: str, b: str) -> np.ndarray | int:
+    """A pure python levenshtein distance implementation"""
     # This version is commutative, so as an optimization we force |a|>=|b|
     if len(a) < len(b):
         return edit_distance_python(b, a)
         return len(a)
     # Only two rows are really needed: the one currently filled in, and the previous
     distances = []
+    distances.append([i for i in range(len(b) + 1)])
+    distances.append([0 for _ in range(len(b) + 1)])
     # We can prefill the first row:
     costs = [0 for _ in range(3)]
     for i, a_token in enumerate(a, start=1):
         distances[1][0] += 1  # Deals with the first column.
         for j, b_token in enumerate(b, start=1):
+            costs[0] = distances[1][j - 1] + 1
             costs[1] = distances[0][j] + 1
+            costs[2] = distances[0][j - 1] + (0 if a_token == b_token else 1)
             distances[1][j] = min(costs)
         # Move to the next row:
         distances[0][:] = distances[1][:]
     return distances[1][len(b)]
 #https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
+def edit_distance_python(seq1: str, seq2: str) -> np.ndarray:
+    """A levenshtein distance implementation.
+    Args:
+        seq1 (str): First sequence.
+        seq2 (str): Second sequence.
+    Returns:
+        np.ndarray: The levenshtein distance between the two sequences.
+    """
     size_x = len(seq1) + 1
     size_y = len(seq2) + 1
+    matrix = np.zeros((size_x, size_y))
     for x in range(size_x):
+        matrix[x, 0] = x
     for y in range(size_y):
+        matrix[0, y] = y
     for x in range(1, size_x):
         for y in range(1, size_y):
+            if seq1[x - 1] == seq2[y - 1]:
+                matrix[x, y] = min(
+                    matrix[x - 1, y] + 1,
+                    matrix[x - 1, y - 1],
+                    matrix[x, y - 1] + 1
                 )
             else:
+                matrix[x, y] = min(
+                    matrix[x - 1, y] + 1,
+                    matrix[x - 1, y - 1] + 1,
+                    matrix[x, y - 1] + 1
                 )
     #print (matrix)
+    return matrix[size_x - 1, size_y - 1]

faster_whisper_wrapper.py CHANGED Viewed

@@ -3,10 +3,12 @@ from typing import Union
 import numpy as np
 import onnxruntime
 import torch
-from faster_whisper import WhisperModel
 from ModelInterfaces import IASRModel
 from constants import sample_rate_resample, app_logger, IS_TESTING, DEVICE
 device = onnxruntime.get_device()
 device = "cpu" if IS_TESTING or device.lower() == DEVICE.lower() else device
@@ -15,7 +17,16 @@ device_compute = "int8_float16" if device == "cuda" else "int8"
 app_logger.info(f"device: {device}, device_compute: {device_compute} #")
-def parse_word_info(word_info, sample_rate):
     start_ts = float(word_info.start) * sample_rate
     end_ts = float(word_info.end) * sample_rate
     word = word_info.word
@@ -23,14 +34,24 @@ def parse_word_info(word_info, sample_rate):
 class FasterWhisperASRModel(IASRModel):
-    def __init__(self, model_name="base", language=None):
         self.asr = WhisperModel(model_name, device=device, compute_type=device_compute)
         self._transcript = ""
         self._word_locations = []
         self.sample_rate = sample_rate_resample
         self.language = language
-    def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
         # 'audio' can be a path to a file or a numpy array of audio samples.
         if isinstance(audio, torch.Tensor):
             audio = audio.detach().cpu().numpy()
@@ -50,7 +71,9 @@ class FasterWhisperASRModel(IASRModel):
         self._transcript = " ".join(transcript)
     def getTranscript(self) -> str:
         return self._transcript
-    def getWordLocations(self) -> list:
         return self._word_locations

 import numpy as np
 import onnxruntime
 import torch
+from faster_whisper import WhisperModel, transcribe
 from ModelInterfaces import IASRModel
 from constants import sample_rate_resample, app_logger, IS_TESTING, DEVICE
+from typing_hints import ParsedWordInfo
 device = onnxruntime.get_device()
 device = "cpu" if IS_TESTING or device.lower() == DEVICE.lower() else device
 app_logger.info(f"device: {device}, device_compute: {device_compute} #")
+def parse_word_info(word_info: transcribe.Word, sample_rate: int) -> ParsedWordInfo:
+    """Parse a word info object from WhisperModel into a dictionary with start and end timestamps.
+    Args:
+        word_info (transcribe.Word): Word object from WhisperModel.transcribe module
+        sample_rate (int): Sample rate of the audio
+    Returns:
+        ParsedWordInfo: Dictionary with the current single word, start_ts and end_ts keys
+    """
     start_ts = float(word_info.start) * sample_rate
     end_ts = float(word_info.end) * sample_rate
     word = word_info.word
 class FasterWhisperASRModel(IASRModel):
+    """Faster Whisper ASR model wrapper class. This class is used to transcribe audio and store the transcript and word locations."""
+    def __init__(self, model_name:str="base", language:str=None):
         self.asr = WhisperModel(model_name, device=device, compute_type=device_compute)
         self._transcript = ""
         self._word_locations = []
         self.sample_rate = sample_rate_resample
         self.language = language
+    def processAudio(self, audio:Union[np.ndarray, torch.Tensor]) -> None:
+        """Transcribe audio and store the transcript and word locations updating self._transcript and self._word_locations,
+        get these values using getTranscript() and getWordLocations() respectively.
+        Args:
+            audio (np.ndarray or torch.Tensor): Audio samples to transcribe.
+        Returns:
+            None
+        """
         # 'audio' can be a path to a file or a numpy array of audio samples.
         if isinstance(audio, torch.Tensor):
             audio = audio.detach().cpu().numpy()
         self._transcript = " ".join(transcript)
     def getTranscript(self) -> str:
+        """Get the transcript of the audio."""
         return self._transcript
+    def getWordLocations(self) -> list[ParsedWordInfo]:
+        """Get a list of ParsedWordInfo"""
         return self._word_locations

lambdaChangeModel.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import pronunciationTrainer
@@ -6,7 +7,8 @@ import pronunciationTrainer
 trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
-def lambda_handler(event, context):
     data = json.loads(event['body'])
     model_name = data['modelName']
     trainer_SST_lambda["de"] = pronunciationTrainer.getTrainer("de", model_name=model_name)

 import json
+from typing import Any
 import pronunciationTrainer
 trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunciationTrainer.getTrainer("en")}
+def lambda_handler(event: dict[str], context: Any) -> str:
+    """Lambda handler to change the model used by the pronunciation trainer (Currently not used)"""
     data = json.loads(event['body'])
     model_name = data['modelName']
     trainer_SST_lambda["de"] = pronunciationTrainer.getTrainer("de", model_name=model_name)

lambdaGetSample.py CHANGED Viewed

@@ -5,26 +5,44 @@ import pandas as pd
 import RuleBasedModels
 from constants import app_logger
 class TextDataset:
-    def __init__(self, table, language):
         self.table_dataframe = table
         self.language = language
-    def __getitem__(self, idx):
         line = [self.table_dataframe['sentence'].iloc[idx]]
         return line
-    def __len__(self):
         return len(self.table_dataframe)
-    def get_category_from_df(self, category_value:int):
         selector = self.table_dataframe["category"] == category_value
         df_by_category = self.table_dataframe[selector]
         return df_by_category
-    def get_random_sample_from_df(self, category_value:int):
         app_logger.info(f"language={self.language}, category_value={category_value}.")
         choice = self.table_dataframe.sample(n=1)
         if category_value !=0:
@@ -49,11 +67,11 @@ for lang in available_languages:
 lambda_translate_new_sample = False
-def lambda_handler(event, context):
     """
     lambda handler to return a random text sample from the dataset.
-    Parameters:
         event (dict): The event data passed to the Lambda function.
         context (dict): The context in which the Lambda function is called.
@@ -87,20 +105,20 @@ def lambda_handler(event, context):
         raise ex
-def get_random_selection(language: str, category: int) -> str:
     """
     Get a random text sample from the dataset.
-    Parameters:
         language (str): The language code.
-        category (int): The category value to filter the dataset.
     Returns:
         str: The selected text sample.
     """
     lambda_df_lang = lambda_database[language]
-    current_transcript = lambda_df_lang.get_random_sample_from_df(category)
-    app_logger.info(f"category={category}, language={language}, current_transcript={current_transcript}.")
     return current_transcript[0]
@@ -121,7 +139,7 @@ def get_enriched_dataframe_csv(
     """
     Read a csv dataframe adding a 'category' column.
-    Parameters:
         language (str): The language code (e.g. "de" for German).
         custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
         custom_folder (Path): The folder containing the csv dataframe.

 import RuleBasedModels
 from constants import app_logger
+from typing_hints import Category
 class TextDataset:
+    """Sentences dataset."""
+    def __init__(self, table: pd.DataFrame, language: str):
         self.table_dataframe = table
         self.language = language
+    def __getitem__(self, idx) -> list[str]:
         line = [self.table_dataframe['sentence'].iloc[idx]]
         return line
+    def __len__(self) -> int:
         return len(self.table_dataframe)
+    def get_category_from_df(self, category_value:Category) -> pd.DataFrame:
+        """Filter the sentence dataframe by category returning
+        Args:
+            category_value (int): The category value to filter the dataframe.
+        Returns:
+            pd.DataFrame: The filtered dataframe.
+        """
         selector = self.table_dataframe["category"] == category_value
         df_by_category = self.table_dataframe[selector]
         return df_by_category
+    def get_random_sample_from_df(self, category_value:Category) -> list[str]:
+        """Get a random sentence from the category filtered dataframe.
+        Args:
+            category_value (int): The category value to filter the dataframe.
+        Returns:
+            list: A list with the selected sentence.
+        """
         app_logger.info(f"language={self.language}, category_value={category_value}.")
         choice = self.table_dataframe.sample(n=1)
         if category_value !=0:
 lambda_translate_new_sample = False
+def lambda_handler(event: dict[str], context) -> str:
     """
     lambda handler to return a random text sample from the dataset.
+    Args:
         event (dict): The event data passed to the Lambda function.
         context (dict): The context in which the Lambda function is called.
         raise ex
+def get_random_selection(language: str, category_value: Category) -> str:
     """
     Get a random text sample from the dataset.
+    Args:
         language (str): The language code.
+        category_value (int): The category value to filter the dataset.
     Returns:
         str: The selected text sample.
     """
     lambda_df_lang = lambda_database[language]
+    current_transcript = lambda_df_lang.get_random_sample_from_df(category_value)
+    app_logger.info(f"category_value={category_value}, language={language}, current_transcript={current_transcript}.")
     return current_transcript[0]
     """
     Read a csv dataframe adding a 'category' column.
+    Args:
         language (str): The language code (e.g. "de" for German).
         custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
         custom_folder (Path): The folder containing the csv dataframe.

lambdaSpeechToScore.py CHANGED Viewed

@@ -24,16 +24,16 @@ trainer_SST_lambda = {'de': pronunciationTrainer.getTrainer("de"), 'en': pronunc
 transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample)
-def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
     """
     Lambda handler for speech-to-score.
-    Parameters:
         event (Dict[str, Any]): The event data containing the request body.
         context (Any): The context in which the lambda function is executed.
     Returns:
-        Dict[str, Any]: The response containing the speech-to-score results.
     """
     body = event['body']
     data = json.loads(body)
@@ -67,7 +67,7 @@ def get_speech_to_score_dict(
     """
     Process the audio file and return a dictionary with speech-to-score results.
-    Parameters:
         use_dtw:
         real_text (str): The text to be matched with the audio.
         file_bytes_or_audiotmpfile (str | bytes | dict): The audio file in bytes or a temporary file.
@@ -184,7 +184,7 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
     """
     Process the audio file and return a tuple with speech-to-score results.
-    Parameters:
         real_text (str): The text to be matched with the audio.
         file_bytes_or_audiotmpfile (str | dict): The audio file in bytes or a temporary file.
         language (str): The language of the audio.
@@ -227,7 +227,7 @@ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int) ->
     """
     Write audio data to a file using soundfile.
-    Parameters:
         audiofile (str | Path): The path to the audio file.
         data (np.ndarray): The audio data to write.
         samplerate (int): The sample rate of the audio data.
@@ -243,7 +243,7 @@ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str
     """
     Get the selected word, its audio file, and duration from the recognition output.
-    Parameters:
         idx_recorded_word (int): The index of the recorded word.
         raw_json_output (str): The JSON output from the recognition process.
@@ -267,7 +267,7 @@ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], e
     """
     Split the audio file into segments based on start and end times.
-    Parameters:
         audiotmpfile (str | Path): The path to the audio file.
         start_time (list[float]): The start times of the segments.
         end_time (list[float]): The end times of the segments.
@@ -296,7 +296,7 @@ def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> Pat
     """
     Generate a file path with a custom suffix.
-    Parameters:
         basefile (str | Path): The base file path.
         custom_suffix (str): The custom suffix to add to the file name.
@@ -315,7 +315,7 @@ def calc_start_end(sr_native: int, time_position: float, n_channels: int) -> int
     """
     Calculate the start or end position in samples.
-    Parameters:
         sr_native (int): The native sample rate.
         time_position (float): The time position in seconds.
         n_channels (int): The number of audio channels.
@@ -330,7 +330,7 @@ def soundfile_load(path: str | Path, offset: float = 0.0, duration: float = None
     """
     Load an audio buffer using soundfile.
-    Parameters:
         path (str | Path): The path to the audio file.
         offset (float): The offset in seconds to start reading the file.
         duration (float): The duration in seconds to read from the file.
@@ -369,7 +369,7 @@ def audioread_load(path: str | Path, offset: float = 0.0, duration: float = None
     """
     This loads one block at a time, and then concatenates the results.
-    Parameters:
         path (str | Path): The path to the audio file.
         offset (float): The offset in seconds to start reading the file.
         duration (float): The duration in seconds to read from the file.

 transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample)
+def lambda_handler(event: dict[str], context: Any) -> str:
     """
     Lambda handler for speech-to-score.
+    Args:
         event (Dict[str, Any]): The event data containing the request body.
         context (Any): The context in which the lambda function is executed.
     Returns:
+        str: The json response containing the speech-to-score results.
     """
     body = event['body']
     data = json.loads(body)
     """
     Process the audio file and return a dictionary with speech-to-score results.
+    Args:
         use_dtw:
         real_text (str): The text to be matched with the audio.
         file_bytes_or_audiotmpfile (str | bytes | dict): The audio file in bytes or a temporary file.
     """
     Process the audio file and return a tuple with speech-to-score results.
+    Args:
         real_text (str): The text to be matched with the audio.
         file_bytes_or_audiotmpfile (str | dict): The audio file in bytes or a temporary file.
         language (str): The language of the audio.
     """
     Write audio data to a file using soundfile.
+    Args:
         audiofile (str | Path): The path to the audio file.
         data (np.ndarray): The audio data to write.
         samplerate (int): The sample rate of the audio data.
     """
     Get the selected word, its audio file, and duration from the recognition output.
+    Args:
         idx_recorded_word (int): The index of the recorded word.
         raw_json_output (str): The JSON output from the recognition process.
     """
     Split the audio file into segments based on start and end times.
+    Args:
         audiotmpfile (str | Path): The path to the audio file.
         start_time (list[float]): The start times of the segments.
         end_time (list[float]): The end times of the segments.
     """
     Generate a file path with a custom suffix.
+    Args:
         basefile (str | Path): The base file path.
         custom_suffix (str): The custom suffix to add to the file name.
     """
     Calculate the start or end position in samples.
+    Args:
         sr_native (int): The native sample rate.
         time_position (float): The time position in seconds.
         n_channels (int): The number of audio channels.
     """
     Load an audio buffer using soundfile.
+    Args:
         path (str | Path): The path to the audio file.
         offset (float): The offset in seconds to start reading the file.
         duration (float): The duration in seconds to read from the file.
     """
     This loads one block at a time, and then concatenates the results.
+    Args:
         path (str | Path): The path to the audio file.
         offset (float): The offset in seconds to start reading the file.
         duration (float): The duration in seconds to read from the file.

models.py CHANGED Viewed

@@ -19,6 +19,16 @@ default_speaker_dict = {
 def getASRModel(language: str, model_name: str = MODEL_NAME_DEFAULT) -> IASRModel:
     models_dict = {
         "whisper": __get_model_whisper,
         "faster_whisper": __get_model_faster_whisper,
@@ -63,6 +73,7 @@ def __eval_apply_neural_asr(model: nn.Module, decoder: Decoder, language: str):
 def getTranslationModel(language: str) -> nn.Module:
     from transformers import AutoTokenizer
     from transformers import AutoModelForSeq2SeqLM
     if language == 'de':

 def getASRModel(language: str, model_name: str = MODEL_NAME_DEFAULT) -> IASRModel:
+    """Wrapper function to get the ASR model based on the model name and language.
+    Currently supported models are 'whisper', 'faster_whisper', and 'silero'.
+    Args:
+        language: str: The language of the model.
+        model_name: str: The name of the model to use. Default is 'whisper'.
+    Returns:
+        IASRModel: The ASR model instance.
+    """
     models_dict = {
         "whisper": __get_model_whisper,
         "faster_whisper": __get_model_faster_whisper,
 def getTranslationModel(language: str) -> nn.Module:
+    """Wrapper function to get the translation model based on the language."""
     from transformers import AutoTokenizer
     from transformers import AutoModelForSeq2SeqLM
     if language == 'de':

pronunciationTrainer.py CHANGED Viewed

@@ -13,26 +13,25 @@ import models as mo
 from constants import app_logger, MODEL_NAME_DEFAULT, sample_rate_resample
-def getTrainer(language: str, model_name: str = MODEL_NAME_DEFAULT):
-    asr_model = mo.getASRModel(language, model_name=model_name)
-    if language == 'de':
-        phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran.Epitran('deu-Latn'))
-    elif language == 'en':
-        phonem_converter = RuleBasedModels.EngPhonemConverter()
-    else:
-        raise ValueError(f"Language '{language}' not implemented")
-    trainer = PronunciationTrainer(asr_model, phonem_converter)
-    return trainer
-def preprocessAudioStandalone(audio: torch.tensor) -> torch.tensor:
     audio = audio-torch.mean(audio)
     audio = audio/torch.max(torch.abs(audio))
     return audio
 class PronunciationTrainer:
     current_transcript: str
     current_ipa: str
@@ -46,11 +45,26 @@ class PronunciationTrainer:
     sampling_rate = sample_rate_resample
     def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
         self.asr_model = asr_model
         self.ipa_converter = word_to_ipa_coverter
-    def getTranscriptAndWordsLocations(self, audio_length_in_samples: int):
         audio_transcript = self.asr_model.getTranscript()
         word_locations_in_samples = self.asr_model.getWordLocations()
@@ -77,8 +91,17 @@ class PronunciationTrainer:
     ##################### ASR Functions ###########################
-    def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
         start = time.time()
         recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
             recordedAudio)
@@ -108,7 +131,16 @@ class PronunciationTrainer:
         return result
-    def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
         current_recorded_audio = recordedAudio
         current_recorded_audio = self.preprocessAudio(
@@ -124,6 +156,16 @@ class PronunciationTrainer:
         return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
     def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
         app_logger.info(f"len_list: word_locations:{len(word_locations)},  mapped_words_indices:{len(mapped_words_indices)}, {len(word_locations) == len(mapped_words_indices)}...")
         start_time = []
         end_time = []
@@ -138,6 +180,16 @@ class PronunciationTrainer:
     ##################### Evaluation Functions ###########################
     def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
         words_estimated = recorded_transcript.split()
         try:
@@ -160,6 +212,15 @@ class PronunciationTrainer:
         return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
     def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
         total_mismatches = 0.
         number_of_phonemes = 0.
         current_words_pronunciation_accuracy = []
@@ -181,9 +242,27 @@ class PronunciationTrainer:
         return np.round(percentage_of_correct_pronunciations), current_words_pronunciation_accuracy
     def removePunctuation(self, word: str) -> str:
         return ''.join([char for char in word if char not in punctuation])
     def getWordsPronunciationCategory(self, accuracies) -> list:
         categories = []
         for accuracy in accuracies:
@@ -193,7 +272,48 @@ class PronunciationTrainer:
         return categories
     def getPronunciationCategoryFromAccuracy(self, accuracy) -> int:
         return np.argmin(abs(self.categories_thresholds-accuracy))
     def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
         return preprocessAudioStandalone(audio)

 from constants import app_logger, MODEL_NAME_DEFAULT, sample_rate_resample
+def preprocessAudioStandalone(audio: torch.tensor) -> torch.tensor:
+    """
+    Preprocess the audio by normalizing it.
+    Args:
+        audio (torch.tensor): The input audio tensor.
+    Returns:
+        torch.tensor: The normalized audio tensor.
+    """
     audio = audio-torch.mean(audio)
     audio = audio/torch.max(torch.abs(audio))
     return audio
 class PronunciationTrainer:
+    """
+    A class to train and evaluate pronunciation accuracy using ASR and phoneme conversion models.
+    """
     current_transcript: str
     current_ipa: str
     sampling_rate = sample_rate_resample
     def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
+        """
+        Initialize the PronunciationTrainer with ASR and phoneme conversion models.
+        Args:
+            asr_model (mi.IASRModel): The ASR model to use.
+            word_to_ipa_coverter (mi.ITextToPhonemModel): The phoneme conversion model to use.
+        """
         self.asr_model = asr_model
         self.ipa_converter = word_to_ipa_coverter
+    def getTranscriptAndWordsLocations(self, audio_length_in_samples: int) -> tuple[str, list]:
+        """
+        Get the transcript and word locations from the ASR model.
+        Args:
+            audio_length_in_samples (int): The length of the audio in samples.
+        Returns:
+            tuple: A tuple containing the audio transcript and word locations in samples.
+        """
         audio_transcript = self.asr_model.getTranscript()
         word_locations_in_samples = self.asr_model.getWordLocations()
     ##################### ASR Functions ###########################
+    def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None) -> dict:
+        """
+        Process the recorded audio and evaluate pronunciation accuracy.
+        Args:
+            recordedAudio (torch.Tensor, optional): The recorded audio tensor. Defaults to None.
+            real_text (str, optional): The real text to compare against. Defaults to None.
+        Returns:
+            dict: A dictionary containing the evaluation results.
+        """
         start = time.time()
         recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
             recordedAudio)
         return result
+    def getAudioTranscript(self, recordedAudio: torch.Tensor = None) -> tuple[str | list]:
+        """
+        Get the transcript and IPA representation of the recorded audio.
+        Args:
+            recordedAudio (torch.Tensor, optional): The recorded audio tensor. Defaults to None.
+        Returns:
+            tuple: A tuple containing the transcript, IPA representation, and word locations.
+        """
         current_recorded_audio = recordedAudio
         current_recorded_audio = self.preprocessAudio(
         return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
     def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
+        """
+        Get the start and end times of words in the recorded audio in seconds.
+        Args:
+            word_locations (list): The word locations in samples.
+            mapped_words_indices (list): The indices of the mapped words.
+        Returns:
+            list: A list containing the start and end times of words in seconds.
+        """
         app_logger.info(f"len_list: word_locations:{len(word_locations)},  mapped_words_indices:{len(mapped_words_indices)}, {len(word_locations) == len(mapped_words_indices)}...")
         start_time = []
         end_time = []
     ##################### Evaluation Functions ###########################
     def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
+        """
+        Match the real text with the recorded transcript and get the IPA representations.
+        Args:
+            real_text (str): The real text to compare against.
+            recorded_transcript (str): The recorded transcript.
+        Returns:
+            tuple: A tuple containing the matched words, IPA representations, and mapped word indices.
+        """
         words_estimated = recorded_transcript.split()
         try:
         return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
     def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
+        """
+        Calculate the pronunciation accuracy based on the IPA representations.
+        Args:
+            real_and_transcribed_words_ipa (list): A list of tuples containing the real and transcribed IPA representations.
+        Returns:
+            float: The percentage of correct pronunciations.
+        """
         total_mismatches = 0.
         number_of_phonemes = 0.
         current_words_pronunciation_accuracy = []
         return np.round(percentage_of_correct_pronunciations), current_words_pronunciation_accuracy
     def removePunctuation(self, word: str) -> str:
+        """
+        Remove punctuation from a word.
+        Args:
+            word (str): The input word.
+        Returns:
+            str: The word without punctuation.
+        """
         return ''.join([char for char in word if char not in punctuation])
     def getWordsPronunciationCategory(self, accuracies) -> list:
+        """
+        Get the pronunciation category for each word based on accuracy.
+        Args:
+            accuracies (list): A list of pronunciation accuracies.
+        Returns:
+            list: A list of pronunciation categories.
+        """
         categories = []
         for accuracy in accuracies:
         return categories
     def getPronunciationCategoryFromAccuracy(self, accuracy) -> int:
+        """
+        Get the pronunciation category based on accuracy.
+        Args:
+            accuracy (float): The pronunciation accuracy.
+        Returns:
+            int: The pronunciation category.
+        """
         return np.argmin(abs(self.categories_thresholds-accuracy))
     def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
+        """
+        Preprocess the audio by normalizing it.
+        Args:
+            audio (torch.tensor): The input audio tensor.
+        Returns:
+            torch.tensor: The normalized audio tensor.
+        """
         return preprocessAudioStandalone(audio)
+def getTrainer(language: str, model_name: str = MODEL_NAME_DEFAULT) -> PronunciationTrainer:
+    """
+    Get a PronunciationTrainer instance for the specified language and model.
+    Args:
+        language (str): The language of the model.
+        model_name (str, optional): The name of the model. Defaults to MODEL_NAME_DEFAULT.
+    Returns:
+        PronunciationTrainer: An instance of PronunciationTrainer.
+    """
+    asr_model = mo.getASRModel(language, model_name=model_name)
+    if language == 'de':
+        phonem_converter = RuleBasedModels.EpitranPhonemConverter(epitran.Epitran('deu-Latn'))
+    elif language == 'en':
+        phonem_converter = RuleBasedModels.EngPhonemConverter()
+    else:
+        raise ValueError(f"Language '{language}' not implemented")
+    trainer = PronunciationTrainer(asr_model, phonem_converter)
+    return trainer

typing_hints.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import Annotated, Optional, TypeAlias, TypedDict
+import annotated_types
+Category: TypeAlias = Annotated[int, annotated_types.Ge(0), annotated_types.Le(4)]
+PositiveFloat: TypeAlias = Annotated[float, annotated_types.Ge(0)]
+class ParsedWordInfo(TypedDict):
+    word: str
+    start_ts: PositiveFloat
+    end_ts: PositiveFloat

utilsFileIO.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import string
 import random
 from constants import ALLOWED_ORIGIN
@@ -11,14 +12,13 @@ headers = {
 }
-def generateRandomString(str_length: int = 20):
     # printing lowercase
     letters = string.ascii_lowercase
     return ''.join(random.choice(letters) for i in range(str_length))
-def return_response(body, mimetype="application/json", status=200):
-    from flask import Response
     return Response(
         response=body,
         status=status,
@@ -27,5 +27,5 @@ def return_response(body, mimetype="application/json", status=200):
     )
-def return_response_ok(body, mimetype="application/json"):
     return return_response(body, mimetype, 200)

 import string
 import random
+from flask import Response
 from constants import ALLOWED_ORIGIN
 }
+def generateRandomString(str_length: int = 20) -> str:
     # printing lowercase
     letters = string.ascii_lowercase
     return ''.join(random.choice(letters) for i in range(str_length))
+def return_response(body, mimetype="application/json", status=200) -> Response:
     return Response(
         response=body,
         status=status,
     )
+def return_response_ok(body, mimetype="application/json") -> Response:
     return return_response(body, mimetype, 200)

whisper_wrapper.py CHANGED Viewed

@@ -8,7 +8,12 @@ from ModelInterfaces import IASRModel
 from constants import sample_rate_resample, app_logger
-def parse_word_info(word_info, sample_rate):
     word = word_info["word"]
     start_ts = float(word_info["start"]) * sample_rate
     end_ts = float(word_info["end"]) * sample_rate
@@ -16,6 +21,7 @@ def parse_word_info(word_info, sample_rate):
 class WhisperASRModel(IASRModel):
     def __init__(self, model_name="base", language=None):
         self.asr = whisper.load_model(model_name)
         self._transcript = ""
@@ -24,6 +30,15 @@ class WhisperASRModel(IASRModel):
         self.language = language
     def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
         # 'audio' can be a path to a file or a numpy array of audio samples.
         if isinstance(audio, torch.Tensor):
             audio = audio.detach().cpu().numpy()
@@ -41,7 +56,9 @@ class WhisperASRModel(IASRModel):
             app_logger.info(f"elaborated segment {segment['id']}/{len_segments-1}: type={type(segment)}, len(words):{len(words)}, text:{segment['text']} #")
     def getTranscript(self) -> str:
         return self._transcript
-    def getWordLocations(self) -> list:
         return self._word_locations

 from constants import sample_rate_resample, app_logger
+def parse_word_info(word_info: dict, sample_rate: int) -> dict:
+    """Parse a word info object from WhisperModel into a dictionary with start and end timestamps.
+    Args:
+        word_info (dict): Word dictionary object
+    """
     word = word_info["word"]
     start_ts = float(word_info["start"]) * sample_rate
     end_ts = float(word_info["end"]) * sample_rate
 class WhisperASRModel(IASRModel):
+    """Whisper ASR model wrapper class. This class is used to transcribe audio and store the transcript and word locations."""
     def __init__(self, model_name="base", language=None):
         self.asr = whisper.load_model(model_name)
         self._transcript = ""
         self.language = language
     def processAudio(self, audio:Union[np.ndarray, torch.Tensor]):
+        """Transcribe audio and store the transcript and word locations updating self._transcript and self._word_locations,
+        get these values using getTranscript() and getWordLocations() respectively.
+        Args:
+            audio (np.ndarray or torch.Tensor): Audio samples to transcribe.
+        Returns:
+            None
+        """
         # 'audio' can be a path to a file or a numpy array of audio samples.
         if isinstance(audio, torch.Tensor):
             audio = audio.detach().cpu().numpy()
             app_logger.info(f"elaborated segment {segment['id']}/{len_segments-1}: type={type(segment)}, len(words):{len(words)}, text:{segment['text']} #")
     def getTranscript(self) -> str:
+        """Get the transcript of the audio."""
         return self._transcript
+    def getWordLocations(self) -> list[dict]:
+        """Get the word locations of the audio."""
         return self._word_locations