MERaLiON
/

MERaLiON-AudioLLM-Whisper-SEA-LION

@@ -48,7 +48,6 @@ class MERaLiONProcessor(ProcessorMixin):
     tokenizer_class = "GemmaTokenizer"
     valid_kwargs = [
         "fixed_speech_embeds_length",
-        "speech_signature",
         "speech_token_index",
         "time_duration_limit",
         "do_normalize"
@@ -59,13 +58,11 @@ class MERaLiONProcessor(ProcessorMixin):
         feature_extractor=None,
         tokenizer=None,
         fixed_speech_embeds_length=100,
-        speech_signature="<SpeechHere>",
         speech_token_index=255999,
         time_duration_limit=-1,
         do_normalize=True
     ):
         self.fixed_speech_embeds_length = fixed_speech_embeds_length
-        self.speech_signature = speech_signature
         self.speech_token_index = speech_token_index
         self.time_duration_limit = time_duration_limit
         self.do_normalize = do_normalize
@@ -74,12 +71,12 @@ class MERaLiONProcessor(ProcessorMixin):
         self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
-    def _process_text(self, text, speech_signature):
         target_string = self.speech_token * self.fixed_speech_embeds_length
         if isinstance(text, list) or isinstance(text, tuple):
-            pieces = [item.replace(speech_signature, target_string) for item in text]
             return pieces
-        return text.replace(speech_signature, target_string)
     def _slice_audios(self, audios, time_duration_limit, sampling_rate):
         if time_duration_limit <= 0:
@@ -101,7 +98,6 @@ class MERaLiONProcessor(ProcessorMixin):
         audios: Union[np.ndarray, List[np.ndarray]] = None,
         padding: Union[bool, str, PaddingStrategy] = True,
         sampling_rate: Optional[int] = None,
-        speech_signature: Optional[str] = None,
         time_duration_limit: Optional[int] = None,
         do_normalize: Optional[bool] = None,
         **kwargs,
@@ -131,8 +127,6 @@ class MERaLiONProcessor(ProcessorMixin):
                   lengths).
             sampling_rate (`int`, defaults to 16000):
                 The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-            speech_signature (`str`, defaults to `<SpeechHere>`):
-                The special string marking the location of speech tokens.
             time_duration_limit (`int`, defaults -1):
                 The max input time duration in seconds.
             do_normalize (`bool`, defaults to `True`):
@@ -144,8 +138,6 @@ class MERaLiONProcessor(ProcessorMixin):
             raise ValueError("You need to specify either a `text` input to process.")
         if sampling_rate is None:
             sampling_rate = self.feature_extractor.sampling_rate
-        if speech_signature is None:
-            speech_signature = self.speech_signature
         if time_duration_limit is None:
             time_duration_limit = self.time_duration_limit
         if do_normalize is None:
@@ -153,7 +145,7 @@ class MERaLiONProcessor(ProcessorMixin):
         inputs_dict = {}
-        text = self._process_text(text, speech_signature)
         text_input = self.tokenizer(
             text=text,

     tokenizer_class = "GemmaTokenizer"
     valid_kwargs = [
         "fixed_speech_embeds_length",
         "speech_token_index",
         "time_duration_limit",
         "do_normalize"
         feature_extractor=None,
         tokenizer=None,
         fixed_speech_embeds_length=100,
         speech_token_index=255999,
         time_duration_limit=-1,
         do_normalize=True
     ):
         self.fixed_speech_embeds_length = fixed_speech_embeds_length
         self.speech_token_index = speech_token_index
         self.time_duration_limit = time_duration_limit
         self.do_normalize = do_normalize
         self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
+    def _process_text(self, text):
         target_string = self.speech_token * self.fixed_speech_embeds_length
         if isinstance(text, list) or isinstance(text, tuple):
+            pieces = [item.replace(self.speech_token, target_string) for item in text]
             return pieces
+        return text.replace(self.speech_token, target_string)
     def _slice_audios(self, audios, time_duration_limit, sampling_rate):
         if time_duration_limit <= 0:
         audios: Union[np.ndarray, List[np.ndarray]] = None,
         padding: Union[bool, str, PaddingStrategy] = True,
         sampling_rate: Optional[int] = None,
         time_duration_limit: Optional[int] = None,
         do_normalize: Optional[bool] = None,
         **kwargs,
                   lengths).
             sampling_rate (`int`, defaults to 16000):
                 The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
             time_duration_limit (`int`, defaults -1):
                 The max input time duration in seconds.
             do_normalize (`bool`, defaults to `True`):
             raise ValueError("You need to specify either a `text` input to process.")
         if sampling_rate is None:
             sampling_rate = self.feature_extractor.sampling_rate
         if time_duration_limit is None:
             time_duration_limit = self.time_duration_limit
         if do_normalize is None:
         inputs_dict = {}
+        text = self._process_text(text)
         text_input = self.tokenizer(
             text=text,

processor_config.json CHANGED Viewed

@@ -5,7 +5,6 @@
   "do_normalize": true,
   "fixed_speech_embeds_length": 100,
   "processor_class": "MERaLiONProcessor",
-  "speech_signature": "<SpeechHere>",
   "speech_token_index": 255999,
   "time_duration_limit": -1
 }

   "do_normalize": true,
   "fixed_speech_embeds_length": 100,
   "processor_class": "MERaLiONProcessor",
   "speech_token_index": 255999,
   "time_duration_limit": -1
 }

tokenizer_config.json CHANGED Viewed

@@ -1987,7 +1987,7 @@
       "special": false
     },
     "255999": {
-      "content": "<speech_token>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,

       "special": false
     },
     "255999": {
+      "content": "<unused99>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,