Yingxu He commited on
Commit
2072c9a
1 Parent(s): 30951e5

Upload processor

Browse files
processing_meralion.py CHANGED
@@ -48,7 +48,6 @@ class MERaLiONProcessor(ProcessorMixin):
48
  tokenizer_class = "GemmaTokenizer"
49
  valid_kwargs = [
50
  "fixed_speech_embeds_length",
51
- "speech_signature",
52
  "speech_token_index",
53
  "time_duration_limit",
54
  "do_normalize"
@@ -59,13 +58,11 @@ class MERaLiONProcessor(ProcessorMixin):
59
  feature_extractor=None,
60
  tokenizer=None,
61
  fixed_speech_embeds_length=100,
62
- speech_signature="<SpeechHere>",
63
  speech_token_index=255999,
64
  time_duration_limit=-1,
65
  do_normalize=True
66
  ):
67
  self.fixed_speech_embeds_length = fixed_speech_embeds_length
68
- self.speech_signature = speech_signature
69
  self.speech_token_index = speech_token_index
70
  self.time_duration_limit = time_duration_limit
71
  self.do_normalize = do_normalize
@@ -74,12 +71,12 @@ class MERaLiONProcessor(ProcessorMixin):
74
 
75
  self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
76
 
77
- def _process_text(self, text, speech_signature):
78
  target_string = self.speech_token * self.fixed_speech_embeds_length
79
  if isinstance(text, list) or isinstance(text, tuple):
80
- pieces = [item.replace(speech_signature, target_string) for item in text]
81
  return pieces
82
- return text.replace(speech_signature, target_string)
83
 
84
  def _slice_audios(self, audios, time_duration_limit, sampling_rate):
85
  if time_duration_limit <= 0:
@@ -101,7 +98,6 @@ class MERaLiONProcessor(ProcessorMixin):
101
  audios: Union[np.ndarray, List[np.ndarray]] = None,
102
  padding: Union[bool, str, PaddingStrategy] = True,
103
  sampling_rate: Optional[int] = None,
104
- speech_signature: Optional[str] = None,
105
  time_duration_limit: Optional[int] = None,
106
  do_normalize: Optional[bool] = None,
107
  **kwargs,
@@ -131,8 +127,6 @@ class MERaLiONProcessor(ProcessorMixin):
131
  lengths).
132
  sampling_rate (`int`, defaults to 16000):
133
  The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
134
- speech_signature (`str`, defaults to `<SpeechHere>`):
135
- The special string marking the location of speech tokens.
136
  time_duration_limit (`int`, defaults -1):
137
  The max input time duration in seconds.
138
  do_normalize (`bool`, defaults to `True`):
@@ -144,8 +138,6 @@ class MERaLiONProcessor(ProcessorMixin):
144
  raise ValueError("You need to specify either a `text` input to process.")
145
  if sampling_rate is None:
146
  sampling_rate = self.feature_extractor.sampling_rate
147
- if speech_signature is None:
148
- speech_signature = self.speech_signature
149
  if time_duration_limit is None:
150
  time_duration_limit = self.time_duration_limit
151
  if do_normalize is None:
@@ -153,7 +145,7 @@ class MERaLiONProcessor(ProcessorMixin):
153
 
154
  inputs_dict = {}
155
 
156
- text = self._process_text(text, speech_signature)
157
 
158
  text_input = self.tokenizer(
159
  text=text,
 
48
  tokenizer_class = "GemmaTokenizer"
49
  valid_kwargs = [
50
  "fixed_speech_embeds_length",
 
51
  "speech_token_index",
52
  "time_duration_limit",
53
  "do_normalize"
 
58
  feature_extractor=None,
59
  tokenizer=None,
60
  fixed_speech_embeds_length=100,
 
61
  speech_token_index=255999,
62
  time_duration_limit=-1,
63
  do_normalize=True
64
  ):
65
  self.fixed_speech_embeds_length = fixed_speech_embeds_length
 
66
  self.speech_token_index = speech_token_index
67
  self.time_duration_limit = time_duration_limit
68
  self.do_normalize = do_normalize
 
71
 
72
  self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
73
 
74
+ def _process_text(self, text):
75
  target_string = self.speech_token * self.fixed_speech_embeds_length
76
  if isinstance(text, list) or isinstance(text, tuple):
77
+ pieces = [item.replace(self.speech_token, target_string) for item in text]
78
  return pieces
79
+ return text.replace(self.speech_token, target_string)
80
 
81
  def _slice_audios(self, audios, time_duration_limit, sampling_rate):
82
  if time_duration_limit <= 0:
 
98
  audios: Union[np.ndarray, List[np.ndarray]] = None,
99
  padding: Union[bool, str, PaddingStrategy] = True,
100
  sampling_rate: Optional[int] = None,
 
101
  time_duration_limit: Optional[int] = None,
102
  do_normalize: Optional[bool] = None,
103
  **kwargs,
 
127
  lengths).
128
  sampling_rate (`int`, defaults to 16000):
129
  The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
 
 
130
  time_duration_limit (`int`, defaults -1):
131
  The max input time duration in seconds.
132
  do_normalize (`bool`, defaults to `True`):
 
138
  raise ValueError("You need to specify either a `text` input to process.")
139
  if sampling_rate is None:
140
  sampling_rate = self.feature_extractor.sampling_rate
 
 
141
  if time_duration_limit is None:
142
  time_duration_limit = self.time_duration_limit
143
  if do_normalize is None:
 
145
 
146
  inputs_dict = {}
147
 
148
+ text = self._process_text(text)
149
 
150
  text_input = self.tokenizer(
151
  text=text,
processor_config.json CHANGED
@@ -5,7 +5,6 @@
5
  "do_normalize": true,
6
  "fixed_speech_embeds_length": 100,
7
  "processor_class": "MERaLiONProcessor",
8
- "speech_signature": "<SpeechHere>",
9
  "speech_token_index": 255999,
10
  "time_duration_limit": -1
11
  }
 
5
  "do_normalize": true,
6
  "fixed_speech_embeds_length": 100,
7
  "processor_class": "MERaLiONProcessor",
 
8
  "speech_token_index": 255999,
9
  "time_duration_limit": -1
10
  }
tokenizer_config.json CHANGED
@@ -1987,7 +1987,7 @@
1987
  "special": false
1988
  },
1989
  "255999": {
1990
- "content": "<speech_token>",
1991
  "lstrip": false,
1992
  "normalized": false,
1993
  "rstrip": false,
 
1987
  "special": false
1988
  },
1989
  "255999": {
1990
+ "content": "<unused99>",
1991
  "lstrip": false,
1992
  "normalized": false,
1993
  "rstrip": false,