gorkemgoknar commited on
Commit
0569c34
1 Parent(s): 9505bbe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +77 -6
README.md CHANGED
@@ -99,34 +99,105 @@ import torchaudio
99
  from datasets import load_dataset, load_metric
100
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
101
  import re
 
 
 
 
 
 
 
 
102
  test_dataset = load_dataset("common_voice", "tr", split="test")
103
  wer = load_metric("wer")
104
  processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
105
  model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
106
  model.to("cuda")
107
- # Note: Not ignoring "'" on this one
108
- chars_to_ignore_regex = """[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]"""
109
 
110
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # Preprocessing the datasets.
112
  # We need to read the aduio files as arrays
113
  def speech_file_to_array_fn(batch):
114
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
115
- speech_array, sampling_rate = torchaudio.load(batch["path"])
116
- batch["speech"] = resampler(speech_array).squeeze().numpy()
 
117
  return batch
118
  test_dataset = test_dataset.map(speech_file_to_array_fn)
119
  # Preprocessing the datasets.
120
  # We need to read the aduio files as arrays
121
  def evaluate(batch):
 
122
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
123
  with torch.no_grad():
124
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
125
  pred_ids = torch.argmax(logits, dim=-1)
126
  batch["pred_strings"] = processor.batch_decode(pred_ids)
127
  return batch
128
- result = test_dataset.map(evaluate, batched=True, batch_size=8)
 
 
 
 
129
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
130
  ```
131
  **Test Result**: TBD %
132
  ## Training
 
99
  from datasets import load_dataset, load_metric
100
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
101
  import re
102
+ import torch
103
+ import pydub
104
+ from pydub.utils import mediainfo
105
+ import array
106
+ from pydub import AudioSegment
107
+ from pydub.utils import get_array_type
108
+ import numpy as np
109
+
110
  test_dataset = load_dataset("common_voice", "tr", split="test")
111
  wer = load_metric("wer")
112
  processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
113
  model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
114
  model.to("cuda")
 
 
115
 
116
+ #Note: Not ignoring "'" on this one
117
+ #Note: Not ignoring "'" on this one
118
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
119
+
120
+ #resampler = torchaudio.transforms.Resample(48_000, 16_000)
121
+ #using custom load and transformer for audio -> see audio_resampler
122
+ new_sample_rate = 16000
123
+
124
+
125
+
126
+ import torchaudio
127
+ import torch
128
+ import pydub
129
+ import array
130
+ import numpy as np
131
+
132
+ def audio_resampler(batch, new_sample_rate = 16000):
133
+
134
+ #not working without complex library compilation in windows for mp3
135
+ #speech_array, sampling_rate = torchaudio.load(batch["path"])
136
+ #speech_array, sampling_rate = librosa.load(batch["path"])
137
+
138
+ #sampling_rate = pydub.utils.info['sample_rate'] ##gets current samplerate
139
+
140
+ sound = pydub.AudioSegment.from_file(file=batch["path"])
141
+ sampling_rate = new_sample_rate
142
+ sound = sound.set_frame_rate(new_sample_rate)
143
+ left = sound.split_to_mono()[0]
144
+ bit_depth = left.sample_width * 8
145
+ array_type = pydub.utils.get_array_type(bit_depth)
146
+
147
+ numeric_array = np.array(array.array(array_type, left._data) )
148
+
149
+ speech_array = torch.FloatTensor(numeric_array)
150
+
151
+ batch["speech"] = numeric_array
152
+ batch["sampling_rate"] = sampling_rate
153
+ #batch["target_text"] = batch["sentence"]
154
+
155
+ return batch
156
+
157
+
158
+ def remove_special_characters(batch):
159
+
160
+ ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
161
+ batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
162
+
163
+ ##remove all caps in text [AÇIKLAMA] etc, do it before..
164
+ batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
165
+
166
+ ##replace three dots (that are inside string with single)
167
+ batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
168
+
169
+ #standart ignore list
170
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
171
+
172
+
173
+ return batch
174
+
175
  # Preprocessing the datasets.
176
  # We need to read the aduio files as arrays
177
  def speech_file_to_array_fn(batch):
178
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
179
+ ##speech_array, sampling_rate = torchaudio.load(batch["path"])
180
+ ##load and conversion done in resampler , takes and returns batch
181
+ batch = audio_resampler(batch, new_sample_rate = new_sample_rate)
182
  return batch
183
  test_dataset = test_dataset.map(speech_file_to_array_fn)
184
  # Preprocessing the datasets.
185
  # We need to read the aduio files as arrays
186
  def evaluate(batch):
187
+
188
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
189
  with torch.no_grad():
190
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
191
  pred_ids = torch.argmax(logits, dim=-1)
192
  batch["pred_strings"] = processor.batch_decode(pred_ids)
193
  return batch
194
+
195
+ print("EVALUATING:")
196
+
197
+ ##for 8GB RAM on GPU best is batch_size 2 for windows, 4 may fit in linux only
198
+ result = test_dataset.map(evaluate, batched=True, batch_size=2)
199
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
200
+
201
  ```
202
  **Test Result**: TBD %
203
  ## Training