j-tobias commited on
Commit
db6e0bb
·
1 Parent(s): 15f66cd

added new model

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. cards.txt +11 -0
  3. processing.py +11 -0
app.py CHANGED
@@ -26,7 +26,7 @@ login(hf_token)
26
 
27
 
28
  # GENERAL OPTIONS FOR MODELS AND DATASETS
29
- MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
30
  DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
31
 
32
  # HELPER FUNCTIONS
@@ -43,7 +43,7 @@ def get_card(selected_model:str)->str:
43
  if "ID: "+selected_model in card:
44
  return card
45
 
46
- return "Unknown Model"
47
 
48
  def is_own(selected_option):
49
  """
 
26
 
27
 
28
  # GENERAL OPTIONS FOR MODELS AND DATASETS
29
+ MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2","facebook/hf-seamless-m4t-medium"]
30
  DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
31
 
32
  # HELPER FUNCTIONS
 
43
  if "ID: "+selected_model in card:
44
  return card
45
 
46
+ return "## Unknown Model"
47
 
48
  def is_own(selected_option):
49
  """
cards.txt CHANGED
@@ -34,4 +34,15 @@
34
  - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
35
  - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
36
 
 
 
 
 
 
 
 
 
 
 
 
37
  (evaluating this model might take a while due to it's size)
 
34
  - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
35
  - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
36
 
37
+ (evaluating this model might take a while due to it's size)
38
+ @@
39
+ #### HF Seamless M4T Medium
40
+ - ID: facebook/hf-seamless-m4t-medium
41
+ - Hugging Face: [model](https://huggingface.co/facebook/hf-seamless-m4t-medium)
42
+ - Creator: facebook
43
+ - Finetuned: No
44
+ - Model Size: 1.2 B Parameters
45
+ - Model Paper: [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf)
46
+ - Training Data: ?
47
+
48
  (evaluating this model might take a while due to it's size)
processing.py CHANGED
@@ -2,6 +2,7 @@
2
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
4
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 
5
 
6
  # Import Libraries to access Datasets
7
  from datasets import load_dataset
@@ -251,6 +252,9 @@ def load_model(model_id:str):
251
  processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
252
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
253
  model.config.forced_decoder_ids = None
 
 
 
254
  else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
255
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
256
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -291,6 +295,13 @@ def model_compute(model, processor, sample, model_id):
291
  transcription = processor.tokenizer.normalize(transcription[0])
292
  print("TRANSCRIPTION Whisper Large v2: ", transcription)
293
  return transcription
 
 
 
 
 
 
 
294
  else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
295
  sample = sample["audio"]
296
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
 
2
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
4
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ from transformers import AutoProcessor, SeamlessM4TModel
6
 
7
  # Import Libraries to access Datasets
8
  from datasets import load_dataset
 
252
  processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
253
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
254
  model.config.forced_decoder_ids = None
255
+ elif model_id == "facebook/hf-seamless-m4t-medium":
256
+ processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
257
+ model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
258
  else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
259
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
260
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
 
295
  transcription = processor.tokenizer.normalize(transcription[0])
296
  print("TRANSCRIPTION Whisper Large v2: ", transcription)
297
  return transcription
298
+ elif model_id == "facebook/hf-seamless-m4t-medium":
299
+ sample = sample["audio"]
300
+ input_data = processor(audios=sample["array"], return_tensors="pt")
301
+ output_tokens = model.generate(**input_data, tgt_lang="eng", generate_speech=False)
302
+ print(output_tokens)
303
+ transcription = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
304
+ return transcription
305
  else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
306
  sample = sample["audio"]
307
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features