Spaces:

mms-meta
/

MMS

Running on L4

App Files Files Community

vineelpratap commited on Aug 9, 2024

Commit

65d863f

verified ·

1 Parent(s): 69b07b9

Update asr_lm_eng.py

Browse files

Files changed (1) hide show

asr_lm_eng.py +48 -63

asr_lm_eng.py CHANGED Viewed

@@ -21,54 +21,56 @@ processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-# lm_decoding_config = {}
-# lm_decoding_configfile = hf_hub_download(
-#     repo_id="facebook/mms-cclms",
-#     filename="decoding_config.json",
-#     subfolder="mms-1b-all",
-# )
-# with open(lm_decoding_configfile) as f:
-#     lm_decoding_config = json.loads(f.read())
-# # allow language model decoding for "eng"
-# decoding_config = lm_decoding_config["eng"]
-# lm_file = hf_hub_download(
-#     repo_id="facebook/mms-cclms",
-#     filename=decoding_config["lmfile"].rsplit("/", 1)[1],
-#     subfolder=decoding_config["lmfile"].rsplit("/", 1)[0],
-# )
-# token_file = hf_hub_download(
-#     repo_id="facebook/mms-cclms",
-#     filename=decoding_config["tokensfile"].rsplit("/", 1)[1],
-#     subfolder=decoding_config["tokensfile"].rsplit("/", 1)[0],
-# )
-# lexicon_file = None
-# if decoding_config["lexiconfile"] is not None:
-#     lexicon_file = hf_hub_download(
-#         repo_id="facebook/mms-cclms",
-#         filename=decoding_config["lexiconfile"].rsplit("/", 1)[1],
-#         subfolder=decoding_config["lexiconfile"].rsplit("/", 1)[0],
-#     )
-# beam_search_decoder = ctc_decoder(
-#     lexicon=lexicon_file,
-#     tokens=token_file,
-#     lm=lm_file,
-#     nbest=1,
-#     beam_size=500,
-#     beam_size_token=50,
-#     lm_weight=float(decoding_config["lmweight"]),
-#     word_score=float(decoding_config["wordscore"]),
-#     sil_score=float(decoding_config["silweight"]),
-#     blank_token="<s>",
-# )
 def transcribe(audio_data=None, lang="eng (English)"):
     if not audio_data:
         return "<<ERROR: Empty Audio Input>>"
@@ -113,24 +115,7 @@ def transcribe(audio_data=None, lang="eng (English)"):
     with torch.no_grad():
         outputs = model(**inputs).logits
-    if lang_code != "eng" or True:
-        ids = torch.argmax(outputs, dim=-1)[0]
-        transcription = processor.decode(ids)
-    else:
-        assert False
-        # beam_search_result = beam_search_decoder(outputs.to("cpu"))
-        # transcription = " ".join(beam_search_result[0][0].words).strip()
     return transcription
-ASR_EXAMPLES = [
-    ["upload/english.mp3", "eng (English)"],
-    # ["upload/tamil.mp3", "tam (Tamil)"],
-    # ["upload/burmese.mp3",  "mya (Burmese)"],
-]
-ASR_NOTE = """
-The above demo doesn't use beam-search decoding using a language model.
-Checkout the instructions [here](https://huggingface.co/facebook/mms-1b-all) on how to run LM decoding for better accuracy.
-"""

 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+lm_decoding_config = {}
+lm_decoding_configfile = hf_hub_download(
+    repo_id="facebook/mms-cclms",
+    filename="decoding_config.json",
+    subfolder="mms-1b-all",
+)
+with open(lm_decoding_configfile) as f:
+    lm_decoding_config = json.loads(f.read())
+# allow language model decoding for "eng"
+decoding_config = lm_decoding_config["eng"]
+lm_file = hf_hub_download(
+    repo_id="facebook/mms-cclms",
+    filename=decoding_config["lmfile"].rsplit("/", 1)[1],
+    subfolder=decoding_config["lmfile"].rsplit("/", 1)[0],
+)
+token_file = hf_hub_download(
+    repo_id="facebook/mms-cclms",
+    filename=decoding_config["tokensfile"].rsplit("/", 1)[1],
+    subfolder=decoding_config["tokensfile"].rsplit("/", 1)[0],
+)
+lexicon_file = None
+if decoding_config["lexiconfile"] is not None:
+    lexicon_file = hf_hub_download(
+        repo_id="facebook/mms-cclms",
+        filename=decoding_config["lexiconfile"].rsplit("/", 1)[1],
+        subfolder=decoding_config["lexiconfile"].rsplit("/", 1)[0],
+    )
+beam_search_decoder = ctc_decoder(
+    lexicon=lexicon_file,
+    tokens=token_file,
+    lm=lm_file,
+    nbest=1,
+    beam_size=500,
+    beam_size_token=50,
+    lm_weight=float(decoding_config["lmweight"]),
+    word_score=float(decoding_config["wordscore"]),
+    sil_score=float(decoding_config["silweight"]),
+    blank_token="<s>",
+)
 def transcribe(audio_data=None, lang="eng (English)"):
+    assert lang.startswith("eng")
     if not audio_data:
         return "<<ERROR: Empty Audio Input>>"
     with torch.no_grad():
         outputs = model(**inputs).logits
+    beam_search_result = beam_search_decoder(outputs.to("cpu"))
+    transcription = " ".join(beam_search_result[0][0].words).strip()
     return transcription