Spaces:

jiuuee
/

my-alexa

Runtime error

App Files Files Community

jiuuee commited on May 2, 2024

Commit

3bdfcbc

verified ·

1 Parent(s): e6d8983

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -57

app.py CHANGED Viewed

@@ -18,37 +18,12 @@ import uuid
 import torch
-from nemo.collections.asr.models import ASRModel
-from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
-from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
-model = ASRModel.from_pretrained("nvidia/canary-1b")
-model.eval()
-# make sure beam size always 1 for consistency
-model.change_decoding_strategy(None)
-decoding_cfg = model.cfg.decoding
-decoding_cfg.beam.beam_size = 1
-model.change_decoding_strategy(decoding_cfg)
-# setup for buffered inference
-model.cfg.preprocessor.dither = 0.0
-model.cfg.preprocessor.pad_to = 0
-feature_stride = model.cfg.preprocessor['window_stride']
-model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
-frame_asr = FrameBatchMultiTaskAED(
-	asr_model=model,
-	frame_len=40.0,
-	total_buffer=40.0,
-	batch_size=16,
-)
-amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
@@ -78,50 +53,36 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
 	return out_filename, duration
-def transcribe(audio_filepath):
     if audio_filepath is None:
         raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
     utt_id = uuid.uuid4()
     with tempfile.TemporaryDirectory() as tmpdir:
-        converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
         # Make manifest file and save
         manifest_data = {
-            "audio_filepath": converted_audio_filepath,
-            "source_lang": "en",
-            "target_lang": "en",
-            "taskname": "asr",
-            "pnc": "no",
-            "answer": "predict",
-            "duration": 10,
         }
         manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
         with open(manifest_filepath, 'w') as fout:
-            json.dump(manifest_data, fout)  # Fix: using json.dump to write manifest data
-        # Call transcribe, passing in manifest filepath
-        if duration < 40:
-            output_text = model.transcribe(manifest_filepath)[0]
-        else:  # Do buffered inference
-            with torch.cuda.amp.autocast(dtype=amp_dtype):  # TODO: make it work if no cuda
-                with torch.no_grad():
-                    hyps = get_buffered_pred_feat_multitaskAED(
-                        frame_asr,
-                        model.cfg.preprocessor,
-                        model_stride_in_secs,
-                        model.device,
-                        manifest=manifest_filepath,
-                        filepaths=None,
-                    )
-                    output_text = hyps[0].text
-    return output_text

 import torch
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
+src_lang = "en"
+tgt_lang = "en"
+pnc="no"
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 	return out_filename, duration
+# Load the ASR pipeline
+asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
+def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
     if audio_filepath is None:
         raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
     utt_id = uuid.uuid4()
     with tempfile.TemporaryDirectory() as tmpdir:
         # Make manifest file and save
         manifest_data = {
+            "audio_filepath": audio_filepath,
+            "source_lang": src_lang,
+            "target_lang": tgt_lang,
+            "taskname": "asr",  # Setting taskname to "asr"
+            "pnc": pnc,
+            "answer": "predict"
         }
         manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
         with open(manifest_filepath, 'w') as fout:
+            json.dump(manifest_data, fout)
+        # Transcribe audio using ASR pipeline
+        transcribed_text = asr_pipeline(audio_filepath)
+        output_text = transcribed_text[0]['transcription']
+    return output_text