File size: 2,284 Bytes
36f8298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704f9ea
 
 
 
6a0d521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704f9ea
6a0d521
 
704f9ea
6a0d521
 
704f9ea
6a0d521
 
36f8298
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import nemo.collections.asr as nemo_asr
from pydub import AudioSegment
from io import BytesIO
import tempfile
import os
import gradio as gr

def convert_to_mono(input_file):
    # Load the audio file (supports various formats)
    sound = AudioSegment.from_file(input_file)

    # Convert to mono
    sound = sound.set_channels(1)

    # Export the mono audio file into a BytesIO object
    converted = BytesIO()
    sound.export(converted, format="wav")
    converted.seek(0)  # Move the pointer to the start of the BytesIO object
    return converted


# Load the pre-trained ASR model
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
    model_name="nvidia/stt_ka_fastconformer_hybrid_large_pc"
)


def transcribe_audio(audio_file):
    if audio_file:
        # Convert the uploaded audio to mono
        mono_audio = convert_to_mono(audio_file)

        # Write the mono audio to a temporary file and close it before transcribing
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            temp_file.write(mono_audio.read())
            temp_file_path = temp_file.name

        # Transcribe the audio using the temporary file path
        res = asr_model.transcribe([temp_file_path])

        # Clean up the temporary file
        os.remove(temp_file_path)

        # Return the transcription result
        return res[0][0]
# def transcribe_audio(audio_file):
#     if audio_file:
#         # Convert the uploaded audio to mono
#         mono_audio = convert_to_mono(audio_file)
        
#         # Transcribe the audio using the BytesIO object directly
#         audio_data = mono_audio.read()
        
#         # Use the audio_data in the format expected by the ASR model
#         res = asr_model.transcribe([BytesIO(audio_data)])
        
#         # Return the transcription result
#         return res[0][0]


# Create the Gradio interface
interface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),  # Allow audio file uploads and get the file path
    outputs="text",  # Display the transcription result as text
    title="ASR Transcription",
    description="Upload an audio file (mp3, wav, or m4a) and get the transcription."
)

# Launch the Gradio interface
interface.launch()