File size: 1,783 Bytes
2ed7223
 
2ba8923
c621812
039f770
011a958
dc03737
2ba8923
dc03737
d649fba
2ba8923
d649fba
 
 
 
 
 
 
2ba8923
d649fba
 
 
 
 
2ba8923
 
fa48096
2ba8923
d649fba
 
 
 
 
2ba8923
d649fba
 
 
 
 
2ba8923
2ed7223
ab07d9e
2ba8923
2ed7223
d649fba
2ed7223
 
2ba8923
 
 
c621812
2ba8923
2ed7223
 
d649fba
c621812
d649fba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import transformers
import gradio as gr
import librosa
import torch
import spaces

@spaces.GPU(duration=120)
def transcribe_and_respond(audio_file):
    try:
        # Load the model pipeline
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )
        
        # Load the audio file
        audio, sr = librosa.load(audio_file, sr=16000)
        
        # Print the path of the audio file
        print(f"Audio file path: {audio_file}")
        
        # Prepare turns with a placeholder for the audio
        turns = [
            {'role': 'system', 'content': 'Respond naturally and informatively.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]
        
        # Print the constructed prompt
        print(f"Constructed prompt: {turns}")
        
        # Run the pipeline with the audio and constructed prompt
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
        
        # Print the output from the model
        print(f"Model output: {output}")
        
        # Return the output for the Gradio interface
        return output

    except Exception as e:
        return f"Error: {str(e)}"

# Set up the Gradio interface
iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="filepath"),  # Accept audio input from microphone
    outputs="text",  # Output as text
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True  # Enable live processing
)

# Launch the interface
if __name__ == "__main__":
    iface.launch()