archit11 commited on
Commit
ee53056
1 Parent(s): d649fba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -21
app.py CHANGED
@@ -3,55 +3,51 @@ import gradio as gr
3
  import librosa
4
  import torch
5
  import spaces
 
6
 
7
  @spaces.GPU(duration=120)
8
  def transcribe_and_respond(audio_file):
9
  try:
10
- # Load the model pipeline
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
13
  trust_remote_code=True,
14
  device=0,
15
  torch_dtype=torch.bfloat16
16
  )
17
-
18
  # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
-
21
- # Print the path of the audio file
22
- print(f"Audio file path: {audio_file}")
23
-
24
- # Prepare turns with a placeholder for the audio
25
  turns = [
26
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
27
  {'role': 'user', 'content': '<|audio|>'}
28
  ]
29
-
30
- # Print the constructed prompt
31
- print(f"Constructed prompt: {turns}")
32
-
33
- # Run the pipeline with the audio and constructed prompt
34
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
35
-
36
- # Print the output from the model
37
  print(f"Model output: {output}")
38
-
39
- # Return the output for the Gradio interface
40
  return output
41
 
42
  except Exception as e:
43
  return f"Error: {str(e)}"
44
 
45
- # Set up the Gradio interface
46
  iface = gr.Interface(
47
  fn=transcribe_and_respond,
48
- inputs=gr.Audio(sources="microphone", type="filepath"), # Accept audio input from microphone
49
- outputs="text", # Output as text
50
  title="Live Transcription and Response",
51
  description="Speak into your microphone, and the model will respond naturally and informatively.",
52
- live=True # Enable live processing
53
  )
54
 
55
- # Launch the interface
56
  if __name__ == "__main__":
57
  iface.launch()
 
3
  import librosa
4
  import torch
5
  import spaces
6
+ import numpy as np
7
 
8
  @spaces.GPU(duration=120)
9
  def transcribe_and_respond(audio_file):
10
  try:
 
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
13
  trust_remote_code=True,
14
  device=0,
15
  torch_dtype=torch.bfloat16
16
  )
17
+
18
  # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Print audio properties for debugging
22
+ print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
23
+
 
24
  turns = [
25
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
26
  {'role': 'user', 'content': '<|audio|>'}
27
  ]
28
+
29
+ # Debug: Print the initial turns
30
+ print(f"Initial turns: {turns}")
31
+
32
+ # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
+
35
+ # Debug: Print the final output from the model
36
  print(f"Model output: {output}")
37
+
 
38
  return output
39
 
40
  except Exception as e:
41
  return f"Error: {str(e)}"
42
 
 
43
  iface = gr.Interface(
44
  fn=transcribe_and_respond,
45
+ inputs=gr.Audio(sources="microphone", type="filepath"),
46
+ outputs="text",
47
  title="Live Transcription and Response",
48
  description="Speak into your microphone, and the model will respond naturally and informatively.",
49
+ live=True
50
  )
51
 
 
52
  if __name__ == "__main__":
53
  iface.launch()