sagar007 commited on
Commit
7e2d83a
·
verified ·
1 Parent(s): b7bcf0a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -70
app.py CHANGED
@@ -1,122 +1,140 @@
1
- import spaces
2
- import gradio as gr
3
  import torch
4
- import os
5
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
6
  from gtts import gTTS
7
- from langdetect import detect
 
 
8
  import subprocess
9
- from io import BytesIO
 
10
 
11
  # Install flash-attn
12
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
- # Disable CUDA initialization at import
15
- os.environ['CUDA_VISIBLE_DEVICES'] = ''
16
- torch.set_grad_enabled(False)
17
-
18
- print("CUDA initialization disabled at import")
 
 
 
 
19
 
 
20
  @spaces.GPU
21
  def load_whisper():
22
  try:
 
23
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
24
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
25
  return processor, model
26
  except Exception as e:
27
  print(f"Error loading Whisper model: {e}")
28
  return None, None
29
 
 
30
  @spaces.GPU
31
- def load_vision_model():
32
- try:
33
- model_id = "microsoft/Phi-3.5-vision-instruct"
34
- model = AutoModelForCausalLM.from_pretrained(
35
- model_id, trust_remote_code=True, torch_dtype=torch.float16
36
- )
37
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
38
- return model, processor
39
- except Exception as e:
40
- print(f"Error loading vision model: {e}")
41
- return None, None
42
 
 
43
  @spaces.GPU
44
- def load_sarvam():
45
- try:
46
- return pipeline('sarvamai/sarvam-2b-v0.5')
47
- except Exception as e:
48
- print(f"Error loading Sarvam model: {e}")
49
- return None
50
 
 
51
  @spaces.GPU
52
- def process_audio(audio_path, whisper_processor, whisper_model):
53
- import librosa
 
 
54
  try:
55
- audio, sr = librosa.load(audio_path, sr=16000)
56
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features
57
  predicted_ids = whisper_model.generate(input_features)
58
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
59
  return transcription
60
  except Exception as e:
61
- return f"Error processing audio: {str(e)}"
62
 
 
63
  @spaces.GPU
64
- def process_image(image, text_prompt, vision_model, vision_processor):
65
  try:
66
- messages = [{"role": "user", "content": f"{text_prompt}\n<|image_1|>"}]
67
- prompt = vision_processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
68
- inputs = vision_processor(prompt, image, return_tensors="pt")
69
- generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
70
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
71
- response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
72
- return response
 
73
  except Exception as e:
74
- return f"Error processing image: {str(e)}"
 
 
 
 
 
 
75
 
76
  @spaces.GPU
77
  def generate_response(transcription, sarvam_pipe):
 
 
 
78
  try:
 
79
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
80
  return response
81
  except Exception as e:
82
  return f"Error generating response: {str(e)}"
83
 
84
- def text_to_speech(text, lang='hi'):
 
85
  try:
86
- tts = gTTS(text=text, lang=lang, tld='co.in')
87
- tts.save("response.mp3")
88
- return "response.mp3"
 
 
 
 
89
  except Exception as e:
90
- print(f"Error in text-to-speech: {str(e)}")
91
- return None
92
 
93
  @spaces.GPU
94
- def indic_vision_assistant(input_type, audio_input, text_input, image_input):
95
  try:
 
96
  whisper_processor, whisper_model = load_whisper()
97
- vision_model, vision_processor = load_vision_model()
98
  sarvam_pipe = load_sarvam()
 
99
 
100
  if input_type == "audio" and audio_input is not None:
101
- transcription = process_audio(audio_input, whisper_processor, whisper_model)
102
  elif input_type == "text" and text_input:
103
  transcription = text_input
104
  elif input_type == "image" and image_input is not None:
105
- text_prompt = text_input if text_input else "Describe this image in detail."
106
- transcription = process_image(image_input, text_prompt, vision_model, vision_processor)
107
  else:
108
- return "Please provide either audio, text, or image input.", "No input provided.", None
109
 
110
  response = generate_response(transcription, sarvam_pipe)
111
- lang = detect(response)
112
  audio_response = text_to_speech(response, lang)
113
 
114
- return transcription, response, audio_response
115
  except Exception as e:
116
  error_message = f"An error occurred: {str(e)}"
117
- return error_message, error_message, None
118
 
119
- # Custom CSS
120
  custom_css = """
121
  body {
122
  background-color: #0b0f19;
@@ -183,12 +201,12 @@ footer {
183
  }
184
  """
185
 
186
- # Custom HTML for the header
187
  custom_header = """
188
  <div id="custom-header">
189
  <h1>
190
- <span class="blue">Hello,</span>
191
- <span class="pink">User</span>
192
  </h1>
193
  <h2>How can I help you today?</h2>
194
  </div>
@@ -206,7 +224,7 @@ custom_suggestions = """
206
  <p>Type in any Indic language</p>
207
  </div>
208
  <div class="suggestion">
209
- <span class="suggestion-icon">🖼️</span>
210
  <p>Upload an image for analysis</p>
211
  </div>
212
  <div class="suggestion">
@@ -220,7 +238,7 @@ custom_suggestions = """
220
  </div>
221
  """
222
 
223
- # Gradio interface
224
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
225
  body_background_fill="#0b0f19",
226
  body_text_color="#e2e8f0",
@@ -235,25 +253,24 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
235
 
236
  with gr.Row():
237
  with gr.Column(scale=1):
238
- gr.Markdown("### Indic Vision Assistant")
239
 
240
  input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
241
  audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
242
- text_input = gr.Textbox(label="Type your message or image prompt")
243
- image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
244
 
245
  submit_btn = gr.Button("Submit")
246
 
247
- output_transcription = gr.Textbox(label="Transcription/Input")
248
  output_response = gr.Textbox(label="Generated Response")
249
  output_audio = gr.Audio(label="Audio Response")
250
 
251
  submit_btn.click(
252
- fn=indic_vision_assistant,
253
  inputs=[input_type, audio_input, text_input, image_input],
254
- outputs=[output_transcription, output_response, output_audio]
255
  )
256
- gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
257
 
258
  # Launch the app
259
  iface.launch()
 
 
 
1
  import torch
2
+ import librosa
3
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
4
  from gtts import gTTS
5
+ import gradio as gr
6
+ import spaces
7
+ from PIL import Image
8
  import subprocess
9
+
10
+ print("Using GPU for operations when available")
11
 
12
  # Install flash-attn
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
+ # Function to safely load pipeline within a GPU-decorated function
16
+ @spaces.GPU
17
+ def load_pipeline(model_name, **kwargs):
18
+ try:
19
+ device = 0 if torch.cuda.is_available() else "cpu"
20
+ return pipeline(model=model_name, device=device, **kwargs)
21
+ except Exception as e:
22
+ print(f"Error loading {model_name} pipeline: {e}")
23
+ return None
24
 
25
+ # Load Whisper model for speech recognition within a GPU-decorated function
26
  @spaces.GPU
27
  def load_whisper():
28
  try:
29
+ device = 0 if torch.cuda.is_available() else "cpu"
30
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
31
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
32
  return processor, model
33
  except Exception as e:
34
  print(f"Error loading Whisper model: {e}")
35
  return None, None
36
 
37
+ # Load sarvam-2b for text generation within a GPU-decorated function
38
  @spaces.GPU
39
+ def load_sarvam():
40
+ return load_pipeline('sarvamai/sarvam-2b-v0.5')
 
 
 
 
 
 
 
 
 
41
 
42
+ # Load vision model
43
  @spaces.GPU
44
+ def load_vision_model():
45
+ model_id = "microsoft/Phi-3.5-vision-instruct"
46
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").cuda().eval()
47
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
48
+ return model, processor
 
49
 
50
+ # Process audio input within a GPU-decorated function
51
  @spaces.GPU
52
+ def process_audio_input(audio, whisper_processor, whisper_model):
53
+ if whisper_processor is None or whisper_model is None:
54
+ return "Error: Speech recognition model is not available. Please type your message instead."
55
+
56
  try:
57
+ audio, sr = librosa.load(audio, sr=16000)
58
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
59
  predicted_ids = whisper_model.generate(input_features)
60
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
61
  return transcription
62
  except Exception as e:
63
+ return f"Error processing audio: {str(e)}. Please type your message instead."
64
 
65
+ # Generate response within a GPU-decorated function
66
  @spaces.GPU
67
+ def text_to_speech(text, lang='hi'):
68
  try:
69
+ # Use a better TTS engine for Indic languages
70
+ if lang in ['hi', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:
71
+ tts = gTTS(text=text, lang=lang, tld='co.in') # Use Indian TLD
72
+ else:
73
+ tts = gTTS(text=text, lang=lang)
74
+
75
+ tts.save("response.mp3")
76
+ return "response.mp3"
77
  except Exception as e:
78
+ print(f"Error in text-to-speech: {str(e)}")
79
+ return None
80
+
81
+ # Detect language (placeholder function, replace with actual implementation)
82
+ def detect_language(text):
83
+ # Implement language detection logic here
84
+ return 'en' # Default to English for now
85
 
86
  @spaces.GPU
87
  def generate_response(transcription, sarvam_pipe):
88
+ if sarvam_pipe is None:
89
+ return "Error: Text generation model is not available."
90
+
91
  try:
92
+ # Generate response using the sarvam-2b model
93
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
94
  return response
95
  except Exception as e:
96
  return f"Error generating response: {str(e)}"
97
 
98
+ @spaces.GPU
99
+ def process_image(image, text_input, vision_model, vision_processor):
100
  try:
101
+ prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
102
+ image = Image.fromarray(image).convert("RGB")
103
+ inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda:0")
104
+ generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
105
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
106
+ response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
107
+ return response
108
  except Exception as e:
109
+ return f"Error processing image: {str(e)}"
 
110
 
111
  @spaces.GPU
112
+ def multimodal_assistant(input_type, audio_input, text_input, image_input):
113
  try:
114
+ # Load models within the GPU-decorated function
115
  whisper_processor, whisper_model = load_whisper()
 
116
  sarvam_pipe = load_sarvam()
117
+ vision_model, vision_processor = load_vision_model()
118
 
119
  if input_type == "audio" and audio_input is not None:
120
+ transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
121
  elif input_type == "text" and text_input:
122
  transcription = text_input
123
  elif input_type == "image" and image_input is not None:
124
+ return process_image(image_input, text_input, vision_model, vision_processor), None
 
125
  else:
126
+ return "Please provide either audio, text, or image input.", None
127
 
128
  response = generate_response(transcription, sarvam_pipe)
129
+ lang = detect_language(response)
130
  audio_response = text_to_speech(response, lang)
131
 
132
+ return response, audio_response
133
  except Exception as e:
134
  error_message = f"An error occurred: {str(e)}"
135
+ return error_message, None
136
 
137
+ # Custom CSS (you can keep your existing custom CSS here)
138
  custom_css = """
139
  body {
140
  background-color: #0b0f19;
 
201
  }
202
  """
203
 
204
+ # Custom HTML for the header (you can keep your existing custom header here)
205
  custom_header = """
206
  <div id="custom-header">
207
  <h1>
208
+ <span class="blue">Multimodal</span>
209
+ <span class="pink">Indic Assistant</span>
210
  </h1>
211
  <h2>How can I help you today?</h2>
212
  </div>
 
224
  <p>Type in any Indic language</p>
225
  </div>
226
  <div class="suggestion">
227
+ <span class="suggestion-icon">📷</span>
228
  <p>Upload an image for analysis</p>
229
  </div>
230
  <div class="suggestion">
 
238
  </div>
239
  """
240
 
241
+ # Create Gradio interface
242
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
243
  body_background_fill="#0b0f19",
244
  body_text_color="#e2e8f0",
 
253
 
254
  with gr.Row():
255
  with gr.Column(scale=1):
256
+ gr.Markdown("### Multimodal Indic Assistant")
257
 
258
  input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
259
  audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
260
+ text_input = gr.Textbox(label="Type your message or image question")
261
+ image_input = gr.Image(label="Upload an image (if image input selected)")
262
 
263
  submit_btn = gr.Button("Submit")
264
 
 
265
  output_response = gr.Textbox(label="Generated Response")
266
  output_audio = gr.Audio(label="Audio Response")
267
 
268
  submit_btn.click(
269
+ fn=multimodal_assistant,
270
  inputs=[input_type, audio_input, text_input, image_input],
271
+ outputs=[output_response, output_audio]
272
  )
273
+ gr.HTML("<footer>Powered by Multimodal Indic Language AI</footer>")
274
 
275
  # Launch the app
276
  iface.launch()