alecinvan commited on
Commit
b56dea8
·
1 Parent(s): 8d0b5f2

Create app.py

Browse files

This is to use BLIP model to generate the caption from the Image uploaded through Gradio webUI, and playback speech by TTS at the same time.

Files changed (1) hide show
  1. app.py +41 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import BlipForConditionalGeneration, BlipProcessor
3
+ import torch
4
+ import tempfile
5
+ from gtts import gTTS
6
+
7
+ # Load models
8
+ device = "cpu"
9
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
10
+ model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
11
+
12
+ def generate_caption_tts(image):
13
+
14
+ inputs = processor(images=image, return_tensors="pt")
15
+ inputs["max_length"] = 20
16
+ inputs["num_beams"] = 5
17
+ outputs = model_image_captioning.generate(**inputs)
18
+
19
+ caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
20
+
21
+ speech = gTTS(caption, lang="en")
22
+ tmp_file = tempfile.mkstemp()[1]
23
+ speech.save(tmp_file)
24
+
25
+ return (caption, tmp_file)
26
+
27
+
28
+ title = "Alec图像理解器"
29
+
30
+ description = "Bootstrapping Language-Image Pre-training model演示:引导语言图像预训练以实现统一视觉语言理解和生成。 请上传您的图像"
31
+
32
+ iface = gr.Interface(
33
+ fn=generate_caption_tts,
34
+ title=title,
35
+ description=description,
36
+ inputs=gr.inputs.Image(shape=(224,224)),
37
+ outputs=["text", "audio"]
38
+ )
39
+
40
+
41
+ iface.launch(share=True, debug=True)