File size: 1,195 Bytes
b56dea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235005a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
from transformers import BlipForConditionalGeneration, BlipProcessor
import torch
import tempfile
from gtts import gTTS

# Load models
device = "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

def generate_caption_tts(image):

  inputs = processor(images=image, return_tensors="pt")
  inputs["max_length"] = 20
  inputs["num_beams"] = 5
  outputs = model_image_captioning.generate(**inputs)

  caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]

  speech = gTTS(caption, lang="en")
  tmp_file = tempfile.mkstemp()[1]
  speech.save(tmp_file)

  return (caption, tmp_file)


title = "Alec图像理解器"

description = "Bootstrapping Language-Image Pre-training model演示:引导语言图像预训练以实现统一视觉语言理解和生成。 请上传您的图像"

iface = gr.Interface(
  fn=generate_caption_tts,
  title=title,
  description=description,
  inputs=gr.inputs.Image(shape=(224,224)),
  outputs=["text", "audio"]
)


#iface.launch(share=True, debug=True)
iface.launch()