diegopacheco's picture
v1 of the app
edeaf50
raw
history blame
2.08 kB
import os
from PIL import Image
from gtts import gTTS
import torch
import gradio as gr
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
def describe_photo(image):
image = Image.fromarray(image.astype('uint8'), 'RGB')
captioner = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")
results = captioner(image)
text = results[0]['generated_text']
print(f"Image caption is: {text}")
return text
def generate_story(description):
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
inputs = tokenizer.encode(description + " [SEP] A funny and friendly story:", return_tensors='pt')
outputs = model.generate(input_ids=inputs,
max_length=200,
num_return_sequences=1,
temperature=0.7,
no_repeat_ngram_size=2)
story = tokenizer.decode(outputs[0], skip_special_tokens=True)
return story
def convert_to_audio(text):
tts = gTTS(text)
audio_file_path = "audio.mp3"
tts.save(audio_file_path)
return audio_file_path
def audio_to_text(audio_file_path):
pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v2")
result = pipe("audio.mp3")
print(result)
return result['text']
def sentiment_analysis(text):
sentiment_analyzer = pipeline("sentiment-analysis")
result = sentiment_analyzer(text)
print(result)
return result
def app(image):
description = describe_photo(image)
story = generate_story(description)
audio_file = convert_to_audio(story)
transcribed_text = audio_to_text(audio_file)
sentiment = sentiment_analysis(transcribed_text)
return description,audio_file,transcribed_text, sentiment
ui = gr.Interface(
fn=app,
inputs="image",
outputs=["text", "audio", "text", "text"],
title="Diego's Story Telling Multimodel LLM Gen AI"
)
ui.launch()