from diffusers import DiffusionPipeline from torchvision.transforms.functional import to_tensor import torch import matplotlib.pyplot as plt from torchvision.transforms.functional import to_pil_image import gradio as gr from torchvision.utils import save_image from PIL import Image import os from gtts import gTTS import torch import gradio as gr from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer import transformers transformers.utils.move_cache() example_1 = "ninja turtles fighting against a mosquito, in the sea" example_2 = "warrior fighting zombies with a sword, in the forest" example_3 = "western cowboy fighting against a dragon, in the desert" def load_image(image_path): images = [] image = Image.open(image_path) image_tensor = to_tensor(image) image_tensor = image_tensor / image_tensor.max() images.append(image_tensor) return to_pil_image(images[0]) def text_to_comics(text): if text == example_1: return load_image("e1_comics_0.png") if text == example_2: return load_image("e2_comics_0.png") if text == example_3: return load_image("e3_comics_0.png") pipeline = DiffusionPipeline.from_pretrained("ogkalu/Comic-Diffusion") output = pipeline(text, prompt_len=70, num_images=1, return_tensors=True) images = [] for i in range(1): image = output.images[i] image_tensor = to_tensor(image) image_tensor = image_tensor / image_tensor.max() save_image(image_tensor, f"comics_{i}.png") images.append(to_pil_image(image_tensor)) return images[0] #text_to_comics(example_1) #text_to_comics(example_2) #text_to_comics(example_3) def generate_story(description): model = GPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") inputs = tokenizer.encode(description + " a thriller/action story.", return_tensors='pt') outputs = model.generate(input_ids=inputs, max_length=200, num_return_sequences=1, temperature=0.7, no_repeat_ngram_size=2) story = tokenizer.decode(outputs[0], skip_special_tokens=True) return story def convert_to_audio(text): tts = gTTS(text) audio_file_path = "audio.mp3" tts.save(audio_file_path) return audio_file_path def audio_to_text(audio_file_path): pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v2") result = pipe("audio.mp3") print(result) return result['text'] def sentiment_analysis(text): sentiment_analyzer = pipeline("sentiment-analysis") result = sentiment_analyzer(text) print(result) return result def app(text): comics = text_to_comics(text) story = generate_story(text) audio_file = convert_to_audio(story) transcribed_text = audio_to_text(audio_file) sentiment = sentiment_analysis(transcribed_text) return comics, audio_file,transcribed_text, sentiment ui = gr.Interface(fn=app, inputs="text", outputs=["image", "audio", "text", "text"], title="GenAI Multi-model LLM comics: Type some text get comics!", description="This model generates comics based on the text(max 70 chars) you provide." + \ "
It does not work on mobile(timeout issue) click on examples if dont want to wait. " + \ "
It may take ~10-20min to generate the comics.", examples=[(example_1),(example_2),(example_3)], ) ui.launch()