File size: 5,170 Bytes
07dadcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch
import cv2
import os
import base64
import soundfile as sf
import time

# --- Set up Models ---

# Stable Diffusion for image generation
scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1",
    scheduler=scheduler,
    torch_dtype=torch.float16
).to("cuda")

# LLaVA for vision-based language understanding
tokenizer = AutoTokenizer.from_pretrained("xtuner/llava-llama-3-8b-v1_1-transformers")
model = AutoModelForCausalLM.from_pretrained("xtuner/llava-llama-3-8b-v1_1-transformers").to("cuda")

# Open-source language model for text generation (e.g., GPT-Neo)
gpt_neo_pipe = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B")

# Text-to-Speech
text_to_speech = pipeline(
    "text-to-speech", model="espnet/fastspeech2_en_ljspeech"
)

# --- Functions ---

def process_image(image_base64, chat_history):
    """Processes an image, sends it to LLaVA, and generates a response."""
    # Prepare LLaVA input
    input_text = f"""<image> {image_base64} </image>\n\nWhat do you see in this image?"""
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Generate response using LLaVA
    with torch.no_grad():
        outputs = model(**inputs)
    response = tokenizer.decode(outputs.logits.argmax(-1)[0], skip_special_tokens=True)

    # Generate speech from the response
    audio = text_to_speech(response)
    audio_path = "generated_audio.wav"
    sf.write(audio_path, audio[0].numpy(), samplerate=22050)

    # Update chat history
    chat_history += "You:  Image\n"
    chat_history += "Model: " + response + "\n"

    return chat_history, audio_path

def generate_image(prompt, chat_history):
    """Generates an image using Stable Diffusion based on a prompt."""
    image = pipe(
        prompt=prompt,
        guidance_scale=7.5,
        num_inference_steps=50,
    ).images[0]

    # Update chat history
    chat_history += "You: " + prompt + "\n"
    chat_history += "Model:  Image\n"

    return chat_history, image

def process_text(text, chat_history):
    """Processes text, generates a response using GPT-Neo, and generates speech."""
    # Generate response using GPT-Neo
    response = gpt_neo_pipe(
        text,
        max_length=100,
        num_return_sequences=1,
    )[0]["generated_text"]

    # Generate speech from the response
    audio = text_to_speech(response)
    audio_path = "generated_audio.wav"
    sf.write(audio_path, audio[0].numpy(), samplerate=22050)

    # Update chat history
    chat_history += "You: " + text + "\n"
    chat_history += "Model: " + response + "\n"

    return chat_history, audio_path

# --- Webcam Capture ---

def capture_image():
    """Captures a screenshot from the webcam."""
    cap = cv2.VideoCapture(0)
    ret, frame = cap.read()
    cap.release()
    image = Image.fromarray(frame)
    image_bytes = image.convert("RGB").save("captured_image.jpg", "JPEG")
    with open("captured_image.jpg", "rb") as f:
        image_base64 = base64.b64encode(f.read()).decode("utf-8")
    return image_base64

# --- Gradio Interface ---

with gr.Blocks() as demo:
    gr.Markdown("## Llama-LLaVA Vision Speech Assistant")
    chat_history = gr.Textbox(label="Chat History", lines=10, interactive=False)
    webcam_output = gr.Image(label="Webcam Feed", interactive=False)
    image_input = gr.Image(label="Uploaded Image")
    text_input = gr.Textbox(label="Enter Text")
    audio_output = gr.Audio(label="Audio Response")

    # Screenshot button
    screenshot_button = gr.Button("Capture Screenshot")
    screenshot_button.click(fn=capture_image, outputs=image_input)

    # Image processing (LLaVA)
    image_input.change(fn=process_image, inputs=[image_input, chat_history], outputs=[chat_history, audio_output])

    # Text processing (GPT-Neo)
    text_input.submit(fn=process_text, inputs=[text_input, chat_history], outputs=[chat_history, audio_output])

    # Image generation (Stable Diffusion)
    with gr.Tab("Image Generation"):
        image_prompt = gr.Textbox(label="Enter image prompt:")
        image_generation_output = gr.Image(label="Generated Image")
        generate_image_button = gr.Button("Generate Image")
        generate_image_button.click(
            fn=generate_image, inputs=[image_prompt, chat_history], outputs=[chat_history, image_generation_output]
        )

    # Webcam stream
    with gr.Tab("Webcam"):
        webcam_output = gr.Image(label="Webcam Feed", source="webcam", interactive=False)
        # Update webcam image every second
        def update_webcam():
            cap = cv2.VideoCapture(0)
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                image = Image.fromarray(frame)
                yield image
                time.sleep(1)  # Update every second

        webcam_output.source = update_webcam()

    demo.launch(share=True)