Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
1littlecoder
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,7 @@
|
|
1 |
import os
|
2 |
-
import tempfile
|
3 |
-
import shutil
|
4 |
-
import numpy as np
|
5 |
-
import requests
|
6 |
import google.generativeai as genai
|
7 |
import gradio as gr
|
8 |
-
import
|
9 |
-
import matplotlib.pyplot as plt
|
10 |
-
from matplotlib.animation import FuncAnimation
|
11 |
-
import PIL.Image
|
12 |
-
from gradio import processing_utils, utils
|
13 |
|
14 |
# Configure Google Gemini API
|
15 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
@@ -18,12 +10,20 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
|
18 |
API_KEY = os.getenv('PLAY_API_KEY')
|
19 |
USER_ID = os.getenv('PLAY_USER_ID')
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Function to upload image to Gemini and get roasted text
|
22 |
def upload_to_gemini(path, mime_type="image/jpeg"):
|
23 |
file = genai.upload_file(path, mime_type=mime_type)
|
24 |
return file
|
25 |
|
26 |
def generate_roast(image_path):
|
|
|
27 |
uploaded_file = upload_to_gemini(image_path)
|
28 |
generation_config = {
|
29 |
"temperature": 1,
|
@@ -35,12 +35,16 @@ def generate_roast(image_path):
|
|
35 |
model = genai.GenerativeModel(
|
36 |
model_name="gemini-1.5-flash-002",
|
37 |
generation_config=generation_config,
|
38 |
-
system_instruction="You are a professional satirist and fashion expert.
|
|
|
|
|
|
|
|
|
39 |
)
|
40 |
-
chat_session = model.start_chat(history=[{"role": "user", "parts": [uploaded_file]}])
|
41 |
response = chat_session.send_message("Roast this image!")
|
42 |
return response.text
|
43 |
|
|
|
44 |
def text_to_speech(text):
|
45 |
url = "https://api.play.ht/api/v2/tts/stream"
|
46 |
payload = {
|
@@ -54,6 +58,7 @@ def text_to_speech(text):
|
|
54 |
"Authorization": API_KEY,
|
55 |
"X-User-ID": USER_ID
|
56 |
}
|
|
|
57 |
response = requests.post(url, json=payload, headers=headers)
|
58 |
if response.status_code == 200:
|
59 |
audio_path = "output_audio.mp3"
|
@@ -61,112 +66,27 @@ def text_to_speech(text):
|
|
61 |
audio_file.write(response.content)
|
62 |
return audio_path
|
63 |
else:
|
64 |
-
|
65 |
-
|
66 |
-
# Generate waveform
|
67 |
-
def make_waveform(
|
68 |
-
audio,
|
69 |
-
bg_color="#f3f4f6",
|
70 |
-
bg_image=None,
|
71 |
-
fg_alpha=0.75,
|
72 |
-
bars_color=("#fbbf24", "#ea580c"),
|
73 |
-
bar_count=50,
|
74 |
-
bar_width=0.6,
|
75 |
-
animate=False
|
76 |
-
):
|
77 |
-
import numpy as np
|
78 |
-
import matplotlib.pyplot as plt
|
79 |
-
from matplotlib.animation import FuncAnimation
|
80 |
-
import tempfile
|
81 |
-
import shutil
|
82 |
-
import PIL.Image
|
83 |
-
|
84 |
-
if isinstance(audio, str):
|
85 |
-
audio = processing_utils.audio_from_file(audio)
|
86 |
-
|
87 |
-
duration = round(len(audio[1]) / audio[0], 4)
|
88 |
-
samples = audio[1]
|
89 |
-
if len(samples.shape) > 1:
|
90 |
-
samples = np.mean(samples, 1)
|
91 |
-
bins_to_pad = bar_count - (len(samples) % bar_count)
|
92 |
-
samples = np.pad(samples, [(0, bins_to_pad)])
|
93 |
-
samples = np.reshape(samples, (bar_count, -1))
|
94 |
-
samples = np.abs(samples)
|
95 |
-
samples = np.max(samples, 1)
|
96 |
-
|
97 |
-
# Color gradient for bars
|
98 |
-
def hex_to_rgb(hex_str):
|
99 |
-
return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
|
106 |
-
return [
|
107 |
-
"#" + "".join(f"{int(round(val * 255)):02x}" for val in item)
|
108 |
-
for item in rgb_colors
|
109 |
-
]
|
110 |
-
|
111 |
-
color = (
|
112 |
-
bars_color
|
113 |
-
if isinstance(bars_color, str)
|
114 |
-
else get_color_gradient(bars_color[0], bars_color[1], bar_count)
|
115 |
-
)
|
116 |
-
|
117 |
-
fig, ax = plt.subplots(figsize=(5, 1), dpi=200, frameon=False)
|
118 |
-
fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
|
119 |
-
plt.axis("off")
|
120 |
-
plt.margins(x=0)
|
121 |
-
|
122 |
-
barcollection = ax.bar(
|
123 |
-
np.arange(0, bar_count),
|
124 |
-
samples * 2,
|
125 |
-
bottom=(-1 * samples),
|
126 |
-
width=bar_width,
|
127 |
-
color=color,
|
128 |
-
alpha=fg_alpha,
|
129 |
-
)
|
130 |
-
|
131 |
-
# Temporary output file
|
132 |
-
tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
133 |
-
savefig_kwargs = {"facecolor": bg_color} if bg_image is None else {"transparent": True}
|
134 |
-
plt.savefig(tmp_img.name, **savefig_kwargs)
|
135 |
-
|
136 |
-
# Use ffmpeg to create video
|
137 |
-
output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
|
138 |
-
ffmpeg_cmd = [
|
139 |
-
shutil.which("ffmpeg"),
|
140 |
-
"-loop", "1",
|
141 |
-
"-i", tmp_img.name,
|
142 |
-
"-i", audio,
|
143 |
-
"-c:v", "libx264",
|
144 |
-
"-c:a", "aac",
|
145 |
-
"-shortest",
|
146 |
-
"-y",
|
147 |
-
output_video_path,
|
148 |
-
]
|
149 |
-
subprocess.run(ffmpeg_cmd, check=True)
|
150 |
-
return output_video_path
|
151 |
-
|
152 |
-
# Full Gradio Interface Function
|
153 |
-
def process_image(image):
|
154 |
-
roast_text = generate_roast(image)
|
155 |
-
audio_path = text_to_speech(roast_text)
|
156 |
-
final_video_path = make_waveform(audio_path, bg_image=image, animate=True)
|
157 |
-
return roast_text, final_video_path
|
158 |
-
|
159 |
-
# Gradio Blocks UI
|
160 |
-
with gr.Blocks() as demo:
|
161 |
-
gr.Markdown("# Image Roast and Waveform Video Generator")
|
162 |
|
163 |
with gr.Row():
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
submit_button = gr.Button("Generate Roast
|
169 |
-
submit_button.click(process_image, inputs=image_input, outputs=[output_text,
|
170 |
|
171 |
# Launch the app
|
172 |
-
demo.launch(debug=True)
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
2 |
import google.generativeai as genai
|
3 |
import gradio as gr
|
4 |
+
import requests
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Configure Google Gemini API
|
7 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
|
|
10 |
API_KEY = os.getenv('PLAY_API_KEY')
|
11 |
USER_ID = os.getenv('PLAY_USER_ID')
|
12 |
|
13 |
+
# theme selection let's go with this before the branded color
|
14 |
+
#theme={"primary_hue": "#b4fd83"}
|
15 |
+
theme = gr.themes.Base(
|
16 |
+
primary_hue="emerald",
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
# Function to upload image to Gemini and get roasted text
|
21 |
def upload_to_gemini(path, mime_type="image/jpeg"):
|
22 |
file = genai.upload_file(path, mime_type=mime_type)
|
23 |
return file
|
24 |
|
25 |
def generate_roast(image_path):
|
26 |
+
# Upload the image to Gemini and get the text
|
27 |
uploaded_file = upload_to_gemini(image_path)
|
28 |
generation_config = {
|
29 |
"temperature": 1,
|
|
|
35 |
model = genai.GenerativeModel(
|
36 |
model_name="gemini-1.5-flash-002",
|
37 |
generation_config=generation_config,
|
38 |
+
system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
|
39 |
+
)
|
40 |
+
|
41 |
+
chat_session = model.start_chat(
|
42 |
+
history=[{"role": "user", "parts": [uploaded_file]}]
|
43 |
)
|
|
|
44 |
response = chat_session.send_message("Roast this image!")
|
45 |
return response.text
|
46 |
|
47 |
+
# Function to convert text to speech with Play.ht
|
48 |
def text_to_speech(text):
|
49 |
url = "https://api.play.ht/api/v2/tts/stream"
|
50 |
payload = {
|
|
|
58 |
"Authorization": API_KEY,
|
59 |
"X-User-ID": USER_ID
|
60 |
}
|
61 |
+
|
62 |
response = requests.post(url, json=payload, headers=headers)
|
63 |
if response.status_code == 200:
|
64 |
audio_path = "output_audio.mp3"
|
|
|
66 |
audio_file.write(response.content)
|
67 |
return audio_path
|
68 |
else:
|
69 |
+
return f"Error: {response.status_code} - {response.text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# Gradio Interface
|
72 |
+
with gr.Blocks(theme = theme) as demo:
|
73 |
+
gr.Markdown("# Image to Text-to-Speech Roasting App")
|
74 |
+
gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
with gr.Row():
|
77 |
+
with gr.Column():
|
78 |
+
image_input = gr.Image(type="filepath", label="Upload Image")
|
79 |
+
with gr.Column():
|
80 |
+
output_text = gr.Textbox(label="Roast Text")
|
81 |
+
audio_output = gr.Audio(label="Roast Audio")
|
82 |
+
|
83 |
+
def process_image(image):
|
84 |
+
roast_text = generate_roast(image)
|
85 |
+
audio_path = text_to_speech(roast_text)
|
86 |
+
return roast_text, audio_path
|
87 |
|
88 |
+
submit_button = gr.Button("Generate Roast")
|
89 |
+
submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])
|
90 |
|
91 |
# Launch the app
|
92 |
+
demo.launch(debug=True)
|