get_rizzed

Running on CPU Upgrade

App Files Files Community

1littlecoder commited on Oct 31, 2024

Commit

0ed5bd6

verified ·

1 Parent(s): 0b47c5d

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -114

app.py CHANGED Viewed

@@ -1,15 +1,7 @@
 import os
-import tempfile
-import shutil
-import numpy as np
-import requests
 import google.generativeai as genai
 import gradio as gr
-import subprocess
-import matplotlib.pyplot as plt
-from matplotlib.animation import FuncAnimation
-import PIL.Image
-from gradio import processing_utils, utils
 # Configure Google Gemini API
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
@@ -18,12 +10,20 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
 # Function to upload image to Gemini and get roasted text
 def upload_to_gemini(path, mime_type="image/jpeg"):
     file = genai.upload_file(path, mime_type=mime_type)
     return file
 def generate_roast(image_path):
     uploaded_file = upload_to_gemini(image_path)
     generation_config = {
         "temperature": 1,
@@ -35,12 +35,16 @@ def generate_roast(image_path):
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash-002",
         generation_config=generation_config,
-        system_instruction="You are a professional satirist and fashion expert. Roast the profile picture.",
     )
-    chat_session = model.start_chat(history=[{"role": "user", "parts": [uploaded_file]}])
     response = chat_session.send_message("Roast this image!")
     return response.text
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
     payload = {
@@ -54,6 +58,7 @@ def text_to_speech(text):
         "Authorization": API_KEY,
         "X-User-ID": USER_ID
     }
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code == 200:
         audio_path = "output_audio.mp3"
@@ -61,112 +66,27 @@ def text_to_speech(text):
             audio_file.write(response.content)
         return audio_path
     else:
-        raise ValueError(f"Error: {response.status_code} - {response.text}")
-# Generate waveform
-def make_waveform(
-    audio,
-    bg_color="#f3f4f6",
-    bg_image=None,
-    fg_alpha=0.75,
-    bars_color=("#fbbf24", "#ea580c"),
-    bar_count=50,
-    bar_width=0.6,
-    animate=False
-):
-    import numpy as np
-    import matplotlib.pyplot as plt
-    from matplotlib.animation import FuncAnimation
-    import tempfile
-    import shutil
-    import PIL.Image
-    if isinstance(audio, str):
-        audio = processing_utils.audio_from_file(audio)
-    duration = round(len(audio[1]) / audio[0], 4)
-    samples = audio[1]
-    if len(samples.shape) > 1:
-        samples = np.mean(samples, 1)
-    bins_to_pad = bar_count - (len(samples) % bar_count)
-    samples = np.pad(samples, [(0, bins_to_pad)])
-    samples = np.reshape(samples, (bar_count, -1))
-    samples = np.abs(samples)
-    samples = np.max(samples, 1)
-    # Color gradient for bars
-    def hex_to_rgb(hex_str):
-        return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
-    def get_color_gradient(c1, c2, n):
-        c1_rgb = np.array(hex_to_rgb(c1)) / 255
-        c2_rgb = np.array(hex_to_rgb(c2)) / 255
-        mix_pcts = [x / (n - 1) for x in range(n)]
-        rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
-        return [
-            "#" + "".join(f"{int(round(val * 255)):02x}" for val in item)
-            for item in rgb_colors
-        ]
-    color = (
-        bars_color
-        if isinstance(bars_color, str)
-        else get_color_gradient(bars_color[0], bars_color[1], bar_count)
-    )
-    fig, ax = plt.subplots(figsize=(5, 1), dpi=200, frameon=False)
-    fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
-    plt.axis("off")
-    plt.margins(x=0)
-    barcollection = ax.bar(
-        np.arange(0, bar_count),
-        samples * 2,
-        bottom=(-1 * samples),
-        width=bar_width,
-        color=color,
-        alpha=fg_alpha,
-    )
-    # Temporary output file
-    tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
-    savefig_kwargs = {"facecolor": bg_color} if bg_image is None else {"transparent": True}
-    plt.savefig(tmp_img.name, **savefig_kwargs)
-    # Use ffmpeg to create video
-    output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
-    ffmpeg_cmd = [
-        shutil.which("ffmpeg"),
-        "-loop", "1",
-        "-i", tmp_img.name,
-        "-i", audio,
-        "-c:v", "libx264",
-        "-c:a", "aac",
-        "-shortest",
-        "-y",
-        output_video_path,
-    ]
-    subprocess.run(ffmpeg_cmd, check=True)
-    return output_video_path
-# Full Gradio Interface Function
-def process_image(image):
-    roast_text = generate_roast(image)
-    audio_path = text_to_speech(roast_text)
-    final_video_path = make_waveform(audio_path, bg_image=image, animate=True)
-    return roast_text, final_video_path
-# Gradio Blocks UI
-with gr.Blocks() as demo:
-    gr.Markdown("# Image Roast and Waveform Video Generator")
     with gr.Row():
-        image_input = gr.Image(type="filepath", label="Upload Image")
-        output_text = gr.Textbox(label="Roast Text")
-        output_video = gr.Video(label="Roast Waveform Video")
-    submit_button = gr.Button("Generate Roast Video")
-    submit_button.click(process_image, inputs=image_input, outputs=[output_text, output_video])
 # Launch the app
-demo.launch(debug=True)

 import os
 import google.generativeai as genai
 import gradio as gr
+import requests
 # Configure Google Gemini API
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
+# theme selection let's go with this before the branded color
+#theme={"primary_hue": "#b4fd83"}
+theme = gr.themes.Base(
+    primary_hue="emerald",
+)
 # Function to upload image to Gemini and get roasted text
 def upload_to_gemini(path, mime_type="image/jpeg"):
     file = genai.upload_file(path, mime_type=mime_type)
     return file
 def generate_roast(image_path):
+    # Upload the image to Gemini and get the text
     uploaded_file = upload_to_gemini(image_path)
     generation_config = {
         "temperature": 1,
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash-002",
         generation_config=generation_config,
+        system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
+    )
+    chat_session = model.start_chat(
+        history=[{"role": "user", "parts": [uploaded_file]}]
     )
     response = chat_session.send_message("Roast this image!")
     return response.text
+# Function to convert text to speech with Play.ht
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
     payload = {
         "Authorization": API_KEY,
         "X-User-ID": USER_ID
     }
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code == 200:
         audio_path = "output_audio.mp3"
             audio_file.write(response.content)
         return audio_path
     else:
+        return f"Error: {response.status_code} - {response.text}"
+# Gradio Interface
+with gr.Blocks(theme = theme) as demo:
+    gr.Markdown("# Image to Text-to-Speech Roasting App")
+    gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
     with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="filepath", label="Upload Image")
+        with gr.Column():
+            output_text = gr.Textbox(label="Roast Text")
+            audio_output = gr.Audio(label="Roast Audio")
+    def process_image(image):
+        roast_text = generate_roast(image)
+        audio_path = text_to_speech(roast_text)
+        return roast_text, audio_path
+    submit_button = gr.Button("Generate Roast")
+    submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])
 # Launch the app
+demo.launch(debug=True)