Spaces:
PlayHT
/
Running on CPU Upgrade

1littlecoder commited on
Commit
0ed5bd6
·
verified ·
1 Parent(s): 0b47c5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -114
app.py CHANGED
@@ -1,15 +1,7 @@
1
  import os
2
- import tempfile
3
- import shutil
4
- import numpy as np
5
- import requests
6
  import google.generativeai as genai
7
  import gradio as gr
8
- import subprocess
9
- import matplotlib.pyplot as plt
10
- from matplotlib.animation import FuncAnimation
11
- import PIL.Image
12
- from gradio import processing_utils, utils
13
 
14
  # Configure Google Gemini API
15
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
@@ -18,12 +10,20 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
18
  API_KEY = os.getenv('PLAY_API_KEY')
19
  USER_ID = os.getenv('PLAY_USER_ID')
20
 
 
 
 
 
 
 
 
21
  # Function to upload image to Gemini and get roasted text
22
  def upload_to_gemini(path, mime_type="image/jpeg"):
23
  file = genai.upload_file(path, mime_type=mime_type)
24
  return file
25
 
26
  def generate_roast(image_path):
 
27
  uploaded_file = upload_to_gemini(image_path)
28
  generation_config = {
29
  "temperature": 1,
@@ -35,12 +35,16 @@ def generate_roast(image_path):
35
  model = genai.GenerativeModel(
36
  model_name="gemini-1.5-flash-002",
37
  generation_config=generation_config,
38
- system_instruction="You are a professional satirist and fashion expert. Roast the profile picture.",
 
 
 
 
39
  )
40
- chat_session = model.start_chat(history=[{"role": "user", "parts": [uploaded_file]}])
41
  response = chat_session.send_message("Roast this image!")
42
  return response.text
43
 
 
44
  def text_to_speech(text):
45
  url = "https://api.play.ht/api/v2/tts/stream"
46
  payload = {
@@ -54,6 +58,7 @@ def text_to_speech(text):
54
  "Authorization": API_KEY,
55
  "X-User-ID": USER_ID
56
  }
 
57
  response = requests.post(url, json=payload, headers=headers)
58
  if response.status_code == 200:
59
  audio_path = "output_audio.mp3"
@@ -61,112 +66,27 @@ def text_to_speech(text):
61
  audio_file.write(response.content)
62
  return audio_path
63
  else:
64
- raise ValueError(f"Error: {response.status_code} - {response.text}")
65
-
66
- # Generate waveform
67
- def make_waveform(
68
- audio,
69
- bg_color="#f3f4f6",
70
- bg_image=None,
71
- fg_alpha=0.75,
72
- bars_color=("#fbbf24", "#ea580c"),
73
- bar_count=50,
74
- bar_width=0.6,
75
- animate=False
76
- ):
77
- import numpy as np
78
- import matplotlib.pyplot as plt
79
- from matplotlib.animation import FuncAnimation
80
- import tempfile
81
- import shutil
82
- import PIL.Image
83
-
84
- if isinstance(audio, str):
85
- audio = processing_utils.audio_from_file(audio)
86
-
87
- duration = round(len(audio[1]) / audio[0], 4)
88
- samples = audio[1]
89
- if len(samples.shape) > 1:
90
- samples = np.mean(samples, 1)
91
- bins_to_pad = bar_count - (len(samples) % bar_count)
92
- samples = np.pad(samples, [(0, bins_to_pad)])
93
- samples = np.reshape(samples, (bar_count, -1))
94
- samples = np.abs(samples)
95
- samples = np.max(samples, 1)
96
-
97
- # Color gradient for bars
98
- def hex_to_rgb(hex_str):
99
- return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
100
 
101
- def get_color_gradient(c1, c2, n):
102
- c1_rgb = np.array(hex_to_rgb(c1)) / 255
103
- c2_rgb = np.array(hex_to_rgb(c2)) / 255
104
- mix_pcts = [x / (n - 1) for x in range(n)]
105
- rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
106
- return [
107
- "#" + "".join(f"{int(round(val * 255)):02x}" for val in item)
108
- for item in rgb_colors
109
- ]
110
-
111
- color = (
112
- bars_color
113
- if isinstance(bars_color, str)
114
- else get_color_gradient(bars_color[0], bars_color[1], bar_count)
115
- )
116
-
117
- fig, ax = plt.subplots(figsize=(5, 1), dpi=200, frameon=False)
118
- fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
119
- plt.axis("off")
120
- plt.margins(x=0)
121
-
122
- barcollection = ax.bar(
123
- np.arange(0, bar_count),
124
- samples * 2,
125
- bottom=(-1 * samples),
126
- width=bar_width,
127
- color=color,
128
- alpha=fg_alpha,
129
- )
130
-
131
- # Temporary output file
132
- tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
133
- savefig_kwargs = {"facecolor": bg_color} if bg_image is None else {"transparent": True}
134
- plt.savefig(tmp_img.name, **savefig_kwargs)
135
-
136
- # Use ffmpeg to create video
137
- output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
138
- ffmpeg_cmd = [
139
- shutil.which("ffmpeg"),
140
- "-loop", "1",
141
- "-i", tmp_img.name,
142
- "-i", audio,
143
- "-c:v", "libx264",
144
- "-c:a", "aac",
145
- "-shortest",
146
- "-y",
147
- output_video_path,
148
- ]
149
- subprocess.run(ffmpeg_cmd, check=True)
150
- return output_video_path
151
-
152
- # Full Gradio Interface Function
153
- def process_image(image):
154
- roast_text = generate_roast(image)
155
- audio_path = text_to_speech(roast_text)
156
- final_video_path = make_waveform(audio_path, bg_image=image, animate=True)
157
- return roast_text, final_video_path
158
-
159
- # Gradio Blocks UI
160
- with gr.Blocks() as demo:
161
- gr.Markdown("# Image Roast and Waveform Video Generator")
162
 
163
  with gr.Row():
164
- image_input = gr.Image(type="filepath", label="Upload Image")
165
- output_text = gr.Textbox(label="Roast Text")
166
- output_video = gr.Video(label="Roast Waveform Video")
 
 
 
 
 
 
 
167
 
168
- submit_button = gr.Button("Generate Roast Video")
169
- submit_button.click(process_image, inputs=image_input, outputs=[output_text, output_video])
170
 
171
  # Launch the app
172
- demo.launch(debug=True)
 
1
  import os
 
 
 
 
2
  import google.generativeai as genai
3
  import gradio as gr
4
+ import requests
 
 
 
 
5
 
6
  # Configure Google Gemini API
7
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 
10
  API_KEY = os.getenv('PLAY_API_KEY')
11
  USER_ID = os.getenv('PLAY_USER_ID')
12
 
13
+ # theme selection let's go with this before the branded color
14
+ #theme={"primary_hue": "#b4fd83"}
15
+ theme = gr.themes.Base(
16
+ primary_hue="emerald",
17
+ )
18
+
19
+
20
  # Function to upload image to Gemini and get roasted text
21
  def upload_to_gemini(path, mime_type="image/jpeg"):
22
  file = genai.upload_file(path, mime_type=mime_type)
23
  return file
24
 
25
  def generate_roast(image_path):
26
+ # Upload the image to Gemini and get the text
27
  uploaded_file = upload_to_gemini(image_path)
28
  generation_config = {
29
  "temperature": 1,
 
35
  model = genai.GenerativeModel(
36
  model_name="gemini-1.5-flash-002",
37
  generation_config=generation_config,
38
+ system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
39
+ )
40
+
41
+ chat_session = model.start_chat(
42
+ history=[{"role": "user", "parts": [uploaded_file]}]
43
  )
 
44
  response = chat_session.send_message("Roast this image!")
45
  return response.text
46
 
47
+ # Function to convert text to speech with Play.ht
48
  def text_to_speech(text):
49
  url = "https://api.play.ht/api/v2/tts/stream"
50
  payload = {
 
58
  "Authorization": API_KEY,
59
  "X-User-ID": USER_ID
60
  }
61
+
62
  response = requests.post(url, json=payload, headers=headers)
63
  if response.status_code == 200:
64
  audio_path = "output_audio.mp3"
 
66
  audio_file.write(response.content)
67
  return audio_path
68
  else:
69
+ return f"Error: {response.status_code} - {response.text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Gradio Interface
72
+ with gr.Blocks(theme = theme) as demo:
73
+ gr.Markdown("# Image to Text-to-Speech Roasting App")
74
+ gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  with gr.Row():
77
+ with gr.Column():
78
+ image_input = gr.Image(type="filepath", label="Upload Image")
79
+ with gr.Column():
80
+ output_text = gr.Textbox(label="Roast Text")
81
+ audio_output = gr.Audio(label="Roast Audio")
82
+
83
+ def process_image(image):
84
+ roast_text = generate_roast(image)
85
+ audio_path = text_to_speech(roast_text)
86
+ return roast_text, audio_path
87
 
88
+ submit_button = gr.Button("Generate Roast")
89
+ submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])
90
 
91
  # Launch the app
92
+ demo.launch(debug=True)