benibraz commited on
Commit
94c4974
1 Parent(s): 0ad6bcf

Add OpenAI integration and enhance prompt functionality for video generation

Browse files
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  import torch
3
  from huggingface_hub import snapshot_download
4
 
@@ -18,9 +19,19 @@ from PIL import Image
18
  import tempfile
19
  import os
20
  import gc
 
21
 
22
  # Load Hugging Face token if needed
23
  hf_token = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
24
 
25
  # Set model download directory within Hugging Face Spaces
26
  model_path = "asset"
@@ -90,6 +101,30 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
90
  return frame_tensor.unsqueeze(0).unsqueeze(2)
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Preset options for resolution and frame configuration
94
  preset_options = [
95
  {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
@@ -169,6 +204,7 @@ pipeline = XoraVideoPipeline(
169
 
170
  def generate_video_from_text(
171
  prompt="",
 
172
  negative_prompt="",
173
  frame_rate=25,
174
  seed=171198,
@@ -185,6 +221,8 @@ def generate_video_from_text(
185
  duration=5,
186
  )
187
 
 
 
188
  sample = {
189
  "prompt": prompt,
190
  "prompt_attention_mask": None,
@@ -247,6 +285,7 @@ def generate_video_from_text(
247
  def generate_video_from_image(
248
  image_path,
249
  prompt="",
 
250
  negative_prompt="",
251
  frame_rate=25,
252
  seed=171198,
@@ -275,6 +314,8 @@ def generate_video_from_image(
275
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
276
  )
277
 
 
 
278
  sample = {
279
  "prompt": prompt,
280
  "prompt_attention_mask": None,
@@ -446,6 +487,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
446
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
447
  lines=5,
448
  )
 
 
 
 
 
 
449
  txt2vid_negative_prompt = gr.Textbox(
450
  label="Step 2: Enter Negative Prompt",
451
  placeholder="Describe what you don't want in the video...",
@@ -515,6 +562,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
515
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
516
  lines=5,
517
  )
 
 
 
 
 
518
  img2vid_negative_prompt = gr.Textbox(
519
  label="Step 3: Enter Negative Prompt",
520
  placeholder="Describe what you don't want in the video...",
@@ -584,6 +636,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
584
  fn=generate_video_from_text,
585
  inputs=[
586
  txt2vid_prompt,
 
587
  txt2vid_negative_prompt,
588
  txt2vid_frame_rate,
589
  *txt2vid_advanced,
@@ -603,6 +656,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
603
  inputs=[
604
  img2vid_image,
605
  img2vid_prompt,
 
606
  img2vid_negative_prompt,
607
  img2vid_frame_rate,
608
  *img2vid_advanced,
 
1
  import gradio as gr
2
+ from gradio_toggle import Toggle
3
  import torch
4
  from huggingface_hub import snapshot_download
5
 
 
19
  import tempfile
20
  import os
21
  import gc
22
+ from openai import OpenAI
23
 
24
  # Load Hugging Face token if needed
25
  hf_token = os.getenv("HF_TOKEN")
26
+ openai_api_key = os.getenv("OPENAI_API_KEY")
27
+ client = OpenAI(api_key=openai_api_key)
28
+ system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
29
+ system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
30
+ with open(system_prompt_t2v_path, "r") as f:
31
+ system_prompt_t2v = f.read()
32
+
33
+ with open(system_prompt_i2v_path, "r") as f:
34
+ system_prompt_i2v = f.read()
35
 
36
  # Set model download directory within Hugging Face Spaces
37
  model_path = "asset"
 
101
  return frame_tensor.unsqueeze(0).unsqueeze(2)
102
 
103
 
104
+ def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
105
+ if not enhance_toggle:
106
+ print("Enhance toggle is off, Prompt: ", prompt)
107
+ return prompt
108
+
109
+ system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
110
+ messages = [
111
+ {"role": "system", "content": system_prompt},
112
+ {"role": "user", "content": prompt},
113
+ ]
114
+
115
+ try:
116
+ response = client.chat.completions.create(
117
+ model="gpt-4o-mini",
118
+ messages=messages,
119
+ max_tokens=200,
120
+ )
121
+ print("Enhanced Prompt: ", response.choices[0].message.content.strip())
122
+ return response.choices[0].message.content.strip()
123
+ except Exception as e:
124
+ print(f"Error: {e}")
125
+ return prompt
126
+
127
+
128
  # Preset options for resolution and frame configuration
129
  preset_options = [
130
  {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
 
204
 
205
  def generate_video_from_text(
206
  prompt="",
207
+ enhance_prompt_toggle=False,
208
  negative_prompt="",
209
  frame_rate=25,
210
  seed=171198,
 
221
  duration=5,
222
  )
223
 
224
+ prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
225
+
226
  sample = {
227
  "prompt": prompt,
228
  "prompt_attention_mask": None,
 
285
  def generate_video_from_image(
286
  image_path,
287
  prompt="",
288
+ enhance_prompt_toggle=False,
289
  negative_prompt="",
290
  frame_rate=25,
291
  seed=171198,
 
314
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
315
  )
316
 
317
+ prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
318
+
319
  sample = {
320
  "prompt": prompt,
321
  "prompt_attention_mask": None,
 
487
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
488
  lines=5,
489
  )
490
+ txt2vid_enhance_toggle = Toggle(
491
+ label="Enhance Prompt",
492
+ value=True,
493
+ interactive=True,
494
+ )
495
+
496
  txt2vid_negative_prompt = gr.Textbox(
497
  label="Step 2: Enter Negative Prompt",
498
  placeholder="Describe what you don't want in the video...",
 
562
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
563
  lines=5,
564
  )
565
+ img2vid_enhance_toggle = Toggle(
566
+ label="Enhance Prompt",
567
+ value=True,
568
+ interactive=True,
569
+ )
570
  img2vid_negative_prompt = gr.Textbox(
571
  label="Step 3: Enter Negative Prompt",
572
  placeholder="Describe what you don't want in the video...",
 
636
  fn=generate_video_from_text,
637
  inputs=[
638
  txt2vid_prompt,
639
+ txt2vid_enhance_toggle,
640
  txt2vid_negative_prompt,
641
  txt2vid_frame_rate,
642
  *txt2vid_advanced,
 
656
  inputs=[
657
  img2vid_image,
658
  img2vid_prompt,
659
+ img2vid_enhance_toggle,
660
  img2vid_negative_prompt,
661
  img2vid_frame_rate,
662
  *img2vid_advanced,
assets/system_prompt_i2v.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You will receive prompts and a first frame image used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
2
+ The captions were created with the following guidelines:
3
+ Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
4
+ 1. Start with a single sentence capturing the main actions in sequence.
5
+ Example: "A woman enters a café, orders a coffee, and sits by the window."
6
+ 2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
7
+ Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
8
+ 3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
9
+ Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
10
+ 4. Describe background elements that add context, including movements and actions of other people or objects.
11
+ Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
12
+ 5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
13
+ Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
14
+ 6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
15
+ Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
16
+ 7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
17
+ Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
18
+ 8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
19
+ Example: "The scene is captured in real-life footage."
20
+
21
+ General Guidelines:
22
+ - Include many details but only those you are certain about, possibly omitting uncertain elements.
23
+ - Avoid conflicts with the first frame image.
24
+ - Use descriptive and literal language; avoid poetic or metaphorical expressions.
25
+ - Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
26
+ - Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
27
+ - Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
28
+ - Descriptions should avoid repeating the same information in different ways.
29
+ - Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
30
+
31
+ Here are some examples to real captions that represent good prompts:
32
+ - A woman walks away from a white Jeep parked on a city street at night...
33
+ A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
34
+ - A woman with long brown hair and light skin smiles at another woman...
35
+ A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
36
+ - A man in a suit enters a room and speaks to two women...
37
+ A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
38
+ - The camera pans across a cityscape of tall buildings...
39
+ The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.
assets/system_prompt_t2v.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You will receive prompts used for generating AI Videos. Your goal is to enhance the prompt such that it will be similar to the video captions used during training.
2
+ The captions were created with the following guidelines:
3
+ Please describe the content of the video to be generated from the given text prompt, focusing on detailed movements and appearances of objects and characters.
4
+ 1. Start with a single sentence capturing the main actions in sequence.
5
+ Example: "A woman enters a café, orders a coffee, and sits by the window."
6
+ 2. Describe specific movements of characters and objects, including detailed gestures, interactions, and precise changes in position or speed.
7
+ Example: "She walks from left to right toward the counter, taps her right-hand fingers rhythmically on the surface, then carries the cup in her left hand to a table."
8
+ 3. Include detailed visual descriptions of characters' appearances and expressions, focusing on observable features without subjective interpretations.
9
+ Example: "She is wearing a royal blue coat and matching scarf, has long brown hair, light skin, and noticeable dark circles under her eyes."
10
+ 4. Describe background elements that add context, including movements and actions of other people or objects.
11
+ Example: "Other patrons sit at scattered tables—some typing on laptops, others engaged in quiet conversation; a barista moves behind the counter, steaming milk."
12
+ 5. Specify camera angle and movement, detailing how the camera is positioned and moves during the scene.
13
+ Example: "The camera starts with a wide shot of the entrance, then pans smoothly to follow her from behind as she approaches the counter."
14
+ 6. Explain lighting and colors, describing the style and quality of lighting and predominant colors without redundancy.
15
+ Example: "Warm amber lights illuminate the café's wooden interior, contrasting with her bright blue coat."
16
+ 7. Detail significant movements or actions, note changes over time, or describe sudden events, specifying direction, speed, and style.
17
+ Example: "She stirs her coffee slowly with her right hand, gazing out the window; suddenly, a man in a black jacket enters quickly from the right, shaking raindrops from his umbrella."
18
+ 8. Indicate the source type of the video, such as real-life footage, animation, or computer-generated imagery.
19
+ Example: "The scene is captured in real-life footage."
20
+
21
+ General Guidelines:
22
+ - Include many details but only those you are certain about, possibly omitting uncertain elements.
23
+ - Use descriptive and literal language; avoid poetic or metaphorical expressions.
24
+ - Do not use introductory phrases such as: 'The video presents', 'The video depicts', 'This video showcases', 'The video captures' and so on.
25
+ - Start the description directly with the content. Avoid phrases like "as time passes" or "as the video progresses."
26
+ - Descriptions shouldn't contain adjectives that are true by default (e.g. 'a wet rain', 'a hot sun').
27
+ - Descriptions should avoid repeating the same information in different ways.
28
+ - Description should be one single paragraph, with no line breaks. Keep the description within 200 English words.
29
+
30
+ Here are some examples to real captions that represent good prompts:
31
+ - A woman walks away from a white Jeep parked on a city street at night...
32
+ A woman walks away from a white Jeep parked on a city street at night, then ascends a staircase and knocks on a door. The woman, wearing a dark jacket and jeans, walks away from the Jeep parked on the left side of the street, her back to the camera; she walks at a steady pace, her arms swinging slightly by her sides; the street is dimly lit, with streetlights casting pools of light on the wet pavement; a man in a dark jacket and jeans walks past the Jeep in the opposite direction; the camera follows the woman from behind as she walks up a set of stairs towards a building with a green door; she reaches the top of the stairs and turns left, continuing to walk towards the building; she reaches the door and knocks on it with her right hand; the camera remains stationary, focused on the doorway; the scene is captured in real-life footage.
33
+ - A woman with long brown hair and light skin smiles at another woman...
34
+ A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.
35
+ - A man in a suit enters a room and speaks to two women...
36
+ A man in a suit enters a room and speaks to two women sitting on a couch. The man, wearing a dark suit with a gold tie, enters the room from the left and walks towards the center of the frame. He has short gray hair, light skin, and a serious expression. He places his right hand on the back of a chair as he approaches the couch. Two women are seated on a light-colored couch in the background. The woman on the left wears a light blue sweater and has short blonde hair. The woman on the right wears a white sweater and has short blonde hair. The camera remains stationary, focusing on the man as he enters the room. The room is brightly lit, with warm tones reflecting off the walls and furniture. The scene appears to be from a film or television show.
37
+ - The camera pans across a cityscape of tall buildings...
38
+ The camera pans across a cityscape of tall buildings with a circular building in the center. The camera moves from left to right, showing the tops of the buildings and the circular building in the center. The buildings are various shades of gray and white, and the circular building has a green roof. The camera angle is high, looking down at the city. The lighting is bright, with the sun shining from the upper left, casting shadows from the buildings. The scene is computer-generated imagery.
requirements.txt CHANGED
@@ -10,3 +10,5 @@ opencv-python
10
  beautifulsoup4
11
  ftfy
12
  gradio
 
 
 
10
  beautifulsoup4
11
  ftfy
12
  gradio
13
+ openai
14
+ gradio_toggle