Spaces:
Sleeping
Sleeping
sandesh-bharadwaj
commited on
Commit
·
9ed1e74
1
Parent(s):
30b0ee8
Fixed description and reduced mllm call
Browse files- engine/video_descriptor.py +10 -18
engine/video_descriptor.py
CHANGED
@@ -15,15 +15,19 @@ except:
|
|
15 |
music_prompt_examples = """
|
16 |
'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
|
17 |
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
|
18 |
-
'90s rock song with electric guitar and heavy drums'
|
|
|
|
|
|
|
|
|
19 |
"""
|
20 |
|
21 |
json_schema = """
|
22 |
{"Content Description": "string", "Music Prompt": "string"}
|
23 |
"""
|
24 |
|
25 |
-
|
26 |
-
You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models.
|
27 |
|
28 |
{music_prompt_examples}
|
29 |
|
@@ -39,7 +43,7 @@ class DescribeVideo:
|
|
39 |
self.safety_settings = self.get_safety_settings()
|
40 |
|
41 |
genai.configure(api_key=__api_key)
|
42 |
-
self.mllm_model = genai.GenerativeModel(self.model)
|
43 |
|
44 |
logging.info(f"Initialized DescribeVideo with model: {self.model}")
|
45 |
|
@@ -61,21 +65,9 @@ class DescribeVideo:
|
|
61 |
safety_settings=self.safety_settings,
|
62 |
)
|
63 |
|
64 |
-
logging.info(
|
65 |
-
f"Generated content for video: {video_path} with response: {response.text}"
|
66 |
-
)
|
67 |
-
|
68 |
-
cleaned_response = self.mllm_model.generate_content(
|
69 |
-
[
|
70 |
-
response.text,
|
71 |
-
gemni_instructions,
|
72 |
-
],
|
73 |
-
safety_settings=self.safety_settings,
|
74 |
-
)
|
75 |
-
|
76 |
-
logging.info(f"Generated : {video_path} with response: {cleaned_response.text}")
|
77 |
|
78 |
-
return json.loads(
|
79 |
|
80 |
def __call__(self, video_path):
|
81 |
return self.describe_video(video_path)
|
|
|
15 |
music_prompt_examples = """
|
16 |
'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
|
17 |
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
|
18 |
+
'90s rock song with electric guitar and heavy drums, nightcore, 140bpm',
|
19 |
+
'lofi melody loop, A minor, 110 bpm, jazzy chords evoking a feeling of curiosity, relaxing, vinyl recording',
|
20 |
+
'J-Pop, 140bpm, 320kbps, 48kHz',
|
21 |
+
'funk, disco, R&B, AOR, soft rock, and boogie',
|
22 |
+
'a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130'.
|
23 |
"""
|
24 |
|
25 |
json_schema = """
|
26 |
{"Content Description": "string", "Music Prompt": "string"}
|
27 |
"""
|
28 |
|
29 |
+
gemini_instructions = f"""
|
30 |
+
You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Generate a music prompt based on the description, and use keywords if provided by the user:
|
31 |
|
32 |
{music_prompt_examples}
|
33 |
|
|
|
43 |
self.safety_settings = self.get_safety_settings()
|
44 |
|
45 |
genai.configure(api_key=__api_key)
|
46 |
+
self.mllm_model = genai.GenerativeModel(self.model, system_instruction=gemini_instructions)
|
47 |
|
48 |
logging.info(f"Initialized DescribeVideo with model: {self.model}")
|
49 |
|
|
|
65 |
safety_settings=self.safety_settings,
|
66 |
)
|
67 |
|
68 |
+
logging.info(f"Generated : {video_path} with response: {response.text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
return json.loads(response.text.strip("```json\n"))
|
71 |
|
72 |
def __call__(self, video_path):
|
73 |
return self.describe_video(video_path)
|