Prudvireddy commited on
Commit
068543b
·
verified ·
1 Parent(s): 423ebfd

Upload 4 files

Browse files
Files changed (4) hide show
  1. Montserrat-Bold.ttf +0 -0
  2. agents.py +147 -0
  3. app.py +33 -0
  4. tools.py +421 -0
Montserrat-Bold.ttf ADDED
Binary file (29.6 kB). View file
 
agents.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Agent, Task
2
+ from tools import llm, create_video_from_images_and_audio, image_generator, speech_generator, wiki_tool
3
+
4
+ script_agent = Agent(
5
+ role='Senior Content Writer',
6
+ goal='Craft engaging, concise, and informative narrations for YouTube short videos',
7
+ backstory="""As a seasoned content writer, you excel at breaking down complex topics into captivating narratives that educate and entertain audiences. Your expertise lies in writing concise, attention-grabbing scripts for YouTube short videos.""",
8
+ verbose=True,
9
+ llm=llm,
10
+ allow_delegation=False
11
+ )
12
+
13
+ image_descriptive_agent = Agent(
14
+ role='Visual Storyteller',
15
+ goal='Design stunning, contextually relevant visuals for YouTube short videos',
16
+ backstory='With a keen eye for visual storytelling, you create compelling imagery that elevates the narrative and captivates the audience. You will ',
17
+ verbose=True,
18
+ llm=llm,
19
+ allow_delegation=False
20
+ )
21
+
22
+ # img_speech_generating_agent = Agent(
23
+ # role='Multimedia Content Creator',
24
+ # goal='Generate high-quality images and speeches for YouTube short videos one after another based on provided descriptions.',
25
+ # backstory='As a multimedia expert, you excel at creating engaging multimedia content that brings stories to life.',
26
+ # verbose=True,
27
+ # llm=llm,
28
+ # allow_delegation=False
29
+ # )
30
+
31
+ img_speech_generating_agent = Agent(
32
+ role='Multimedia Content Creator',
33
+ goal='Generate high-quality images and speeches for YouTube short videos based on provided script',
34
+ backstory='As a multimedia expert, you excel at creating engaging multimedia content that brings stories to life.',
35
+ verbose=True,
36
+ llm=llm,
37
+ allow_delegation=False
38
+ )
39
+
40
+ editor = Agent(
41
+ role = 'Video editor',
42
+ goal = 'To make a video for YouTube shorts.',
43
+ backstory = "You are a video editor working for a YouTube creator",
44
+ verbose=True,
45
+ llm=llm,
46
+ allow_delegation = False,
47
+ tools = [create_video_from_images_and_audio]
48
+ )
49
+
50
+ content_generation_task = Task(
51
+ description='Generate engaging and informative content on the topic: {topic}. Use the provided tool, only if you have no idea about the given topic.',
52
+ expected_output="""Good corpus of text about: {topic}""",
53
+ agent=script_agent,
54
+ tools = [wiki_tool]
55
+ )
56
+
57
+ story_writing_task = Task(
58
+ description='Write an engaging narration for a YouTube short video on the topic: {topic}',
59
+ expected_output="""A short paragraph suitable for narrating in five seconds that provides an immersive experience to the audience. Follow the below example for output length and format.
60
+
61
+ **Example input:**
62
+ Ancient Wonders of the World
63
+
64
+ **Output format:**
65
+ Embark on a journey through time and marvel at the ancient wonders of the world!
66
+ From the majestic Great Pyramid of Giza, symbolizing the ingenuity of ancient Egypt,
67
+ to the Hanging Gardens of Babylon, an oasis of lush beauty amidst ancient Mesopotamia's arid landscape.
68
+ These remarkable structures continue to intrigue and inspire awe, reminding us of humanity's enduring quest for greatness.
69
+ """,
70
+ agent=script_agent
71
+ )
72
+
73
+
74
+ img_text_task = Task(
75
+ description='Given the narration, visually describe each sentence in the narration which will be used as a prompt for image generation.',
76
+ expected_output="""Sentences encoded in <narration> and <image> tags. Follow the example below for the output format.
77
+
78
+ **Example input:**
79
+ Embark on a journey through time and marvel at the ancient wonders of the world! From the majestic Great Pyramid of Giza, symbolizing the ingenuity of ancient Egypt, to the Hanging Gardens of Babylon, an oasis of lush beauty amidst ancient Mesopotamia's arid landscape. These remarkable structures continue to intrigue and inspire awe, reminding us of humanity's enduring quest for greatness.
80
+
81
+ **Output format:**
82
+
83
+ <narration>Embark on a journey through time and marvel at the ancient wonders of the world!</narration>
84
+ <image>A breathtaking view of various ancient wonders, showcasing their grandeur and mystery.</image>
85
+
86
+ <narration>From the majestic Great Pyramid of Giza, symbolizing the ingenuity of ancient Egypt,</narration>
87
+ <image>The majestic Great Pyramid of Giza, standing tall against the desert backdrop, a testament to ancient engineering.</image>
88
+
89
+ <narration>to the Hanging Gardens of Babylon, an oasis of lush beauty amidst ancient Mesopotamia's arid landscape,</narration>
90
+ <image>The Hanging Gardens of Babylon, lush greenery cascading from terraced gardens, amidst the arid Mesopotamian landscape.</image>
91
+
92
+ <narration>These remarkable structures continue to intrigue and inspire awe, reminding us of humanity's enduring quest for greatness.</narration>
93
+ <image>Visitors captivated by the beauty and historical significance of these ancient marvels, exploring and marveling.</image>
94
+ """,
95
+ agent=image_descriptive_agent,
96
+ context=[story_writing_task]
97
+ )
98
+
99
+
100
+
101
+ # process_script_task = Task(
102
+ # description="Extract text for image and speech generation from a provided script.",
103
+ # expected_output="A dictionary containing lists of texts for image generation and speech generation.",
104
+ # agent=ScriptSynthesizer
105
+ # )
106
+
107
+ # img_generation_task = Task(
108
+ # description='Given the input generate images for sequence of sentence enclosed in <image> tag.',
109
+ # expected_output="""Acknowledgement of image generation""",
110
+ # tools = [image_generator],
111
+ # context = [img_text_task],
112
+ # # async_execution=True,
113
+ # agent=img_speech_generating_agent
114
+ # )
115
+
116
+ # speech_generation_task = Task(
117
+ # description='Given the input generate speech for each sentence enclosed in <narration> tag.',
118
+ # expected_output="""Acknowledgement of speech generation""",
119
+ # tools = [speech_generator],
120
+ # context = [img_text_task],
121
+ # # async_execution=True,
122
+ # agent=img_speech_generating_agent
123
+ # )
124
+ img_generation_task = Task(
125
+ description='Given the script, use the given tool to generate images',
126
+ expected_output="""Acknowledgement of image generation""",
127
+ tools = [image_generator],
128
+ context = [img_text_task],
129
+ # async_execution=True,
130
+ agent=img_speech_generating_agent
131
+ )
132
+
133
+ speech_generation_task = Task(
134
+ description='Given the script, use the given tool to generate speech',
135
+ expected_output="""Acknowledgement of speech generation""",
136
+ tools = [speech_generator],
137
+ context = [img_text_task],
138
+ # async_execution=True,
139
+ agent=img_speech_generating_agent
140
+ )
141
+
142
+ make_video_task = Task(
143
+ description = 'Create video using images and speeches from the forlders "outpus/images" and "outputs/speeches"',
144
+ expected_output = "output video path",
145
+ agent=editor,
146
+ context = [img_generation_task, speech_generation_task]
147
+ )
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+
4
+ from crewai import Crew, Process
5
+ from agents import *
6
+
7
+
8
+ def generate_video(topic, grow_api_key, stability_ai_api_key):
9
+ os.environ['GROQ_API_KEY'] = grow_api_key
10
+ os.environ['STABILITY_AI_API_KEY'] = stability_ai_api_key
11
+
12
+ crew = Crew(
13
+ agents=[script_agent, image_descriptive_agent, img_speech_generating_agent, editor],
14
+ tasks=[content_generation_task, story_writing_task, img_text_task, img_generation_task,speech_generation_task,make_video_task],
15
+ process = Process.sequential,
16
+ # cache = True,
17
+ memory=True,
18
+ verbose=2
19
+ )
20
+ crew.kickoff(inputs={'topic': topic})
21
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'outputs/final_video/final_video.mp4')
22
+
23
+ app = gr.Interface(
24
+ fn=generate_video,
25
+ inputs=['text', 'text', 'text'],
26
+ # outputs=gr.Video(value=os.path.join('outputs/final_video/video.mp4'),label="Generated Video", width=720/2, height=1280/2),
27
+ outputs = gr.Video(format='mp4',label="Generated Video", width=720/2, height=1280/2),
28
+ title="ShortsIn",
29
+ description="Shorts generator"
30
+ )
31
+
32
+ app.launch(share=True)
33
+ #os.path.dirname(os.path.abspath(__file__))
tools.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.tools import tool, Tool
2
+ import re
3
+ import os
4
+ from langchain_groq import ChatGroq
5
+ import requests
6
+ import cv2
7
+ from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
8
+ from langchain.pydantic_v1 import BaseModel, Field
9
+ from langchain_community.tools import WikipediaQueryRun
10
+ from langchain_community.utilities import WikipediaAPIWrapper
11
+
12
+ # from diffusers import StableDiffusionXLPipeline, DPMSolverSinglestepScheduler
13
+ # import bitsandbytes as bnb
14
+ # import torch.nn as nn
15
+ # import torch
16
+ import pyttsx3
17
+ import os
18
+ # from langchain_google_genai import ChatGoogleGenerativeAI
19
+
20
+ # from langchain.chat_models import ChatOpenAI
21
+ # # llm2 = ChatOpenAI(model='gpt-3.5-turbo')
22
+ # # llm3 = ChatOpenAI(model='gpt-3.5-turbo')
23
+ # llm1 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048)
24
+ # # llm2 = ChatGroq(model='mixtral-8x7b-32768', temperature=0.6, max_tokens=2048, api_key='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')
25
+ # llm2 = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0.0)
26
+ # # llm2 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
27
+ # # llm3 = ChatGoogleGenerativeAI(model='gemini-pro')
28
+ # llm4 = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=2048, api_key='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')
29
+ llm = ChatGroq(model='llama3-70b-8192', temperature=0.6, max_tokens=1024, api_key=os.environ.get('GROQ_API_KEY'))
30
+
31
+ # pipe = StableDiffusionXLPipeline.from_pretrained("sd-community/sdxl-flash", torch_dtype=torch.float16).to('cuda')
32
+ # pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
33
+
34
+ # def quantize_model_to_4bit(model):
35
+ # replacements = []
36
+
37
+ # # Collect layers to be replaced
38
+ # for name, module in model.named_modules():
39
+ # if isinstance(module, nn.Linear):
40
+ # replacements.append((name, module))
41
+
42
+ # # Replace layers
43
+ # for name, module in replacements:
44
+ # # Split the name to navigate to the parent module
45
+ # *path, last = name.split('.')
46
+ # parent = model
47
+ # for part in path:
48
+ # parent = getattr(parent, part)
49
+
50
+ # # Create and assign the quantized layer
51
+ # quantized_layer = bnb.nn.Linear4bit(module.in_features, module.out_features, bias=module.bias is not None)
52
+ # quantized_layer.weight.data = module.weight.data
53
+ # if module.bias is not None:
54
+ # quantized_layer.bias.data = module.bias.data
55
+ # setattr(parent, last, quantized_layer)
56
+
57
+ # return model
58
+
59
+ # pipe.unet = quantize_model_to_4bit(pipe.unet)
60
+ # pipe.enable_model_cpu_offload()
61
+
62
+ def generate_speech(text, speech_dir='./outputs/audio', lang='en', speed=170, voice='default', num=0):
63
+ """
64
+ Generates speech for given script.
65
+ """
66
+ engine = pyttsx3.init()
67
+
68
+ # Set language and voice
69
+ voices = engine.getProperty('voices')
70
+ if voice == 'default':
71
+ voice_id = voices[1].id
72
+ else:
73
+ # Try to find the voice with the given name
74
+ voice_id = None
75
+ for v in voices:
76
+ if voice in v.name:
77
+ voice_id = v.id
78
+ break
79
+ if not voice_id:
80
+ raise ValueError(f"Voice '{voice}' not found.")
81
+
82
+ engine.setProperty('voice', voice_id)
83
+ engine.setProperty('rate', speed)
84
+ os.remove(os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3')) if os.path.exists(os.path.join(speech_dir, f'speech_{num}.mp3')) else None
85
+ engine.save_to_file(text, os.path.join(os.path.dirname(os.path.abspath(__file__)), speech_dir, f'speech_{num}.mp3'))
86
+ engine.runAndWait()
87
+
88
+ # class VideoGeneration(BaseModel):
89
+ # images_dir : str = Field(description='Path to images directory, such as "outputs/images"')
90
+ # speeches_dir : str = Field(description='Path to speeches directory, such as "outputs/speeches"')
91
+
92
+ # @tool(args_schema=VideoGeneration)
93
+ # def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
94
+ # """Creates video using images and audios with zoom-in effect"""
95
+ # images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), images_dir)
96
+ # speeches_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), speeches_dir)
97
+
98
+ # images_paths = os.listdir(images_dir)
99
+ # audio_paths = os.listdir(speeches_dir)
100
+ # # print(images_paths, audio_paths)
101
+ # clips = []
102
+
103
+ # for i in range(min(len(images_paths), len(audio_paths))):
104
+ # # Load the image
105
+ # img_clip = ImageClip(os.path.join(images_dir, images_paths[i]))
106
+
107
+ # # Load the audio file
108
+ # audioclip = AudioFileClip(os.path.join(speeches_dir, audio_paths[i]))
109
+
110
+ # # Set the duration of the video clip to the duration of the audio file
111
+ # videoclip = img_clip.set_duration(audioclip.duration)
112
+
113
+ # # Apply zoom-in effect to the video clip
114
+ # zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
115
+
116
+ # # Add audio to the zoomed video clip
117
+ # zoomed_clip = zoomed_clip.set_audio(audioclip)
118
+
119
+ # clips.append(zoomed_clip)
120
+
121
+ # # Concatenate all video clips
122
+ # final_clip = concatenate_videoclips(clips)
123
+
124
+ # # Write the result to a file
125
+ # final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
126
+
127
+ # return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
128
+
129
+ # def apply_zoom_in_effect(clip, zoom_factor=1.2):
130
+ # width, height = clip.size
131
+ # duration = clip.duration
132
+
133
+ # def zoom_in_effect(get_frame, t):
134
+ # frame = get_frame(t)
135
+ # zoom = 1 + (zoom_factor - 1) * (t / duration)
136
+ # new_width, new_height = int(width * zoom), int(height * zoom)
137
+ # resized_frame = cv2.resize(frame, (new_width, new_height))
138
+
139
+ # # Calculate the position to crop the frame to the original size
140
+ # x_start = (new_width - width) // 2
141
+ # y_start = (new_height - height) // 2
142
+ # cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
143
+
144
+ # return cropped_frame
145
+
146
+ # return clip.fl(zoom_in_effect, apply_to=['mask'])
147
+
148
+ # Example usage
149
+ # image_paths = "outputs/images"
150
+ # audio_paths = "outputs/audio"
151
+
152
+ # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
153
+ # print(f"Video created at: {video_path}")
154
+
155
+
156
+ # class ImageGeneration(BaseModel):
157
+ # text : str = Field(description='description of sentence used for image generation')
158
+ # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
159
+
160
+ # class SpeechGeneration(BaseModel):
161
+ # text : str = Field(description='description of sentence used for image generation')
162
+ # num : int = Field(description='sequence of description passed this tool. Used in image saving path. Example 1,2,3,4,5 and so on')
163
+
164
+ import os
165
+ import cv2
166
+ from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
167
+ from PIL import Image, ImageDraw, ImageFont
168
+ import numpy as np
169
+ from groq import Groq
170
+
171
+ client = Groq()
172
+
173
+ class VideoGeneration(BaseModel):
174
+ images_dir: str = Field(description='Path to images directory, such as "outputs/images"')
175
+ speeches_dir: str = Field(description='Path to speeches directory, such as "outputs/speeches"')
176
+
177
+ def split_text_into_chunks(text, chunk_size):
178
+ words = text.split()
179
+ return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
180
+
181
+ def add_text_to_video(input_video, output_video, text, duration=1, fontsize=40, fontcolor=(255, 255, 255),
182
+ outline_thickness=2, outline_color=(0, 0, 0), delay_between_chunks=0.1,
183
+ font_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'Montserrat-Bold.ttf')):
184
+
185
+ chunks = split_text_into_chunks(text, 3) # Adjust chunk size as needed
186
+
187
+ cap = cv2.VideoCapture(input_video)
188
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
189
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
190
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
191
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
192
+ out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))
193
+
194
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
195
+ chunk_duration_frames = duration * fps
196
+ delay_frames = int(delay_between_chunks * fps)
197
+
198
+ font = ImageFont.truetype(font_path, fontsize)
199
+
200
+ current_frame = 0
201
+
202
+ while cap.isOpened():
203
+ ret, frame = cap.read()
204
+ if not ret:
205
+ break
206
+
207
+ frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
208
+ draw = ImageDraw.Draw(frame_pil)
209
+
210
+ chunk_index = current_frame // (chunk_duration_frames + delay_frames)
211
+
212
+ if current_frame % (chunk_duration_frames + delay_frames) < chunk_duration_frames and chunk_index < len(chunks):
213
+ chunk = chunks[chunk_index]
214
+ text_width, text_height = draw.textsize(chunk, font=font)
215
+ text_x = (width - text_width) // 2
216
+ text_y = height - 400 # Position text at the bottom
217
+
218
+ if text_width > width:
219
+ words = chunk.split()
220
+ half = len(words) // 2
221
+ line1 = ' '.join(words[:half])
222
+ line2 = ' '.join(words[half:])
223
+
224
+ text_size_line1 = draw.textsize(line1, font=font)
225
+ text_size_line2 = draw.textsize(line2, font=font)
226
+ text_x_line1 = (width - text_size_line1[0]) // 2
227
+ text_x_line2 = (width - text_size_line2[0]) // 2
228
+ text_y = height - 250 - text_size_line1[1] # Adjust vertical position for two lines
229
+
230
+ for dx in range(-outline_thickness, outline_thickness + 1):
231
+ for dy in range(-outline_thickness, outline_thickness + 1):
232
+ if dx != 0 or dy != 0:
233
+ draw.text((text_x_line1 + dx, text_y + dy), line1, font=font, fill=outline_color)
234
+ draw.text((text_x_line2 + dx, text_y + text_size_line1[1] + dy), line2, font=font, fill=outline_color)
235
+
236
+ draw.text((text_x_line1, text_y), line1, font=font, fill=fontcolor)
237
+ draw.text((text_x_line2, text_y + text_size_line1[1]), line2, font=font, fill=fontcolor)
238
+
239
+ else:
240
+ for dx in range(-outline_thickness, outline_thickness + 1):
241
+ for dy in range(-outline_thickness, outline_thickness + 1):
242
+ if dx != 0 or dy != 0:
243
+ draw.text((text_x + dx, text_y + dy), chunk, font=font, fill=outline_color)
244
+
245
+ draw.text((text_x, text_y), chunk, font=font, fill=fontcolor)
246
+
247
+ frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
248
+
249
+ out.write(frame)
250
+ current_frame += 1
251
+
252
+ cap.release()
253
+ out.release()
254
+ cv2.destroyAllWindows()
255
+
256
+ def apply_zoom_in_effect(clip, zoom_factor=1.2):
257
+ width, height = clip.size
258
+ duration = clip.duration
259
+
260
+ def zoom_in_effect(get_frame, t):
261
+ frame = get_frame(t)
262
+ zoom = 1 + (zoom_factor - 1) * (t / duration)
263
+ new_width, new_height = int(width * zoom), int(height * zoom)
264
+ resized_frame = cv2.resize(frame, (new_width, new_height))
265
+
266
+ x_start = (new_width - width) // 2
267
+ y_start = (new_height - height) // 2
268
+ cropped_frame = resized_frame[y_start:y_start + height, x_start:x_start + width]
269
+
270
+ return cropped_frame
271
+
272
+ return clip.fl(zoom_in_effect, apply_to=['mask'])
273
+
274
+ @tool(args_schema=VideoGeneration)
275
+ def create_video_from_images_and_audio(images_dir, speeches_dir, zoom_factor=1.2):
276
+ """Creates video using images and audios.
277
+ Args:
278
+ images_dir: path to images folder, example 'outputs/images'
279
+ speeches_dir: path to speeches folder, example 'outputs/speeches'"""
280
+ images_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir)))
281
+ audio_paths = sorted(os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir)))
282
+ clips = []
283
+ temp_files = []
284
+
285
+ for i in range(min(len(images_paths), len(audio_paths))):
286
+ img_clip = ImageClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),images_dir, images_paths[i]))
287
+ audioclip = AudioFileClip(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]))
288
+ videoclip = img_clip.set_duration(audioclip.duration)
289
+ zoomed_clip = apply_zoom_in_effect(videoclip, zoom_factor)
290
+
291
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)),speeches_dir, audio_paths[i]), "rb") as file:
292
+ transcription = client.audio.transcriptions.create(
293
+ file=(audio_paths[i], file.read()),
294
+ model="whisper-large-v3",
295
+ response_format="verbose_json",
296
+ )
297
+ caption = transcription.text
298
+
299
+ temp_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_zoomed_{i}.mp4")
300
+ zoomed_clip.write_videofile(temp_video_path, codec='libx264', fps=24)
301
+ temp_files.append(temp_video_path)
302
+
303
+ final_video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"outputs/final_video/temp_captioned_{i}.mp4")
304
+ add_text_to_video(temp_video_path, final_video_path, caption, duration=1, fontsize=60)
305
+ temp_files.append(final_video_path)
306
+
307
+ final_clip = VideoFileClip(final_video_path)
308
+ final_clip = final_clip.set_audio(audioclip)
309
+
310
+ clips.append(final_clip)
311
+
312
+ final_clip = concatenate_videoclips(clips)
313
+ final_clip.write_videofile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4"), codec='libx264', fps=24)
314
+
315
+ # Close all video files properly
316
+ for clip in clips:
317
+ clip.close()
318
+
319
+ # Remove all temporary files
320
+ for temp_file in temp_files:
321
+ try:
322
+ os.remove(temp_file)
323
+ except Exception as e:
324
+ print(f"Error removing file {temp_file}: {e}")
325
+
326
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs/final_video/final_video.mp4")
327
+
328
+ # Example usage
329
+ # image_paths = "outputs/images"
330
+ # audio_paths = "outputs/speeches"
331
+
332
+ # video_path = create_video_from_images_and_audio(image_paths, audio_paths)
333
+ # print(f"Video created at: {video_path}")
334
+
335
+ class WikiInputs(BaseModel):
336
+ """Inputs to the wikipedia tool."""
337
+ query: str = Field(description="query to look up in Wikipedia, should be 3 or less words")
338
+
339
+ api_wrapper = WikipediaAPIWrapper(top_k_results=3)#, doc_content_chars_max=100)
340
+
341
+ wiki_tool = WikipediaQueryRun(
342
+ name="wiki-tool",
343
+ description="{query:'input here'}",
344
+ args_schema=WikiInputs,
345
+ api_wrapper=api_wrapper,
346
+ return_direct=True,
347
+ )
348
+
349
+ wiki = Tool(
350
+ name = 'wikipedia',
351
+ func = wiki_tool.run,
352
+ description= "{query:'input here'}"
353
+ )
354
+
355
+ # wiki_tool.run("latest news in India")
356
+
357
+ # @tool
358
+ def process_script(script):
359
+ """Used to process the script into dictionary format"""
360
+ dict = {}
361
+ dict['text_for_image_generation'] = re.findall(r'<image>(.*?)</?image>', script)
362
+ dict['text_for_speech_generation'] = re.findall(r'<narration>.*?</?narration>', script)
363
+ return dict
364
+
365
+ @tool#(args_schema=ImageGeneration)
366
+ def image_generator(script):
367
+ """Generates images for the given script.
368
+ Saves it to images_dir and return path
369
+ Args:
370
+ script: a complete script containing narrations and image descriptions"""
371
+ images_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/images')
372
+ # if num==1:
373
+ for filename in os.listdir(images_dir):
374
+ file_path = os.path.join(images_dir, filename)
375
+ if os.path.isfile(file_path):
376
+ os.remove(file_path)
377
+
378
+ dict = process_script(script)
379
+ for i, text in enumerate(dict['text_for_image_generation']):
380
+ # image = pipe(text, num_inference_steps=12, guidance_scale=2, width=720, height=1280, verbose=0).images[0]
381
+ # image.save(os.path.join(images_dir, f'image{i}.jpg'))
382
+ response = requests.post(
383
+ f"https://api.stability.ai/v2beta/stable-image/generate/core",
384
+ headers={
385
+ "authorization": os.environ.get('STABILITY_AI_API_KEY'),
386
+ "accept": "image/*"
387
+ },
388
+ files={"none": ''},
389
+ data={
390
+ "prompt": text,
391
+ "output_format": "png",
392
+ 'aspect_ratio': "9:16",
393
+ },
394
+ )
395
+
396
+ if response.status_code == 200:
397
+ with open(os.path.join(images_dir, f'image_{i}.png'), 'wb') as file:
398
+ file.write(response.content)
399
+ else:
400
+ raise Exception(str(response.json()))
401
+ return f'images generated.'#f'image generated for "{text}" and saved to directory {images_dir} as image{num}.jpg'
402
+
403
+ @tool
404
+ def speech_generator(script):
405
+ """Generates speech for given text
406
+ Saves it to speech_dir and return path
407
+ Args:
408
+ script: a complete script containing narrations and image descriptions"""
409
+ speech_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), './outputs/speeches')
410
+
411
+ # if num==1:
412
+ for filename in os.listdir(speech_dir):
413
+ file_path = os.path.join(speech_dir, filename)
414
+ if os.path.isfile(file_path):
415
+ os.remove(file_path)
416
+
417
+ dict = process_script(script)
418
+ print(dict)
419
+ for i, text in enumerate(dict['text_for_speech_generation']):
420
+ generate_speech(text, speech_dir, num=i)
421
+ return f'speechs generated.'#f'speech generated for "{text}" and saved to directory {speech_dir} as speech{num}.mp3'