Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -154,23 +154,23 @@ class ConversationBot:
|
|
154 |
return gr.Button.update(visible=False)
|
155 |
def init_agent(self, openai_api_key):
|
156 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
157 |
-
self.t2i = T2I(device="cuda:0")
|
158 |
-
self.i2t = ImageCaptioning(device="cuda:0")
|
159 |
self.t2a = T2A(device="cuda:0")
|
160 |
self.tts = TTS(device="cuda:0")
|
161 |
-
self.t2s = T2S(device="cuda:0")
|
162 |
-
self.i2a = I2A(device="cuda:0")
|
163 |
self.a2t = A2T(device="cuda:0")
|
164 |
-
self.asr = ASR(device="cuda:0")
|
165 |
-
self.inpaint = Inpaint(device="cuda:0")
|
166 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
167 |
self.tools = [
|
168 |
-
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
169 |
-
|
170 |
-
|
171 |
-
Tool(name="Get Photo Description", func=self.i2t.inference,
|
172 |
-
|
173 |
-
|
174 |
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
175 |
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
176 |
"The input to this tool should be a string, representing the text used to generate audio."),
|
@@ -179,27 +179,27 @@ class ConversationBot:
|
|
179 |
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
180 |
# "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
181 |
# "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
|
182 |
-
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
189 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
190 |
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
191 |
-
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
192 |
-
|
193 |
-
|
194 |
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
195 |
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
196 |
"The input to this tool should be a string, representing the audio_path."),
|
197 |
-
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
198 |
-
|
199 |
-
|
200 |
-
Tool(name="Transcribe speech", func=self.asr.inference,
|
201 |
-
|
202 |
-
|
203 |
self.agent = initialize_agent(
|
204 |
self.tools,
|
205 |
self.llm,
|
|
|
154 |
return gr.Button.update(visible=False)
|
155 |
def init_agent(self, openai_api_key):
|
156 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
157 |
+
# self.t2i = T2I(device="cuda:0")
|
158 |
+
# self.i2t = ImageCaptioning(device="cuda:0")
|
159 |
self.t2a = T2A(device="cuda:0")
|
160 |
self.tts = TTS(device="cuda:0")
|
161 |
+
# self.t2s = T2S(device="cuda:0")
|
162 |
+
# self.i2a = I2A(device="cuda:0")
|
163 |
self.a2t = A2T(device="cuda:0")
|
164 |
+
# self.asr = ASR(device="cuda:0")
|
165 |
+
# self.inpaint = Inpaint(device="cuda:0")
|
166 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
167 |
self.tools = [
|
168 |
+
# Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
169 |
+
# description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
|
170 |
+
# "The input to this tool should be a string, representing the text used to generate image. "),
|
171 |
+
# Tool(name="Get Photo Description", func=self.i2t.inference,
|
172 |
+
# description="useful for when you want to know what is inside the photo. receives image_path as input. "
|
173 |
+
# "The input to this tool should be a string, representing the image_path. "),
|
174 |
Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
|
175 |
description="useful for when you want to generate an audio from a user input text and it saved it to a file."
|
176 |
"The input to this tool should be a string, representing the text used to generate audio."),
|
|
|
179 |
# description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
180 |
# "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
|
181 |
# "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
|
182 |
+
# Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
|
183 |
+
# description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
|
184 |
+
# "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
|
185 |
+
# "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
|
186 |
+
# "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
187 |
+
# "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
|
188 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
189 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
190 |
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
191 |
+
# Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
192 |
+
# description="useful for when you want to generate an audio based on an image."
|
193 |
+
# "The input to this tool should be a string, representing the image_path. "),
|
194 |
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
195 |
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
196 |
"The input to this tool should be a string, representing the audio_path."),
|
197 |
+
# Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
198 |
+
# description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
199 |
+
# "The input to this tool should be a string, representing the audio_path."),
|
200 |
+
# Tool(name="Transcribe speech", func=self.asr.inference,
|
201 |
+
# description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
202 |
+
# "The input to this tool should be a string, representing the audio_path.")]
|
203 |
self.agent = initialize_agent(
|
204 |
self.tools,
|
205 |
self.llm,
|