lmzjms commited on
Commit
5443283
·
1 Parent(s): 82b4d1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -27
app.py CHANGED
@@ -154,23 +154,23 @@ class ConversationBot:
154
  return gr.Button.update(visible=False)
155
  def init_agent(self, openai_api_key):
156
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
157
- self.t2i = T2I(device="cuda:0")
158
- self.i2t = ImageCaptioning(device="cuda:0")
159
  self.t2a = T2A(device="cuda:0")
160
  self.tts = TTS(device="cuda:0")
161
- self.t2s = T2S(device="cuda:0")
162
- self.i2a = I2A(device="cuda:0")
163
  self.a2t = A2T(device="cuda:0")
164
- self.asr = ASR(device="cuda:0")
165
- self.inpaint = Inpaint(device="cuda:0")
166
  #self.tts_ood = TTS_OOD(device="cuda:0")
167
  self.tools = [
168
- Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
169
- description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
170
- "The input to this tool should be a string, representing the text used to generate image. "),
171
- Tool(name="Get Photo Description", func=self.i2t.inference,
172
- description="useful for when you want to know what is inside the photo. receives image_path as input. "
173
- "The input to this tool should be a string, representing the image_path. "),
174
  Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
175
  description="useful for when you want to generate an audio from a user input text and it saved it to a file."
176
  "The input to this tool should be a string, representing the text used to generate audio."),
@@ -179,27 +179,27 @@ class ConversationBot:
179
  # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
180
  # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
181
  # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
182
- Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
183
- description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
184
- "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
185
- "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
186
- "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
187
- "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
188
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
189
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
190
  "The input to this tool should be a string, representing the text used to be converted to speech."),
191
- Tool(name="Generate Audio From The Image", func=self.i2a.inference,
192
- description="useful for when you want to generate an audio based on an image."
193
- "The input to this tool should be a string, representing the image_path. "),
194
  Tool(name="Generate Text From The Audio", func=self.a2t.inference,
195
  description="useful for when you want to describe an audio in text, receives audio_path as input."
196
  "The input to this tool should be a string, representing the audio_path."),
197
- Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
198
- description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
199
- "The input to this tool should be a string, representing the audio_path."),
200
- Tool(name="Transcribe speech", func=self.asr.inference,
201
- description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
202
- "The input to this tool should be a string, representing the audio_path.")]
203
  self.agent = initialize_agent(
204
  self.tools,
205
  self.llm,
 
154
  return gr.Button.update(visible=False)
155
  def init_agent(self, openai_api_key):
156
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
157
+ # self.t2i = T2I(device="cuda:0")
158
+ # self.i2t = ImageCaptioning(device="cuda:0")
159
  self.t2a = T2A(device="cuda:0")
160
  self.tts = TTS(device="cuda:0")
161
+ # self.t2s = T2S(device="cuda:0")
162
+ # self.i2a = I2A(device="cuda:0")
163
  self.a2t = A2T(device="cuda:0")
164
+ # self.asr = ASR(device="cuda:0")
165
+ # self.inpaint = Inpaint(device="cuda:0")
166
  #self.tts_ood = TTS_OOD(device="cuda:0")
167
  self.tools = [
168
+ # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
169
+ # description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
170
+ # "The input to this tool should be a string, representing the text used to generate image. "),
171
+ # Tool(name="Get Photo Description", func=self.i2t.inference,
172
+ # description="useful for when you want to know what is inside the photo. receives image_path as input. "
173
+ # "The input to this tool should be a string, representing the image_path. "),
174
  Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
175
  description="useful for when you want to generate an audio from a user input text and it saved it to a file."
176
  "The input to this tool should be a string, representing the text used to generate audio."),
 
179
  # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
180
  # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
181
  # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
182
+ # Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
183
+ # description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
184
+ # "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
185
+ # "If Like: Generate a piece of singing voice. Text: xxx, Note: xxx, Duration: xxx. "
186
+ # "Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
187
+ # "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
188
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
189
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
190
  "The input to this tool should be a string, representing the text used to be converted to speech."),
191
+ # Tool(name="Generate Audio From The Image", func=self.i2a.inference,
192
+ # description="useful for when you want to generate an audio based on an image."
193
+ # "The input to this tool should be a string, representing the image_path. "),
194
  Tool(name="Generate Text From The Audio", func=self.a2t.inference,
195
  description="useful for when you want to describe an audio in text, receives audio_path as input."
196
  "The input to this tool should be a string, representing the audio_path."),
197
+ # Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
198
+ # description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
199
+ # "The input to this tool should be a string, representing the audio_path."),
200
+ # Tool(name="Transcribe speech", func=self.asr.inference,
201
+ # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
202
+ # "The input to this tool should be a string, representing the audio_path.")]
203
  self.agent = initialize_agent(
204
  self.tools,
205
  self.llm,