import gradio as gr from transformers import pipeline from espnet2.bin.tts_inference import Text2Speech def generateTextAndAudio(inputText, numGen): # --- Generating the Text --- # With the provided text from user, generate more text up to `numGen` tokens/sub-words textOutput = textGenerator(inputText, max_length = numGen) # The output of the text generator is a list of dictionaries, grab the first dictionary # then get the generated text from the dictionary using the `generated_text` key genText = textOutput[0]['generated_text'] print("Input Text:", inputText) print("Generated Text:", genText) # --- Generating the Audio --- # With the newly generated text, generate some speech audioOutput = audioGenerator(genText) # Get the wav data genAudio = audioOutput['wav'] # Return two things # 1) Generated Text # 2) 24k sampling rate, and the Generated Audio (wav) as numpy (instead of tensor) return genText, (24000, genAudio.numpy()) # Main textGenerator = pipeline('text-generation', model = 'gpt2') audioGenerator = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_joint_finetune_conformer_fastspeech2_hifigan") input1_textbox = gr.Textbox(label="Input text") input2_slider = gr.Slider(minimum=1, maximum=100, step=1, default=30, label="Number of words to generate") output1_textbox = gr.Textbox(label = "Generated Text") output2_Audio = gr.Audio(label = "Generated Audio") title = "Generate Text and it's Audio!" description = "Provide the text, and how many subwords to generate" examples = [ ["I won a", 50], ["My name is", 30], ["I have", 60] ] article = "
" iface = gr.Interface(fn=generateTextAndAudio, inputs=[input1_textbox, input2_slider], outputs=[output1_textbox, output2_Audio], title=title, description=description, examples=examples, article=article).launch(debug = True)