mikhail-panzo commited on
Commit
8101957
1 Parent(s): 3eae451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -13
app.py CHANGED
@@ -1,18 +1,49 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
 
3
 
4
- pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
5
 
6
- def predict(input_img):
7
- predictions = pipeline(input_img)
8
- return input_img, {p["label"]: p["score"] for p in predictions}
 
9
 
10
- gradio_app = gr.Interface(
11
- predict,
12
- inputs=gr.Image(label="Select hot dog candidate", sources=['upload', 'webcam'], type="pil"),
13
- outputs=[gr.Image(label="Processed Image"), gr.Label(label="Result", num_top_classes=2)],
14
- title="Hot Dog? Or Not?",
15
- )
16
 
17
- if __name__ == "__main__":
18
- gradio_app.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
 
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
 
8
+ checkpoint = "mikhail-panzo/zlm-fil-ceb_b64_le5_s8000"
9
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
10
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
 
13
+ def predict(text, speaker):
14
+ if len(text.strip()) == 0:
15
+ return (16000, np.zeros(0).astype(np.int16))
 
 
 
16
 
17
+ inputs = processor(text=text, return_tensors="pt")
18
+
19
+ # limit input length
20
+ input_ids = inputs["input_ids"]
21
+ input_ids = input_ids[..., :model.config.max_text_positions]
22
+
23
+ speaker_embedding = np.load(spe)
24
+
25
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
26
+
27
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
28
+
29
+ speech = (speech.numpy() * 32767).astype(np.int16)
30
+ return (16000, speech)
31
+
32
+
33
+ title = "CEB SEQ TTS"
34
+
35
+ description = """
36
+ This demo system is intended for survey purposes only.
37
+ """
38
+
39
+ gr.Interface(
40
+ fn=predict,
41
+ inputs=[
42
+ gr.Text(label="Input Text")
43
+ ],
44
+ outputs=[
45
+ gr.Audio(label="Generated Speech", type="numpy"),
46
+ ],
47
+ title=title,
48
+ description=description,
49
+ ).launch()