speechbrainteam yingzhi commited on
Commit
3df4496
1 Parent(s): e33be7f

add information for inference (#4)

Browse files

- add comments (66f6facf1de25d1680c50847a3d0dc5649d3204a)


Co-authored-by: Yingzhi Wang <yingzhi@users.noreply.huggingface.co>

Files changed (1) hide show
  1. README.md +13 -2
README.md CHANGED
@@ -48,7 +48,13 @@ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", saved
48
 
49
  # Run TTS with text input
50
  input_text = "were the leaders in this luckless change; though our own Baskerville; who was at work some years before them; went much on the same lines;"
51
- mel_output, durations, pitch, energy = fastspeech2.encode_text([input_text])
 
 
 
 
 
 
52
 
53
  # Running Vocoder (spectrogram-to-waveform)
54
  waveforms = hifi_gan.decode_batch(mel_output)
@@ -59,7 +65,12 @@ torchaudio.save('example_TTS_input_text.wav', waveforms.squeeze(1), 22050)
59
 
60
  # Run TTS with phoneme input
61
  input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
62
- mel_output, durations, pitch, energy = fastspeech2.encode_phoneme([input_phonemes])
 
 
 
 
 
63
 
64
  # Running Vocoder (spectrogram-to-waveform)
65
  waveforms = hifi_gan.decode_batch(mel_output)
 
48
 
49
  # Run TTS with text input
50
  input_text = "were the leaders in this luckless change; though our own Baskerville; who was at work some years before them; went much on the same lines;"
51
+
52
+ mel_output, durations, pitch, energy = fastspeech2.encode_text(
53
+ [input_text],
54
+ pace=1.0, # scale up/down the speed
55
+ pitch_rate=1.0, # scale up/down the pitch
56
+ energy_rate=1.0, # scale up/down the energy
57
+ )
58
 
59
  # Running Vocoder (spectrogram-to-waveform)
60
  waveforms = hifi_gan.decode_batch(mel_output)
 
65
 
66
  # Run TTS with phoneme input
67
  input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
68
+ mel_output, durations, pitch, energy = fastspeech2.encode_phoneme(
69
+ [input_phonemes],
70
+ pace=1.0, # scale up/down the speed
71
+ pitch_rate=1.0, # scale up/down the pitch
72
+ energy_rate=1.0, # scale up/down the energy
73
+ )
74
 
75
  # Running Vocoder (spectrogram-to-waveform)
76
  waveforms = hifi_gan.decode_batch(mel_output)