Tokenization issue
#4
by
DareDeviLXDD
- opened
the tokens bundled with the model produce strange output
sorry to bother you but can you provide a small implementation for infrencing the model
so far i've implemented like this
import numpy as np
import onnxruntime as ort
from scipy.io.wavfile import write
import json
import subprocess
session = ort.InferenceSession('jarvis-high.onnx')
with open('config.json', 'r') as file:
data = json.load(file)
phoneme_id_map = data.get('phoneme_id_map', {})
def text_to_sequence(text):
text= text.lower()
phenomes = text_to_phonemes(text)
return phonemes_to_tokens(phenomes)
def text_to_phonemes(text):
result = subprocess.run(['espeak', '-v', 'en-gb-x-rp', '-x', text], stdout=subprocess.PIPE)
phonemes = result.stdout.decode('utf-8').strip()
return phonemes
def phonemes_to_tokens(phonemes):
token_ids = []
for phoneme in phonemes:
if phoneme in phoneme_id_map:
token_ids.extend(phoneme_id_map[phoneme])
else:
print(f"Warning: Phoneme ID for phoneme '{phoneme}' not found!")
return token_ids
text_input = "hello, sir!"
phoneme_sequence = text_to_sequence(text_input)
phoneme_sequence_np = np.array(phoneme_sequence, dtype=np.int64)[np.newaxis, :]
input_length = np.array([phoneme_sequence_np.shape[1]], dtype=np.int64)
scales = np.array([
data['inference']['noise_scale'],
data['inference']['length_scale'],
data['inference']['noise_w']
], dtype=np.float32)
inputs = {
'input': phoneme_sequence_np,
'input_lengths': input_length,
'scales': scales
}
outputs = session.run(None, inputs)
waveform = outputs[0]
waveform = np.squeeze(waveform)
rate = 22050
write("output.wav", rate, waveform)
print("Audio saved as 'output.wav'")
@DareDeviLXDD I've only used this with Piper and haven't tried inferencing it manually so I'm afraid I can't help you there. I'd suggest looking into how Piper uses these models to see if there's anything to change about your approach.
Okay thank you, also good quality you’ve got there keep it up!