Spaces:
Running
Running
File size: 2,535 Bytes
e4eb5c5 944dedf 4492d6d 19be65d a82f51b 93b0a99 4fda610 3b61a4e 3e38fbb a82f51b 4fda610 a82f51b 4eb15f6 944dedf 023b17c 19be65d f6a94c1 023b17c f6a94c1 ab3b67e f6a94c1 023b17c 944dedf 1782e10 0bad579 6043e76 1782e10 023b17c 944dedf 023b17c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
import hazm
import typing
normalizer = hazm.Normalizer()
sent_tokenizer = hazm.SentenceTokenizer()
word_tokenizer = hazm.WordTokenizer()
tagger_path = hf_hub_download(repo_id="gyroing/HAZM_POS_TAGGER", filename="pos_tagger.model")
tagger = hazm.POSTagger(model=tagger_path)
model_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx")
config_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx.json")
voice = PiperVoice.load(model_path, config_path)
def preprocess_text(text: str) -> typing.List[typing.List[str]]:
"""Split/normalize text into sentences/words with hazm"""
text = normalizer.normalize(text)
processed_sentences = []
for sentence in sent_tokenizer.tokenize(text):
words = word_tokenizer.tokenize(sentence)
processed_words = fix_words(words)
processed_sentences.append(" ".join(processed_words))
return " ".join(processed_sentences)
def fix_words(words: typing.List[str]) -> typing.List[str]:
fixed_words = []
for word, pos in tagger.tag(words):
if pos[-1] == "Z":
if word[-1] != "ِ":
if (word[-1] == "ه") and (word[-2] != "ا"):
word += "ی"
word += "ِ"
fixed_words.append(word)
return fixed_words
def synthesize_speech(text):
# Create an in-memory buffer for the WAV file
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
# Synthesize speech
eztext = preprocess_text(text)
voice.synthesize(eztext, wav_file)
# Convert buffer to NumPy array for Gradio output
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes()
# Using Gradio Blocks
with gr.Blocks(theme=gr.themes.Base()) as blocks:
input_text = gr.Textbox(label="Input")
output_audio = gr.Audio(label="Output", type="numpy")
submit_button = gr.Button("Synthesize")
submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio])
# Run the app
blocks.launch() |