File size: 2,535 Bytes
e4eb5c5
944dedf
 
 
 
4492d6d
19be65d
a82f51b
 
 
 
 
 
 
93b0a99
4fda610
3b61a4e
 
3e38fbb
a82f51b
 
 
 
 
 
 
 
 
 
4fda610
a82f51b
 
 
 
 
 
 
 
 
 
 
 
 
 
4eb15f6
944dedf
023b17c
19be65d
f6a94c1
023b17c
 
f6a94c1
 
 
 
 
ab3b67e
 
f6a94c1
023b17c
 
 
 
 
944dedf
1782e10
 
0bad579
6043e76
1782e10
 
023b17c
944dedf
023b17c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice 
from transformers import pipeline
import hazm
import typing

normalizer = hazm.Normalizer()
sent_tokenizer = hazm.SentenceTokenizer()
word_tokenizer = hazm.WordTokenizer()

tagger_path = hf_hub_download(repo_id="gyroing/HAZM_POS_TAGGER",  filename="pos_tagger.model")
tagger = hazm.POSTagger(model=tagger_path)
model_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx")
config_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-medium.onnx.json")
voice = PiperVoice.load(model_path, config_path)

def preprocess_text(text: str) -> typing.List[typing.List[str]]:
        """Split/normalize text into sentences/words with hazm"""
        text = normalizer.normalize(text)
        processed_sentences = []

        for sentence in sent_tokenizer.tokenize(text):
            words = word_tokenizer.tokenize(sentence)
            processed_words = fix_words(words)
            processed_sentences.append(" ".join(processed_words))
        return  " ".join(processed_sentences)    
def fix_words(words: typing.List[str]) -> typing.List[str]:
        fixed_words = []

        for word, pos in tagger.tag(words):
            if pos[-1] == "Z":
                if word[-1] != "ِ":
                    if (word[-1] == "ه") and (word[-2] != "ا"):
                        word += "‌ی"
                word += "ِ"
                    

            fixed_words.append(word)

        return fixed_words

def synthesize_speech(text):


    # Create an in-memory buffer for the WAV file
    buffer = BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)  # 16-bit
        wav_file.setnchannels(1)  # mono

        # Synthesize speech
        eztext = preprocess_text(text)
        voice.synthesize(eztext, wav_file)

    # Convert buffer to NumPy array for Gradio output
    buffer.seek(0)
    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)

    return audio_data.tobytes()

# Using Gradio Blocks
with gr.Blocks(theme=gr.themes.Base()) as blocks:
    input_text = gr.Textbox(label="Input")
    output_audio = gr.Audio(label="Output", type="numpy")
    submit_button = gr.Button("Synthesize")

    submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio])
# Run the app
blocks.launch()