Spaces:

M4xjunior
/

locseed

Running on Zero

App Files Files Community

M4xjunior commited on Dec 2, 2024

Commit

f68e5ad

1 Parent(s): 7341fd9

d

Browse files

Files changed (2) hide show

app.py +32 -58
logs/sentence_analyzer_2024-12-02.log +10 -0

app.py CHANGED Viewed

@@ -66,60 +66,36 @@ last_ema = None
 tts_api = None
 training_process = None  # Adicione esta linha se necessário para o seu contexto
-@gpu_decorator
-def infer(
-    project, file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step, use_ema, speed, seed, remove_silence
-):
-    global last_checkpoint, last_device, tts_api, last_ema
-    if not os.path.isfile(file_checkpoint):
-        return None, "checkpoint not found!"
-    if training_process is not None:
-        device_test = "cpu"
-    else:
-        device_test = None
-    if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
-        if last_checkpoint != file_checkpoint:
-            last_checkpoint = file_checkpoint
-        if last_device != device_test:
-            last_device = device_test
-        if last_ema != use_ema:
-            last_ema = use_ema
-        vocab_file = "/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt"
-        tts_api = F5TTS(
-            model_type=exp_name, ckpt_file=file_checkpoint, vocab_file=vocab_file, device=device_test, use_ema=use_ema
-        )
-        print("update >> ", device_test, file_checkpoint, use_ema)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-        tts_api.infer(
-            gen_text=gen_text.lower().strip(),
-            ref_text=ref_text.lower().strip(),
-            ref_file=ref_audio,
-            nfe_step=nfe_step,
-            file_wave=f.name,
-            speed=speed,
-            seed=seed,
-            remove_silence=remove_silence,
-        )
-        return f.name, tts_api.device, str(tts_api.seed)
-# Estilos CSS
 custom_css = """
 #sentences-container {
-    border: 1px solid #ddd;
     border-radius: 4px;
     padding: 10px;
     margin-bottom: 10px;
 }
 .sentence-box {
-    border: 1px solid #eee;
     padding: 5px;
     margin-bottom: 5px;
     border-radius: 4px;
-    background-color: #f9f9f9;
 }
 """
-with gr.Blocks(css=custom_css) as app:
     with gr.Tabs():
         with gr.Tab("TTS Básico"):
             gr.Markdown("# TTS Básico com F5-TTS")
@@ -189,6 +165,7 @@ with gr.Blocks(css=custom_css) as app:
                 nfe_slider,
                 chunk_size_slider,
                 seed_input,  # Passando o seed para process_chunks
             ):
                 # Dividir o texto em sentenças
                 sentences = analyzer.split_into_sentences(gen_text_input)
@@ -202,19 +179,15 @@ with gr.Blocks(css=custom_css) as app:
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
-                    # Usando a função infer correta aqui, ignorando device_used
-                    audio_file, _, seed_used = infer(
-                        "Emilia_ZH_EN_pinyin",  # Substitua pelo nome do seu projeto
-                        "/home/user/app/model_1200000.safetensors",  # Substitua pelo caminho do seu checkpoint
-                        "F5-TTS",  # Ou "E2-TTS" dependendo do seu modelo
-                        ref_text_input,
-                        ref_audio_input,
-                        chunk,
-                        nfe_slider,
-                        True,  # use_ema - ajuste se necessário
-                        speed_slider,
-                        seed_input,
-                        remove_silence,
                     )
                     audio_data, _ = torchaudio.load(audio_file)
                     audio_segments.append(audio_data.squeeze().cpu().numpy())
@@ -226,7 +199,7 @@ with gr.Blocks(css=custom_css) as app:
                         (24000, final_audio_data),  # Áudio final - assumindo taxa de amostragem de 24000
                         None,  # Espectrograma - não estamos gerando um espectrograma aqui
                         gr.update(value=ref_text_input),  # Nenhuma mudança no Texto de Referência
-                        seed_used  # Retornando o seed
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
@@ -249,12 +222,13 @@ with gr.Blocks(css=custom_css) as app:
                     speed_slider,
                     nfe_slider,
                     chunk_size_slider,
-                    seed_input,  # Passando o seed como entrada
                 ],
                 outputs=[
                     audio_output,
-                    ref_text_input,
-                    seed_output,
                 ],
             )

 tts_api = None
 training_process = None  # Adicione esta linha se necessário para o seu contexto
+# Estilos CSS com tema escuro
 custom_css = """
 #sentences-container {
+    border: 1px solid #555;
     border-radius: 4px;
     padding: 10px;
     margin-bottom: 10px;
 }
 .sentence-box {
+    border: 1px solid #333;
     padding: 5px;
     margin-bottom: 5px;
     border-radius: 4px;
+    background-color: #222;
+    color: #eee;
+}
+body {
+    background-color: #111;
+    color: #eee;
+}
+.gradio-container {
+    background-color: #111;
+}
+.dark {
+    background-color: #333;
+    color: #eee;
 }
 """
+with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="gray", secondary_hue="gray")) as app:
     with gr.Tabs():
         with gr.Tab("TTS Básico"):
             gr.Markdown("# TTS Básico com F5-TTS")
                 nfe_slider,
                 chunk_size_slider,
                 seed_input,  # Passando o seed para process_chunks
+                f5tts_model,  # Passando a instância F5TTS_ema_model como argumento
             ):
                 # Dividir o texto em sentenças
                 sentences = analyzer.split_into_sentences(gen_text_input)
                 # Processar cada chunk
                 audio_segments = []
                 for chunk in chunks:
+                    # Usando a função infer correta aqui
+                    audio_file, _, _ = f5tts_model.infer(  # Usando f5tts_model.infer
+                        ref_file=ref_audio_input,
+                        ref_text=ref_text_input,
+                        gen_text=chunk,
+                        nfe_step=nfe_slider,
+                        speed=speed_slider,
+                        seed=seed_input,
+                        remove_silence=remove_silence,
                     )
                     audio_data, _ = torchaudio.load(audio_file)
                     audio_segments.append(audio_data.squeeze().cpu().numpy())
                         (24000, final_audio_data),  # Áudio final - assumindo taxa de amostragem de 24000
                         None,  # Espectrograma - não estamos gerando um espectrograma aqui
                         gr.update(value=ref_text_input),  # Nenhuma mudança no Texto de Referência
+                        f5tts_model.seed  # Retornando o seed da instância F5TTS_ema_model
                     )
                 else:
                     gr.Warning("Nenhum áudio gerado.")
                     speed_slider,
                     nfe_slider,
                     chunk_size_slider,
+                    seed_input,
+                    F5TTS_ema_model,  # Passando a instância F5TTS_ema_model como argumento
                 ],
                 outputs=[
                     audio_output,
+                    ref_text_input,
+                    seed_output,
                 ],
             )

logs/sentence_analyzer_2024-12-02.log CHANGED Viewed

@@ -44,3 +44,13 @@
 2024-12-02 19:31:01,029 - SentenceAnalyzer - DEBUG - Normalized whitespace
 2024-12-02 19:31:01,051 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
 2024-12-02 19:31:01,051 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup

 2024-12-02 19:31:01,029 - SentenceAnalyzer - DEBUG - Normalized whitespace
 2024-12-02 19:31:01,051 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
 2024-12-02 19:31:01,051 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
+2024-12-02 20:26:26,906 - SentenceAnalyzer - DEBUG - Logger set up successfully
+2024-12-02 20:26:26,906 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
+2024-12-02 20:27:51,708 - SentenceAnalyzer - DEBUG - Starting sentence splitting
+2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Normalized text using NFC
+2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
+2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
+2024-12-02 20:27:51,710 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
+2024-12-02 20:27:51,710 - SentenceAnalyzer - DEBUG - Normalized whitespace
+2024-12-02 20:27:51,733 - SentenceAnalyzer - DEBUG - Split text into 2 sentences using NLTK
+2024-12-02 20:27:51,734 - SentenceAnalyzer - INFO - Split text into 2 sentences after cleanup