M4xjunior commited on
Commit
f68e5ad
·
1 Parent(s): 7341fd9
Files changed (2) hide show
  1. app.py +32 -58
  2. logs/sentence_analyzer_2024-12-02.log +10 -0
app.py CHANGED
@@ -66,60 +66,36 @@ last_ema = None
66
  tts_api = None
67
  training_process = None # Adicione esta linha se necessário para o seu contexto
68
 
69
- @gpu_decorator
70
- def infer(
71
- project, file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step, use_ema, speed, seed, remove_silence
72
- ):
73
- global last_checkpoint, last_device, tts_api, last_ema
74
- if not os.path.isfile(file_checkpoint):
75
- return None, "checkpoint not found!"
76
- if training_process is not None:
77
- device_test = "cpu"
78
- else:
79
- device_test = None
80
- if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
81
- if last_checkpoint != file_checkpoint:
82
- last_checkpoint = file_checkpoint
83
- if last_device != device_test:
84
- last_device = device_test
85
- if last_ema != use_ema:
86
- last_ema = use_ema
87
- vocab_file = "/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt"
88
- tts_api = F5TTS(
89
- model_type=exp_name, ckpt_file=file_checkpoint, vocab_file=vocab_file, device=device_test, use_ema=use_ema
90
- )
91
- print("update >> ", device_test, file_checkpoint, use_ema)
92
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
93
- tts_api.infer(
94
- gen_text=gen_text.lower().strip(),
95
- ref_text=ref_text.lower().strip(),
96
- ref_file=ref_audio,
97
- nfe_step=nfe_step,
98
- file_wave=f.name,
99
- speed=speed,
100
- seed=seed,
101
- remove_silence=remove_silence,
102
- )
103
- return f.name, tts_api.device, str(tts_api.seed)
104
-
105
- # Estilos CSS
106
  custom_css = """
107
  #sentences-container {
108
- border: 1px solid #ddd;
109
  border-radius: 4px;
110
  padding: 10px;
111
  margin-bottom: 10px;
112
  }
113
  .sentence-box {
114
- border: 1px solid #eee;
115
  padding: 5px;
116
  margin-bottom: 5px;
117
  border-radius: 4px;
118
- background-color: #f9f9f9;
 
 
 
 
 
 
 
 
 
 
 
 
119
  }
120
  """
121
 
122
- with gr.Blocks(css=custom_css) as app:
123
  with gr.Tabs():
124
  with gr.Tab("TTS Básico"):
125
  gr.Markdown("# TTS Básico com F5-TTS")
@@ -189,6 +165,7 @@ with gr.Blocks(css=custom_css) as app:
189
  nfe_slider,
190
  chunk_size_slider,
191
  seed_input, # Passando o seed para process_chunks
 
192
  ):
193
  # Dividir o texto em sentenças
194
  sentences = analyzer.split_into_sentences(gen_text_input)
@@ -202,19 +179,15 @@ with gr.Blocks(css=custom_css) as app:
202
  # Processar cada chunk
203
  audio_segments = []
204
  for chunk in chunks:
205
- # Usando a função infer correta aqui, ignorando device_used
206
- audio_file, _, seed_used = infer(
207
- "Emilia_ZH_EN_pinyin", # Substitua pelo nome do seu projeto
208
- "/home/user/app/model_1200000.safetensors", # Substitua pelo caminho do seu checkpoint
209
- "F5-TTS", # Ou "E2-TTS" dependendo do seu modelo
210
- ref_text_input,
211
- ref_audio_input,
212
- chunk,
213
- nfe_slider,
214
- True, # use_ema - ajuste se necessário
215
- speed_slider,
216
- seed_input,
217
- remove_silence,
218
  )
219
  audio_data, _ = torchaudio.load(audio_file)
220
  audio_segments.append(audio_data.squeeze().cpu().numpy())
@@ -226,7 +199,7 @@ with gr.Blocks(css=custom_css) as app:
226
  (24000, final_audio_data), # Áudio final - assumindo taxa de amostragem de 24000
227
  None, # Espectrograma - não estamos gerando um espectrograma aqui
228
  gr.update(value=ref_text_input), # Nenhuma mudança no Texto de Referência
229
- seed_used # Retornando o seed
230
  )
231
  else:
232
  gr.Warning("Nenhum áudio gerado.")
@@ -249,12 +222,13 @@ with gr.Blocks(css=custom_css) as app:
249
  speed_slider,
250
  nfe_slider,
251
  chunk_size_slider,
252
- seed_input, # Passando o seed como entrada
 
253
  ],
254
  outputs=[
255
  audio_output,
256
- ref_text_input,
257
- seed_output,
258
  ],
259
  )
260
 
 
66
  tts_api = None
67
  training_process = None # Adicione esta linha se necessário para o seu contexto
68
 
69
+ # Estilos CSS com tema escuro
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  custom_css = """
71
  #sentences-container {
72
+ border: 1px solid #555;
73
  border-radius: 4px;
74
  padding: 10px;
75
  margin-bottom: 10px;
76
  }
77
  .sentence-box {
78
+ border: 1px solid #333;
79
  padding: 5px;
80
  margin-bottom: 5px;
81
  border-radius: 4px;
82
+ background-color: #222;
83
+ color: #eee;
84
+ }
85
+ body {
86
+ background-color: #111;
87
+ color: #eee;
88
+ }
89
+ .gradio-container {
90
+ background-color: #111;
91
+ }
92
+ .dark {
93
+ background-color: #333;
94
+ color: #eee;
95
  }
96
  """
97
 
98
+ with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="gray", secondary_hue="gray")) as app:
99
  with gr.Tabs():
100
  with gr.Tab("TTS Básico"):
101
  gr.Markdown("# TTS Básico com F5-TTS")
 
165
  nfe_slider,
166
  chunk_size_slider,
167
  seed_input, # Passando o seed para process_chunks
168
+ f5tts_model, # Passando a instância F5TTS_ema_model como argumento
169
  ):
170
  # Dividir o texto em sentenças
171
  sentences = analyzer.split_into_sentences(gen_text_input)
 
179
  # Processar cada chunk
180
  audio_segments = []
181
  for chunk in chunks:
182
+ # Usando a função infer correta aqui
183
+ audio_file, _, _ = f5tts_model.infer( # Usando f5tts_model.infer
184
+ ref_file=ref_audio_input,
185
+ ref_text=ref_text_input,
186
+ gen_text=chunk,
187
+ nfe_step=nfe_slider,
188
+ speed=speed_slider,
189
+ seed=seed_input,
190
+ remove_silence=remove_silence,
 
 
 
 
191
  )
192
  audio_data, _ = torchaudio.load(audio_file)
193
  audio_segments.append(audio_data.squeeze().cpu().numpy())
 
199
  (24000, final_audio_data), # Áudio final - assumindo taxa de amostragem de 24000
200
  None, # Espectrograma - não estamos gerando um espectrograma aqui
201
  gr.update(value=ref_text_input), # Nenhuma mudança no Texto de Referência
202
+ f5tts_model.seed # Retornando o seed da instância F5TTS_ema_model
203
  )
204
  else:
205
  gr.Warning("Nenhum áudio gerado.")
 
222
  speed_slider,
223
  nfe_slider,
224
  chunk_size_slider,
225
+ seed_input,
226
+ F5TTS_ema_model, # Passando a instância F5TTS_ema_model como argumento
227
  ],
228
  outputs=[
229
  audio_output,
230
+ ref_text_input,
231
+ seed_output,
232
  ],
233
  )
234
 
logs/sentence_analyzer_2024-12-02.log CHANGED
@@ -44,3 +44,13 @@
44
  2024-12-02 19:31:01,029 - SentenceAnalyzer - DEBUG - Normalized whitespace
45
  2024-12-02 19:31:01,051 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
46
  2024-12-02 19:31:01,051 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
 
 
 
 
 
 
 
 
 
 
 
44
  2024-12-02 19:31:01,029 - SentenceAnalyzer - DEBUG - Normalized whitespace
45
  2024-12-02 19:31:01,051 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
46
  2024-12-02 19:31:01,051 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
47
+ 2024-12-02 20:26:26,906 - SentenceAnalyzer - DEBUG - Logger set up successfully
48
+ 2024-12-02 20:26:26,906 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
49
+ 2024-12-02 20:27:51,708 - SentenceAnalyzer - DEBUG - Starting sentence splitting
50
+ 2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Normalized text using NFC
51
+ 2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
52
+ 2024-12-02 20:27:51,709 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
53
+ 2024-12-02 20:27:51,710 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
54
+ 2024-12-02 20:27:51,710 - SentenceAnalyzer - DEBUG - Normalized whitespace
55
+ 2024-12-02 20:27:51,733 - SentenceAnalyzer - DEBUG - Split text into 2 sentences using NLTK
56
+ 2024-12-02 20:27:51,734 - SentenceAnalyzer - INFO - Split text into 2 sentences after cleanup