Chillarmo commited on
Commit
7c62735
β€’
1 Parent(s): 0ef49b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -14
app.py CHANGED
@@ -1,20 +1,32 @@
1
  import gradio as gr
2
  import torch
3
- from outetts.v0_1.interface import InterfaceHF
4
  import soundfile as sf
5
  import tempfile
6
  import os
7
  from faster_whisper import WhisperModel
 
 
 
 
 
 
 
 
 
8
 
9
  def initialize_models():
10
  """Initialize the OuteTTS and Faster-Whisper models"""
11
- tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
12
- # Use tiny model with lowest compute settings for maximum speed
 
 
 
13
  asr_model = WhisperModel("tiny",
14
  device="cpu",
15
- compute_type="int8", # Use int8 quantization for efficiency
16
- num_workers=1, # Limit workers for low-resource environment
17
- cpu_threads=1) # Limit CPU threads
18
  return tts_interface, asr_model
19
 
20
  # Initialize models globally to avoid reloading
@@ -23,17 +35,15 @@ TTS_INTERFACE, ASR_MODEL = initialize_models()
23
  def transcribe_audio(audio_path):
24
  """Transcribe audio using Faster-Whisper tiny"""
25
  try:
26
- # Transcribe with minimal settings for speed
27
  segments, _ = ASR_MODEL.transcribe(audio_path,
28
- beam_size=1,
29
- best_of=1,
30
- temperature=1.0,
31
  condition_on_previous_text=False,
32
  compression_ratio_threshold=2.4,
33
  log_prob_threshold=-1.0,
34
  no_speech_threshold=0.6)
35
 
36
- # Combine all segments
37
  text = " ".join([segment.text for segment in segments]).strip()
38
  return text
39
  except Exception as e:
@@ -77,10 +87,11 @@ Reference text: {reference_text[:500]}...
77
  return None, f"Error: {str(e)}"
78
 
79
  # Create Gradio interface
80
- with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
81
- gr.Markdown("# πŸŽ™οΈ Voice Cloning with OuteTTS")
82
  gr.Markdown("""
83
- This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
 
84
  and enter the new text you want to be spoken in the cloned voice.
85
 
86
  Note:
 
1
  import gradio as gr
2
  import torch
3
+ from outetts.v0_1.interface import InterfaceGGUF
4
  import soundfile as sf
5
  import tempfile
6
  import os
7
  from faster_whisper import WhisperModel
8
+ import huggingface_hub
9
+
10
+ def download_model():
11
+ """Download the GGUF model from HuggingFace"""
12
+ model_path = huggingface_hub.hf_hub_download(
13
+ repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
14
+ filename="outetts-0.1-350m.gguf"
15
+ )
16
+ return model_path
17
 
18
  def initialize_models():
19
  """Initialize the OuteTTS and Faster-Whisper models"""
20
+ # Download and initialize GGUF model
21
+ model_path = download_model()
22
+ tts_interface = InterfaceGGUF(model_path)
23
+
24
+ # Initialize Whisper
25
  asr_model = WhisperModel("tiny",
26
  device="cpu",
27
+ compute_type="int8",
28
+ num_workers=1,
29
+ cpu_threads=1)
30
  return tts_interface, asr_model
31
 
32
  # Initialize models globally to avoid reloading
 
35
  def transcribe_audio(audio_path):
36
  """Transcribe audio using Faster-Whisper tiny"""
37
  try:
 
38
  segments, _ = ASR_MODEL.transcribe(audio_path,
39
+ beam_size=1,
40
+ best_of=1,
41
+ temperature=1.0,
42
  condition_on_previous_text=False,
43
  compression_ratio_threshold=2.4,
44
  log_prob_threshold=-1.0,
45
  no_speech_threshold=0.6)
46
 
 
47
  text = " ".join([segment.text for segment in segments]).strip()
48
  return text
49
  except Exception as e:
 
87
  return None, f"Error: {str(e)}"
88
 
89
  # Create Gradio interface
90
+ with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
91
+ gr.Markdown("# πŸŽ™οΈ Voice Cloning with OuteTTS (GGUF)")
92
  gr.Markdown("""
93
+ This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file,
94
+ provide the text being spoken in that audio (or leave blank for automatic transcription),
95
  and enter the new text you want to be spoken in the cloned voice.
96
 
97
  Note: