cocktailpeanut commited on
Commit
cb6da82
·
1 Parent(s): 579d79b
app.py CHANGED
@@ -11,31 +11,36 @@ import io
11
  import numpy as np
12
  import random
13
  import uuid
14
- import spaces
 
15
 
16
 
17
  DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
18
  TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
19
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
21
  whisper_model, align_model, voicecraft_model = None, None, None
22
 
23
 
24
  def get_random_string():
25
  return "".join(str(uuid.uuid4()).split("-"))
26
 
27
- @spaces.GPU(duration=30)
28
  def seed_everything(seed):
29
  if seed != -1:
30
  os.environ['PYTHONHASHSEED'] = str(seed)
31
  random.seed(seed)
32
  np.random.seed(seed)
33
  torch.manual_seed(seed)
34
- torch.cuda.manual_seed(seed)
 
 
 
35
  torch.backends.cudnn.benchmark = False
36
  torch.backends.cudnn.deterministic = True
37
 
38
- @spaces.GPU(duration=120)
39
  class WhisperxAlignModel:
40
  def __init__(self):
41
  from whisperx import load_align_model
@@ -46,7 +51,7 @@ class WhisperxAlignModel:
46
  audio = load_audio(audio_path)
47
  return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
48
 
49
- @spaces.GPU(duration=120)
50
  class WhisperModel:
51
  def __init__(self, model_name):
52
  from whisper import load_model
@@ -63,7 +68,7 @@ class WhisperModel:
63
  def transcribe(self, audio_path):
64
  return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
65
 
66
- @spaces.GPU(duration=120)
67
  class WhisperxModel:
68
  def __init__(self, model_name, align_model: WhisperxAlignModel):
69
  from whisperx import load_model
@@ -74,7 +79,7 @@ class WhisperxModel:
74
  segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
75
  return self.align_model.align(segments, audio_path)
76
 
77
- @spaces.GPU(duration=120)
78
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
79
  global transcribe_model, align_model, voicecraft_model
80
 
@@ -123,7 +128,7 @@ def get_transcribe_state(segments):
123
  "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
124
  }
125
 
126
- @spaces.GPU(duration=60)
127
  def transcribe(seed, audio_path):
128
  if transcribe_model is None:
129
  raise gr.Error("Transcription model not loaded")
@@ -162,7 +167,7 @@ def align_segments(transcript, audio_path):
162
  with open(tmp_sync_map_path, "r") as f:
163
  return json.load(f)
164
 
165
- @spaces.GPU(duration=90)
166
  def align(seed, transcript, audio_path):
167
  if align_model is None:
168
  raise gr.Error("Align model not loaded")
@@ -193,7 +198,7 @@ def get_output_audio(audio_tensors, codec_audio_sr):
193
  buffer.seek(0)
194
  return buffer.read()
195
 
196
- @spaces.GPU(duration=90)
197
  def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
198
  stop_repetition, sample_batch_size, kvcache, silence_tokens,
199
  audio_path, transcribe_state, transcript, smart_transcript,
 
11
  import numpy as np
12
  import random
13
  import uuid
14
+ #import spaces
15
+ import devicetorch
16
 
17
 
18
  DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
19
  TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
20
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
21
+ #device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ device = devicetorch(torch)
23
  whisper_model, align_model, voicecraft_model = None, None, None
24
 
25
 
26
  def get_random_string():
27
  return "".join(str(uuid.uuid4()).split("-"))
28
 
29
+ #@spaces.GPU(duration=30)
30
  def seed_everything(seed):
31
  if seed != -1:
32
  os.environ['PYTHONHASHSEED'] = str(seed)
33
  random.seed(seed)
34
  np.random.seed(seed)
35
  torch.manual_seed(seed)
36
+ if device == "cuda":
37
+ torch.cuda.manual_seed(seed)
38
+ elif device == "mps":
39
+ torch.mps.manual_seed(seed)
40
  torch.backends.cudnn.benchmark = False
41
  torch.backends.cudnn.deterministic = True
42
 
43
+ #@spaces.GPU(duration=120)
44
  class WhisperxAlignModel:
45
  def __init__(self):
46
  from whisperx import load_align_model
 
51
  audio = load_audio(audio_path)
52
  return align(segments, self.model, self.metadata, audio, device, return_char_alignments=False)["segments"]
53
 
54
+ #@spaces.GPU(duration=120)
55
  class WhisperModel:
56
  def __init__(self, model_name):
57
  from whisper import load_model
 
68
  def transcribe(self, audio_path):
69
  return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
70
 
71
+ #@spaces.GPU(duration=120)
72
  class WhisperxModel:
73
  def __init__(self, model_name, align_model: WhisperxAlignModel):
74
  from whisperx import load_model
 
79
  segments = self.model.transcribe(audio_path, batch_size=8)["segments"]
80
  return self.align_model.align(segments, audio_path)
81
 
82
+ #@spaces.GPU(duration=120)
83
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
84
  global transcribe_model, align_model, voicecraft_model
85
 
 
128
  "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
129
  }
130
 
131
+ #@spaces.GPU(duration=60)
132
  def transcribe(seed, audio_path):
133
  if transcribe_model is None:
134
  raise gr.Error("Transcription model not loaded")
 
167
  with open(tmp_sync_map_path, "r") as f:
168
  return json.load(f)
169
 
170
+ #@spaces.GPU(duration=90)
171
  def align(seed, transcript, audio_path):
172
  if align_model is None:
173
  raise gr.Error("Align model not loaded")
 
198
  buffer.seek(0)
199
  return buffer.read()
200
 
201
+ #@spaces.GPU(duration=90)
202
  def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
203
  stop_repetition, sample_batch_size, kvcache, silence_tokens,
204
  audio_path, transcribe_state, transcript, smart_transcript,
inference_speech_editing_scale.py CHANGED
@@ -4,6 +4,7 @@ import os, random
4
  import numpy as np
5
  import torch
6
  import torchaudio
 
7
 
8
  from data.tokenizer import (
9
  AudioTokenizer,
@@ -96,9 +97,10 @@ def get_model(exp_dir, device=None):
96
  del ckpt
97
  logging.info("done loading weights...")
98
  if device == None:
99
- device = torch.device("cpu")
100
- if torch.cuda.is_available():
101
- device = torch.device("cuda:0")
 
102
  model.to(device)
103
  model.eval()
104
  return model, model_args, phn2num
@@ -132,7 +134,10 @@ if __name__ == "__main__":
132
  random.seed(seed)
133
  np.random.seed(seed)
134
  torch.manual_seed(seed)
135
- torch.cuda.manual_seed(seed)
 
 
 
136
  torch.backends.cudnn.benchmark = False
137
  torch.backends.cudnn.deterministic = True
138
  formatter = (
 
4
  import numpy as np
5
  import torch
6
  import torchaudio
7
+ import devicetorch
8
 
9
  from data.tokenizer import (
10
  AudioTokenizer,
 
97
  del ckpt
98
  logging.info("done loading weights...")
99
  if device == None:
100
+ device = devicetorch(torch)
101
+ # device = torch.device("cpu")
102
+ # if torch.cuda.is_available():
103
+ # device = torch.device("cuda:0")
104
  model.to(device)
105
  model.eval()
106
  return model, model_args, phn2num
 
134
  random.seed(seed)
135
  np.random.seed(seed)
136
  torch.manual_seed(seed)
137
+ if device == "cuda":
138
+ torch.cuda.manual_seed(seed)
139
+ elif device == "mps":
140
+ torch.mps.manual_seed(seed)
141
  torch.backends.cudnn.benchmark = False
142
  torch.backends.cudnn.deterministic = True
143
  formatter = (
inference_tts_scale.py CHANGED
@@ -4,6 +4,7 @@ import os, random
4
  import numpy as np
5
  import torch
6
  import torchaudio
 
7
 
8
  from data.tokenizer import (
9
  AudioTokenizer,
@@ -115,9 +116,10 @@ def get_model(exp_dir, device=None):
115
  del ckpt
116
  logging.info("done loading weights...")
117
  if device == None:
118
- device = torch.device("cpu")
119
- if torch.cuda.is_available():
120
- device = torch.device("cuda:0")
 
121
  model.to(device)
122
  model.eval()
123
  return model, model_args, phn2num
@@ -128,7 +130,11 @@ if __name__ == "__main__":
128
  random.seed(seed)
129
  np.random.seed(seed)
130
  torch.manual_seed(seed)
131
- torch.cuda.manual_seed(seed)
 
 
 
 
132
  torch.backends.cudnn.benchmark = False
133
  torch.backends.cudnn.deterministic = True
134
  formatter = (
@@ -187,4 +193,4 @@ if __name__ == "__main__":
187
  seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
188
 
189
  torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
190
- torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
 
4
  import numpy as np
5
  import torch
6
  import torchaudio
7
+ import devicetorch
8
 
9
  from data.tokenizer import (
10
  AudioTokenizer,
 
116
  del ckpt
117
  logging.info("done loading weights...")
118
  if device == None:
119
+ device = devicetorch.get(torch)
120
+ # device = torch.device("cpu")
121
+ # if torch.cuda.is_available():
122
+ # device = torch.device("cuda:0")
123
  model.to(device)
124
  model.eval()
125
  return model, model_args, phn2num
 
130
  random.seed(seed)
131
  np.random.seed(seed)
132
  torch.manual_seed(seed)
133
+ device = devicetorch.get(torch)
134
+ if device == "cuda":
135
+ torch.cuda.manual_seed(seed)
136
+ elif device == "mps":
137
+ torch.mps.manual_seed(seed)
138
  torch.backends.cudnn.benchmark = False
139
  torch.backends.cudnn.deterministic = True
140
  formatter = (
 
193
  seg_save_fn_concat = f"{args.output_dir}/concat_{new_audio_fn[:-4]}_{i}_seed{args.seed}.wav"
194
 
195
  torchaudio.save(seg_save_fn_gen, gen_audio, args.codec_audio_sr)
196
+ torchaudio.save(seg_save_fn_concat, concated_audio, args.codec_audio_sr)
requirements.txt CHANGED
@@ -3,7 +3,8 @@ phonemizer==3.2.1
3
  gradio
4
  nltk>=3.8.1
5
  openai-whisper>=20231117
6
- spaces
7
  aeneas==1.7.3.0
8
  whisperx==3.1.1
9
- huggingface-hub==0.22.2
 
 
3
  gradio
4
  nltk>=3.8.1
5
  openai-whisper>=20231117
6
+ #spaces
7
  aeneas==1.7.3.0
8
  whisperx==3.1.1
9
+ huggingface-hub==0.22.2
10
+ devicetorch