AkitoP commited on
Commit
4536f60
1 Parent(s): c3fbe2e

Update GPT_SoVITS/feature_extractor/cnhubert.py

Browse files
GPT_SoVITS/feature_extractor/cnhubert.py CHANGED
@@ -1,104 +1,95 @@
1
- import time
2
-
3
- import librosa
4
- import torch
5
- import torch.nn.functional as F
6
- import soundfile as sf
7
- import logging
8
-
9
- logging.getLogger("numba").setLevel(logging.WARNING)
10
-
11
- from transformers import (
12
- Wav2Vec2FeatureExtractor,
13
- HubertModel,
14
- )
15
-
16
- import utils
17
- import torch.nn as nn
18
-
19
- cnhubert_base_path = None
20
-
21
-
22
- class CNHubert(nn.Module):
23
- def __init__(self):
24
- super().__init__()
25
- self.model = HubertModel.from_pretrained(cnhubert_base_path)
26
- self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
27
- cnhubert_base_path
28
- )
29
-
30
- def forward(self, x):
31
- input_values = self.feature_extractor(
32
- x, return_tensors="pt", sampling_rate=16000
33
- ).input_values.to(x.device)
34
- feats = self.model(input_values)["last_hidden_state"]
35
- return feats
36
-
37
-
38
- # class CNHubertLarge(nn.Module):
39
- # def __init__(self):
40
- # super().__init__()
41
- # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
42
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
43
- # def forward(self, x):
44
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
45
- # feats = self.model(input_values)["last_hidden_state"]
46
- # return feats
47
- #
48
- # class CVec(nn.Module):
49
- # def __init__(self):
50
- # super().__init__()
51
- # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
52
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
53
- # def forward(self, x):
54
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
55
- # feats = self.model(input_values)["last_hidden_state"]
56
- # return feats
57
- #
58
- # class cnw2v2base(nn.Module):
59
- # def __init__(self):
60
- # super().__init__()
61
- # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
62
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
63
- # def forward(self, x):
64
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
65
- # feats = self.model(input_values)["last_hidden_state"]
66
- # return feats
67
-
68
-
69
- def get_model():
70
- model = CNHubert()
71
- model.eval()
72
- return model
73
-
74
-
75
- # def get_large_model():
76
- # model = CNHubertLarge()
77
- # model.eval()
78
- # return model
79
- #
80
- # def get_model_cvec():
81
- # model = CVec()
82
- # model.eval()
83
- # return model
84
- #
85
- # def get_model_cnw2v2base():
86
- # model = cnw2v2base()
87
- # model.eval()
88
- # return model
89
-
90
-
91
- def get_content(hmodel, wav_16k_tensor):
92
- with torch.no_grad():
93
- feats = hmodel(wav_16k_tensor)
94
- return feats.transpose(1, 2)
95
-
96
-
97
- if __name__ == "__main__":
98
- model = get_model()
99
- src_path = "/Users/Shared/原音频2.wav"
100
- wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
101
- model = model
102
- wav_16k_tensor = wav_16k_tensor
103
- feats = get_content(model, wav_16k_tensor)
104
- print(feats.shape)
 
1
+ import time
2
+
3
+ import librosa
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import soundfile as sf
7
+ import logging
8
+
9
+ logging.getLogger("numba").setLevel(logging.WARNING)
10
+
11
+ from transformers import (
12
+ Wav2Vec2FeatureExtractor,
13
+ HubertModel,
14
+ )
15
+
16
+ import torch.nn as nn
17
+
18
+ cnhubert_base_path = None
19
+
20
+
21
+ class CNHubert(nn.Module):
22
+ def __init__(self):
23
+ super().__init__()
24
+ self.model = HubertModel.from_pretrained(cnhubert_base_path)
25
+ self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
26
+ cnhubert_base_path
27
+ )
28
+
29
+ def forward(self, x):
30
+ input_values = self.feature_extractor(
31
+ x, return_tensors="pt", sampling_rate=16000
32
+ ).input_values.to(x.device)
33
+ feats = self.model(input_values)["last_hidden_state"]
34
+ return feats
35
+
36
+
37
+ # class CNHubertLarge(nn.Module):
38
+ # def __init__(self):
39
+ # super().__init__()
40
+ # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
41
+ # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
42
+ # def forward(self, x):
43
+ # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
44
+ # feats = self.model(input_values)["last_hidden_state"]
45
+ # return feats
46
+ #
47
+ # class CVec(nn.Module):
48
+ # def __init__(self):
49
+ # super().__init__()
50
+ # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
51
+ # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
52
+ # def forward(self, x):
53
+ # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
54
+ # feats = self.model(input_values)["last_hidden_state"]
55
+ # return feats
56
+ #
57
+ # class cnw2v2base(nn.Module):
58
+ # def __init__(self):
59
+ # super().__init__()
60
+ # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
61
+ # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
62
+ # def forward(self, x):
63
+ # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
64
+ # feats = self.model(input_values)["last_hidden_state"]
65
+ # return feats
66
+
67
+
68
+ def get_model():
69
+ model = CNHubert()
70
+ model.eval()
71
+ return model
72
+
73
+
74
+ # def get_large_model():
75
+ # model = CNHubertLarge()
76
+ # model.eval()
77
+ # return model
78
+ #
79
+ # def get_model_cvec():
80
+ # model = CVec()
81
+ # model.eval()
82
+ # return model
83
+ #
84
+ # def get_model_cnw2v2base():
85
+ # model = cnw2v2base()
86
+ # model.eval()
87
+ # return model
88
+
89
+
90
+ def get_content(hmodel, wav_16k_tensor):
91
+ with torch.no_grad():
92
+ feats = hmodel(wav_16k_tensor)
93
+ return feats.transpose(1, 2)
94
+
95
+