Wataru commited on
Commit
aeef433
1 Parent(s): 34e77e5

added basic files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +0 -1
  3. app.py +48 -0
  4. requirements.txt +181 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv/
README.md CHANGED
@@ -10,4 +10,3 @@ pinned: false
10
  license: cc-by-nc-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  license: cc-by-nc-2.0
11
  ---
12
 
 
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from miipher.dataset.preprocess_for_infer import PreprocessForInfer
3
+ from miipher.lightning_module import MiipherLightningModule
4
+ from lightning_vocoders.models.hifigan.xvector_lightning_module import HiFiGANXvectorLightningModule
5
+ import torch
6
+ import torchaudio
7
+ import hydra
8
+ import tempfile
9
+
10
+ miipher_path = "miipher.ckpt"
11
+ miipher = MiipherLightningModule.load_from_checkpoint(miipher_path,map_location='cpu')
12
+ vocoder = HiFiGANXvectorLightningModule.load_from_checkpoint("vocoder_finetuned.ckpt",map_location='cpu')
13
+ xvector_model = hydra.utils.instantiate(vocoder.cfg.data.xvector.model)
14
+ xvector_model = xvector_model.to('cpu')
15
+ preprocessor = PreprocessForInfer(miipher.cfg)
16
+
17
+ @torch.inference_mode()
18
+ def main(wav_path,transcript,lang_code):
19
+ wav,sr =torchaudio.load(wav_path)
20
+ wav = wav[0].unsqueeze(0)
21
+ batch = preprocessor.process(
22
+ 'test',
23
+ (torch.tensor(wav),sr),
24
+ word_segmented_text=transcript,
25
+ lang_code=lang_code
26
+ )
27
+
28
+ miipher.feature_extractor(batch)
29
+ (
30
+ phone_feature,
31
+ speaker_feature,
32
+ degraded_ssl_feature,
33
+ _,
34
+ ) = miipher.feature_extractor(batch)
35
+ cleaned_ssl_feature, _ = miipher(phone_feature,speaker_feature,degraded_ssl_feature)
36
+ vocoder_xvector = xvector_model.encode_batch(batch['degraded_wav_16k'].view(1,-1).cpu()).squeeze(1)
37
+ cleaned_wav = vocoder.generator_forward({"input_feature": cleaned_ssl_feature, "xvector": vocoder_xvector})[0].T
38
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as fp:
39
+ torchaudio.save(fp,cleaned_wav.view(1,-1), sample_rate=22050,format='wav')
40
+ return fp.name
41
+
42
+ inputs = [gr.Audio(label="noisy audio",type='filepath'),gr.Textbox(label="Transcript", value="Your transcript here", max_lines=1),
43
+ gr.Radio(label="Language", choices=["eng-us", "jpn"], value="eng-us")]
44
+ outputs = gr.Audio(label="Output")
45
+
46
+ demo = gr.Interface(fn=main, inputs=inputs, outputs=outputs)
47
+
48
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ aiofiles==23.2.1
3
+ aiohttp==3.8.5
4
+ aiosignal==1.3.1
5
+ altair==5.1.1
6
+ annotated-types==0.5.0
7
+ antlr4-python3-runtime==4.9.3
8
+ anyio==3.7.1
9
+ appdirs==1.4.4
10
+ arrow==1.2.3
11
+ async-timeout==4.0.3
12
+ attrs==23.1.0
13
+ Babel==2.12.1
14
+ backoff==2.2.1
15
+ beautifulsoup4==4.12.2
16
+ blessed==1.20.0
17
+ braceexpand==0.1.7
18
+ cachetools==5.3.1
19
+ certifi==2023.7.22
20
+ cffi==1.16.0
21
+ charset-normalizer==3.3.0
22
+ click==8.1.7
23
+ clldutils==3.20.0
24
+ cmake==3.27.5
25
+ colorama==0.4.6
26
+ colorlog==6.7.0
27
+ contourpy==1.1.1
28
+ croniter==1.4.1
29
+ csvw==3.1.3
30
+ cycler==0.12.0
31
+ Cython==3.0.2
32
+ dateutils==0.6.12
33
+ deepdiff==6.5.0
34
+ dill==0.3.7
35
+ docker-pycreds==0.4.0
36
+ exceptiongroup==1.1.3
37
+ fastapi==0.103.2
38
+ ffmpy==0.3.1
39
+ filelock==3.12.4
40
+ fonttools==4.43.0
41
+ frozenlist==1.4.0
42
+ fsspec==2023.9.2
43
+ gitdb==4.0.10
44
+ GitPython==3.1.37
45
+ google-auth==2.23.2
46
+ google-auth-oauthlib==1.0.0
47
+ gradio==3.45.2
48
+ gradio_client==0.5.3
49
+ grpcio==1.59.0
50
+ h11==0.14.0
51
+ httpcore==0.18.0
52
+ httpx==0.25.0
53
+ huggingface-hub==0.17.3
54
+ hydra-core==1.3.2
55
+ HyperPyYAML==1.2.2
56
+ idna==3.4
57
+ importlib-resources==6.1.0
58
+ inquirer==3.1.3
59
+ isodate==0.6.1
60
+ itsdangerous==2.1.2
61
+ Jinja2==3.1.2
62
+ joblib==1.3.2
63
+ jsonschema==4.19.1
64
+ jsonschema-specifications==2023.7.1
65
+ kiwisolver==1.4.5
66
+ language-tags==1.2.0
67
+ lightning==2.0.9.post0
68
+ lightning-cloud==0.5.39
69
+ lightning-utilities==0.9.0
70
+ lightning-vocoders @ git+https://github.com/Wataru-Nakata/ssl-vocoders@8a628630a45fa2c034d464db7db98901eb1091e4
71
+ lit==17.0.1
72
+ llvmlite==0.40.1
73
+ lxml==4.9.3
74
+ Markdown==3.4.4
75
+ markdown-it-py==3.0.0
76
+ MarkupSafe==2.1.3
77
+ matplotlib==3.7.3
78
+ mdurl==0.1.2
79
+ mecab-python3==1.0.8
80
+ miipher @ git+https://github.com/Wataru-Nakata/miipher/@5a326adb732e0c5ba11b5232f0644f0f19b696be
81
+ mpmath==1.3.0
82
+ multidict==6.0.4
83
+ networkx==3.1
84
+ numpy==1.26.0
85
+ nvidia-cublas-cu11==11.10.3.66
86
+ nvidia-cuda-cupti-cu11==11.7.101
87
+ nvidia-cuda-nvrtc-cu11==11.7.99
88
+ nvidia-cuda-runtime-cu11==11.7.99
89
+ nvidia-cudnn-cu11==8.5.0.96
90
+ nvidia-cufft-cu11==10.9.0.58
91
+ nvidia-curand-cu11==10.2.10.91
92
+ nvidia-cusolver-cu11==11.4.0.1
93
+ nvidia-cusparse-cu11==11.7.4.91
94
+ nvidia-nccl-cu11==2.14.3
95
+ nvidia-nvtx-cu11==11.7.91
96
+ oauthlib==3.2.2
97
+ omegaconf==2.3.0
98
+ ordered-set==4.1.0
99
+ orjson==3.9.7
100
+ packaging==23.1
101
+ pandarallel==1.6.5
102
+ pandas==2.1.1
103
+ pathtools==0.1.2
104
+ Pillow==10.0.1
105
+ plac==1.4.0
106
+ protobuf==4.24.3
107
+ psutil==5.9.5
108
+ pyasn1==0.5.0
109
+ pyasn1-modules==0.3.0
110
+ pybind11==2.11.1
111
+ pycparser==2.21
112
+ pydantic==2.1.1
113
+ pydantic_core==2.4.0
114
+ pydub==0.25.1
115
+ Pygments==2.16.1
116
+ PyJWT==2.8.0
117
+ pylatexenc==2.10
118
+ pyparsing==3.1.1
119
+ pyroomacoustics==0.7.3
120
+ pyrootutils==1.0.4
121
+ python-dateutil==2.8.2
122
+ python-dotenv==1.0.0
123
+ python-editor==1.0.4
124
+ python-multipart==0.0.6
125
+ pytorch-lightning==2.0.9.post0
126
+ pytz==2023.3.post1
127
+ PyYAML==6.0.1
128
+ rdflib==7.0.0
129
+ readchar==4.0.5
130
+ referencing==0.30.2
131
+ regex==2023.8.8
132
+ requests==2.31.0
133
+ requests-oauthlib==1.3.1
134
+ rfc3986==1.5.0
135
+ rich==13.6.0
136
+ rpds-py==0.10.3
137
+ rsa==4.9
138
+ ruamel.yaml==0.17.33
139
+ ruamel.yaml.clib==0.2.7
140
+ scipy==1.11.3
141
+ segments==2.2.1
142
+ semantic-version==2.10.0
143
+ sentencepiece==0.1.99
144
+ sentry-sdk==1.31.0
145
+ setproctitle==1.3.2
146
+ six==1.16.0
147
+ smmap==5.0.1
148
+ sniffio==1.3.0
149
+ soundfile==0.12.1
150
+ soupsieve==2.5
151
+ speechbrain==0.5.15
152
+ starlette==0.27.0
153
+ starsessions==1.3.0
154
+ sympy==1.12
155
+ tabulate==0.9.0
156
+ tensorboard==2.13.0
157
+ tensorboard-data-server==0.7.1
158
+ text2phonemesequence==0.1.4
159
+ tokenizers==0.13.3
160
+ toolz==0.12.0
161
+ torch==2.0.1
162
+ torchaudio==2.0.2
163
+ torchmetrics==1.2.0
164
+ tqdm==4.66.1
165
+ traitlets==5.10.1
166
+ transformers==4.29.2
167
+ triton==2.0.0
168
+ typing_extensions==4.8.0
169
+ tzdata==2023.3
170
+ unidic==1.1.0
171
+ uritemplate==4.1.1
172
+ urllib3==2.0.5
173
+ uvicorn==0.23.2
174
+ wandb==0.15.11
175
+ wasabi==0.10.1
176
+ wcwidth==0.2.8
177
+ webdataset==0.2.57
178
+ websocket-client==1.6.3
179
+ websockets==11.0.3
180
+ Werkzeug==3.0.0
181
+ yarl==1.9.2