kAIto47802 commited on
Commit
b55d767
1 Parent(s): 5cc3b5d

Resolved conflict in README.md

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +50 -0
  2. LICENSE +21 -0
  3. README.md +4 -5
  4. app.py +73 -0
  5. models/fusion_stage3/fold0_s42_best_model.pth +3 -0
  6. models/fusion_stage3/fold1_s42_best_model.pth +3 -0
  7. models/fusion_stage3/fold2_s42_best_model.pth +3 -0
  8. models/fusion_stage3/fold3_s42_best_model.pth +3 -0
  9. models/fusion_stage3/fold4_s42_best_model.pth +3 -0
  10. requirements.txt +10 -0
  11. utmosv2/config/c_fusion_stage2.py +149 -0
  12. utmosv2/config/c_fusion_stage3.py +149 -0
  13. utmosv2/config/c_spec_only_stage1.py +134 -0
  14. utmosv2/config/c_spec_only_stage2.py +134 -0
  15. utmosv2/config/c_ssl_only_stage1.py +68 -0
  16. utmosv2/config/c_ssl_only_stage2.py +68 -0
  17. utmosv2/config/fusion_stage2.py +151 -0
  18. utmosv2/config/fusion_stage2_wo_bc.py +160 -0
  19. utmosv2/config/fusion_stage2_wo_bvcc.py +160 -0
  20. utmosv2/config/fusion_stage2_wo_sarulab.py +160 -0
  21. utmosv2/config/fusion_stage2_wo_somos.py +160 -0
  22. utmosv2/config/fusion_stage3.py +150 -0
  23. utmosv2/config/fusion_stage3_wo_bc.py +160 -0
  24. utmosv2/config/fusion_stage3_wo_bvcc.py +160 -0
  25. utmosv2/config/fusion_stage3_wo_sarulab.py +160 -0
  26. utmosv2/config/fusion_stage3_wo_somos.py +160 -0
  27. utmosv2/config/fusion_wo_stage1and2.py +150 -0
  28. utmosv2/config/fusion_wo_stage2.py +150 -0
  29. utmosv2/config/spec_only.py +135 -0
  30. utmosv2/config/spec_only_wo_bc.py +145 -0
  31. utmosv2/config/spec_only_wo_bvcc.py +145 -0
  32. utmosv2/config/spec_only_wo_sarulab.py +145 -0
  33. utmosv2/config/spec_only_wo_somos.py +145 -0
  34. utmosv2/config/ssl_only_stage1.py +69 -0
  35. utmosv2/config/ssl_only_stage1_wo_bc.py +79 -0
  36. utmosv2/config/ssl_only_stage1_wo_bvcc.py +79 -0
  37. utmosv2/config/ssl_only_stage1_wo_sarulab.py +79 -0
  38. utmosv2/config/ssl_only_stage1_wo_somos.py +79 -0
  39. utmosv2/config/ssl_only_stage2.py +69 -0
  40. utmosv2/config/ssl_only_stage2_wo_bc.py +79 -0
  41. utmosv2/config/ssl_only_stage2_wo_bvcc.py +79 -0
  42. utmosv2/config/ssl_only_stage2_wo_sarulab.py +79 -0
  43. utmosv2/config/ssl_only_stage2_wo_somos.py +79 -0
  44. utmosv2/dataset/__init__.py +11 -0
  45. utmosv2/dataset/_utils.py +53 -0
  46. utmosv2/dataset/multi_spec.py +99 -0
  47. utmosv2/dataset/ssl.py +48 -0
  48. utmosv2/dataset/ssl_multispec.py +20 -0
  49. utmosv2/loss/__init__.py +3 -0
  50. utmosv2/loss/losses.py +37 -0
.gitignore ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data/
2
+ data2/
3
+ preds/
4
+ preprocessed_data/
5
+ wandb/
6
+
7
+ .ruff_cache/
8
+
9
+ *.sif
10
+ *.log
11
+ *.out
12
+
13
+ .DS_Store
14
+
15
+ __pycache__/
16
+ *.py[cod]
17
+ *$py.class
18
+
19
+ *.so
20
+
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ share/python-wheels/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+ MANIFEST
39
+
40
+ .ipynb_checkpoints
41
+
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ .python-version
46
+
47
+ .env
48
+
49
+ .mypy_cache
50
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Saruwatari&Saito laboratory, The University of Tokyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
  title: UTMOSv2
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.37.2
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: UTMOSv2
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.38.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ from types import SimpleNamespace
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ # import spaces
8
+ import torch
9
+
10
+ from utmosv2.utils import get_dataset, get_model
11
+
12
+ description = (
13
+ "# 🚀 UTMOSv2 demo\n\n"
14
+ "This is a demonstration of MOS prediction using UTMOSv2. "
15
+ "This demonstration only accepts `.wav` format. Best at 16 kHz sampling rate."
16
+ )
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ config = importlib.import_module("utmosv2.config.fusion_stage3")
20
+ cfg = SimpleNamespace(**{attr: getattr(config, attr) for attr in config.__dict__ if not attr.startswith("__")})
21
+ cfg.reproduce = False
22
+ cfg.config = "fusion_stage3"
23
+ cfg.print_config = False
24
+ cfg.data_config = None
25
+ cfg.phase = "inference"
26
+ cfg.weight = None
27
+ cfg.num_workers = 1
28
+
29
+ # @spaces.GPU
30
+ def predict_mos(audio_path: str, domain: str) -> float:
31
+ data = pd.DataFrame({"file_path": [audio_path]})
32
+ data["dataset"] = domain
33
+ data['mos'] = 0
34
+
35
+ preds = 0.0
36
+ for fold in range(5):
37
+ cfg.now_fold = fold
38
+ model = get_model(cfg, device)
39
+ for _ in range(5):
40
+ test_dataset = get_dataset(cfg, data, "test")
41
+ p = model(*[torch.tensor(t).unsqueeze(0) for t in test_dataset[0][:-1]])
42
+ preds += p[0]
43
+ preds /= 25.0
44
+ return preds
45
+
46
+
47
+ with gr.Blocks() as demo:
48
+ gr.Markdown(description)
49
+ with gr.Row():
50
+ with gr.Column():
51
+ audio = gr.Audio(type="filepath", label="Audio")
52
+ domain = gr.Dropdown(
53
+ [
54
+ "sarulab",
55
+ "bvcc",
56
+ "somos",
57
+ "blizzard2008",
58
+ "blizzard2009",
59
+ "blizzard2010-EH1",
60
+ "blizzard2010-EH2",
61
+ "blizzard2010-ES1",
62
+ "blizzard2010-ES3",
63
+ "blizzard2011",
64
+ ],
65
+ label="Data-domain ID for the MOS prediction",
66
+ )
67
+ submit = gr.Button(value="Submit")
68
+
69
+ with gr.Column():
70
+ output = gr.Textbox(label="Predicted MOS", type="text")
71
+ submit.click(fn=predict_mos, inputs=[audio, domain], outputs=[output])
72
+
73
+ demo.queue().launch()
models/fusion_stage3/fold0_s42_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92fb17df6d4ef4708ea56cf0dca072b9b63d2b522190ef21ccee4f9ea80864fd
3
+ size 818531314
models/fusion_stage3/fold1_s42_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06756833fa743cb6683f109252ce178236464c33fbbee69d4e45cdf1ae7ad0cc
3
+ size 818531314
models/fusion_stage3/fold2_s42_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d784c0cdf9f4e4cbf697a8f755c8b0f5a0b842d18ad3d2bb42bbae3802d17a78
3
+ size 818531314
models/fusion_stage3/fold3_s42_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed133fe9b3e5cb78d0037aa784c7e23650a3e7b4f8ba00a73644c02aeb627758
3
+ size 818531314
models/fusion_stage3/fold4_s42_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12659b9db09d753654a7744b6b93ff6f90759a7429da793137b8cce107355967
3
+ size 818531314
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy>=1.24.4
2
+ pandas>=2.2.2
3
+ torch>=2.3.1
4
+ timm>=1.0.7
5
+ librosa>=0.10.2
6
+ tqdm>=4.66.4
7
+ scikit-learn>=1.3.2
8
+ transformers>=4.42.4
9
+ wandb>=0.17.0
10
+ python-dotenv>=1.0.1
utmosv2/config/c_fusion_stage2.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 16
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="stratified_group",
19
+ target="mos",
20
+ group="sys_id",
21
+ )
22
+
23
+ external_data = "all"
24
+ use_bvcc = True
25
+
26
+
27
+ validation_dataset = "sarulab"
28
+
29
+ dataset = SimpleNamespace(
30
+ name="ssl_multispec_ext",
31
+ specs=[
32
+ SimpleNamespace(
33
+ mode="melspec",
34
+ n_fft=4096,
35
+ hop_length=32,
36
+ win_length=4096,
37
+ n_mels=512,
38
+ shape=(512, 512),
39
+ norm=80,
40
+ ),
41
+ SimpleNamespace(
42
+ mode="melspec",
43
+ n_fft=4096,
44
+ hop_length=32,
45
+ win_length=2048,
46
+ n_mels=512,
47
+ shape=(512, 512),
48
+ norm=80,
49
+ ),
50
+ SimpleNamespace(
51
+ mode="melspec",
52
+ n_fft=4096,
53
+ hop_length=32,
54
+ win_length=1024,
55
+ n_mels=512,
56
+ shape=(512, 512),
57
+ norm=80,
58
+ ),
59
+ SimpleNamespace(
60
+ mode="melspec",
61
+ n_fft=4096,
62
+ hop_length=32,
63
+ win_length=512,
64
+ n_mels=512,
65
+ shape=(512, 512),
66
+ norm=80,
67
+ ),
68
+ ],
69
+ spec_frames=SimpleNamespace(
70
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
71
+ ),
72
+ ssl=SimpleNamespace(
73
+ duration=3,
74
+ ),
75
+ )
76
+ transform = dict(
77
+ train=transforms.Compose(
78
+ [
79
+ transforms.Resize((512, 512)),
80
+ XYMasking(
81
+ num_masks_x=(0, 2),
82
+ num_masks_y=(0, 2),
83
+ mask_x_length=(10, 40),
84
+ mask_y_length=(10, 30),
85
+ fill_value=0,
86
+ p=0.5,
87
+ ),
88
+ # transforms.ToTensor(),
89
+ ]
90
+ ),
91
+ valid=transforms.Compose(
92
+ [
93
+ transforms.Resize((512, 512)),
94
+ # transforms.ToTensor()
95
+ ]
96
+ ),
97
+ )
98
+
99
+ loss = [
100
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
101
+ (SimpleNamespace(name="mse"), 0.2),
102
+ ]
103
+
104
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
105
+
106
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
107
+
108
+ model = SimpleNamespace(
109
+ name="ssl_multispec_ext",
110
+ multi_spec=SimpleNamespace(
111
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
112
+ pretrained=True,
113
+ num_classes=1,
114
+ pool_type="catavgmax",
115
+ # feature_height=16,
116
+ atten=True,
117
+ # classifier=None,
118
+ ),
119
+ ssl=SimpleNamespace(
120
+ name="facebook/wav2vec2-base",
121
+ attn=1,
122
+ freeze=True,
123
+ num_classes=1,
124
+ ),
125
+ ssl_spec=SimpleNamespace(
126
+ ssl_weight="c_ssl_only_stage2",
127
+ spec_weight="c_spec_only_stage2",
128
+ num_classes=1,
129
+ freeze=True,
130
+ ),
131
+ )
132
+
133
+ run = SimpleNamespace(
134
+ mixup=True,
135
+ mixup_alpha=0.4,
136
+ num_epochs=8,
137
+ )
138
+
139
+ main_metric = "sys_srcc"
140
+ id_name = None
141
+
142
+
143
+ inference = SimpleNamespace(
144
+ save_path=Path("preds"),
145
+ submit_save_path=Path("submissions"),
146
+ num_tta=5,
147
+ batch_size=8,
148
+ extend="tile",
149
+ )
utmosv2/config/c_fusion_stage3.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="stratified_group",
19
+ target="mos",
20
+ group="sys_id",
21
+ )
22
+
23
+ external_data = "all"
24
+ use_bvcc = True
25
+
26
+
27
+ validation_dataset = "sarulab"
28
+
29
+ dataset = SimpleNamespace(
30
+ name="ssl_multispec_ext",
31
+ specs=[
32
+ SimpleNamespace(
33
+ mode="melspec",
34
+ n_fft=4096,
35
+ hop_length=32,
36
+ win_length=4096,
37
+ n_mels=512,
38
+ shape=(512, 512),
39
+ norm=80,
40
+ ),
41
+ SimpleNamespace(
42
+ mode="melspec",
43
+ n_fft=4096,
44
+ hop_length=32,
45
+ win_length=2048,
46
+ n_mels=512,
47
+ shape=(512, 512),
48
+ norm=80,
49
+ ),
50
+ SimpleNamespace(
51
+ mode="melspec",
52
+ n_fft=4096,
53
+ hop_length=32,
54
+ win_length=1024,
55
+ n_mels=512,
56
+ shape=(512, 512),
57
+ norm=80,
58
+ ),
59
+ SimpleNamespace(
60
+ mode="melspec",
61
+ n_fft=4096,
62
+ hop_length=32,
63
+ win_length=512,
64
+ n_mels=512,
65
+ shape=(512, 512),
66
+ norm=80,
67
+ ),
68
+ ],
69
+ spec_frames=SimpleNamespace(
70
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
71
+ ),
72
+ ssl=SimpleNamespace(
73
+ duration=3,
74
+ ),
75
+ )
76
+ transform = dict(
77
+ train=transforms.Compose(
78
+ [
79
+ transforms.Resize((512, 512)),
80
+ XYMasking(
81
+ num_masks_x=(0, 2),
82
+ num_masks_y=(0, 2),
83
+ mask_x_length=(10, 40),
84
+ mask_y_length=(10, 30),
85
+ fill_value=0,
86
+ p=0.5,
87
+ ),
88
+ # transforms.ToTensor(),
89
+ ]
90
+ ),
91
+ valid=transforms.Compose(
92
+ [
93
+ transforms.Resize((512, 512)),
94
+ # transforms.ToTensor()
95
+ ]
96
+ ),
97
+ )
98
+
99
+ loss = [
100
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
101
+ (SimpleNamespace(name="mse"), 0.2),
102
+ ]
103
+
104
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
105
+
106
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
107
+
108
+ model = SimpleNamespace(
109
+ name="ssl_multispec_ext",
110
+ multi_spec=SimpleNamespace(
111
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
112
+ pretrained=True,
113
+ num_classes=1,
114
+ pool_type="catavgmax",
115
+ # feature_height=16,
116
+ atten=True,
117
+ # classifier=None,
118
+ ),
119
+ ssl=SimpleNamespace(
120
+ name="facebook/wav2vec2-base",
121
+ attn=1,
122
+ freeze=False,
123
+ num_classes=1,
124
+ ),
125
+ ssl_spec=SimpleNamespace(
126
+ ssl_weight="c_ssl_only_stage2",
127
+ spec_weight="c_spec_only_stage2",
128
+ num_classes=1,
129
+ freeze=False,
130
+ ),
131
+ )
132
+
133
+ run = SimpleNamespace(
134
+ mixup=True,
135
+ mixup_alpha=0.4,
136
+ num_epochs=2,
137
+ )
138
+
139
+ main_metric = "sys_srcc"
140
+ id_name = None
141
+
142
+
143
+ inference = SimpleNamespace(
144
+ save_path=Path("preds"),
145
+ submit_save_path=Path("submissions"),
146
+ num_tta=5,
147
+ batch_size=8,
148
+ extend="tile",
149
+ )
utmosv2/config/c_spec_only_stage1.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="stratified_group",
19
+ target="mos",
20
+ group="sys_id",
21
+ )
22
+
23
+ external_data = []
24
+ use_bvcc = True
25
+
26
+
27
+ validation_dataset = "bvcc"
28
+
29
+ dataset = SimpleNamespace(
30
+ name="multi_spec",
31
+ specs=[
32
+ SimpleNamespace(
33
+ mode="melspec",
34
+ n_fft=4096,
35
+ hop_length=32,
36
+ win_length=4096,
37
+ n_mels=512,
38
+ shape=(512, 512),
39
+ norm=80,
40
+ ),
41
+ SimpleNamespace(
42
+ mode="melspec",
43
+ n_fft=4096,
44
+ hop_length=32,
45
+ win_length=2048,
46
+ n_mels=512,
47
+ shape=(512, 512),
48
+ norm=80,
49
+ ),
50
+ SimpleNamespace(
51
+ mode="melspec",
52
+ n_fft=4096,
53
+ hop_length=32,
54
+ win_length=1024,
55
+ n_mels=512,
56
+ shape=(512, 512),
57
+ norm=80,
58
+ ),
59
+ SimpleNamespace(
60
+ mode="melspec",
61
+ n_fft=4096,
62
+ hop_length=32,
63
+ win_length=512,
64
+ n_mels=512,
65
+ shape=(512, 512),
66
+ norm=80,
67
+ ),
68
+ ],
69
+ spec_frames=SimpleNamespace(
70
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
71
+ ),
72
+ )
73
+ transform = dict(
74
+ train=transforms.Compose(
75
+ [
76
+ transforms.Resize((512, 512)),
77
+ XYMasking(
78
+ num_masks_x=(0, 2),
79
+ num_masks_y=(0, 2),
80
+ mask_x_length=(10, 40),
81
+ mask_y_length=(10, 30),
82
+ fill_value=0,
83
+ p=0.5,
84
+ ),
85
+ # transforms.ToTensor(),
86
+ ]
87
+ ),
88
+ valid=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ # transforms.ToTensor()
92
+ ]
93
+ ),
94
+ )
95
+
96
+ loss = [
97
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
98
+ (SimpleNamespace(name="mse"), 0.2),
99
+ ]
100
+
101
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
102
+
103
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
104
+
105
+ model = SimpleNamespace(
106
+ name="multi_specv2",
107
+ multi_spec=SimpleNamespace(
108
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
109
+ pretrained=True,
110
+ num_classes=1,
111
+ pool_type="catavgmax",
112
+ # feature_height=16,
113
+ atten=True,
114
+ # classifier=None,
115
+ ),
116
+ )
117
+
118
+ run = SimpleNamespace(
119
+ mixup=True,
120
+ mixup_alpha=0.4,
121
+ num_epochs=20,
122
+ )
123
+
124
+ main_metric = "sys_srcc"
125
+ id_name = None
126
+
127
+
128
+ inference = SimpleNamespace(
129
+ save_path=Path("preds"),
130
+ submit_save_path=Path("submissions"),
131
+ num_tta=5,
132
+ batch_size=8,
133
+ extend="tile",
134
+ )
utmosv2/config/c_spec_only_stage2.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="stratified_group",
19
+ target="mos",
20
+ group="sys_id",
21
+ )
22
+
23
+ external_data = ["sarulab"]
24
+ use_bvcc = False
25
+
26
+
27
+ validation_dataset = "sarulab"
28
+
29
+ dataset = SimpleNamespace(
30
+ name="multi_spec",
31
+ specs=[
32
+ SimpleNamespace(
33
+ mode="melspec",
34
+ n_fft=4096,
35
+ hop_length=32,
36
+ win_length=4096,
37
+ n_mels=512,
38
+ shape=(512, 512),
39
+ norm=80,
40
+ ),
41
+ SimpleNamespace(
42
+ mode="melspec",
43
+ n_fft=4096,
44
+ hop_length=32,
45
+ win_length=2048,
46
+ n_mels=512,
47
+ shape=(512, 512),
48
+ norm=80,
49
+ ),
50
+ SimpleNamespace(
51
+ mode="melspec",
52
+ n_fft=4096,
53
+ hop_length=32,
54
+ win_length=1024,
55
+ n_mels=512,
56
+ shape=(512, 512),
57
+ norm=80,
58
+ ),
59
+ SimpleNamespace(
60
+ mode="melspec",
61
+ n_fft=4096,
62
+ hop_length=32,
63
+ win_length=512,
64
+ n_mels=512,
65
+ shape=(512, 512),
66
+ norm=80,
67
+ ),
68
+ ],
69
+ spec_frames=SimpleNamespace(
70
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
71
+ ),
72
+ )
73
+ transform = dict(
74
+ train=transforms.Compose(
75
+ [
76
+ transforms.Resize((512, 512)),
77
+ XYMasking(
78
+ num_masks_x=(0, 2),
79
+ num_masks_y=(0, 2),
80
+ mask_x_length=(10, 40),
81
+ mask_y_length=(10, 30),
82
+ fill_value=0,
83
+ p=0.5,
84
+ ),
85
+ # transforms.ToTensor(),
86
+ ]
87
+ ),
88
+ valid=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ # transforms.ToTensor()
92
+ ]
93
+ ),
94
+ )
95
+
96
+ loss = [
97
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
98
+ (SimpleNamespace(name="mse"), 0.2),
99
+ ]
100
+
101
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
102
+
103
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
104
+
105
+ model = SimpleNamespace(
106
+ name="multi_specv2",
107
+ multi_spec=SimpleNamespace(
108
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
109
+ pretrained=True,
110
+ num_classes=1,
111
+ pool_type="catavgmax",
112
+ # feature_height=16,
113
+ atten=True,
114
+ # classifier=None,
115
+ ),
116
+ )
117
+
118
+ run = SimpleNamespace(
119
+ mixup=True,
120
+ mixup_alpha=0.4,
121
+ num_epochs=5,
122
+ )
123
+
124
+ main_metric = "sys_srcc"
125
+ id_name = None
126
+
127
+
128
+ inference = SimpleNamespace(
129
+ save_path=Path("preds"),
130
+ submit_save_path=Path("submissions"),
131
+ num_tta=5,
132
+ batch_size=8,
133
+ extend="tile",
134
+ )
utmosv2/config/c_ssl_only_stage1.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="stratified_group",
15
+ target="mos",
16
+ group="sys_id",
17
+ )
18
+
19
+ dataset = SimpleNamespace(
20
+ name="sslext",
21
+ ssl=SimpleNamespace(
22
+ duration=3,
23
+ ),
24
+ )
25
+
26
+ external_data = "all"
27
+ use_bvcc = True
28
+
29
+
30
+ validation_dataset = "sarulab"
31
+
32
+ loss = [
33
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
34
+ (SimpleNamespace(name="mse"), 0.2),
35
+ ]
36
+
37
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
38
+
39
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
40
+
41
+ model_path = "model"
42
+ model = SimpleNamespace(
43
+ name="sslext",
44
+ ssl=SimpleNamespace(
45
+ name="facebook/wav2vec2-base",
46
+ attn=1,
47
+ freeze=True,
48
+ num_classes=1,
49
+ ),
50
+ )
51
+
52
+ run = SimpleNamespace(
53
+ mixup=True,
54
+ mixup_alpha=0.4,
55
+ num_epochs=20,
56
+ )
57
+
58
+ main_metric = "sys_srcc"
59
+ id_name = None
60
+
61
+
62
+ inference = SimpleNamespace(
63
+ save_path=Path("preds"),
64
+ submit_save_path=Path("submissions"),
65
+ num_tta=5,
66
+ batch_size=8,
67
+ # extend="tile",
68
+ )
utmosv2/config/c_ssl_only_stage2.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="stratified_group",
15
+ target="mos",
16
+ group="sys_id",
17
+ )
18
+
19
+ dataset = SimpleNamespace(
20
+ name="sslext",
21
+ ssl=SimpleNamespace(
22
+ duration=3,
23
+ ),
24
+ )
25
+
26
+ external_data = "all"
27
+ use_bvcc = True
28
+
29
+
30
+ validation_dataset = "sarulab"
31
+
32
+ loss = [
33
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
34
+ (SimpleNamespace(name="mse"), 0.2),
35
+ ]
36
+
37
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
38
+
39
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
40
+
41
+ model_path = "model"
42
+ model = SimpleNamespace(
43
+ name="sslext",
44
+ ssl=SimpleNamespace(
45
+ name="facebook/wav2vec2-base",
46
+ attn=1,
47
+ freeze=False,
48
+ num_classes=1,
49
+ ),
50
+ )
51
+
52
+ run = SimpleNamespace(
53
+ mixup=True,
54
+ mixup_alpha=0.4,
55
+ num_epochs=5,
56
+ )
57
+
58
+ main_metric = "sys_srcc"
59
+ id_name = None
60
+
61
+
62
+ inference = SimpleNamespace(
63
+ save_path=Path("preds"),
64
+ submit_save_path=Path("submissions"),
65
+ num_tta=5,
66
+ batch_size=8,
67
+ # extend="tile",
68
+ )
utmosv2/config/fusion_stage2.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import SimpleNamespace
2
+
3
+ from torchvision import transforms
4
+
5
+ from utmosv2.transform.xymasking import XYMasking
6
+
7
+ batch_size = 16
8
+ num_folds = 5
9
+
10
+ sr = 16000
11
+
12
+ preprocess = SimpleNamespace(
13
+ top_db=30, min_seconds=None, save_path="preprocessed_data/clip_audio"
14
+ )
15
+
16
+ split = SimpleNamespace(
17
+ type="sgkf_kind",
18
+ target="mos",
19
+ group="sys_id",
20
+ kind="dataset",
21
+ )
22
+
23
+ external_data = "all"
24
+ use_bvcc = True
25
+
26
+ predict_dataset = "ysaito"
27
+ # predict_dataset = "bvcc"
28
+
29
+ validation_dataset = "each"
30
+
31
+ dataset = SimpleNamespace(
32
+ name="ssl_multispec_ext",
33
+ specs=[
34
+ SimpleNamespace(
35
+ mode="melspec",
36
+ n_fft=4096,
37
+ hop_length=32,
38
+ win_length=4096,
39
+ n_mels=512,
40
+ shape=(512, 512),
41
+ norm=80,
42
+ ),
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=2048,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=1024,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=512,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ ],
71
+ spec_frames=SimpleNamespace(
72
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
73
+ ),
74
+ ssl=SimpleNamespace(
75
+ duration=3,
76
+ ),
77
+ )
78
+ transform = dict(
79
+ train=transforms.Compose(
80
+ [
81
+ transforms.Resize((512, 512)),
82
+ XYMasking(
83
+ num_masks_x=(0, 2),
84
+ num_masks_y=(0, 2),
85
+ mask_x_length=(10, 40),
86
+ mask_y_length=(10, 30),
87
+ fill_value=0,
88
+ p=0.5,
89
+ ),
90
+ # transforms.ToTensor(),
91
+ ]
92
+ ),
93
+ valid=transforms.Compose(
94
+ [
95
+ transforms.Resize((512, 512)),
96
+ # transforms.ToTensor()
97
+ ]
98
+ ),
99
+ )
100
+
101
+ loss = [
102
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
103
+ (SimpleNamespace(name="mse"), 0.2),
104
+ ]
105
+
106
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
107
+
108
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
109
+
110
+ model = SimpleNamespace(
111
+ name="ssl_multispec_ext_v2",
112
+ multi_spec=SimpleNamespace(
113
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
114
+ pretrained=True,
115
+ num_classes=1,
116
+ pool_type="catavgmax",
117
+ # feature_height=16,
118
+ atten=True,
119
+ # classifier=None,
120
+ ),
121
+ ssl=SimpleNamespace(
122
+ name="facebook/wav2vec2-base",
123
+ attn=1,
124
+ freeze=True,
125
+ num_classes=1,
126
+ ),
127
+ ssl_spec=SimpleNamespace(
128
+ ssl_weight="ssl_only_stage2",
129
+ spec_weight="spec_only",
130
+ num_classes=1,
131
+ freeze=True,
132
+ ),
133
+ )
134
+
135
+ run = SimpleNamespace(
136
+ mixup=True,
137
+ mixup_alpha=0.4,
138
+ num_epochs=8,
139
+ )
140
+
141
+ main_metric = "sys_srcc"
142
+ id_name = None
143
+
144
+
145
+ inference = SimpleNamespace(
146
+ save_path="preds",
147
+ submit_save_path="submissions",
148
+ num_tta=5,
149
+ batch_size=8,
150
+ extend="tile",
151
+ )
utmosv2/config/fusion_stage2_wo_bc.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 16
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ # "blizzard2008",
27
+ # "blizzard2009",
28
+ # "blizzard2011",
29
+ # "blizzard2010-EH1",
30
+ # "blizzard2010-EH2",
31
+ # "blizzard2010-ES1",
32
+ # "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=True,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_bc",
138
+ spec_weight="spec_only_wo_bc",
139
+ num_classes=1,
140
+ freeze=True,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=8,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage2_wo_bvcc.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 16
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = False
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=True,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_bvcc",
138
+ spec_weight="spec_only_wo_bvcc",
139
+ num_classes=1,
140
+ freeze=True,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=8,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage2_wo_sarulab.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 16
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ # "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=True,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_sarulab",
138
+ spec_weight="spec_only_wo_sarulab",
139
+ num_classes=1,
140
+ freeze=True,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=8,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage2_wo_somos.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 16
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ # "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-5)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=True,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_somos",
138
+ spec_weight="spec_only_wo_somos",
139
+ num_classes=1,
140
+ freeze=True,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=8,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage3.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = "all"
25
+ use_bvcc = True
26
+
27
+
28
+ validation_dataset = "each"
29
+
30
+ dataset = SimpleNamespace(
31
+ name="ssl_multispec_ext",
32
+ specs=[
33
+ SimpleNamespace(
34
+ mode="melspec",
35
+ n_fft=4096,
36
+ hop_length=32,
37
+ win_length=4096,
38
+ n_mels=512,
39
+ shape=(512, 512),
40
+ norm=80,
41
+ ),
42
+ SimpleNamespace(
43
+ mode="melspec",
44
+ n_fft=4096,
45
+ hop_length=32,
46
+ win_length=2048,
47
+ n_mels=512,
48
+ shape=(512, 512),
49
+ norm=80,
50
+ ),
51
+ SimpleNamespace(
52
+ mode="melspec",
53
+ n_fft=4096,
54
+ hop_length=32,
55
+ win_length=1024,
56
+ n_mels=512,
57
+ shape=(512, 512),
58
+ norm=80,
59
+ ),
60
+ SimpleNamespace(
61
+ mode="melspec",
62
+ n_fft=4096,
63
+ hop_length=32,
64
+ win_length=512,
65
+ n_mels=512,
66
+ shape=(512, 512),
67
+ norm=80,
68
+ ),
69
+ ],
70
+ spec_frames=SimpleNamespace(
71
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
72
+ ),
73
+ ssl=SimpleNamespace(
74
+ duration=3,
75
+ ),
76
+ )
77
+ transform = dict(
78
+ train=transforms.Compose(
79
+ [
80
+ transforms.Resize((512, 512)),
81
+ XYMasking(
82
+ num_masks_x=(0, 2),
83
+ num_masks_y=(0, 2),
84
+ mask_x_length=(10, 40),
85
+ mask_y_length=(10, 30),
86
+ fill_value=0,
87
+ p=0.5,
88
+ ),
89
+ # transforms.ToTensor(),
90
+ ]
91
+ ),
92
+ valid=transforms.Compose(
93
+ [
94
+ transforms.Resize((512, 512)),
95
+ # transforms.ToTensor()
96
+ ]
97
+ ),
98
+ )
99
+
100
+ loss = [
101
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
102
+ (SimpleNamespace(name="mse"), 0.2),
103
+ ]
104
+
105
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
106
+
107
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
108
+
109
+ model = SimpleNamespace(
110
+ name="ssl_multispec_ext_v2",
111
+ multi_spec=SimpleNamespace(
112
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
113
+ pretrained=True,
114
+ num_classes=1,
115
+ pool_type="catavgmax",
116
+ # feature_height=16,
117
+ atten=True,
118
+ # classifier=None,
119
+ ),
120
+ ssl=SimpleNamespace(
121
+ name="facebook/wav2vec2-base",
122
+ attn=1,
123
+ freeze=False,
124
+ num_classes=1,
125
+ ),
126
+ ssl_spec=SimpleNamespace(
127
+ ssl_weight="ssl_only_stage2",
128
+ spec_weight="spec_only",
129
+ num_classes=1,
130
+ freeze=False,
131
+ ),
132
+ )
133
+
134
+ run = SimpleNamespace(
135
+ mixup=True,
136
+ mixup_alpha=0.4,
137
+ num_epochs=2,
138
+ )
139
+
140
+ main_metric = "sys_srcc"
141
+ id_name = None
142
+
143
+
144
+ inference = SimpleNamespace(
145
+ save_path=Path("preds"),
146
+ submit_save_path=Path("submissions"),
147
+ num_tta=5,
148
+ batch_size=8,
149
+ extend="tile",
150
+ )
utmosv2/config/fusion_stage3_wo_bc.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ # "blizzard2008",
27
+ # "blizzard2009",
28
+ # "blizzard2011",
29
+ # "blizzard2010-EH1",
30
+ # "blizzard2010-EH2",
31
+ # "blizzard2010-ES1",
32
+ # "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=False,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_bc",
138
+ spec_weight="spec_only_wo_bc",
139
+ num_classes=1,
140
+ freeze=False,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=2,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage3_wo_bvcc.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = False
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=False,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_bvcc",
138
+ spec_weight="spec_only_wo_bvcc",
139
+ num_classes=1,
140
+ freeze=False,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=2,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage3_wo_sarulab.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ # "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=False,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_sarulab",
138
+ spec_weight="spec_only_wo_sarulab",
139
+ num_classes=1,
140
+ freeze=False,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=2,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_stage3_wo_somos.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ # "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="ssl_multispec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ ssl=SimpleNamespace(
84
+ duration=3,
85
+ ),
86
+ )
87
+ transform = dict(
88
+ train=transforms.Compose(
89
+ [
90
+ transforms.Resize((512, 512)),
91
+ XYMasking(
92
+ num_masks_x=(0, 2),
93
+ num_masks_y=(0, 2),
94
+ mask_x_length=(10, 40),
95
+ mask_y_length=(10, 30),
96
+ fill_value=0,
97
+ p=0.5,
98
+ ),
99
+ # transforms.ToTensor(),
100
+ ]
101
+ ),
102
+ valid=transforms.Compose(
103
+ [
104
+ transforms.Resize((512, 512)),
105
+ # transforms.ToTensor()
106
+ ]
107
+ ),
108
+ )
109
+
110
+ loss = [
111
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
112
+ (SimpleNamespace(name="mse"), 0.2),
113
+ ]
114
+
115
+ optimizer = SimpleNamespace(name="adamw", lr=5e-5, weight_decay=1e-4)
116
+
117
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-8)
118
+
119
+ model = SimpleNamespace(
120
+ name="ssl_multispec_ext_v2",
121
+ multi_spec=SimpleNamespace(
122
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
123
+ pretrained=True,
124
+ num_classes=1,
125
+ pool_type="catavgmax",
126
+ # feature_height=16,
127
+ atten=True,
128
+ # classifier=None,
129
+ ),
130
+ ssl=SimpleNamespace(
131
+ name="facebook/wav2vec2-base",
132
+ attn=1,
133
+ freeze=False,
134
+ num_classes=1,
135
+ ),
136
+ ssl_spec=SimpleNamespace(
137
+ ssl_weight="ssl_only_stage2_wo_somos",
138
+ spec_weight="spec_only_wo_somos",
139
+ num_classes=1,
140
+ freeze=False,
141
+ ),
142
+ )
143
+
144
+ run = SimpleNamespace(
145
+ mixup=True,
146
+ mixup_alpha=0.4,
147
+ num_epochs=2,
148
+ )
149
+
150
+ main_metric = "sys_srcc"
151
+ id_name = None
152
+
153
+
154
+ inference = SimpleNamespace(
155
+ save_path=Path("preds"),
156
+ submit_save_path=Path("submissions"),
157
+ num_tta=5,
158
+ batch_size=8,
159
+ extend="tile",
160
+ )
utmosv2/config/fusion_wo_stage1and2.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = "all"
25
+ use_bvcc = True
26
+
27
+
28
+ validation_dataset = "each"
29
+
30
+ dataset = SimpleNamespace(
31
+ name="ssl_multispec_ext",
32
+ specs=[
33
+ SimpleNamespace(
34
+ mode="melspec",
35
+ n_fft=4096,
36
+ hop_length=32,
37
+ win_length=4096,
38
+ n_mels=512,
39
+ shape=(512, 512),
40
+ norm=80,
41
+ ),
42
+ SimpleNamespace(
43
+ mode="melspec",
44
+ n_fft=4096,
45
+ hop_length=32,
46
+ win_length=2048,
47
+ n_mels=512,
48
+ shape=(512, 512),
49
+ norm=80,
50
+ ),
51
+ SimpleNamespace(
52
+ mode="melspec",
53
+ n_fft=4096,
54
+ hop_length=32,
55
+ win_length=1024,
56
+ n_mels=512,
57
+ shape=(512, 512),
58
+ norm=80,
59
+ ),
60
+ SimpleNamespace(
61
+ mode="melspec",
62
+ n_fft=4096,
63
+ hop_length=32,
64
+ win_length=512,
65
+ n_mels=512,
66
+ shape=(512, 512),
67
+ norm=80,
68
+ ),
69
+ ],
70
+ spec_frames=SimpleNamespace(
71
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
72
+ ),
73
+ ssl=SimpleNamespace(
74
+ duration=3,
75
+ ),
76
+ )
77
+ transform = dict(
78
+ train=transforms.Compose(
79
+ [
80
+ transforms.Resize((512, 512)),
81
+ XYMasking(
82
+ num_masks_x=(0, 2),
83
+ num_masks_y=(0, 2),
84
+ mask_x_length=(10, 40),
85
+ mask_y_length=(10, 30),
86
+ fill_value=0,
87
+ p=0.5,
88
+ ),
89
+ # transforms.ToTensor(),
90
+ ]
91
+ ),
92
+ valid=transforms.Compose(
93
+ [
94
+ transforms.Resize((512, 512)),
95
+ # transforms.ToTensor()
96
+ ]
97
+ ),
98
+ )
99
+
100
+ loss = [
101
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
102
+ (SimpleNamespace(name="mse"), 0.2),
103
+ ]
104
+
105
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
106
+
107
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
108
+
109
+ model = SimpleNamespace(
110
+ name="ssl_multispec_ext_v2",
111
+ multi_spec=SimpleNamespace(
112
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
113
+ pretrained=True,
114
+ num_classes=1,
115
+ pool_type="catavgmax",
116
+ # feature_height=16,
117
+ atten=True,
118
+ # classifier=None,
119
+ ),
120
+ ssl=SimpleNamespace(
121
+ name="facebook/wav2vec2-base",
122
+ attn=1,
123
+ freeze=False,
124
+ num_classes=1,
125
+ ),
126
+ ssl_spec=SimpleNamespace(
127
+ ssl_weight=None,
128
+ spec_weight=None,
129
+ num_classes=1,
130
+ freeze=False,
131
+ ),
132
+ )
133
+
134
+ run = SimpleNamespace(
135
+ mixup=True,
136
+ mixup_alpha=0.4,
137
+ num_epochs=20,
138
+ )
139
+
140
+ main_metric = "sys_srcc"
141
+ id_name = None
142
+
143
+
144
+ inference = SimpleNamespace(
145
+ save_path=Path("preds"),
146
+ submit_save_path=Path("submissions"),
147
+ num_tta=5,
148
+ batch_size=8,
149
+ extend="tile",
150
+ )
utmosv2/config/fusion_wo_stage2.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 8
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = "all"
25
+ use_bvcc = True
26
+
27
+
28
+ validation_dataset = "each"
29
+
30
+ dataset = SimpleNamespace(
31
+ name="ssl_multispec_ext",
32
+ specs=[
33
+ SimpleNamespace(
34
+ mode="melspec",
35
+ n_fft=4096,
36
+ hop_length=32,
37
+ win_length=4096,
38
+ n_mels=512,
39
+ shape=(512, 512),
40
+ norm=80,
41
+ ),
42
+ SimpleNamespace(
43
+ mode="melspec",
44
+ n_fft=4096,
45
+ hop_length=32,
46
+ win_length=2048,
47
+ n_mels=512,
48
+ shape=(512, 512),
49
+ norm=80,
50
+ ),
51
+ SimpleNamespace(
52
+ mode="melspec",
53
+ n_fft=4096,
54
+ hop_length=32,
55
+ win_length=1024,
56
+ n_mels=512,
57
+ shape=(512, 512),
58
+ norm=80,
59
+ ),
60
+ SimpleNamespace(
61
+ mode="melspec",
62
+ n_fft=4096,
63
+ hop_length=32,
64
+ win_length=512,
65
+ n_mels=512,
66
+ shape=(512, 512),
67
+ norm=80,
68
+ ),
69
+ ],
70
+ spec_frames=SimpleNamespace(
71
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
72
+ ),
73
+ ssl=SimpleNamespace(
74
+ duration=3,
75
+ ),
76
+ )
77
+ transform = dict(
78
+ train=transforms.Compose(
79
+ [
80
+ transforms.Resize((512, 512)),
81
+ XYMasking(
82
+ num_masks_x=(0, 2),
83
+ num_masks_y=(0, 2),
84
+ mask_x_length=(10, 40),
85
+ mask_y_length=(10, 30),
86
+ fill_value=0,
87
+ p=0.5,
88
+ ),
89
+ # transforms.ToTensor(),
90
+ ]
91
+ ),
92
+ valid=transforms.Compose(
93
+ [
94
+ transforms.Resize((512, 512)),
95
+ # transforms.ToTensor()
96
+ ]
97
+ ),
98
+ )
99
+
100
+ loss = [
101
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
102
+ (SimpleNamespace(name="mse"), 0.2),
103
+ ]
104
+
105
+ optimizer = SimpleNamespace(name="adamw", lr=1e-4, weight_decay=1e-4)
106
+
107
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
108
+
109
+ model = SimpleNamespace(
110
+ name="ssl_multispec_ext_v2",
111
+ multi_spec=SimpleNamespace(
112
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
113
+ pretrained=True,
114
+ num_classes=1,
115
+ pool_type="catavgmax",
116
+ # feature_height=16,
117
+ atten=True,
118
+ # classifier=None,
119
+ ),
120
+ ssl=SimpleNamespace(
121
+ name="facebook/wav2vec2-base",
122
+ attn=1,
123
+ freeze=False,
124
+ num_classes=1,
125
+ ),
126
+ ssl_spec=SimpleNamespace(
127
+ ssl_weight="ssl_only_stage2",
128
+ spec_weight="spec_only",
129
+ num_classes=1,
130
+ freeze=False,
131
+ ),
132
+ )
133
+
134
+ run = SimpleNamespace(
135
+ mixup=True,
136
+ mixup_alpha=0.4,
137
+ num_epochs=20,
138
+ )
139
+
140
+ main_metric = "sys_srcc"
141
+ id_name = None
142
+
143
+
144
+ inference = SimpleNamespace(
145
+ save_path=Path("preds"),
146
+ submit_save_path=Path("submissions"),
147
+ num_tta=5,
148
+ batch_size=8,
149
+ extend="tile",
150
+ )
utmosv2/config/spec_only.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = "all"
25
+ use_bvcc = True
26
+
27
+
28
+ validation_dataset = "each"
29
+
30
+ dataset = SimpleNamespace(
31
+ name="multi_spec_ext",
32
+ specs=[
33
+ SimpleNamespace(
34
+ mode="melspec",
35
+ n_fft=4096,
36
+ hop_length=32,
37
+ win_length=4096,
38
+ n_mels=512,
39
+ shape=(512, 512),
40
+ norm=80,
41
+ ),
42
+ SimpleNamespace(
43
+ mode="melspec",
44
+ n_fft=4096,
45
+ hop_length=32,
46
+ win_length=2048,
47
+ n_mels=512,
48
+ shape=(512, 512),
49
+ norm=80,
50
+ ),
51
+ SimpleNamespace(
52
+ mode="melspec",
53
+ n_fft=4096,
54
+ hop_length=32,
55
+ win_length=1024,
56
+ n_mels=512,
57
+ shape=(512, 512),
58
+ norm=80,
59
+ ),
60
+ SimpleNamespace(
61
+ mode="melspec",
62
+ n_fft=4096,
63
+ hop_length=32,
64
+ win_length=512,
65
+ n_mels=512,
66
+ shape=(512, 512),
67
+ norm=80,
68
+ ),
69
+ ],
70
+ spec_frames=SimpleNamespace(
71
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
72
+ ),
73
+ )
74
+ transform = dict(
75
+ train=transforms.Compose(
76
+ [
77
+ transforms.Resize((512, 512)),
78
+ XYMasking(
79
+ num_masks_x=(0, 2),
80
+ num_masks_y=(0, 2),
81
+ mask_x_length=(10, 40),
82
+ mask_y_length=(10, 30),
83
+ fill_value=0,
84
+ p=0.5,
85
+ ),
86
+ # transforms.ToTensor(),
87
+ ]
88
+ ),
89
+ valid=transforms.Compose(
90
+ [
91
+ transforms.Resize((512, 512)),
92
+ # transforms.ToTensor()
93
+ ]
94
+ ),
95
+ )
96
+
97
+ loss = [
98
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
99
+ (SimpleNamespace(name="mse"), 0.2),
100
+ ]
101
+
102
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
103
+
104
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
105
+
106
+ model = SimpleNamespace(
107
+ name="multi_spec_ext",
108
+ multi_spec=SimpleNamespace(
109
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
110
+ pretrained=True,
111
+ num_classes=1,
112
+ pool_type="catavgmax",
113
+ # feature_height=16,
114
+ atten=True,
115
+ # classifier=None,
116
+ ),
117
+ )
118
+
119
+ run = SimpleNamespace(
120
+ mixup=True,
121
+ mixup_alpha=0.4,
122
+ num_epochs=20,
123
+ )
124
+
125
+ main_metric = "sys_srcc"
126
+ id_name = None
127
+
128
+
129
+ inference = SimpleNamespace(
130
+ save_path=Path("preds"),
131
+ submit_save_path=Path("submissions"),
132
+ num_tta=5,
133
+ batch_size=8,
134
+ extend="tile",
135
+ )
utmosv2/config/spec_only_wo_bc.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ # "blizzard2008",
27
+ # "blizzard2009",
28
+ # "blizzard2011",
29
+ # "blizzard2010-EH1",
30
+ # "blizzard2010-EH2",
31
+ # "blizzard2010-ES1",
32
+ # "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="multi_spec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ )
84
+ transform = dict(
85
+ train=transforms.Compose(
86
+ [
87
+ transforms.Resize((512, 512)),
88
+ XYMasking(
89
+ num_masks_x=(0, 2),
90
+ num_masks_y=(0, 2),
91
+ mask_x_length=(10, 40),
92
+ mask_y_length=(10, 30),
93
+ fill_value=0,
94
+ p=0.5,
95
+ ),
96
+ # transforms.ToTensor(),
97
+ ]
98
+ ),
99
+ valid=transforms.Compose(
100
+ [
101
+ transforms.Resize((512, 512)),
102
+ # transforms.ToTensor()
103
+ ]
104
+ ),
105
+ )
106
+
107
+ loss = [
108
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
109
+ (SimpleNamespace(name="mse"), 0.2),
110
+ ]
111
+
112
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
113
+
114
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
115
+
116
+ model = SimpleNamespace(
117
+ name="multi_spec_ext",
118
+ multi_spec=SimpleNamespace(
119
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
120
+ pretrained=True,
121
+ num_classes=1,
122
+ pool_type="catavgmax",
123
+ # feature_height=16,
124
+ atten=True,
125
+ # classifier=None,
126
+ ),
127
+ )
128
+
129
+ run = SimpleNamespace(
130
+ mixup=True,
131
+ mixup_alpha=0.4,
132
+ num_epochs=20,
133
+ )
134
+
135
+ main_metric = "sys_srcc"
136
+ id_name = None
137
+
138
+
139
+ inference = SimpleNamespace(
140
+ save_path=Path("preds"),
141
+ submit_save_path=Path("submissions"),
142
+ num_tta=5,
143
+ batch_size=8,
144
+ extend="tile",
145
+ )
utmosv2/config/spec_only_wo_bvcc.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = False
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="multi_spec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ )
84
+ transform = dict(
85
+ train=transforms.Compose(
86
+ [
87
+ transforms.Resize((512, 512)),
88
+ XYMasking(
89
+ num_masks_x=(0, 2),
90
+ num_masks_y=(0, 2),
91
+ mask_x_length=(10, 40),
92
+ mask_y_length=(10, 30),
93
+ fill_value=0,
94
+ p=0.5,
95
+ ),
96
+ # transforms.ToTensor(),
97
+ ]
98
+ ),
99
+ valid=transforms.Compose(
100
+ [
101
+ transforms.Resize((512, 512)),
102
+ # transforms.ToTensor()
103
+ ]
104
+ ),
105
+ )
106
+
107
+ loss = [
108
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
109
+ (SimpleNamespace(name="mse"), 0.2),
110
+ ]
111
+
112
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
113
+
114
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
115
+
116
+ model = SimpleNamespace(
117
+ name="multi_spec_ext",
118
+ multi_spec=SimpleNamespace(
119
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
120
+ pretrained=True,
121
+ num_classes=1,
122
+ pool_type="catavgmax",
123
+ # feature_height=16,
124
+ atten=True,
125
+ # classifier=None,
126
+ ),
127
+ )
128
+
129
+ run = SimpleNamespace(
130
+ mixup=True,
131
+ mixup_alpha=0.4,
132
+ num_epochs=20,
133
+ )
134
+
135
+ main_metric = "sys_srcc"
136
+ id_name = None
137
+
138
+
139
+ inference = SimpleNamespace(
140
+ save_path=Path("preds"),
141
+ submit_save_path=Path("submissions"),
142
+ num_tta=5,
143
+ batch_size=8,
144
+ extend="tile",
145
+ )
utmosv2/config/spec_only_wo_sarulab.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ # "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="multi_spec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ )
84
+ transform = dict(
85
+ train=transforms.Compose(
86
+ [
87
+ transforms.Resize((512, 512)),
88
+ XYMasking(
89
+ num_masks_x=(0, 2),
90
+ num_masks_y=(0, 2),
91
+ mask_x_length=(10, 40),
92
+ mask_y_length=(10, 30),
93
+ fill_value=0,
94
+ p=0.5,
95
+ ),
96
+ # transforms.ToTensor(),
97
+ ]
98
+ ),
99
+ valid=transforms.Compose(
100
+ [
101
+ transforms.Resize((512, 512)),
102
+ # transforms.ToTensor()
103
+ ]
104
+ ),
105
+ )
106
+
107
+ loss = [
108
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
109
+ (SimpleNamespace(name="mse"), 0.2),
110
+ ]
111
+
112
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
113
+
114
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
115
+
116
+ model = SimpleNamespace(
117
+ name="multi_spec_ext",
118
+ multi_spec=SimpleNamespace(
119
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
120
+ pretrained=True,
121
+ num_classes=1,
122
+ pool_type="catavgmax",
123
+ # feature_height=16,
124
+ atten=True,
125
+ # classifier=None,
126
+ ),
127
+ )
128
+
129
+ run = SimpleNamespace(
130
+ mixup=True,
131
+ mixup_alpha=0.4,
132
+ num_epochs=20,
133
+ )
134
+
135
+ main_metric = "sys_srcc"
136
+ id_name = None
137
+
138
+
139
+ inference = SimpleNamespace(
140
+ save_path=Path("preds"),
141
+ submit_save_path=Path("submissions"),
142
+ num_tta=5,
143
+ batch_size=8,
144
+ extend="tile",
145
+ )
utmosv2/config/spec_only_wo_somos.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ from torchvision import transforms
5
+
6
+ from utmosv2.transform.xymasking import XYMasking
7
+
8
+ batch_size = 10
9
+ num_folds = 5
10
+
11
+ sr = 16000
12
+
13
+ preprocess = SimpleNamespace(
14
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
15
+ )
16
+
17
+ split = SimpleNamespace(
18
+ type="sgkf_kind",
19
+ target="mos",
20
+ group="sys_id",
21
+ kind="dataset",
22
+ )
23
+
24
+ external_data = [
25
+ "sarulab",
26
+ "blizzard2008",
27
+ "blizzard2009",
28
+ "blizzard2011",
29
+ "blizzard2010-EH1",
30
+ "blizzard2010-EH2",
31
+ "blizzard2010-ES1",
32
+ "blizzard2010-ES3",
33
+ # "somos",
34
+ ]
35
+ use_bvcc = True
36
+
37
+
38
+ validation_dataset = "each"
39
+
40
+ dataset = SimpleNamespace(
41
+ name="multi_spec_ext",
42
+ specs=[
43
+ SimpleNamespace(
44
+ mode="melspec",
45
+ n_fft=4096,
46
+ hop_length=32,
47
+ win_length=4096,
48
+ n_mels=512,
49
+ shape=(512, 512),
50
+ norm=80,
51
+ ),
52
+ SimpleNamespace(
53
+ mode="melspec",
54
+ n_fft=4096,
55
+ hop_length=32,
56
+ win_length=2048,
57
+ n_mels=512,
58
+ shape=(512, 512),
59
+ norm=80,
60
+ ),
61
+ SimpleNamespace(
62
+ mode="melspec",
63
+ n_fft=4096,
64
+ hop_length=32,
65
+ win_length=1024,
66
+ n_mels=512,
67
+ shape=(512, 512),
68
+ norm=80,
69
+ ),
70
+ SimpleNamespace(
71
+ mode="melspec",
72
+ n_fft=4096,
73
+ hop_length=32,
74
+ win_length=512,
75
+ n_mels=512,
76
+ shape=(512, 512),
77
+ norm=80,
78
+ ),
79
+ ],
80
+ spec_frames=SimpleNamespace(
81
+ num_frames=2, frame_sec=1.4, mixup_inner=True, mixup_alpha=0.4, extend="tile"
82
+ ),
83
+ )
84
+ transform = dict(
85
+ train=transforms.Compose(
86
+ [
87
+ transforms.Resize((512, 512)),
88
+ XYMasking(
89
+ num_masks_x=(0, 2),
90
+ num_masks_y=(0, 2),
91
+ mask_x_length=(10, 40),
92
+ mask_y_length=(10, 30),
93
+ fill_value=0,
94
+ p=0.5,
95
+ ),
96
+ # transforms.ToTensor(),
97
+ ]
98
+ ),
99
+ valid=transforms.Compose(
100
+ [
101
+ transforms.Resize((512, 512)),
102
+ # transforms.ToTensor()
103
+ ]
104
+ ),
105
+ )
106
+
107
+ loss = [
108
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
109
+ (SimpleNamespace(name="mse"), 0.2),
110
+ ]
111
+
112
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
113
+
114
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
115
+
116
+ model = SimpleNamespace(
117
+ name="multi_spec_ext",
118
+ multi_spec=SimpleNamespace(
119
+ backbone="tf_efficientnetv2_s.in21k_ft_in1k",
120
+ pretrained=True,
121
+ num_classes=1,
122
+ pool_type="catavgmax",
123
+ # feature_height=16,
124
+ atten=True,
125
+ # classifier=None,
126
+ ),
127
+ )
128
+
129
+ run = SimpleNamespace(
130
+ mixup=True,
131
+ mixup_alpha=0.4,
132
+ num_epochs=20,
133
+ )
134
+
135
+ main_metric = "sys_srcc"
136
+ id_name = None
137
+
138
+
139
+ inference = SimpleNamespace(
140
+ save_path=Path("preds"),
141
+ submit_save_path=Path("submissions"),
142
+ num_tta=5,
143
+ batch_size=8,
144
+ extend="tile",
145
+ )
utmosv2/config/ssl_only_stage1.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = "all"
28
+ use_bvcc = True
29
+
30
+
31
+ validation_dataset = "each"
32
+
33
+ loss = [
34
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
35
+ (SimpleNamespace(name="mse"), 0.2),
36
+ ]
37
+
38
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
39
+
40
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
41
+
42
+ model_path = "model"
43
+ model = SimpleNamespace(
44
+ name="sslext",
45
+ ssl=SimpleNamespace(
46
+ name="facebook/wav2vec2-base",
47
+ attn=1,
48
+ freeze=True,
49
+ num_classes=1,
50
+ ),
51
+ )
52
+
53
+ run = SimpleNamespace(
54
+ mixup=True,
55
+ mixup_alpha=0.4,
56
+ num_epochs=20,
57
+ )
58
+
59
+ main_metric = "sys_srcc"
60
+ id_name = None
61
+
62
+
63
+ inference = SimpleNamespace(
64
+ save_path=Path("preds"),
65
+ submit_save_path=Path("submissions"),
66
+ num_tta=5,
67
+ batch_size=8,
68
+ # extend="tile",
69
+ )
utmosv2/config/ssl_only_stage1_wo_bc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ # "blizzard2008",
30
+ # "blizzard2009",
31
+ # "blizzard2011",
32
+ # "blizzard2010-EH1",
33
+ # "blizzard2010-EH2",
34
+ # "blizzard2010-ES1",
35
+ # "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=True,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=20,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage1_wo_bvcc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = False
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=True,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=20,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage1_wo_sarulab.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ # "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=True,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=20,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage1_wo_somos.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ # "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=1e-3, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-7)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=True,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=20,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage2.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = "all"
28
+ use_bvcc = True
29
+
30
+
31
+ validation_dataset = "each"
32
+
33
+ loss = [
34
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
35
+ (SimpleNamespace(name="mse"), 0.2),
36
+ ]
37
+
38
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
39
+
40
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
41
+
42
+ model_path = "model"
43
+ model = SimpleNamespace(
44
+ name="sslext",
45
+ ssl=SimpleNamespace(
46
+ name="facebook/wav2vec2-base",
47
+ attn=1,
48
+ freeze=False,
49
+ num_classes=1,
50
+ ),
51
+ )
52
+
53
+ run = SimpleNamespace(
54
+ mixup=True,
55
+ mixup_alpha=0.4,
56
+ num_epochs=5,
57
+ )
58
+
59
+ main_metric = "sys_srcc"
60
+ id_name = None
61
+
62
+
63
+ inference = SimpleNamespace(
64
+ save_path=Path("preds"),
65
+ submit_save_path=Path("submissions"),
66
+ num_tta=5,
67
+ batch_size=8,
68
+ # extend="tile",
69
+ )
utmosv2/config/ssl_only_stage2_wo_bc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ # "blizzard2008",
30
+ # "blizzard2009",
31
+ # "blizzard2011",
32
+ # "blizzard2010-EH1",
33
+ # "blizzard2010-EH2",
34
+ # "blizzard2010-ES1",
35
+ # "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=False,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=5,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage2_wo_bvcc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = False
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=False,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=5,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage2_wo_sarulab.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ # "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=False,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=5,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/config/ssl_only_stage2_wo_somos.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ batch_size = 32
5
+ num_folds = 5
6
+
7
+ sr = 16000
8
+
9
+ preprocess = SimpleNamespace(
10
+ top_db=30, min_seconds=None, save_path=Path("preprocessed_data")
11
+ )
12
+
13
+ split = SimpleNamespace(
14
+ type="sgkf_kind",
15
+ target="mos",
16
+ group="sys_id",
17
+ kind="dataset",
18
+ )
19
+
20
+ dataset = SimpleNamespace(
21
+ name="sslext",
22
+ ssl=SimpleNamespace(
23
+ duration=3,
24
+ ),
25
+ )
26
+
27
+ external_data = [
28
+ "sarulab",
29
+ "blizzard2008",
30
+ "blizzard2009",
31
+ "blizzard2011",
32
+ "blizzard2010-EH1",
33
+ "blizzard2010-EH2",
34
+ "blizzard2010-ES1",
35
+ "blizzard2010-ES3",
36
+ # "somos",
37
+ ]
38
+ use_bvcc = True
39
+
40
+
41
+ validation_dataset = "each"
42
+
43
+ loss = [
44
+ (SimpleNamespace(name="pairwize_diff", margin=0.2, norm="l1"), 0.7),
45
+ (SimpleNamespace(name="mse"), 0.2),
46
+ ]
47
+
48
+ optimizer = SimpleNamespace(name="adamw", lr=3e-5, weight_decay=1e-4)
49
+
50
+ scheduler = SimpleNamespace(name="cosine", T_max=None, eta_min=1e-9)
51
+
52
+ model_path = "model"
53
+ model = SimpleNamespace(
54
+ name="sslext",
55
+ ssl=SimpleNamespace(
56
+ name="facebook/wav2vec2-base",
57
+ attn=1,
58
+ freeze=False,
59
+ num_classes=1,
60
+ ),
61
+ )
62
+
63
+ run = SimpleNamespace(
64
+ mixup=True,
65
+ mixup_alpha=0.4,
66
+ num_epochs=5,
67
+ )
68
+
69
+ main_metric = "sys_srcc"
70
+ id_name = None
71
+
72
+
73
+ inference = SimpleNamespace(
74
+ save_path=Path("preds"),
75
+ submit_save_path=Path("submissions"),
76
+ num_tta=5,
77
+ batch_size=8,
78
+ # extend="tile",
79
+ )
utmosv2/dataset/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utmosv2.dataset.multi_spec import MultiSpecDataset, MultiSpecExtDataset
2
+ from utmosv2.dataset.ssl import SSLDataset, SSLExtDataset
3
+ from utmosv2.dataset.ssl_multispec import SSLLMultiSpecExtDataset
4
+
5
+ __all__ = [
6
+ "MultiSpecDataset",
7
+ "MultiSpecExtDataset",
8
+ "SSLLMultiSpecExtDataset",
9
+ "SSLDataset",
10
+ "SSLExtDataset",
11
+ ]
utmosv2/dataset/_utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import librosa
4
+ import numpy as np
5
+
6
+
7
+ def load_audio(cfg, file: str) -> np.ndarray:
8
+ if file.endswith(".wav"):
9
+ y, sr = librosa.load(file, sr=None)
10
+ y = librosa.resample(y, orig_sr=sr, target_sr=cfg.sr)
11
+ else:
12
+ y = np.load(file)
13
+ return y
14
+
15
+
16
+ def extend_audio(cfg, y: np.ndarray, length: int, type: str) -> np.ndarray:
17
+ if y.shape[0] > length:
18
+ return y
19
+ elif type == "tile":
20
+ n = length // y.shape[0] + 1
21
+ y = np.tile(y, n)
22
+ return y
23
+ else:
24
+ raise NotImplementedError
25
+
26
+
27
+ def select_random_start(y: np.ndarray, length: int) -> np.ndarray:
28
+ start = np.random.randint(0, y.shape[0] - length)
29
+ return y[start : start + length]
30
+
31
+
32
+ def get_dataset_map(cfg):
33
+ if cfg.data_config:
34
+ with open(cfg.data_config, "r") as f:
35
+ datasets = json.load(f)
36
+ return {d["name"]: i for i, d in enumerate(datasets["data"])}
37
+ else:
38
+ return {
39
+ "bvcc": 0,
40
+ "sarulab": 1,
41
+ "blizzard2008": 2,
42
+ "blizzard2009": 3,
43
+ "blizzard2010-EH1": 4,
44
+ "blizzard2010-EH2": 5,
45
+ "blizzard2010-ES1": 6,
46
+ "blizzard2010-ES3": 7,
47
+ "blizzard2011": 8,
48
+ "somos": 9,
49
+ }
50
+
51
+
52
+ def get_dataset_num(cfg):
53
+ return len(get_dataset_map(cfg))
utmosv2/dataset/multi_spec.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+
6
+ from utmosv2.dataset._utils import (
7
+ extend_audio,
8
+ get_dataset_map,
9
+ load_audio,
10
+ select_random_start,
11
+ )
12
+
13
+
14
+ class MultiSpecDataset(torch.utils.data.Dataset):
15
+ def __init__(self, cfg, data: pd.DataFrame, phase: str, transform=None):
16
+ self.cfg = cfg
17
+ self.data = data
18
+ self.phase = phase
19
+ self.transform = transform
20
+
21
+ def __len__(self):
22
+ return len(self.data)
23
+
24
+ def __getitem__(self, idx):
25
+ row = self.data.iloc[idx]
26
+ file = row["file_path"]
27
+ y = load_audio(self.cfg, file)
28
+ specs = []
29
+ length = int(self.cfg.dataset.spec_frames.frame_sec * self.cfg.sr)
30
+ y = extend_audio(self.cfg, y, length, type=self.cfg.dataset.spec_frames.extend)
31
+ for _ in range(self.cfg.dataset.spec_frames.num_frames):
32
+ y1 = select_random_start(y, length)
33
+ for spec_cfg in self.cfg.dataset.specs:
34
+ spec = _make_spctrogram(self.cfg, spec_cfg, y1)
35
+ if self.cfg.dataset.spec_frames.mixup_inner:
36
+ y2 = select_random_start(y, length)
37
+ spec2 = _make_spctrogram(self.cfg, spec_cfg, y2)
38
+ lmd = np.random.beta(
39
+ self.cfg.dataset.spec_frames.mixup_alpha,
40
+ self.cfg.dataset.spec_frames.mixup_alpha,
41
+ )
42
+ spec = lmd * spec + (1 - lmd) * spec2
43
+ spec = np.stack([spec, spec, spec], axis=0)
44
+ # spec = np.transpose(spec, (1, 2, 0))
45
+ spec = torch.tensor(spec, dtype=torch.float32)
46
+ phase = "train" if self.phase == "train" else "valid"
47
+ spec = self.transform[phase](spec)
48
+ specs.append(spec)
49
+ spec = torch.stack(specs).float()
50
+
51
+ target = row["mos"]
52
+ target = torch.tensor(target, dtype=torch.float32)
53
+
54
+ return spec, target
55
+
56
+
57
+ class MultiSpecExtDataset(MultiSpecDataset):
58
+ def __init__(self, cfg, data: pd.DataFrame, phase: str, transform=None):
59
+ super().__init__(cfg, data, phase, transform)
60
+ self.dataset_map = get_dataset_map(cfg)
61
+
62
+ def __getitem__(self, idx):
63
+ spec, target = super().__getitem__(idx)
64
+
65
+ d = np.zeros(len(self.dataset_map))
66
+ d[self.dataset_map[self.data.iloc[idx]["dataset"]]] = 1
67
+ d = torch.tensor(d, dtype=torch.float32)
68
+
69
+ return spec, d, target
70
+
71
+
72
+ def _make_spctrogram(cfg, spec_cfg, y: np.ndarray) -> np.ndarray:
73
+ if spec_cfg.mode == "melspec":
74
+ return _make_melspec(cfg, spec_cfg, y)
75
+ elif spec_cfg.mode == "stft":
76
+ return _make_stft(cfg, spec_cfg, y)
77
+ else:
78
+ raise NotImplementedError
79
+
80
+
81
+ def _make_melspec(cfg, spec_cfg, y: np.ndarray) -> np.ndarray:
82
+ spec = librosa.feature.melspectrogram(
83
+ y=y,
84
+ sr=cfg.sr,
85
+ n_fft=spec_cfg.n_fft,
86
+ hop_length=spec_cfg.hop_length,
87
+ n_mels=spec_cfg.n_mels,
88
+ )
89
+ spec = librosa.power_to_db(spec, ref=np.max)
90
+ if spec_cfg.norm is not None:
91
+ spec = (spec + spec_cfg.norm) / spec_cfg.norm
92
+ return spec
93
+
94
+
95
+ def _make_stft(cfg, spec_cfg, y: np.ndarray) -> np.ndarray:
96
+ spec = librosa.stft(y=y, n_fft=spec_cfg.n_fft, hop_length=spec_cfg.hop_length)
97
+ spec = np.abs(spec)
98
+ spec = librosa.amplitude_to_db(spec)
99
+ return spec
utmosv2/dataset/ssl.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+
5
+ from utmosv2.dataset._utils import (
6
+ extend_audio,
7
+ get_dataset_map,
8
+ load_audio,
9
+ select_random_start,
10
+ )
11
+
12
+
13
+ class SSLDataset(torch.utils.data.Dataset):
14
+ def __init__(self, cfg, data: pd.DataFrame, phase: str):
15
+ self.cfg = cfg
16
+ self.data = data
17
+ self.phase = phase
18
+
19
+ def __len__(self):
20
+ return len(self.data)
21
+
22
+ def __getitem__(self, idx):
23
+ row = self.data.iloc[idx]
24
+ file = row["file_path"]
25
+ y = load_audio(self.cfg, file)
26
+ length = int(self.cfg.dataset.ssl.duration * self.cfg.sr)
27
+ y = extend_audio(self.cfg, y, length, type="tile")
28
+ y = select_random_start(y, length)
29
+
30
+ target = row["mos"]
31
+ target = torch.tensor(target, dtype=torch.float32)
32
+
33
+ return y, target
34
+
35
+
36
+ class SSLExtDataset(SSLDataset):
37
+ def __init__(self, cfg, data: pd.DataFrame, phase: str):
38
+ super().__init__(cfg, data, phase)
39
+ self.dataset_map = get_dataset_map(cfg)
40
+
41
+ def __getitem__(self, idx):
42
+ y, target = super().__getitem__(idx)
43
+
44
+ d = np.zeros(len(self.dataset_map))
45
+ d[self.dataset_map[self.data.iloc[idx]["dataset"]]] = 1
46
+ d = torch.tensor(d, dtype=torch.float32)
47
+
48
+ return y, d, target
utmosv2/dataset/ssl_multispec.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+
4
+ from utmosv2.dataset import MultiSpecDataset, SSLExtDataset
5
+
6
+
7
+ class SSLLMultiSpecExtDataset(torch.utils.data.Dataset):
8
+ def __init__(self, cfg, data: pd.DataFrame, phase: str, transform=None):
9
+ self.data = data
10
+ self.ssl = SSLExtDataset(cfg, data, phase)
11
+ self.multi_spec = MultiSpecDataset(cfg, data, phase, transform)
12
+
13
+ def __len__(self):
14
+ return len(self.data)
15
+
16
+ def __getitem__(self, idx):
17
+ x1, d, target = self.ssl[idx]
18
+ x2, _ = self.multi_spec[idx]
19
+
20
+ return x1, x2, d, target
utmosv2/loss/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from utmosv2.loss.losses import CombinedLoss, PairwizeDiffLoss
2
+
3
+ __all__ = ["PairwizeDiffLoss", "CombinedLoss"]
utmosv2/loss/losses.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+
8
+ class PairwizeDiffLoss(nn.Module):
9
+ def __init__(self, margin: float = 0.2, norm: str = "l1"):
10
+ super().__init__()
11
+ self.margin = margin
12
+ self.norm = norm
13
+
14
+ def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
15
+ s = input.unsqueeze(1) - input.unsqueeze(0)
16
+ t = target.unsqueeze(1) - target.unsqueeze(0)
17
+ if self.norm not in ["l1", "l2_squared"]:
18
+ raise ValueError(
19
+ f'Unknown norm: {self.norm}. Must be one of ["l1", "l2_squared"]'
20
+ )
21
+ norm_fn = {
22
+ "l1": torch.abs,
23
+ "l2_squared": lambda x: x**2,
24
+ }[self.norm]
25
+ loss = F.relu(norm_fn(s - t) - self.margin)
26
+ return loss.mean().div(2)
27
+
28
+
29
+ class CombinedLoss(nn.Module):
30
+ def __init__(self, weighted_losses: list[tuple[nn.Module, float]]):
31
+ super().__init__()
32
+ self.weighted_losses = weighted_losses
33
+
34
+ def forward(
35
+ self, input: torch.Tensor, target: torch.Tensor
36
+ ) -> list[tuple[float, torch.Tensor]]:
37
+ return [(w, loss(input, target)) for loss, w in self.weighted_losses]