Spaces:
Sleeping
Sleeping
Marcos12886
commited on
model.py: probar cargando datasets hf
Browse files
model.py
CHANGED
@@ -1,104 +1,171 @@
|
|
1 |
-
import torch
|
2 |
-
import numpy as np
|
3 |
import os
|
|
|
|
|
|
|
|
|
|
|
4 |
from huggingface_hub import login, upload_folder
|
5 |
-
from datasets import load_dataset, Audio
|
6 |
from transformers.integrations import TensorBoardCallback
|
|
|
7 |
from transformers import (
|
8 |
-
Wav2Vec2FeatureExtractor,
|
9 |
Trainer, TrainingArguments,
|
10 |
EarlyStoppingCallback
|
11 |
)
|
12 |
-
|
13 |
-
# SE USA FLOAT32 EN EL MODELO ORIGINAL
|
14 |
MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
|
15 |
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
|
16 |
seed = 123
|
17 |
MAX_DURATION = 1.00
|
18 |
-
SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
|
19 |
-
token = os.getenv(
|
20 |
config_file = "models_config.json"
|
21 |
clasificador = "class"
|
22 |
monitor = "mon"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def seed_everything():
|
25 |
-
np.random.seed(seed)
|
26 |
torch.manual_seed(seed)
|
27 |
torch.cuda.manual_seed(seed)
|
28 |
torch.backends.cudnn.deterministic = True
|
29 |
torch.backends.cudnn.benchmark = False
|
30 |
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
|
31 |
|
32 |
-
def
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
truncation=True, # Muchísimo más rápido.
|
43 |
-
padding=True, # Vectores igual longitud
|
44 |
-
do_normalize=True, # No afecta 1ª época, no sé si necesario
|
45 |
-
# return_attention_mask=True, # Empeora 1ª época. No sé si necesario
|
46 |
-
padding_value=0.0, # No afecta 1ª época, no sé si necesario
|
47 |
-
float=32 # No afecta 1ª época, no sé si necesario
|
48 |
-
)
|
49 |
-
return inputs
|
50 |
|
51 |
-
def
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
def load_model(num_labels, label2id, id2label):
|
62 |
-
|
63 |
MODEL,
|
64 |
num_labels=num_labels,
|
65 |
label2id=label2id,
|
66 |
-
id2label=id2label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
)
|
|
|
68 |
return model
|
69 |
|
70 |
def model_params(dataset_path):
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
model = load_model(len(id2label), label2id, id2label)
|
75 |
-
return model, encoded_dataset, id2label
|
76 |
|
77 |
def compute_metrics(eval_pred):
|
78 |
-
predictions =
|
79 |
-
references = eval_pred.label_ids
|
|
|
|
|
80 |
return {
|
81 |
-
"accuracy":
|
|
|
|
|
|
|
82 |
}
|
83 |
|
84 |
-
def
|
85 |
-
|
86 |
-
|
87 |
-
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
|
88 |
trainer = Trainer(
|
89 |
model=model,
|
90 |
args=training_args,
|
91 |
compute_metrics=compute_metrics,
|
92 |
-
train_dataset=
|
93 |
-
eval_dataset=
|
94 |
-
callbacks=[
|
95 |
)
|
96 |
torch.cuda.empty_cache() # liberar memoria de la GPU
|
97 |
trainer.train() # se pueden modificar los parámetros para continuar el train
|
|
|
98 |
trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
|
99 |
trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
|
100 |
os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
|
101 |
-
upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
|
102 |
|
103 |
def load_config(model_name):
|
104 |
with open(config_file, 'r') as f:
|
@@ -109,9 +176,9 @@ def load_config(model_name):
|
|
109 |
return model_config
|
110 |
|
111 |
if __name__ == "__main__":
|
112 |
-
config = load_config(clasificador) # PARA CAMBIAR MODELOS
|
113 |
-
|
114 |
training_args = config["training_args"]
|
115 |
output_dir = config["output_dir"]
|
116 |
dataset_path = config["dataset_path"]
|
117 |
-
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
from torch.utils.data import Dataset, DataLoader
|
7 |
from huggingface_hub import login, upload_folder
|
|
|
8 |
from transformers.integrations import TensorBoardCallback
|
9 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
10 |
from transformers import (
|
11 |
+
Wav2Vec2FeatureExtractor, HubertConfig, HubertForSequenceClassification,
|
12 |
Trainer, TrainingArguments,
|
13 |
EarlyStoppingCallback
|
14 |
)
|
15 |
+
|
|
|
16 |
MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
|
17 |
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
|
18 |
seed = 123
|
19 |
MAX_DURATION = 1.00
|
20 |
+
SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
|
21 |
+
token = os.getenv("HF_TOKEN")
|
22 |
config_file = "models_config.json"
|
23 |
clasificador = "class"
|
24 |
monitor = "mon"
|
25 |
+
batch_size = 16
|
26 |
+
|
27 |
+
class AudioDataset(Dataset):
|
28 |
+
def __init__(self, dataset_path, label2id):
|
29 |
+
self.dataset_path = dataset_path
|
30 |
+
self.label2id = label2id
|
31 |
+
self.file_paths = []
|
32 |
+
self.labels = []
|
33 |
+
for label_dir, label_id in self.label2id.items():
|
34 |
+
label_path = os.path.join(self.dataset_path, label_dir)
|
35 |
+
if os.path.isdir(label_path):
|
36 |
+
for file_name in os.listdir(label_path):
|
37 |
+
audio_path = os.path.join(label_path, file_name)
|
38 |
+
self.file_paths.append(audio_path)
|
39 |
+
self.labels.append(label_id)
|
40 |
+
|
41 |
+
def __len__(self):
|
42 |
+
return len(self.file_paths)
|
43 |
+
|
44 |
+
def __getitem__(self, idx):
|
45 |
+
audio_path = self.file_paths[idx]
|
46 |
+
label = self.labels[idx]
|
47 |
+
input_values = self.preprocess_audio(audio_path)
|
48 |
+
return {
|
49 |
+
"input_values": input_values,
|
50 |
+
"labels": torch.tensor(label)
|
51 |
+
}
|
52 |
+
|
53 |
+
def preprocess_audio(self, audio_path):
|
54 |
+
waveform, sample_rate = torchaudio.load(
|
55 |
+
audio_path,
|
56 |
+
normalize=True, # Convierte a float32
|
57 |
+
# num_frames= # TODO: Probar para que no haga falta recortar los audios
|
58 |
+
)
|
59 |
+
if sample_rate != SAMPLING_RATE: # Resamplear si no es 16kHz
|
60 |
+
resampler = torchaudio.transforms.Resample(sample_rate, SAMPLING_RATE)
|
61 |
+
waveform = resampler(waveform)
|
62 |
+
if waveform.shape[0] > 1: # Si es stereo, convertir a mono
|
63 |
+
waveform = waveform.mean(dim=0)
|
64 |
+
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-6) # Sin 1e-6 el accuracy es pésimo!!
|
65 |
+
max_length = int(SAMPLING_RATE * MAX_DURATION)
|
66 |
+
if waveform.shape[0] > max_length:
|
67 |
+
waveform = waveform[:max_length]
|
68 |
+
else:
|
69 |
+
waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.shape[0]))
|
70 |
+
inputs = FEATURE_EXTRACTOR(
|
71 |
+
waveform,
|
72 |
+
sampling_rate=SAMPLING_RATE,
|
73 |
+
return_tensors="pt",
|
74 |
+
# max_length=int(SAMPLING_RATE * MAX_DURATION),
|
75 |
+
# truncation=True,
|
76 |
+
padding=True,
|
77 |
+
)
|
78 |
+
return inputs.input_values.squeeze()
|
79 |
|
80 |
def seed_everything():
|
|
|
81 |
torch.manual_seed(seed)
|
82 |
torch.cuda.manual_seed(seed)
|
83 |
torch.backends.cudnn.deterministic = True
|
84 |
torch.backends.cudnn.benchmark = False
|
85 |
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
|
86 |
|
87 |
+
def build_label_mappings(dataset_path):
|
88 |
+
label2id = {}
|
89 |
+
id2label = {}
|
90 |
+
label_id = 0
|
91 |
+
for label_dir in os.listdir(dataset_path):
|
92 |
+
if os.path.isdir(os.path.join(dataset_path, label_dir)):
|
93 |
+
label2id[label_dir] = label_id
|
94 |
+
id2label[label_id] = label_dir
|
95 |
+
label_id += 1
|
96 |
+
return label2id, id2label
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
def create_dataloader(dataset_path, test_size=0.2, num_workers=12, shuffle=True, pin_memory=True):
|
99 |
+
label2id, id2label = build_label_mappings(dataset_path)
|
100 |
+
dataset = AudioDataset(dataset_path, label2id)
|
101 |
+
dataset_size = len(dataset)
|
102 |
+
indices = list(range(dataset_size))
|
103 |
+
random.shuffle(indices)
|
104 |
+
split_idx = int(dataset_size * (1 - test_size))
|
105 |
+
train_indices = indices[:split_idx]
|
106 |
+
test_indices = indices[split_idx:]
|
107 |
+
train_dataset = torch.utils.data.Subset(dataset, train_indices)
|
108 |
+
test_dataset = torch.utils.data.Subset(dataset, test_indices)
|
109 |
+
train_dataloader = DataLoader(
|
110 |
+
train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
|
111 |
+
)
|
112 |
+
test_dataloader = DataLoader(
|
113 |
+
test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
|
114 |
+
)
|
115 |
+
return train_dataloader, test_dataloader, label2id, id2label
|
116 |
|
117 |
def load_model(num_labels, label2id, id2label):
|
118 |
+
config = HubertConfig.from_pretrained(
|
119 |
MODEL,
|
120 |
num_labels=num_labels,
|
121 |
label2id=label2id,
|
122 |
+
id2label=id2label,
|
123 |
+
finetuning_task="audio-classification"
|
124 |
+
)
|
125 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
126 |
+
model = HubertForSequenceClassification.from_pretrained( # TODO: mirar parámetros. Posibles optimizaciones
|
127 |
+
MODEL,
|
128 |
+
config=config,
|
129 |
+
torch_dtype=torch.float32, # No afecta 1ª época, mejor ponerlo
|
130 |
)
|
131 |
+
model.to(device)
|
132 |
return model
|
133 |
|
134 |
def model_params(dataset_path):
|
135 |
+
train_dataloader, test_dataloader, label2id, id2label = create_dataloader(dataset_path)
|
136 |
+
model = load_model(num_labels=len(id2label), label2id=label2id, id2label=id2label)
|
137 |
+
return model, train_dataloader, test_dataloader, id2label
|
|
|
|
|
138 |
|
139 |
def compute_metrics(eval_pred):
|
140 |
+
predictions = torch.argmax(torch.tensor(eval_pred.predictions), dim=-1)
|
141 |
+
references = torch.tensor(eval_pred.label_ids)
|
142 |
+
accuracy = accuracy_score(references, predictions)
|
143 |
+
precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='weighted')
|
144 |
return {
|
145 |
+
"accuracy": accuracy,
|
146 |
+
"precision": precision,
|
147 |
+
"recall": recall,
|
148 |
+
"f1": f1,
|
149 |
}
|
150 |
|
151 |
+
def main(training_args, output_dir, dataset_path):
|
152 |
+
seed_everything()
|
153 |
+
model, train_dataloader, test_dataloader, _ = model_params(dataset_path)
|
|
|
154 |
trainer = Trainer(
|
155 |
model=model,
|
156 |
args=training_args,
|
157 |
compute_metrics=compute_metrics,
|
158 |
+
train_dataset=train_dataloader.dataset,
|
159 |
+
eval_dataset=test_dataloader.dataset,
|
160 |
+
callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
|
161 |
)
|
162 |
torch.cuda.empty_cache() # liberar memoria de la GPU
|
163 |
trainer.train() # se pueden modificar los parámetros para continuar el train
|
164 |
+
login(token, add_to_git_credential=True)
|
165 |
trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
|
166 |
trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
|
167 |
os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
|
168 |
+
# upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
|
169 |
|
170 |
def load_config(model_name):
|
171 |
with open(config_file, 'r') as f:
|
|
|
176 |
return model_config
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
+
# config = load_config(clasificador) # PARA CAMBIAR MODELOS
|
180 |
+
config = load_config(monitor) # PARA CAMBIAR MODELOS
|
181 |
training_args = config["training_args"]
|
182 |
output_dir = config["output_dir"]
|
183 |
dataset_path = config["dataset_path"]
|
184 |
+
main(training_args, output_dir, dataset_path)
|