File size: 6,080 Bytes
5112867 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
from typing import Dict
from diacritization_evaluation import der, wer
import torch
from torch import nn
from torch import optim
from torch.cuda.amp import autocast
from torch.utils.tensorboard.writer import SummaryWriter
from tqdm.notebook import tqdm
from tqdm import trange
from diacritization_evaluation import util
from .config_manager import ConfigManager
from .dataset import load_iterators
from .diacritizer import CBHGDiacritizer, Seq2SeqDiacritizer
from .options import OptimizerType
import gdown
class Trainer:
def run(self):
raise NotImplementedError
class GeneralTrainer(Trainer):
def __init__(self, config_path: str, model_kind: str) -> None:
self.config_path = config_path
self.model_kind = model_kind
self.config_manager = ConfigManager(
config_path=config_path, model_kind=model_kind
)
self.config = self.config_manager.config
self.losses = []
self.lr = 0
self.pad_idx = 0
self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_idx)
self.set_device()
self.config_manager.create_remove_dirs()
self.text_encoder = self.config_manager.text_encoder
self.start_symbol_id = self.text_encoder.start_symbol_id
self.summary_manager = SummaryWriter(log_dir=self.config_manager.log_dir)
self.model = self.config_manager.get_model()
self.optimizer = self.get_optimizer()
self.model = self.model.to(self.device)
self.load_model(model_path=self.config.get("train_resume_model_path"))
self.load_diacritizer()
self.initialize_model()
def set_device(self):
if self.config.get("device"):
self.device = self.config["device"]
else:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
def load_diacritizer(self):
if self.model_kind in ["cbhg", "baseline"]:
self.diacritizer = CBHGDiacritizer(self.config_path, self.model_kind)
elif self.model_kind in ["seq2seq", "tacotron_based"]:
self.diacritizer = Seq2SeqDiacritizer(self.config_path, self.model_kind)
def initialize_model(self):
if self.global_step > 1:
return
if self.model_kind == "transformer":
print("Initializing using xavier_uniform_")
self.model.apply(initialize_weights)
def load_model(self, model_path: str = None, load_optimizer: bool = True):
with open(
self.config_manager.base_dir / f"{self.model_kind}_network.txt", "w"
) as file:
file.write(str(self.model))
if model_path is None:
last_model_path = self.config_manager.get_last_model_path()
if last_model_path is None:
self.global_step = 1
return
else:
last_model_path = model_path
print(f"loading from {last_model_path}")
saved_model = torch.load(last_model_path, torch.device(self.config.get("device")))
self.model.load_state_dict(saved_model["model_state_dict"])
if load_optimizer:
self.optimizer.load_state_dict(saved_model["optimizer_state_dict"])
self.global_step = saved_model["global_step"] + 1
class DiacritizationTester(GeneralTrainer):
def __init__(self, config_path: str, model_kind: str, model_path: str) -> None:
# if config_path == 'config/test.yml' or config_path == "Arabic_Diacritization/config/test.yml":
# print("Exporting the pretrained models ... ")
# url = 'https://drive.google.com/uc?id=12aYNY7cbsLNzhdPdC2K3u1sgrb1lpzwO'
# gdown.cached_download(url,'model.zip', quiet=False, postprocess=gdown.extractall)
self.config_path = config_path
self.model_kind = model_kind
self.config_manager = ConfigManager(
config_path=config_path, model_kind=model_kind
)
self.config = self.config_manager.config
# print(self.config)
self.pad_idx = 0
self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_idx)
self.set_device()
self.text_encoder = self.config_manager.text_encoder
self.start_symbol_id = self.text_encoder.start_symbol_id
self.model = self.config_manager.get_model()
self.model = self.model.to(self.device)
self.load_model(model_path=model_path, load_optimizer=False)
self.load_diacritizer()
self.diacritizer.set_model(self.model)
self.initialize_model()
def collate_fn(self, data):
"""
Padding the input and output sequences
"""
def merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
for i, seq in enumerate(sequences):
end = lengths[i]
padded_seqs[i, :end] = seq[:end]
return padded_seqs, lengths
data.sort(key=lambda x: len(x[0]), reverse=True)
# separate source and target sequences
src_seqs, trg_seqs, original = zip(*data)
# merge sequences (from tuple of 1D tensor to 2D tensor)
src_seqs, src_lengths = merge(src_seqs)
trg_seqs, trg_lengths = merge(trg_seqs)
batch = {
"original": original,
"src": src_seqs,
"target": trg_seqs,
"lengths": torch.LongTensor(src_lengths), # src_lengths = trg_lengths
}
return batch
def get_batch(self, sentence):
data = self.text_encoder.clean(sentence)
text, inputs, diacritics = util.extract_haraqat(data)
inputs = torch.Tensor(self.text_encoder.input_to_sequence("".join(inputs)))
diacritics = torch.Tensor(self.text_encoder.target_to_sequence(diacritics))
batch = self.collate_fn([(inputs, diacritics, text)])
return batch
def infer(self, sentence):
self.model.eval()
batch = self.get_batch(sentence)
predicted = self.diacritizer.diacritize_batch(batch)
return predicted[0]
|