Ashaar / poetry_diacritizer /config_manager.py
Zaid's picture
add diacritizer
5112867
raw
history blame
13.2 kB
from enum import Enum
import os
from pathlib import Path
import shutil
import subprocess
from typing import Any, Dict
import ruamel.yaml
import torch
from poetry_diacritizer.models.baseline import BaseLineModel
from poetry_diacritizer.models.cbhg import CBHGModel
from poetry_diacritizer.models.gpt import GPTModel
from poetry_diacritizer.models.seq2seq import Decoder as Seq2SeqDecoder, Encoder as Seq2SeqEncoder, Seq2Seq
from poetry_diacritizer.models.tacotron_based import (
Decoder as TacotronDecoder,
Encoder as TacotronEncoder,
Tacotron,
)
from poetry_diacritizer.options import AttentionType, LossType, OptimizerType
from poetry_diacritizer.util.text_encoders import (
ArabicEncoderWithStartSymbol,
BasicArabicEncoder,
TextEncoder,
)
class ConfigManager:
"""Co/home/almodhfer/Projects/daicritization/temp_results/CA_MSA/cbhg-new/model-10.ptnfig Manager"""
def __init__(self, config_path: str, model_kind: str):
available_models = ["baseline", "cbhg", "seq2seq", "tacotron_based", "gpt"]
if model_kind not in available_models:
raise TypeError(f"model_kind must be in {available_models}")
self.config_path = Path(config_path)
self.model_kind = model_kind
self.yaml = ruamel.yaml.YAML()
self.config: Dict[str, Any] = self._load_config()
self.git_hash = self._get_git_hash()
self.session_name = ".".join(
[
self.config["data_type"],
self.config["session_name"],
f"{model_kind}",
]
)
self.data_dir = Path(
os.path.join(self.config["data_directory"], self.config["data_type"])
)
self.base_dir = Path(
os.path.join(self.config["log_directory"], self.session_name)
)
self.log_dir = Path(os.path.join(self.base_dir, "logs"))
self.prediction_dir = Path(os.path.join(self.base_dir, "predictions"))
self.plot_dir = Path(os.path.join(self.base_dir, "plots"))
self.models_dir = Path(os.path.join(self.base_dir, "models"))
if "sp_model_path" in self.config:
self.sp_model_path = self.config["sp_model_path"]
else:
self.sp_model_path = None
self.text_encoder: TextEncoder = self.get_text_encoder()
self.config["len_input_symbols"] = len(self.text_encoder.input_symbols)
self.config["len_target_symbols"] = len(self.text_encoder.target_symbols)
if self.model_kind in ["seq2seq", "tacotron_based"]:
self.config["attention_type"] = AttentionType[self.config["attention_type"]]
self.config["optimizer"] = OptimizerType[self.config["optimizer_type"]]
def _load_config(self):
with open(self.config_path, "rb") as model_yaml:
_config = self.yaml.load(model_yaml)
return _config
@staticmethod
def _get_git_hash():
try:
return (
subprocess.check_output(["git", "describe", "--always"])
.strip()
.decode()
)
except Exception as e:
print(f"WARNING: could not retrieve git hash. {e}")
def _check_hash(self):
try:
git_hash = (
subprocess.check_output(["git", "describe", "--always"])
.strip()
.decode()
)
if self.config["git_hash"] != git_hash:
print(
f"""WARNING: git hash mismatch. Current: {git_hash}.
Config hash: {self.config['git_hash']}"""
)
except Exception as e:
print(f"WARNING: could not check git hash. {e}")
@staticmethod
def _print_dict_values(values, key_name, level=0, tab_size=2):
tab = level * tab_size * " "
print(tab + "-", key_name, ":", values)
def _print_dictionary(self, dictionary, recursion_level=0):
for key in dictionary.keys():
if isinstance(key, dict):
recursion_level += 1
self._print_dictionary(dictionary[key], recursion_level)
else:
self._print_dict_values(
dictionary[key], key_name=key, level=recursion_level
)
def print_config(self):
print("\nCONFIGURATION", self.session_name)
self._print_dictionary(self.config)
def update_config(self):
self.config["git_hash"] = self._get_git_hash()
def dump_config(self):
self.update_config()
_config = {}
for key, val in self.config.items():
if isinstance(val, Enum):
_config[key] = val.name
else:
_config[key] = val
with open(self.base_dir / "config.yml", "w") as model_yaml:
self.yaml.dump(_config, model_yaml)
def create_remove_dirs(
self,
clear_dir: bool = False,
clear_logs: bool = False,
clear_weights: bool = False,
clear_all: bool = False,
):
self.base_dir.mkdir(exist_ok=True, parents=True)
self.plot_dir.mkdir(exist_ok=True)
self.prediction_dir.mkdir(exist_ok=True)
if clear_dir:
delete = input(f"Delete {self.log_dir} AND {self.models_dir}? (y/[n])")
if delete == "y":
shutil.rmtree(self.log_dir, ignore_errors=True)
shutil.rmtree(self.models_dir, ignore_errors=True)
if clear_logs:
delete = input(f"Delete {self.log_dir}? (y/[n])")
if delete == "y":
shutil.rmtree(self.log_dir, ignore_errors=True)
if clear_weights:
delete = input(f"Delete {self.models_dir}? (y/[n])")
if delete == "y":
shutil.rmtree(self.models_dir, ignore_errors=True)
self.log_dir.mkdir(exist_ok=True)
self.models_dir.mkdir(exist_ok=True)
def get_last_model_path(self):
"""
Given a checkpoint, get the last save model name
Args:
checkpoint (str): the path where models are saved
"""
models = os.listdir(self.models_dir)
models = [model for model in models if model[-3:] == ".pt"]
if len(models) == 0:
return None
_max = max(int(m.split(".")[0].split("-")[0]) for m in models)
model_name = f"{_max}-snapshot.pt"
last_model_path = os.path.join(self.models_dir, model_name)
return last_model_path
def load_model(self, model_path: str = None):
"""
loading a model from path
Args:
checkpoint (str): the path to the model
name (str): the name of the model, which is in the path
model (Tacotron): the model to load its save state
optimizer: the optimizer to load its saved state
"""
model = self.get_model()
with open(self.base_dir / f"{self.model_kind}_network.txt", "w") as file:
file.write(str(model))
if model_path is None:
last_model_path = self.get_last_model_path()
if last_model_path is None:
return model, 1
else:
last_model_path = model_path
saved_model = torch.load(last_model_path)
out = model.load_state_dict(saved_model["model_state_dict"])
print(out)
global_step = saved_model["global_step"] + 1
return model, global_step
def get_model(self, ignore_hash=False):
if not ignore_hash:
self._check_hash()
if self.model_kind == "cbhg":
return self.get_cbhg()
elif self.model_kind == "seq2seq":
return self.get_seq2seq()
elif self.model_kind == "tacotron_based":
return self.get_tacotron_based()
elif self.model_kind == "baseline":
return self.get_baseline()
elif self.model_kind == "gpt":
return self.get_gpt()
def get_gpt(self):
model = GPTModel(
self.config["base_model_path"],
freeze=self.config["freeze"],
n_layer=self.config["n_layer"],
use_lstm=self.config["use_lstm"],
)
return model
def get_baseline(self):
model = BaseLineModel(
embedding_dim=self.config["embedding_dim"],
inp_vocab_size=self.config["len_input_symbols"],
targ_vocab_size=self.config["len_target_symbols"],
layers_units=self.config["layers_units"],
use_batch_norm=self.config["use_batch_norm"],
)
return model
def get_cbhg(self):
model = CBHGModel(
embedding_dim=self.config["embedding_dim"],
inp_vocab_size=self.config["len_input_symbols"],
targ_vocab_size=self.config["len_target_symbols"],
use_prenet=self.config["use_prenet"],
prenet_sizes=self.config["prenet_sizes"],
cbhg_gru_units=self.config["cbhg_gru_units"],
cbhg_filters=self.config["cbhg_filters"],
cbhg_projections=self.config["cbhg_projections"],
post_cbhg_layers_units=self.config["post_cbhg_layers_units"],
post_cbhg_use_batch_norm=self.config["post_cbhg_use_batch_norm"],
)
return model
def get_seq2seq(self):
encoder = Seq2SeqEncoder(
embedding_dim=self.config["encoder_embedding_dim"],
inp_vocab_size=self.config["len_input_symbols"],
layers_units=self.config["encoder_units"],
use_batch_norm=self.config["use_batch_norm"],
)
decoder = TacotronDecoder(
self.config["len_target_symbols"],
start_symbol_id=self.text_encoder.start_symbol_id,
embedding_dim=self.config["decoder_embedding_dim"],
encoder_dim=self.config["encoder_dim"],
decoder_units=self.config["decoder_units"],
decoder_layers=self.config["decoder_layers"],
attention_type=self.config["attention_type"],
attention_units=self.config["attention_units"],
is_attention_accumulative=self.config["is_attention_accumulative"],
use_prenet=self.config["use_decoder_prenet"],
prenet_depth=self.config["decoder_prenet_depth"],
teacher_forcing_probability=self.config["teacher_forcing_probability"],
)
model = Tacotron(encoder=encoder, decoder=decoder)
return model
def get_tacotron_based(self):
encoder = TacotronEncoder(
embedding_dim=self.config["encoder_embedding_dim"],
inp_vocab_size=self.config["len_input_symbols"],
prenet_sizes=self.config["prenet_sizes"],
use_prenet=self.config["use_encoder_prenet"],
cbhg_gru_units=self.config["cbhg_gru_units"],
cbhg_filters=self.config["cbhg_filters"],
cbhg_projections=self.config["cbhg_projections"],
)
decoder = TacotronDecoder(
self.config["len_target_symbols"],
start_symbol_id=self.text_encoder.start_symbol_id,
embedding_dim=self.config["decoder_embedding_dim"],
encoder_dim=self.config["encoder_dim"],
decoder_units=self.config["decoder_units"],
decoder_layers=self.config["decoder_layers"],
attention_type=self.config["attention_type"],
attention_units=self.config["attention_units"],
is_attention_accumulative=self.config["is_attention_accumulative"],
use_prenet=self.config["use_decoder_prenet"],
prenet_depth=self.config["decoder_prenet_depth"],
teacher_forcing_probability=self.config["teacher_forcing_probability"],
)
model = Tacotron(encoder=encoder, decoder=decoder)
return model
def get_text_encoder(self):
"""Getting the class of TextEncoder from config"""
if self.config["text_cleaner"] not in [
"basic_cleaners",
"valid_arabic_cleaners",
None,
]:
raise Exception(f"cleaner is not known {self.config['text_cleaner']}")
if self.config["text_encoder"] == "BasicArabicEncoder":
text_encoder = BasicArabicEncoder(
cleaner_fn=self.config["text_cleaner"], sp_model_path=self.sp_model_path
)
elif self.config["text_encoder"] == "ArabicEncoderWithStartSymbol":
text_encoder = ArabicEncoderWithStartSymbol(
cleaner_fn=self.config["text_cleaner"], sp_model_path=self.sp_model_path
)
else:
raise Exception(
f"the text encoder is not found {self.config['text_encoder']}"
)
return text_encoder
def get_loss_type(self):
try:
loss_type = LossType[self.config["loss_type"]]
except:
raise Exception(f"The loss type is not correct {self.config['loss_type']}")
return loss_type
if __name__ == "__main__":
config_path = "config/tacotron-base-config.yml"
model_kind = "tacotron"
config = ConfigManager(config_path=config_path, model_kind=model_kind)