|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Script to convert NeMo Megatron T5/UL2 model to Huggingface T5 model. |
|
Based off of NVIDIA's conversion script at: https://github.com/NVIDIA/NeMo/blob/main/scripts/nlp_language_modeling/hf_t5-v1_1_to_nemo.py . |
|
We reverse their conversion process. |
|
|
|
NOTE: You may want to double check the conversion if you are using a custom config with shared_decoder_tokens_head_embeddings=False. |
|
""" |
|
|
|
import argparse |
|
import os |
|
import collections |
|
import sys |
|
|
|
import torch |
|
from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model |
|
from omegaconf.omegaconf import OmegaConf |
|
from pytorch_lightning.trainer.trainer import Trainer |
|
from transformers import AutoTokenizer, T5Config, T5ForConditionalGeneration |
|
|
|
|
|
|
|
|
|
def load_nemo_megatron_model(checkpoint_path, devices=1, num_nodes=1, accelerator="gpu"): |
|
trainer = Trainer(devices=devices, num_nodes=num_nodes, accelerator=accelerator) |
|
model = MegatronT5Model.load_from_checkpoint(checkpoint_path, trainer=trainer) |
|
|
|
return model |
|
|
|
|
|
def load_huggingface_t5_model(model_config_path): |
|
""" |
|
# You need to configure config yourself based on your hparams during training |
|
# See examples of UL2 hugginface configs: |
|
# https://huggingface.co/google/flan-ul2/blob/main/config.json |
|
# https://huggingface.co/Finnish-NLP/ul2-base-nl36-finnish/blob/main/config.json |
|
""" |
|
t5_config = T5Config.from_pretrained(model_config_path) |
|
t5_model = T5ForConditionalGeneration(t5_config) |
|
|
|
return t5_model |
|
|
|
|
|
def _get_model_type_block_layer_hf(k): |
|
""" |
|
Get info from Huggingface model block and layer names |
|
|
|
Returns model_type, block number, layer number. |
|
""" |
|
if k.startswith("encoder"): |
|
model_type = "encoder" |
|
elif k.startswith("decoder"): |
|
model_type = "decoder" |
|
else: |
|
raise ValueError(f"Unknown model type for {k}") |
|
return model_type, int(k.split(".")[2]), int(k.split(".")[4]) |
|
|
|
|
|
def _get_model_type_layer_nemo(k): |
|
""" |
|
Get info from NeMo layer names. |
|
|
|
Returns model_type, layer number. |
|
5th element in the split is the layer number. |
|
""" |
|
print(k) |
|
if "encoder" in k: |
|
model_type = "encoder" |
|
elif "decoder" in k: |
|
model_type = "decoder" |
|
else: |
|
raise ValueError(f"Unknown model type for {k}") |
|
return model_type, int(k.split(".")[5]) |
|
|
|
|
|
def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size): |
|
|
|
|
|
|
|
|
|
|
|
|
|
input_shape = param.size() |
|
if checkpoint_version == 1.0: |
|
|
|
saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:] |
|
param = param.view(*saved_shape) |
|
param = param.transpose(0, 2) |
|
param = param.transpose(1, 2).contiguous() |
|
elif checkpoint_version >= 2.0: |
|
|
|
saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] |
|
param = param.view(*saved_shape) |
|
param = param.transpose(0, 1).contiguous() |
|
param = param.view(*input_shape) |
|
return param |
|
|
|
|
|
def convert_nemo_to_hf( |
|
nemo_weights, fix_qkv_ordering=False, hidden_size=768, num_heads=12, kv_dim=64, checkpoint_version=2.0 |
|
): |
|
""" |
|
Convert NeMo Megatron T5/UL2 model to Huggingface T5 model. |
|
|
|
Args: |
|
nemo_weights (dict): NeMo model weights (state dict). |
|
fix_qkv_ordering (bool): Whether to fix the query, key, value ordering in the self-attention blocks. |
|
hidden_size (int): Hidden size of the model. |
|
num_heads (int): Number of attention heads. |
|
kv_dim (int): Projection weights dimension in multi-head attention. Generally: hidden_size // num_heads. |
|
checkpoint_version (float): Megatron checkpoint version (No idea how to get this from the checkpoint itself). |
|
|
|
Returns: |
|
hf_weights (dict): Huggingface model weights (state dict). |
|
""" |
|
print(f"Found {len(nemo_weights.keys())} keys in the NeMo checkpoint") |
|
|
|
hf_weights = collections.OrderedDict() |
|
|
|
for k, v in nemo_weights.items(): |
|
|
|
|
|
|
|
|
|
if k == "enc_dec_model.decoder_embedding.word_embeddings.weight": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
is_shared_encdec = torch.allclose( |
|
v, nemo_weights["enc_dec_model.encoder_embedding.word_embeddings.weight"] |
|
) |
|
if is_shared_encdec: |
|
print("Found shared encoder and decoder embeddings") |
|
hf_weights["shared.weight"] = v |
|
else: |
|
ValueError( |
|
( |
|
f"Found separate encoder and decoder embeddings in NeMo checkpoint. \n" |
|
f"Not supported in T5 HF implementation. \n" |
|
f"You should probably set 'share_token_embeddings' to True in your NeMo config. \n" |
|
) |
|
) |
|
|
|
if k == "enc_dec_model.tokens_head.weight": |
|
|
|
|
|
|
|
|
|
hf_weights["lm_head.weight"] = v |
|
print(f"Mapped {k} to lm_head.weight") |
|
|
|
elif k == "enc_dec_model.tokens_head.bias": |
|
|
|
ValueError( |
|
( |
|
f"Found bias for lm_head.weight in NeMo checkpoint. This is not supported in HF T5 implementation. \n" |
|
f"You should probably set 'tokens_head_bias' to False in your NeMo config. \n" |
|
f"If your checkpoint is from older version of Megatron, you may also need to set 'share_decoder_tokens_head_embeddings' to False in NeMo config. \n" |
|
f"See: https://github.com/NVIDIA/NeMo/blob/557c4b7ae766faf050374e6b9a862e2e67385b10/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py#L231-L236" |
|
) |
|
) |
|
|
|
|
|
|
|
|
|
elif k == "enc_dec_model.decoder_embedding.word_embeddings.weight": |
|
hf_weights["decoder.embed_tokens.weight"] = v |
|
|
|
elif k == "enc_dec_model.encoder_embedding.word_embeddings.weight": |
|
hf_weights["encoder.embed_tokens.weight"] = v |
|
print(f"Mapped {k} to encoder.embed_tokens.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif k == "enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight": |
|
hf_weights["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = v |
|
print(f"Mapped {k} to encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight") |
|
elif k == "enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight": |
|
hf_weights["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = v |
|
print(f"Mapped {k} to decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif "layernorm" in k: |
|
if "final" in k: |
|
model_type = "encoder" if "encoder" in k else "decoder" |
|
|
|
|
|
hf_weights[f"{model_type}.final_layer_norm.weight"] = v |
|
print(f"Mapped {k} to {model_type}.final_layer_norm.weight") |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if "input_layernorm" in k and model_type == "encoder": |
|
|
|
hf_weights[f"encoder.block.{layer_number}.layer.0.layer_norm.weight"] = v |
|
print(f"Mapped {k} to encoder.block.{layer_number}.layer.0.layer_norm.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif "post_attention_layernorm" in k and model_type == "encoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.1.layer_norm.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.1.layer_norm.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif "input_layernorm" in k and model_type == "decoder": |
|
|
|
hf_weights[f"decoder.block.{layer_number}.layer.0.layer_norm.weight"] = v |
|
print(f"Mapped {k} to decoder.block.{layer_number}.layer.0.layer_norm.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif "post_attention_layernorm" in k and model_type == "decoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.1.layer_norm.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.1.layer_norm.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif "post_inter_attention_layernorm" in k and model_type == "decoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.2.layer_norm.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.2.layer_norm.weight") |
|
|
|
|
|
|
|
|
|
else: |
|
raise ValueError("Unknown layer_norm key: {}".format(k)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif "self_attention.query_key_value.weight" in k: |
|
|
|
|
|
|
|
|
|
|
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if fix_qkv_ordering: |
|
out_val = fix_query_key_value_ordering( |
|
v, checkpoint_version=checkpoint_version, num_splits=3, num_heads=num_heads, hidden_size=kv_dim |
|
) |
|
else: |
|
out_val = v |
|
|
|
q_weights = out_val[0 * hidden_size : 1 * hidden_size, :] |
|
k_weights = out_val[1 * hidden_size : 2 * hidden_size, :] |
|
v_weights = out_val[2 * hidden_size : 3 * hidden_size, :] |
|
|
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.0.SelfAttention.q.weight"] = q_weights |
|
hf_weights[f"{model_type}.block.{layer_number}.layer.0.SelfAttention.k.weight"] = k_weights |
|
hf_weights[f"{model_type}.block.{layer_number}.layer.0.SelfAttention.v.weight"] = v_weights |
|
|
|
print( |
|
( |
|
f"Mapped {k} to: \n", |
|
f"{model_type}.block.{layer_number}.layer.0.SelfAttention.q.weight \n", |
|
f"{model_type}.block.{layer_number}.layer.0.SelfAttention.k.weight \n", |
|
f"{model_type}.block.{layer_number}.layer.0.SelfAttention.v.weight \n", |
|
) |
|
) |
|
|
|
|
|
|
|
elif "self_attention.query_key_value.bias" in k: |
|
ValueError( |
|
"Bias terms for most weights are not supported in Huggingface T5. Train with bias=False in NeMo config." |
|
) |
|
|
|
|
|
elif "self_attention.dense.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.0.SelfAttention.o.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.0.SelfAttention.o.weight") |
|
|
|
|
|
|
|
elif "inter_attention.key_value.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if fix_qkv_ordering: |
|
out_val = fix_query_key_value_ordering( |
|
v, checkpoint_version=checkpoint_version, num_splits=2, num_heads=num_heads, hidden_size=kv_dim |
|
) |
|
else: |
|
out_val = v |
|
|
|
|
|
k_weights = out_val[0 * hidden_size : 1 * hidden_size, :] |
|
v_weights = out_val[1 * hidden_size : 2 * hidden_size, :] |
|
hf_weights[f"decoder.block.{layer_number}.layer.1.EncDecAttention.k.weight"] = k_weights |
|
hf_weights[f"decoder.block.{layer_number}.layer.1.EncDecAttention.v.weight"] = v_weights |
|
print( |
|
( |
|
f"Mapped {k} to: \n", |
|
f"decoder.block.{layer_number}.layer.1.EncDecAttention.k.weight \n", |
|
f"decoder.block.{layer_number}.layer.1.EncDecAttention.v.weight \n", |
|
) |
|
) |
|
|
|
|
|
elif "inter_attention.query.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
hf_weights[f"decoder.block.{layer_number}.layer.1.EncDecAttention.q.weight"] = v |
|
print(f"Mapped {k} to decoder.block.{layer_number}.layer.1.EncDecAttention.q.weight") |
|
|
|
|
|
elif "inter_attention.dense.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
hf_weights[f"decoder.block.{layer_number}.layer.1.EncDecAttention.o.weight"] = v |
|
print(f"Mapped {k} to decoder.block.{layer_number}.layer.1.EncDecAttention.o.weight") |
|
|
|
|
|
|
|
|
|
|
|
elif "mlp.dense_h_to_4h.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if model_type == "encoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.1.DenseReluDense.wi_0.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.1.DenseReluDense.wi_0.weight") |
|
elif model_type == "decoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.2.DenseReluDense.wi_0.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.2.DenseReluDense.wi_0.weight") |
|
|
|
elif "mlp.dense_h_to_4h_2.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if model_type == "encoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.1.DenseReluDense.wi_1.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.1.DenseReluDense.wi_1.weight") |
|
elif model_type == "decoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.2.DenseReluDense.wi_1.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.2.DenseReluDense.wi_1.weight") |
|
|
|
elif "mlp.dense_4h_to_h.weight" in k: |
|
model_type, layer_number = _get_model_type_layer_nemo(k) |
|
|
|
if model_type == "encoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.1.DenseReluDense.wo.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.1.DenseReluDense.wo.weight") |
|
elif model_type == "decoder": |
|
|
|
hf_weights[f"{model_type}.block.{layer_number}.layer.2.DenseReluDense.wo.weight"] = v |
|
print(f"Mapped {k} to {model_type}.block.{layer_number}.layer.2.DenseReluDense.wo.weight") |
|
|
|
else: |
|
raise ValueError(f"Unknown key: {k}") |
|
|
|
print("Done mapping weights. \n") |
|
print(f"Total keys in converted Huggingface weight mapping: {len(hf_weights.keys())} \n") |
|
return hf_weights |
|
|
|
|
|
|
|
|
|
|
|
def compare_weights_hf_nemo(model, hf_weights, hf_config_path, hf_model_path=None): |
|
""" |
|
Compares the weights of a Huggingface initialized model against Nemo model converted to HF. |
|
Prints if there are any missing keys that were expected but not mapped. |
|
Also compares parameter count of HF initialized model against original unconverted Nemo model. |
|
|
|
Args: |
|
model: NeMo model |
|
hf_weights: Dictionary of Huggingface weights |
|
hf_config_path: Path to Huggingface config file to initialize model from. |
|
hf_model_path: Path to Huggingface Hub or local HF model folder, if you alternatively want to |
|
load/initialize from an existing model on HF Hub or disk (optional) |
|
""" |
|
|
|
if args.hf_model_path: |
|
|
|
hf_model = T5ForConditionalGeneration.from_pretrained(hf_model_path) |
|
else: |
|
|
|
hf_model = load_huggingface_t5_model(hf_config_path) |
|
|
|
print(f"Total keys in converted Huggingface weight mapping: {len(hf_weights.keys())} \n") |
|
print(f"Total keys in Huggingface model initialized from config or HF Hub: {len(hf_model.state_dict().keys())} \n") |
|
|
|
|
|
print( |
|
f"Number of parameters in HF model initialized from config or HF hub: {sum(p.numel() for p in hf_model.parameters() if p.requires_grad)}" |
|
) |
|
|
|
print(f"Number of parameters in Nemo model: {sum(p.numel() for p in model.parameters() if p.requires_grad)} \n") |
|
|
|
|
|
print( |
|
( |
|
f"Keys in converted HF weight mapping but missing in HF model initialized from config.json: \n" |
|
f"{set(hf_weights.keys()) - set(hf_model.state_dict().keys())} \n" |
|
) |
|
) |
|
print( |
|
( |
|
f"Keys in HF model initialized from config.json but missing in converted HF weight mapping: \n" |
|
f"{set(hf_model.state_dict().keys()) - set(hf_weights.keys())} \n" |
|
) |
|
) |
|
|
|
print( |
|
( |
|
f"It is expected that lm_head.weight is missing from converted HF weight mapping \n" |
|
f"if you have set share_decoder_tokens_head_embeddings=True in your Nemo config. \n" |
|
f"This weight doesn't exist in Nemo, as it is shared with the decoder token embeddings. \n \n" |
|
f"In Huggingface, weights for lm_head.weight and decoder token embeddings are generally duplicated \n" |
|
f"in the state_dict. When missing, the lm_head.weight is automatically initialized from shared decoder \n" |
|
f"token embeddings weights if your HF config.json has tie_word_embeddings=True." |
|
) |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Convert Nemo T5/UL2 model to Huggingface T5/UL2 model") |
|
parser.add_argument( |
|
"--nemo_model_path", |
|
type=str, |
|
required=True, |
|
help="Path to Nemo T5/UL2 model .ckpt file", |
|
) |
|
parser.add_argument( |
|
"--hf_config_path", |
|
type=str, |
|
required=True, |
|
help="Path to Huggingface T5 config.json", |
|
) |
|
parser.add_argument( |
|
"--hf_model_path", |
|
type=str, |
|
required=False, |
|
help="Path to Huggingface T5 model, local folder or HF hub model", |
|
) |
|
parser.add_argument( |
|
"--output_path", |
|
type=str, |
|
required=True, |
|
help="Folder to save converted Huggingface T5/UL2 model in", |
|
) |
|
|
|
parser.add_argument("--hidden_size", type=int, default=768, help="Hidden size of Nemo model") |
|
parser.add_argument("--num_heads", type=int, default=12, help="Number of attention heads in Nemo model") |
|
|
|
parser.add_argument("--fix_qkv", action="store_true", help="Fix QKV weights in converted HF model") |
|
parser.add_argument("--checkpoint_version", type=float, default=2.0, help="Checkpoint version of Nemo model") |
|
parser.add_argument( |
|
"--kv_dim", type=int, default=64, help="Key/Value dimension of Nemo model. Typically hidden_size // num_heads" |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
model = load_nemo_megatron_model(checkpoint_path=args.nemo_model_path) |
|
nemo_weights = model.state_dict() |
|
|
|
hf_weights = convert_nemo_to_hf( |
|
nemo_weights=nemo_weights, |
|
fix_qkv_ordering=args.fix_qkv, |
|
hidden_size=args.hidden_size, |
|
num_heads=args.num_heads, |
|
kv_dim=args.kv_dim, |
|
checkpoint_version=args.checkpoint_version, |
|
) |
|
|
|
|
|
tokenizer = model.tokenizer.__dict__["tokenizer"] |
|
|
|
|
|
|
|
|
|
config = T5Config.from_json_file(args.hf_config_path) |
|
|
|
|
|
config.save_pretrained(args.output_path) |
|
print(f"Saved config to {os.path.join(args.output_path, 'config.json')}") |
|
|
|
|
|
tokenizer.save_pretrained(args.output_path) |
|
print(f"Saved tokenizer to {os.path.join(args.output_path, 'tokenizer.json')}") |
|
|
|
|
|
torch.save(hf_weights, os.path.join(args.output_path, "pytorch_model.bin")) |
|
print(f"Saved converted weights to {os.path.join(args.output_path, 'pytorch_model.bin')}") |
|
|
|
|
|
compare_weights_hf_nemo(model, hf_weights, hf_config_path=args.hf_config_path) |
|
|