File size: 6,394 Bytes
45ce8e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
"""Initialize a student Whisper model from a pre-trained teacher model for teacher-student distillation."""
import argparse
import copy
import logging
import os
import numpy as np
import torch
from transformers import GenerationConfig, WhisperForConditionalGeneration, WhisperProcessor
# https://stackoverflow.com/questions/71692354/facing-ssl-error-with-huggingface-pretrained-models
os.environ['CURL_CA_BUNDLE'] = ''
# disable warning message
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Initialise a student Whisper model from a teacher model, copying the relevant layer weights and adjusting the processor as necessary."
)
parser.add_argument(
"--teacher_checkpoint",
type=str,
required=True,
help="The HF Hub ID of the teacher checkpoint.",
)
parser.add_argument(
"--encoder_layers",
type=int,
default=None,
help="Number of encoder layers to use in the student model. Defaults to all layers from the teacher.",
)
parser.add_argument(
"--decoder_layers",
type=int,
default=2,
help="Number of decoder layers to use in the student model. Defaults to 2 layers.",
)
parser.add_argument(
"--save_dir",
type=str,
required=True,
help="Where to save the student weights and processor.",
)
args = parser.parse_args()
return args
def init_student_model_from_teacher(
teacher_checkpoint,
save_dir,
encoder_layers=None,
decoder_layers=2,
):
teacher_model = WhisperForConditionalGeneration.from_pretrained(
teacher_checkpoint,
low_cpu_mem_usage=True,
)
processor = WhisperProcessor.from_pretrained(teacher_checkpoint)
generation_config = GenerationConfig.from_pretrained(teacher_checkpoint)
teacher_config = teacher_model.config
teacher_encoder_layers = teacher_config.encoder_layers
teacher_decoder_layers = teacher_config.decoder_layers
student_config = copy.deepcopy(teacher_config)
student_config.update(
{
"encoder_layers": encoder_layers if encoder_layers is not None else teacher_encoder_layers,
"decoder_layers": decoder_layers,
}
)
encoder_mapping = np.linspace(0, teacher_encoder_layers - 1, student_config.encoder_layers, dtype=int)
encoder_mapping[-1] = teacher_encoder_layers - 1
encoder_map = {}
for student_layer, teacher_layer in enumerate(encoder_mapping):
encoder_map[teacher_layer] = student_layer
decoder_mapping = np.linspace(0, teacher_decoder_layers - 1, student_config.decoder_layers, dtype=int)
decoder_mapping[-1] = teacher_decoder_layers - 1
decoder_map = {}
for student_layer, teacher_layer in enumerate(decoder_mapping):
decoder_map[teacher_layer] = student_layer
# init the student params from the teacher model
student_model = WhisperForConditionalGeneration(student_config)
missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
if len(missing_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Missing key(s) in state_dict: {missing_keys}"
)
if decoder_layers == teacher_decoder_layers:
decoder_keys = [key for key in unexpected_keys if "model.decoder.layers" in key]
if len(decoder_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Unexpected key(s) in state_dict: {decoder_keys}"
)
if encoder_layers == teacher_encoder_layers:
encoder_keys = [key for key in unexpected_keys if "model.encoder.layers" in key]
if len(encoder_keys) > 0:
raise RuntimeError(
"Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
f"Unexpected key(s) in state_dict: {encoder_keys}"
)
for layer in range(teacher_decoder_layers):
if layer in decoder_map:
# re-introduce pre-defined layers from the teacher
student_model.model.decoder.layers[decoder_map[layer]].load_state_dict(
teacher_model.model.decoder.layers[layer].state_dict()
)
if encoder_layers is not None:
for layer in range(teacher_encoder_layers):
if layer in encoder_map:
# re-introduce pre-defined layers from the teacher
student_model.model.encoder.layers[encoder_map[layer]].load_state_dict(
teacher_model.model.encoder.layers[layer].state_dict()
)
# remove the teacher params and model
del teacher_model
# save the converted weights and model
student_model.save_pretrained(save_dir)
# we also need to correctly save the processor and generation config
processor.save_pretrained(save_dir)
generation_config.save_pretrained(save_dir)
# check we can do a forward pass with the saved model - first load the weights and processor
logger.info("Checking we can load the saved model...")
student_model = WhisperForConditionalGeneration.from_pretrained(save_dir, low_cpu_mem_usage=True)
processor = WhisperProcessor.from_pretrained(save_dir)
# define some random inputs
input_features = processor(np.ones(16000), sampling_rate=16000, return_tensors="pt").input_features
decoder_start_token_id = student_model.config.decoder_start_token_id
decoder_input_ids = torch.ones((input_features.shape[0], 1), dtype=torch.long) * decoder_start_token_id
# do a forward pass - outputs will be gibberish for the initialised model so we can't check them
# but we make can sure the model runs as expected
logger.info("Checking we can run the converted model forward...")
_ = student_model(input_features, decoder_input_ids=decoder_input_ids).logits
logger.info("Conversion successful!")
if __name__ == "__main__":
args = parse_args()
init_student_model_from_teacher(
teacher_checkpoint=args.teacher_checkpoint,
encoder_layers=args.encoder_layers,
decoder_layers=args.decoder_layers,
save_dir=args.save_dir,
)
|