diff --git "a/src/laboratory.ipynb" "b/src/laboratory.ipynb"
new file mode 100644--- /dev/null
+++ "b/src/laboratory.ipynb"
@@ -0,0 +1,2576 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "63594f228ab14d9796bbf24112269a52",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "VBox(children=(HTML(value='
Dict[str, torch.Tensor]:\n",
+ " # split inputs and labels since they have to be of different lengths and need different padding methods\n",
+ " # first treat the audio inputs by simply returning torch tensors\n",
+ " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
+ " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
+ "\n",
+ " # get the tokenized label sequences\n",
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
+ "\n",
+ " # pad the labels to max length\n",
+ " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
+ "\n",
+ " # replace padding with -100 to ignore loss correctly\n",
+ " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
+ "\n",
+ " # if bos token is appended in previous tokenization step,\n",
+ " # cut bos token here as it's append later anyways\n",
+ " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
+ " labels = labels[:, 1:]\n",
+ "\n",
+ " batch[\"labels\"] = labels\n",
+ "\n",
+ " return batch\n",
+ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import WhisperForConditionalGeneration\n",
+ "\n",
+ "\n",
+ "model = WhisperForConditionalGeneration.from_pretrained(\n",
+ " model_name_or_path, device_map=\"auto\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DatasetDict({\n",
+ " test: Dataset({\n",
+ " features: ['input_features', 'labels'],\n",
+ " num_rows: 857\n",
+ " })\n",
+ "})"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "evaluation_dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 108/108 [09:19<00:00, 5.18s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wer=24.938214396045723\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from torch.utils.data import DataLoader\n",
+ "from tqdm import tqdm\n",
+ "import numpy as np\n",
+ "import gc\n",
+ "import evaluate\n",
+ "metric = evaluate.load(\"wer\")\n",
+ "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=8, collate_fn=data_collator)\n",
+ "\n",
+ "model.eval()\n",
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ " generated_tokens = (\n",
+ " model.generate(\n",
+ " input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+ " decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n",
+ " max_new_tokens=255,\n",
+ " )\n",
+ " .cpu()\n",
+ " .numpy()\n",
+ " )\n",
+ " labels = batch[\"labels\"].cpu().numpy()\n",
+ " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+ " decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " metric.add_batch(\n",
+ " predictions=decoded_preds,\n",
+ " references=decoded_labels,\n",
+ " )\n",
+ " del generated_tokens, labels, batch\n",
+ " gc.collect()\n",
+ "wer = 100 * metric.compute()\n",
+ "print(f\"{wer=}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 54/54 [07:20<00:00, 8.15s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wer=24.934352795798578 and normalized_wer=13.639508070714834\n",
+ "{'eval/wer': 24.934352795798578, 'eval/normalized_wer': 13.639508070714834}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import gc\n",
+ "import numpy as np\n",
+ "from tqdm import tqdm\n",
+ "from torch.utils.data import DataLoader\n",
+ "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
+ "\n",
+ "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=16, collate_fn=data_collator)\n",
+ "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task='transcribe')\n",
+ "normalizer = BasicTextNormalizer()\n",
+ "\n",
+ "predictions = []\n",
+ "references = []\n",
+ "normalized_predictions = []\n",
+ "normalized_references = []\n",
+ "import evaluate\n",
+ "metric = evaluate.load(\"wer\")\n",
+ "model.eval()\n",
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ " generated_tokens = (\n",
+ " model.generate(\n",
+ " input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+ " forced_decoder_ids=forced_decoder_ids,\n",
+ " max_new_tokens=255,\n",
+ " )\n",
+ " .cpu()\n",
+ " .numpy()\n",
+ " )\n",
+ " labels = batch[\"labels\"].cpu().numpy()\n",
+ " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n",
+ " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+ " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " predictions.extend(decoded_preds)\n",
+ " references.extend(decoded_labels)\n",
+ " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n",
+ " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n",
+ " del generated_tokens, labels, batch\n",
+ " gc.collect()\n",
+ "wer = 100 * metric.compute(predictions=predictions, references=references)\n",
+ "normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n",
+ "eval_metrics = {\"eval/wer\": wer, \"eval/normalized_wer\": normalized_wer}\n",
+ "\n",
+ "print(f\"{wer=} and {normalized_wer=}\")\n",
+ "print(eval_metrics)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Lora\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "\n",
+ "from dataclasses import dataclass\n",
+ "from typing import Any, Dict, List, Union\n",
+ "\n",
+ "@dataclass\n",
+ "class DataCollatorSpeechSeq2SeqWithPadding:\n",
+ " processor: Any\n",
+ "\n",
+ " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
+ " # split inputs and labels since they have to be of different lengths and need different padding methods\n",
+ " # first treat the audio inputs by simply returning torch tensors\n",
+ " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
+ " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
+ "\n",
+ " # get the tokenized label sequences\n",
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
+ "\n",
+ " # pad the labels to max length\n",
+ " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
+ "\n",
+ " # replace padding with -100 to ignore loss correctly\n",
+ " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
+ "\n",
+ " # if bos token is appended in previous tokenization step,\n",
+ " # cut bos token here as it's append later anyways\n",
+ " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
+ " labels = labels[:, 1:]\n",
+ "\n",
+ " batch[\"labels\"] = labels\n",
+ "\n",
+ " return batch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from transformers import (\n",
+ " AutomaticSpeechRecognitionPipeline,\n",
+ " WhisperForConditionalGeneration,\n",
+ " WhisperTokenizer,\n",
+ " WhisperProcessor,\n",
+ ")\n",
+ "from peft import PeftModel, PeftConfig\n",
+ "\n",
+ "peft_model_id = \"DuyTa/vi-whisper-medium-Lora\"\n",
+ "\n",
+ "language = \"Vietnamese\"\n",
+ "task = \"transcribe\"\n",
+ "\n",
+ "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+ "model = WhisperForConditionalGeneration.from_pretrained(\n",
+ " peft_config.base_model_name_or_path,\n",
+ ")\n",
+ "model = PeftModel.from_pretrained(model, peft_model_id)\n",
+ "model.to(\"cuda\").half()\n",
+ "\n",
+ "processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 108/108 [12:31<00:00, 6.96s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wer_lora=24.934352795798578\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from torch.utils.data import DataLoader\n",
+ "from tqdm import tqdm\n",
+ "import numpy as np\n",
+ "import gc\n",
+ "import evaluate\n",
+ "metric = evaluate.load(\"wer\")\n",
+ "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=8, collate_fn=data_collator)\n",
+ "\n",
+ "model.eval()\n",
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ " generated_tokens = (\n",
+ " model.generate(\n",
+ " input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+ " decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n",
+ " max_new_tokens=255,\n",
+ " )\n",
+ " .cpu()\n",
+ " .numpy()\n",
+ " )\n",
+ " labels = batch[\"labels\"].cpu().numpy()\n",
+ " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+ " decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+ " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " metric.add_batch(\n",
+ " predictions=decoded_preds,\n",
+ " references=decoded_labels,\n",
+ " )\n",
+ " del generated_tokens, labels, batch\n",
+ " gc.collect()\n",
+ "wer_lora = 100 * metric.compute()\n",
+ "print(f\"{wer_lora=}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 54/54 [09:20<00:00, 10.39s/it]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wer=24.934352795798578 and normalized_wer=13.624135280553421\n",
+ "{'eval/wer': 24.934352795798578, 'eval/normalized_wer': 13.624135280553421}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import gc\n",
+ "import numpy as np\n",
+ "from tqdm import tqdm\n",
+ "from torch.utils.data import DataLoader\n",
+ "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
+ "\n",
+ "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=16, collate_fn=data_collator)\n",
+ "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task='transcribe')\n",
+ "normalizer = BasicTextNormalizer()\n",
+ "\n",
+ "predictions = []\n",
+ "references = []\n",
+ "normalized_predictions = []\n",
+ "normalized_references = []\n",
+ "import evaluate\n",
+ "metric = evaluate.load(\"wer\")\n",
+ "model.eval()\n",
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ " generated_tokens = (\n",
+ " model.generate(\n",
+ " input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+ " forced_decoder_ids=forced_decoder_ids,\n",
+ " max_new_tokens=255,\n",
+ " )\n",
+ " .cpu()\n",
+ " .numpy()\n",
+ " )\n",
+ " labels = batch[\"labels\"].cpu().numpy()\n",
+ " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n",
+ " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+ " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+ " predictions.extend(decoded_preds)\n",
+ " references.extend(decoded_labels)\n",
+ " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n",
+ " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n",
+ " del generated_tokens, labels, batch\n",
+ " gc.collect()\n",
+ "wer = 100 * metric.compute(predictions=predictions, references=references)\n",
+ "normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n",
+ "eval_metrics = {\"eval/wer\": wer, \"eval/normalized_wer\": normalized_wer}\n",
+ "\n",
+ "print(f\"{wer=} and {normalized_wer=}\")\n",
+ "print(eval_metrics)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quantization Whisper Lora"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from whisper_quant import WhisperModel\n",
+ "\n",
+ "model_size = \"medium\"\n",
+ "\n",
+ "# Run on GPU with FP16\n",
+ "model = WhisperModel(model_size, device=\"cuda\", compute_type=\"float16\")\n",
+ "\n",
+ "# or run on GPU with INT8\n",
+ "# model = WhisperModel(model_size, device=\"cuda\", compute_type=\"int8_float16\")\n",
+ "# or run on CPU with INT8\n",
+ "# model = WhisperModel(model_size, device=\"cpu\", compute_type=\"int8\")\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "segments, info = model.transcribe(\"audio.wav\", beam_size=1, language ='vi', temperature= 0)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0.00s -> 7.10s] Hai, đây tức là một kẻ ăn mày vậy, anh ta chưa kịp quay đi thì đã thấy mấy con chó vàng chạy sồng sộc ra cứ nhảy sổ vào chân anh.\n"
+ ]
+ }
+ ],
+ "source": [
+ "for segment in segments:\n",
+ " print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "normalizer = BasicTextNormalizer()\n",
+ "norm = normalizer(segment.text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "' hai đây tức là một kẻ ăn mầy vậy anh ta chưa kịp quay đi thì đã thấy mấy con chó vàng chạy sồng sộc ra cứ nhảy sổ vào chân anh '"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "norm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from dataclasses import dataclass\n",
+ "\n",
+ "@dataclass\n",
+ "class DataCollatorSpeechSeq2SeqWithPadding:\n",
+ " processor: Any\n",
+ "\n",
+ " def __call__(self, features):\n",
+ " audios = []\n",
+ " for feature in features:\n",
+ " audios.append(feature[\"audio\"])\n",
+ " batch = {\n",
+ " \"audio\": [feature[\"audio\"]['array'] for feature in features],\n",
+ " \"transcription\": [feature[\"transcription\"] for feature in features]\n",
+ " }\n",
+ " return batch\n",
+ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor='No')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.utils.data import DataLoader\n",
+ "from tqdm import tqdm\n",
+ "import numpy as np\n",
+ "import gc\n",
+ "import evaluate\n",
+ "metric = evaluate.load(\"wer\")\n",
+ "eval_dataloader = DataLoader(fleurs['test'], batch_size=16, collate_fn=data_collator)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "for data in eval_dataloader:\n",
+ " audios = data['audio']\n",
+ " transcriptions = data['transcription']\n",
+ " final = []\n",
+ " for audio in data['audio']:\n",
+ " print(\"-\" * 20)\n",
+ " segments, info = model.transcribe(audio, beam_size=1, language='vi')\n",
+ " out = [out.text for out in segments]\n",
+ " pred = ''.join(out)\n",
+ " norm_pred = normalizer(pred)\n",
+ " final.append(norm_pred)\n",
+ "cleaned_text_list = [re.sub(r'\\s+', ' ', text.strip()) for text in final]\n",
+ " \n",
+ "\n",
+ " print(cleaned_text_list)\n",
+ " print(transcriptions)\n",
+ " break\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ "\n",
+ " labels = batch[\"transcription\"]\n",
+ " print(labels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+ " with torch.cuda.amp.autocast():\n",
+ " with torch.no_grad():\n",
+ " final = []\n",
+ " labels = batch[\"transcription\"]\n",
+ " for audio in batch[\"audio\"]:\n",
+ " \n",
+ " segments, _ = model.transcribe(audio, beam_size=1, language='vi')\n",
+ " out = [out.text for out in segments]\n",
+ " pred = ''.join(out)\n",
+ " norm_pred = normalizer(pred)\n",
+ " final.append(norm_pred)\n",
+ " cleaned_text_list = [re.sub(r'\\s+', ' ', text.strip()) for text in final]\n",
+ " print(cleaned_text_list)\n",
+ " print(labels)\n",
+ " metric.add_batch(\n",
+ " predictions=cleaned_text_list,\n",
+ " references=labels,\n",
+ " )\n",
+ " del labels, batch, final\n",
+ " gc.collect()\n",
+ "wer_lora = 100 * metric.compute()\n",
+ "print(f\"{wer_lora=}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "