{ "cells": [ { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "\n", "def count_files_by_extension(path, extension):\n", " \"\"\"\n", " path : root path to check,\n", " extension : .wav, ...\n", " \"\"\"\n", " total_count = 0\n", " \n", " for foldername, subfolders, filenames in os.walk(path):\n", " files = glob.glob(os.path.join(foldername, f\"*.{extension}\"))\n", " total_count += len(files)\n", " \n", " return total_count\n", "\n", "\n", "root_path = \"./Cleaned_MITI/dataset_2\"" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "num_wav_files = count_files_by_extension(root_path, \"wav\")\n", "num_txt_files = count_files_by_extension(root_path, \"txt\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Số lượng file WAV: 2099\n", "Số lượng file text: 2099\n" ] } ], "source": [ "print(f\"Số lượng file WAV: {num_wav_files}\")\n", "print(f\"Số lượng file text: {num_txt_files}\")" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tần số mẫu (sample rate): 44100 Hz\n", "Số kênh (channels): 1\n" ] } ], "source": [ "import os\n", "import random\n", "import wave\n", "\n", "\n", "def get_random_wav_file_info(folder_path):\n", " for foldername, subfolders, filenames in os.walk(folder_path): \n", " wav_files = glob.glob(f\"{foldername}/*.wav\")\n", " \n", " if not wav_files:\n", " return None, None\n", " \n", " random_wav_file = random.choice(wav_files)\n", " \n", " with wave.open(random_wav_file, 'rb') as wav_file:\n", " sample_rate = wav_file.getframerate()\n", " channels = wav_file.getnchannels()\n", " \n", " return sample_rate, channels\n", "\n", "path_to_wav_folder = \"./Cleaned_MITI/dataset_2/\"\n", "\n", "sample_rate, channels = get_random_wav_file_info(path_to_wav_folder)\n", "\n", "if sample_rate is not None and channels is not None:\n", " print(f\"Tần số mẫu (sample rate): {sample_rate} Hz\")\n", " print(f\"Số kênh (channels): {channels}\")\n", "else:\n", " print(\"Nothing.\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def remove_special_characters(input_string):\n", " special_characters = ['.', ',', '-', '_', \" \"]\n", " \n", " # Duyệt qua từng ký tự trong chuỗi\n", " filtered_string = ''.join([char for char in input_string if char not in special_characters])\n", " \n", " return filtered_string\n", "\n", "# Sử dụng hàm\n", "input_string = \"Hello, this_is_a-test.string!\"\n", "output_string = remove_special_characters(input_string)\n", "print(output_string) # Kết quả: \"Hello thisisa teststring\"\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 84%|████████▎ | 164/196 [00:00<00:00, 1629.92it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 196/196 [00:00<00:00, 1580.86it/s]\n", "100%|██████████| 218/218 [00:00<00:00, 1440.12it/s]\n", "100%|██████████| 216/216 [00:00<00:00, 1364.20it/s]\n", "100%|██████████| 205/205 [00:00<00:00, 1412.14it/s]\n", "100%|██████████| 204/204 [00:00<00:00, 1426.29it/s]\n", "100%|██████████| 220/220 [00:00<00:00, 1511.87it/s]\n", "100%|██████████| 225/225 [00:00<00:00, 1499.30it/s]\n", "100%|██████████| 175/175 [00:00<00:00, 1492.85it/s]\n", "100%|██████████| 220/220 [00:00<00:00, 1496.34it/s]\n", "100%|██████████| 220/220 [00:00<00:00, 1480.81it/s]\n" ] } ], "source": [ "import os\n", "import csv\n", "from tqdm import tqdm\n", "import glob\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "normalizer = BasicTextNormalizer()\n", "def create_csv_from_wav_folder(folder_path, output_csv_file):\n", " with open(output_csv_file, mode='w', newline='') as csv_file:\n", " csv_writer = csv.writer(csv_file)\n", " csv_writer.writerow(['path', 'name', 'sentence'])\n", "\n", " for person_foldername, _, _ in os.walk(folder_path):\n", " if \"person_\" in person_foldername:\n", " wav_files = glob.glob(os.path.join(person_foldername, \"*.wav\"))\n", "\n", " for wav_file_path in tqdm(wav_files):\n", " wav_filename = os.path.basename(wav_file_path)\n", " text_filename = os.path.splitext(wav_filename)[0] + \".txt\"\n", " text_file_path = os.path.join(person_foldername, text_filename)\n", "\n", " if os.path.exists(text_file_path):\n", " with open(text_file_path, 'r') as txt_file:\n", " text_content = normalizer(txt_file.read())\n", " else:\n", " text_content = \"Not found.\"\n", "\n", " csv_writer.writerow([wav_file_path, wav_filename, text_content])\n", "\n", "root_path = \"./Cleaned_MITI/dataset_2\" \n", "output_csv_file = \"MITI.csv\" \n", "\n", "create_csv_from_wav_folder(root_path, output_csv_file)\n" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2099" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "data = pd.read_csv('MITI.csv')\n", "len(data)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import random\n", "\n", "def split_csv_file(input_file, output_file1, output_file2, ratio):\n", " with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:\n", " csvreader = csv.reader(csvfile)\n", " header = next(csvreader) \n", " \n", " data = list(csvreader)\n", " random.shuffle(data)\n", "\n", " total_rows = len(data)\n", " rows_output_file1 = int(total_rows * ratio)\n", " rows_output_file2 = total_rows - rows_output_file1\n", " \n", " # Split the data into two parts\n", " data1 = data[:rows_output_file1]\n", " data2 = data[rows_output_file1:]\n", "\n", " with open(output_file1, 'w', newline='', encoding='utf-8') as csvfile1:\n", " csvwriter1 = csv.writer(csvfile1, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", " csvwriter1.writerow(header)\n", " csvwriter1.writerows(data1)\n", "\n", " with open(output_file2, 'w', newline='', encoding='utf-8') as csvfile2:\n", " csvwriter2 = csv.writer(csvfile2, quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", " csvwriter2.writerow(header)\n", " csvwriter2.writerows(data2)\n", "\n", "input_file = 'MITI.csv'\n", "output_file1 = 'MITI_train.csv'\n", "output_file2 = 'MITI_test.csv'\n", "ratio = 0.8 \n", "\n", "split_csv_file(input_file, output_file1, output_file2, ratio)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, DatasetDict\n", "\n", "vivos = DatasetDict()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "\n", "import torch\n", "import torchaudio\n", "\n", "import pandas as pd\n", "import whisper\n", "import torchaudio.transforms as at\n", "from pathlib import Path\n", "\n", "def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:\n", " waveform, sr = torchaudio.load(wave_path, normalize=True)\n", " if sample_rate != sr:\n", " waveform = at.Resample(sr, sample_rate)(waveform)\n", " return waveform\n", "\n", "\n", "\n", "def get_list_files_vin100h(phase, dataset_path='./vin_data/vlsp2020_train_set_02/', text_max_length=10000, audio_max_sample_length=1000000, sample_rate=16000):\n", " audio_transcript_pair_list = []\n", " if phase == 'train':\n", " csv_file = 'vin_train.csv'\n", " else:\n", " csv_file = 'vin_test.csv'\n", " df = pd.read_csv(csv_file)\n", " for index, row in df.iterrows():\n", " new_path = Path(row['path'])\n", " audio_id = index\n", " text = row['sentence']\n", " if new_path.exists():\n", " audio = load_wave(new_path, sample_rate=sample_rate)[0]\n", " # if len(text) > text_max_length or len(audio) > audio_max_sample_length:\n", " # print('skip file:', new_path, 'with len text:', len(text), 'and len audio', len(audio))\n", " # continue\n", " audio_transcript_pair_list.append((audio_id, str(new_path), text))\n", " print(audio_transcript_pair_list)\n", " return audio, audio_transcript_pair_list\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_list_files_vin100h(phase='train')" ] } ], "metadata": { "kernelspec": { "display_name": "DUY", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }