diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c79714ffe2d802994366264f1d864e50e910be41 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker +# you will also find guides on how best to write your Dockerfile + +FROM python:3.10 + +# Create and switch to a non-root user +RUN useradd -m -u 1000 user +USER user +ENV PATH="/home/user/.local/bin:$PATH" + +# Set a working directory for temporary operations +WORKDIR /app + +# Install system packages +USER root +RUN apt-get update && \ + apt-get install -y wget git calibre ffmpeg libmecab-dev mecab mecab-ipadic && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Clone the GitHub repository and set it as the working directory +USER root +RUN apt-get update && apt-get install -y git && apt-get clean && rm -rf /var/lib/apt/lists/* +USER user +RUN git clone https://github.com/DrewThomasson/ebook2audiobook.git /home/user/ebook2audiobook + +# Set the cloned repository as the base working directory +WORKDIR /home/user/ebook2audiobook + +#Install Python dependences from the ebook2audiobook repo +RUN pip install --no-cache-dir --upgrade -r requirements.txt + +# Expose the required port +EXPOSE 7860 + +# Start the Gradio app from the repository +CMD ["python", "app.py"] diff --git a/Notebooks/Kaggel Archive Code/4.wav b/Notebooks/Kaggel Archive Code/4.wav deleted file mode 100644 index 7d9b5213bf9d351cf5a19d67b244010a39357127..0000000000000000000000000000000000000000 Binary files a/Notebooks/Kaggel Archive Code/4.wav and /dev/null differ diff --git a/Notebooks/Kaggel Archive Code/LICENSE b/Notebooks/Kaggel Archive Code/LICENSE deleted file mode 100644 index 7fe2ff326e4f6b887789fde344b515c10c3c48dc..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Drew Thomasson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/Notebooks/Kaggel Archive Code/README.md b/Notebooks/Kaggel Archive Code/README.md deleted file mode 100644 index d61f07e55b3e857a769a31e3f36a76feaef23a5c..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/README.md +++ /dev/null @@ -1,118 +0,0 @@ -# this is a sample for running on kaggle and it may not be updated frequently - -# ebook2audiobook kaggle eddition -Generates an audiobook with chapters and ebook metadata using Calibre and Xtts from Coqui tts, and with optional voice cloning, and supports multiple languages - -# import this notebook to kaggle -https://github.com/Rihcus/ebook2audiobookXTTS/blob/main/kaggle-ebook2audiobook-demo.ipynb - -## Features - -- Converts eBooks to text format using Calibre's `ebook-convert` tool. -- Splits the eBook into chapters for structured audio conversion. -- Uses XTTS from Coqui TTS for high-quality text-to-speech conversion. -- Optional voice cloning feature using a provided voice file. -- Supports different languages for text-to-speech conversion, with English as the default. -- Confirmed to run on only 4 gb ram - -## Requirements - -- Python 3.x -- `coqui-tts` Python package -- Calibre (for eBook conversion) -- FFmpeg (for audiobook file creation) -- Optional: Custom voice file for voice cloning - -### Installation Instructions for Dependencies - -Install Python 3.x from [Python.org](https://www.python.org/downloads/). - -Install Calibre: -- Ubuntu: `sudo apt-get install -y calibre` -- macOS: `brew install calibre` -- Windows(Powershell in Administrator mode): `choco install calibre` - -Install FFmpeg: -- Ubuntu: `sudo apt-get install -y ffmpeg` -- macOS: `brew install ffmpeg` -- Windows(Powershell in Administrator mode): `choco install ffmpeg` - -Install Mecab for (Non Latin-based Languages tts support)(Optional): -- Ubuntu: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` -- macOS: `brew install mecab`, `brew install mecab-ipadic` -- Windows(Powershell in Administrator mode no support for mecab-ipadic easy install so no Japanese for windows :/): `choco install mecab ` - -Install Python packages: -```bash -pip install tts pydub nltk beautifulsoup4 ebooklib tqdm -``` -(For non Latin-based Languages tts support)(Optional) -`python -m unidic download` -```bash -pip install mecab mecab-python3 unidic -``` - -### Supported Languages - -The script supports the following languages for text-to-speech conversion: - -English (en), -Spanish (es), -French (fr), -German (de), -Italian (it), -Portuguese (pt), -Polish (pl), -Turkish (tr), -Russian (ru), -Dutch (nl), -Czech (cs), -Arabic (ar), -Chinese (zh-cn), -Japanese (ja), -Hungarian (hu), -Korean (ko) - -Specify the language code when running the script to use these languages. - -### Usage - -Navigate to the script's directory in the terminal and execute one of the following commands: -If you have any trouble getting it to run in Windows then it should run fine in WSL2 - -Basic Usage: ALL PARAMETERS ARE MANDATORY WHEN CALLED THE SCRIPT - -```bash -python ebook2audiobook.py [path_to_voice_file] [language_code] -``` -Replace with the path to your eBook file. -include for voice cloning. -include to specify the language - - -## Demo - - - -https://github.com/DrewThomasson/ebook2audiobookXTTS/assets/126999465/bccd7240-f967-4d27-a87d-445034db7d21 - - - -### Supported ebook File Types: -.epub, .pdf, .mobi, .txt, .html, .rtf, .chm, .lit, .pdb, .fb2, .odt, .cbr, .cbz, .prc, .lrf, .pml, .snb, .cbc, .rb, and .tcr, -(Best results are from using epub or mobi for auto chapter detection) - -### outputs as a m4b with all book metadata and chapters, example output file in an audiobook player app -![Example_of_output_in_audiobook_program](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) - -A special thanks to the creaters of: - - --Coqui TTS - --https://github.com/coqui-ai/TTS - - --Calibre - --https://calibre-ebook.com diff --git a/Notebooks/Kaggel Archive Code/Worker_2T4.sh b/Notebooks/Kaggel Archive Code/Worker_2T4.sh deleted file mode 100644 index f366fdc1c8fbe3f906f80f0a9d79a2e68fe49af7..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/Worker_2T4.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -workers=$1 - -# Clean up operator directory -rm -rf "./Operator" -rm -rf "./Chapter_wav_files" -mkdir "./Operator" -mkdir "./Chapter_wav_files" - - -# Make appropriate temp directories -for i in $(seq 1 $workers); do - mkdir "./Operator/$i" - mkdir "./Operator/$i/temp" - mkdir "./Operator/$i/temp_ebook" -done - -echo "Created $workers directories" - -#Divide the chapters -share=1 -for FILE in ./Working_files/temp_ebook/*; do - cp $FILE "./Operator/$share/temp_ebook/" - if [ $share -lt $workers ]; - then - share=$((share+1)) - else - share=1 - fi -done - -echo "Split chapters into operator" - -#Run audio generation -#for i in $(seq 1 $workers); do -# echo "Starting Worker $i" -# python p2a_worker.py $i & -#done - -gpu=1 -for i in $(seq 1 $workers); do - if [ $gpu -lt 2 ]; - then - echo "Starting Worker $i on GPU 1" - python p2a_worker_gpu1.py $i & #Run audio generation GPU 1 T4 - gpu=2 # switch to gpu 2 on next loop - else - echo "Starting Worker $i on GPU 2" - python p2a_worker_gpu2.py $i & #Run audio generation GPU 2 T4 - gpu=1 # switch to gpu 1 on next loop - fi -done - - - -echo "All workers started waiting for completion...." -wait -echo "Done!" diff --git a/Notebooks/Kaggel Archive Code/default_voice.wav b/Notebooks/Kaggel Archive Code/default_voice.wav deleted file mode 100644 index d98ca272441703d70a195f2c098a78a4ff6f100e..0000000000000000000000000000000000000000 Binary files a/Notebooks/Kaggel Archive Code/default_voice.wav and /dev/null differ diff --git a/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub b/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub deleted file mode 100644 index 6f39e5a56e05f80fdca14bdb89431776461178dc..0000000000000000000000000000000000000000 Binary files a/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub and /dev/null differ diff --git a/Notebooks/Kaggel Archive Code/ebook2audiobook.py b/Notebooks/Kaggel Archive Code/ebook2audiobook.py deleted file mode 100644 index d224a7d8f2d02aa56539fe8efbbe7b952ab9066e..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/ebook2audiobook.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removeing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and and removeing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb b/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb deleted file mode 100644 index a0eca0c5c2ede50184d480f0f7813bdccc78b12e..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30733,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"Install depdenencies","metadata":{}},{"cell_type":"code","source":"#!DEBIAN_FRONTEND=noninteractive\n!sudo apt-get update # && sudo apt-get -y upgrade\n!sudo apt-get -y install libegl1 \n!sudo apt-get -y install libopengl0\n!sudo apt-get -y install libxcb-cursor0\n!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n!sudo apt-get install -y ffmpeg\n!pip install tts pydub nltk beautifulsoup4 ebooklib tqdm\n!pip install numpy==1.26.4","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-06-17T21:17:43.474429Z","iopub.execute_input":"2024-06-17T21:17:43.474679Z","iopub.status.idle":"2024-06-17T21:20:20.992799Z","shell.execute_reply.started":"2024-06-17T21:17:43.474655Z","shell.execute_reply":"2024-06-17T21:20:20.991791Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Download modified ebook2audiobookXTTS\nhttps://github.com/Rihcus/ebook2audiobookXTTS\n\nOrigional unmodified version\nhttps://github.com/DrewThomasson/ebook2audiobookXTTS","metadata":{}},{"cell_type":"code","source":"!git clone https://github.com/Rihcus/ebook2audiobookXTTS","metadata":{"execution":{"iopub.status.busy":"2024-03-25T23:22:24.156772Z","iopub.execute_input":"2024-03-25T23:22:24.157618Z","iopub.status.idle":"2024-03-25T23:22:26.202486Z","shell.execute_reply.started":"2024-03-25T23:22:24.157577Z","shell.execute_reply":"2024-03-25T23:22:26.201179Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"(optional) Uploading your own epub book.\n\nBy default this notebook will use a sample epub book for testing/demo. \n\nIf you want to use your own book you will need to create a private kaggle data set, upload your epub to it, attach it to this notebook, and uncomment the two lines of code bellow, and update the data set path","metadata":{}},{"cell_type":"code","source":"# !cp -r /kaggle/input//*.epub /kaggle/working/ebook2audiobookXTTS #copy your custom book\n# !rm /kaggle/working/ebook2audiobookXTTS/demo_mini_story_chapters_Drew.epub #remove default sample book","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"This to install xtts_v2 models","metadata":{}},{"cell_type":"code","source":"import os\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n!cd /kaggle/working/ebook2audiobookXTTS && tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 --text \"test\" --speaker_wav ./4.wav --language_idx en --use_cuda true","metadata":{"execution":{"iopub.status.busy":"2024-03-25T23:23:15.626677Z","iopub.execute_input":"2024-03-25T23:23:15.627585Z","iopub.status.idle":"2024-03-25T23:27:40.712856Z","shell.execute_reply.started":"2024-03-25T23:23:15.627548Z","shell.execute_reply":"2024-03-25T23:27:40.711852Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"This is a modified version of ebook2audiobookXTTS. \n\n- p1.py only runs the first part ebook2audiobookXTTS and generates chapter txts (I commented out other parts)\n - https://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p1.py\n- Worker_2T4.sh as a basic attempt at multigpu support. The 4 argument processes of ebook2audiobook will be run in parallel\n - Worker_2T4 will try to divide the chapter in even groups based on number of workers (ex 4 group 4 workers)\n - It will try to divy up the work between kaggles two T4 GPUS\n - I'm not sure how much of a difference it makes since kaggles cpu limitations\n \nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/Worker_2T4.sh\n\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p2a_worker_gpu1.py\n\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p2a_worker_gpu2.py","metadata":{}},{"cell_type":"code","source":"!cd /kaggle/working/ebook2audiobookXTTS && python p1.py \"$(ls ./*.epub)\" \"4.wav\" \"en\"\n!cd /kaggle/working/ebook2audiobookXTTS && bash Worker_2T4.sh 4","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"p3.py runs the final ffmpeg command. ffmpeg has been a bit buggy\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p3.py","metadata":{}},{"cell_type":"code","source":"!cd /kaggle/working/ebook2audiobookXTTS && python p3.py \"$(ls ./*.epub)\" \"4.wav\" \"en\"","metadata":{},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/Notebooks/Kaggel Archive Code/p1.py b/Notebooks/Kaggel Archive Code/p1.py deleted file mode 100644 index dec3eba3b93ce57a3649214414bcef521e3bd2a5..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/p1.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removeing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and and removeing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) -# audiobook_output_path = os.path.join(".", "Audiobooks") -# print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") -# convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py b/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py deleted file mode 100644 index d39a187ed33b187fff0a219b48b8562c758338f5..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py +++ /dev/null @@ -1,465 +0,0 @@ -print("starting...") - -#import os -#import shutil -#import subprocess -import re -#from pydub import AudioSegment -#import tempfile -#from pydub import AudioSegment -#import os -import nltk -#from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -#import os -#import subprocess -#import ebooklib -#from ebooklib import epub -#from bs4 import BeautifulSoup -#import re -#import csv -#import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -#import os -import subprocess -import sys -import torchaudio # not sure if this is needed - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) -# temp_audio_directory = os.path.join(".", "Working_files", "temp") - temp_audio_directory = os.path.join(".", "Operator",worker_num, "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": -# if len(sys.argv) < 2: -# print("Usage: python script.py [target_voice_file_path]") -# sys.exit(1) - - worker_num = sys.argv[1] #to let the script know which temp dir its using in operator -# ebook_file_path = sys.argv[1] - target_voice = "./4.wav" # sys.argv[2] if len(sys.argv) > 2 else None - language = "en" # sys.argv[3] if len(sys.argv) > 3 else None - -# if not calibre_installed(): -# sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") -# chapters_directory = os.path.join(".","Working_files", "temp_ebook") - chapters_directory = os.path.join(".","Operator",worker_num, "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py b/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py deleted file mode 100644 index 857aa89df181282de5b96bbd09b9eeb6dcd998a1..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py +++ /dev/null @@ -1,465 +0,0 @@ -print("starting...") - -#import os -#import shutil -#import subprocess -import re -#from pydub import AudioSegment -#import tempfile -#from pydub import AudioSegment -#import os -import nltk -#from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -#import os -#import subprocess -#import ebooklib -#from ebooklib import epub -#from bs4 import BeautifulSoup -#import re -#import csv -#import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -#import os -import subprocess -import sys -import torchaudio # not sure if this is needed - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) -# temp_audio_directory = os.path.join(".", "Working_files", "temp") - temp_audio_directory = os.path.join(".", "Operator",worker_num, "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": -# if len(sys.argv) < 2: -# print("Usage: python script.py [target_voice_file_path]") -# sys.exit(1) - - worker_num = sys.argv[1] #to let the script know which temp dir its using in operator -# ebook_file_path = sys.argv[1] - target_voice = "./4.wav" # sys.argv[2] if len(sys.argv) > 2 else None - language = "en" # sys.argv[3] if len(sys.argv) > 3 else None - -# if not calibre_installed(): -# sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") -# chapters_directory = os.path.join(".","Working_files", "temp_ebook") - chapters_directory = os.path.join(".","Operator",worker_num, "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/Notebooks/Kaggel Archive Code/p3.py b/Notebooks/Kaggel Archive Code/p3.py deleted file mode 100644 index a22a76c081ded8525e375cfc523358c717c314d8..0000000000000000000000000000000000000000 --- a/Notebooks/Kaggel Archive Code/p3.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") -# print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") -# convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/Notebooks/colab_ebook2audiobookxtts.ipynb b/Notebooks/colab_ebook2audiobookxtts.ipynb deleted file mode 100644 index b990dfe2339c2f27d7585fa8c9541e97a9e4423d..0000000000000000000000000000000000000000 --- a/Notebooks/colab_ebook2audiobookxtts.ipynb +++ /dev/null @@ -1,105 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Welcome to the ebook2audiobookxtts free google colab!\n", - "## 🌟 Features\n", - "\n", - "- 📖 Converts eBooks to text format with Calibre.\n", - "- 📚 Splits eBook into chapters for organized audio.\n", - "- 🎙️ High-quality text-to-speech with Coqui XTTS.\n", - "- 🗣️ Optional voice cloning with your own voice file.\n", - "- 🌍 Supports multiple languages! (English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu), Korean (ko)).\n", - "## Want to run locally for free? ⬇\n", - "## [Check out the ebook2audiobookxtts github!](https://github.com/DrewThomasson/ebook2audiobookXTTS)" - ], - "metadata": { - "id": "DKNNnwD-HJwQ" - } - }, - { - "cell_type": "code", - "source": [ - "# @title 🛠️ Install requirments\n", - "#!DEBIAN_FRONTEND=noninteractive\n", - "!sudo apt-get update # && sudo apt-get -y upgrade\n", - "!sudo apt-get -y install libegl1\n", - "!sudo apt-get -y install libopengl0\n", - "!sudo apt-get -y install libxcb-cursor0\n", - "!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n", - "!sudo apt-get install -y ffmpeg\n", - "#!sudo apt-get install -y calibre\n", - "!pip install ebook2audiobook-install-counter\n", - "!pip install ebooklib\n", - "!pip install pydub\n", - "!pip install nltk\n", - "!pip install beautifulsoup4\n", - "!pip install tqdm\n", - "!pip install gradio\n", - "!pip install coqui-tts" - ], - "metadata": { - "id": "Edxj355K0rUz", - "collapsed": true, - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title 🚀 Run ebook2audiobookxtts! (Make sure to set the runtime to have gpu to have faster generation speeds! :)\n", - "#ntlk error fix\n", - "#https://github.com/delip/PyTorchNLPBook/issues/14\n", - "import nltk\n", - "nltk.download('punkt')\n", - "\n", - "#Auto agree to xtts\n", - "import os\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", - "\n", - "# To download the app.py and the Default_voice wav if not seen locally\n", - "!wget -O /content/app.py https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/app.py\n", - "!wget -O /content/default_voice.wav https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/default_voice.wav\n", - "\n", - "# Start the app with Share=True for the gradio interface\n", - "!python /content/app.py --share True" - ], - "metadata": { - "id": "658BTHueyLMo", - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb b/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb deleted file mode 100644 index 8b218357d0d3046e760c800f3aa0607c624a48b9..0000000000000000000000000000000000000000 --- a/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","kaggle":{"accelerator":"gpu","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# IGNORE THESE ITS OLD LOL\n\n# install needed packages\n\n##!apt-get update\n\n##!apt-get install wget unzip git ffmpeg calibre\n\n\n\n# pip install requirments\n\n##!pip install tts==0.21.3 pydub nltk beautifulsoup4 ebooklib tqdm gradio\n\n\n\n##!pip install numpy==1.23\n\n##!pip install --no-binary lxml lxml\n\n##import os\n\n##os.kill(os.getpid(), 9)\n","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gh3HEhmzuqVA","outputId":"81217d71-7576-43db-d56c-07ce11ea6517","jupyter":{"source_hidden":true},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#!DEBIAN_FRONTEND=noninteractive\n\n!sudo apt-get update # && sudo apt-get -y upgrade\n\n!sudo apt-get -y install libegl1\n\n!sudo apt-get -y install libopengl0\n\n!sudo apt-get -y install libxcb-cursor0\n\n!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n\n!sudo apt-get install -y ffmpeg\n\n!pip install tts pydub nltk beautifulsoup4 ebooklib tqdm\n\n!pip install numpy==1.26.4\n\n!pip install gradio","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"Edxj355K0rUz","outputId":"9fc5f4e1-1ba2-4814-a477-496f626c2772","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Start the app with Share=True for the gradio interface\n\n\n\n#ntlk error fix\n\n#https://github.com/delip/PyTorchNLPBook/issues/14\n\nimport nltk\n\nnltk.download('punkt')\n\n\n\n#Auto agree to xtts\n\nimport os\n\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n\n\n!python /kaggle/working/app.py --share True","metadata":{"id":"EZIZva9Tvdbb","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#ntlk error fix\n\n#https://github.com/delip/PyTorchNLPBook/issues/14\n\nimport nltk\n\nnltk.download('punkt')\n\n\n\n#Auto agree to xtts\n\nimport os\n\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n\n\n# To download the app.py and the Default_voice wav if not seen locally\n\n!wget -O /kaggle/working/app.py https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/app.py\n\n!wget -O /kaggle/working/default_voice.wav https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/default_voice.wav\n\n\n\n# Start the app with Share=True for the gradio interface\n\n!python /kaggle/working/app.py --share True","metadata":{"id":"658BTHueyLMo","colab":{"base_uri":"https://localhost:8080/"},"outputId":"e293e70d-b25a-41bc-dbac-7ca1ddf1d3d2","trusted":true},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/README.md b/README.md index 4bdb4640d8fe96a0c4788a6e7a72efb2aab958b6..660aa1a10b23d28d1dc7e525e307764bf8e13bb9 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,12 @@ --- -title: Ebook2audiobook V2.0 Beta -emoji: 🚀 +title: ebook2audiobook_docker_test +emoji: ⚡ colorFrom: indigo -colorTo: red -sdk: gradio -sdk_version: 5.9.0 -app_file: app.py -pinned: true +colorTo: indigo +sdk: docker +pinned: false license: apache-2.0 -short_description: Added improvements, 1107+ languages supported +short_description: First ebook2audiobook Dockerfile test --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file +Check out the [Ebook2Audiobook]("https://github.com/DrewThomasson/ebook2audiobook") github repo! \ No newline at end of file diff --git a/app.py b/app.py deleted file mode 100644 index 1dbd37b7aef5eae89cbb7f75374451cc57d3e8c3..0000000000000000000000000000000000000000 --- a/app.py +++ /dev/null @@ -1,232 +0,0 @@ -import argparse -import os -import regex as re -import socket -import subprocess -import sys -import unidic - -from lib.conf import * -from lib.lang import language_mapping, default_language_code - -def check_python_version(): - current_version = sys.version_info[:2] # (major, minor) - if current_version < min_python_version or current_version > max_python_version: - error = f'''********** Error: Your OS Python version is not compatible! (current: {current_version[0]}.{current_version[1]}) - Please create a virtual python environment verrsion {min_python_version[0]}.{min_python_version[1]} or {max_python_version[0]}.{max_python_version[1]} - with conda or python -v venv **********''' - print(error) - return False - else: - return True - -def check_and_install_requirements(file_path): - if not os.path.exists(file_path): - print(f'Warning: File {file_path} not found. Skipping package check.') - try: - from importlib.metadata import version, PackageNotFoundError - with open(file_path, 'r') as f: - contents = f.read().replace('\r', '\n') - packages = [pkg.strip() for pkg in contents.splitlines() if pkg.strip()] - - missing_packages = [] - for package in packages: - # Extract package name without version specifier - pkg_name = re.split(r'[<>=]', package)[0].strip() - try: - installed_version = version(pkg_name) - except PackageNotFoundError: - print(f'{package} is missing.') - missing_packages.append(package) - pass - - if missing_packages: - print('\nInstalling missing packages...') - try: - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'] + missing_packages) - except subprocess.CalledProcessError as e: - print(f'Failed to install packages: {e}') - return False - - return True - except Exception as e: - raise(f'An error occurred: {e}') - -def check_dictionary(): - unidic_path = unidic.DICDIR - dicrc = os.path.join(unidic_path, 'dicrc') - if not os.path.exists(dicrc) or os.path.getsize(dicrc) == 0: - try: - print('UniDic dictionary not found or incomplete. Downloading now...') - subprocess.run(['python', '-m', 'unidic', 'download'], check=True) - except subprocess.CalledProcessError as e: - print(f'Failed to download UniDic dictionary. Error: {e}') - raise SystemExit('Unable to continue without UniDic. Exiting...') - return True - -def is_port_in_use(port): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(('0.0.0.0', port)) == 0 - -def main(): - global is_gui_process - - # Convert the list of languages to a string to display in the help text - lang_list_str = ', '.join(list(language_mapping.keys())) - - # Argument parser to handle optional parameters with descriptions - parser = argparse.ArgumentParser( - description='Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the Gradio interface or run the script in headless mode for direct conversion.', - epilog=''' -Example usage: -Windows: - headless: - ebook2audiobook.cmd --headless --ebook 'path_to_ebook' - Graphic Interface: - ebook2audiobook.cmd -Linux/Mac: - headless: - ./ebook2audiobook.sh --headless --ebook 'path_to_ebook' - Graphic Interface: - ./ebook2audiobook.sh - ''', - formatter_class=argparse.RawTextHelpFormatter - ) - options = [ - '--script_mode', '--share', '--headless', - '--session', '--ebook', '--ebooks_dir', - '--voice', '--language', '--device', '--custom_model', - '--temperature', '--length_penalty', '--repetition_penalty', - '--top_k', '--top_p', '--speed', - '--enable_text_splitting', '--fine_tuned', - '--version', '--help' - ] - parser.add_argument(options[0], type=str, - help='Force the script to run in NATIVE or DOCKER_UTILS') - parser.add_argument(options[1], action='store_true', - help='Enable a public shareable Gradio link. Default to False.') - parser.add_argument(options[2], nargs='?', const=True, default=False, - help='Run in headless mode. Default to True if the flag is present without a value, False otherwise.') - parser.add_argument(options[3], type=str, - help='Session to reconnect in case of interruption (headless mode only)') - parser.add_argument(options[4], type=str, - help='Path to the ebook file for conversion. Required in headless mode.') - parser.add_argument(options[5], nargs='?', const='default', type=str, - help=f'Path to the directory containing ebooks for batch conversion. Default to "{os.path.basename(ebooks_dir)}" if "default" is provided.') - parser.add_argument(options[6], type=str, default=None, - help='Path to the target voice file for TTS. Optional, must be 24khz for XTTS and 16khz for fairseq models, uses a default voice if not provided.') - parser.add_argument(options[7], type=str, default=default_language_code, - help=f'Language for the audiobook conversion. Options: {lang_list_str}. Default to English (eng).') - parser.add_argument(options[8], type=str, default='cpu', choices=['cpu', 'gpu'], - help=f'Type of processor unit for the audiobook conversion. If not specified: check first if gpu available, if not cpu is selected.') - parser.add_argument(options[9], type=str, - help=f'Path to the custom model (.zip file containing {default_model_files}). Required if using a custom model.') - parser.add_argument(options[10], type=float, default=0.65, - help='Temperature for the model. Default to 0.65. Higher temperatures lead to more creative outputs.') - parser.add_argument(options[11], type=float, default=1.0, - help='A length penalty applied to the autoregressive decoder. Default to 1.0. Not applied to custom models.') - parser.add_argument(options[12], type=float, default=2.5, - help='A penalty that prevents the autoregressive decoder from repeating itself. Default to 2.5') - parser.add_argument(options[13], type=int, default=50, - help='Top-k sampling. Lower values mean more likely outputs and increased audio generation speed. Default to 50') - parser.add_argument(options[14], type=float, default=0.8, - help='Top-p sampling. Lower values mean more likely outputs and increased audio generation speed. Default to 0.8') - parser.add_argument(options[15], type=float, default=1.0, - help='Speed factor for the speech generation. Default to 1.0') - parser.add_argument(options[16], action='store_true', - help='Enable splitting text into sentences. Default to False.') - parser.add_argument(options[17], type=str, default=default_fine_tuned, - help='Name of the fine tuned model. Optional, uses the standard model according to the TTS engine and language.') - parser.add_argument(options[18], action='version',version=f'ebook2audiobook version {version}', - help='Show the version of the script and exit') - - for arg in sys.argv: - if arg.startswith('--') and arg not in options: - print(f'Error: Unrecognized option "{arg}"') - sys.exit(1) - - args = vars(parser.parse_args()) - - # Check if the port is already in use to prevent multiple launches - if not args['headless'] and is_port_in_use(interface_port): - print(f'Error: Port {interface_port} is already in use. The web interface may already be running.') - sys.exit(1) - - args['script_mode'] = args['script_mode'] if args['script_mode'] else NATIVE - args['share'] = args['share'] if args['share'] else False - - if args['script_mode'] == NATIVE: - check_pkg = check_and_install_requirements(requirements_file) - if check_pkg: - if not check_dictionary(): - sys.exit(1) - else: - print('Some packages could not be installed') - sys.exit(1) - - from lib.functions import web_interface, convert_ebook - - # Conditions based on the --headless flag - if args['headless']: - args['is_gui_process'] = False - args['audiobooks_dir'] = audiobooks_cli_dir - - # Condition to stop if both --ebook and --ebooks_dir are provided - if args['ebook'] and args['ebooks_dir']: - print('Error: You cannot specify both --ebook and --ebooks_dir in headless mode.') - sys.exit(1) - - # Condition 1: If --ebooks_dir exists, check value and set 'ebooks_dir' - if args['ebooks_dir']: - new_ebooks_dir = None - if args['ebooks_dir'] == 'default': - print(f'Using the default ebooks_dir: {ebooks_dir}') - new_ebooks_dir = os.path.abspath(ebooks_dir) - else: - # Check if the directory exists - if os.path.exists(args['ebooks_dir']): - new_ebooks_dir = os.path.abspath(args['ebooks_dir']) - else: - print(f'Error: The provided --ebooks_dir "{args["ebooks_dir"]}" does not exist.') - sys.exit(1) - - if os.path.exists(new_ebooks_dir): - for file in os.listdir(new_ebooks_dir): - # Process files with supported ebook formats - if any(file.endswith(ext) for ext in ebook_formats): - full_path = os.path.join(new_ebooks_dir, file) - print(f'Processing eBook file: {full_path}') - args['ebook'] = full_path - progress_status, audiobook_file = convert_ebook(args) - if audiobook_file is None: - print(f'Conversion failed: {progress_status}') - sys.exit(1) - else: - print(f'Error: The directory {new_ebooks_dir} does not exist.') - sys.exit(1) - - elif args['ebook']: - progress_status, audiobook_file = convert_ebook(args) - if audiobook_file is None: - print(f'Conversion failed: {progress_status}') - sys.exit(1) - - else: - print('Error: In headless mode, you must specify either an ebook file using --ebook or an ebook directory using --ebooks_dir.') - sys.exit(1) - else: - args['is_gui_process'] = True - passed_arguments = sys.argv[1:] - allowed_arguments = {'--share', '--script_mode'} - passed_args_set = {arg for arg in passed_arguments if arg.startswith('--')} - if passed_args_set.issubset(allowed_arguments): - web_interface(args) - else: - print('Error: In non-headless mode, no option or only --share can be passed') - sys.exit(1) - -if __name__ == '__main__': - if not check_python_version(): - sys.exit(1) - else: - main() diff --git a/ebook2audiobook.cmd b/ebook2audiobook.cmd deleted file mode 100644 index 4f24bb5b7441c27cfa1ac987a9a27b42aa7b348b..0000000000000000000000000000000000000000 --- a/ebook2audiobook.cmd +++ /dev/null @@ -1,285 +0,0 @@ -@echo off -setlocal enabledelayedexpansion - -:: Capture all arguments into ARGS -set "ARGS=%*" - -set "NATIVE=native" -set "DOCKER_UTILS=docker_utils" -set "FULL_DOCKER=full_docker" - -set "SCRIPT_MODE=%NATIVE%" -set "SCRIPT_DIR=%~dp0" - -set "PYTHON_VERSION=3.12" -set "DOCKER_UTILS_IMG=utils" -set "PYTHON_ENV=python_env" -set "CURRENT_ENV=" -set "PROGRAMS_LIST=calibre ffmpeg" - -set "CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe" -set "CONDA_INSTALLER=%TEMP%\Miniconda3-latest-Windows-x86_64.exe" -set "CONDA_INSTALL_DIR=%USERPROFILE%\miniconda3" -set "CONDA_PATH=%USERPROFILE%\miniconda3\bin" -set "PATH=%CONDA_PATH%;%PATH%" - -set "PROGRAMS_CHECK=0" -set "CONDA_CHECK_STATUS=0" -set "CONDA_RUN_INIT=0" -set "DOCKER_CHECK_STATUS=0" -set "DOCKER_BUILD_STATUS=0" - -set "CALIBRE_TEMP_DIR=C:\Windows\Temp\Calibre" - -if not exist "%CALIBRE_TEMP_DIR%" ( - mkdir "%CALIBRE_TEMP_DIR%" -) - -icacls "%CALIBRE_TEMP_DIR%" /grant Users:(OI)(CI)F /T - -for %%A in (%ARGS%) do ( - if "%%A"=="%DOCKER_UTILS%" ( - set "SCRIPT_MODE=%DOCKER_UTILS%" - break - ) -) - -cd /d "%SCRIPT_DIR%" - -:: Check if running inside Docker -if defined CONTAINER ( - echo Running in %FULL_DOCKER% mode - set "SCRIPT_MODE=%FULL_DOCKER%" - goto main -) - -echo Running in %SCRIPT_MODE% mode - -:: Check if running in a Conda environment -if defined CONDA_DEFAULT_ENV ( - set "CURRENT_ENV=%CONDA_PREFIX%" -) - -:: Check if running in a Python virtual environment -if defined VIRTUAL_ENV ( - set "CURRENT_ENV=%VIRTUAL_ENV%" -) - -for /f "delims=" %%i in ('where python') do ( - if defined CONDA_PREFIX ( - if /i "%%i"=="%CONDA_PREFIX%\Scripts\python.exe" ( - set "CURRENT_ENV=%CONDA_PREFIX%" - break - ) - ) else if defined VIRTUAL_ENV ( - if /i "%%i"=="%VIRTUAL_ENV%\Scripts\python.exe" ( - set "CURRENT_ENV=%VIRTUAL_ENV%" - break - ) - ) -) - -if not "%CURRENT_ENV%"=="" ( - echo Current python virtual environment detected: %CURRENT_ENV%. - echo This script runs with its own virtual env and must be out of any other virtual environment when it's launched. - goto failed -) - -goto conda_check - -:conda_check -where conda >nul 2>&1 -if %errorlevel% neq 0 ( - set "CONDA_CHECK_STATUS=1" -) else ( - if "%SCRIPT_MODE%"=="%DOCKER_UTILS%" ( - goto docker_check - exit /b - ) else ( - call :programs_check - ) -) -goto dispatch -exit /b - -:programs_check -set "missing_prog_array=" -for %%p in (%PROGRAMS_LIST%) do ( - set "FOUND=" - for /f "delims=" %%i in ('where %%p 2^>nul') do ( - set "FOUND=%%i" - ) - if not defined FOUND ( - echo %%p is not installed. - set "missing_prog_array=!missing_prog_array! %%p" - ) -) -if not "%missing_prog_array%"=="" ( - set "PROGRAMS_CHECK=1" -) -exit /b - -:docker_check -docker --version >nul 2>&1 -if %errorlevel% neq 0 ( - set "DOCKER_CHECK_STATUS=1" -) else ( - :: Verify Docker is running - call docker info >nul 2>&1 - if %errorlevel% neq 0 ( - set "DOCKER_CHECK_STATUS=1" - ) else ( - :: Check if the Docker socket is running - set "docker_socket=" - if exist \\.\pipe\docker_engine ( - set "docker_socket=Windows" - ) - if not defined docker_socket ( - echo Cannot connect to docker socket. Check if the docker socket is running. - goto failed - exit /b - ) else ( - :: Check if the Docker image is available - call docker images -q %DOCKER_UTILS_IMG% >nul 2>&1 - if %errorlevel% neq 0 ( - echo Docker image '%DOCKER_UTILS_IMG%' not found. Installing it now... - set "DOCKER_BUILD_STATUS=1" - ) else ( - goto dispatch - exit /b - ) - ) - ) -) -goto install_components -exit /b - -:install_components -:: Check if running as administrator -net session >nul 2>&1 -if %errorlevel% neq 0 ( - echo This script needs to be run as administrator. - echo Attempting to restart with administrator privileges... - if defined ARGS ( - call powershell -ExecutionPolicy Bypass -Command "Start-Process '%~f0' -ArgumentList '%ARGS%' -WorkingDirectory '%SCRIPT_DIR%' -Verb RunAs" - ) else ( - call powershell -ExecutionPolicy Bypass -Command "Start-Process '%~f0' -WorkingDirectory '%SCRIPT_DIR%' -Verb RunAs" - ) - exit /b -) -:: Install Chocolatey if not already installed -choco -v >nul 2>&1 -if %errorlevel% neq 0 ( - echo Chocolatey is not installed. Installing Chocolatey... - call powershell -ExecutionPolicy Bypass -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; Invoke-Expression ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" -) -:: Install Python if not already installed -python --version >nul 2>&1 -if %errorlevel% neq 0 ( - echo Python is not installed. Installing Python... - call choco install python -y -) -:: Install missing packages if any -if not "%PROGRAMS_CHECK%"=="0" ( - call choco install %missing_prog_array% -y --force - setx CALIBRE_TEMP_DIR "%CALIBRE_TEMP_DIR%" /M - set "PROGRAMS_CHECK=0" - set "missing_prog_array=" -) -:: Install Conda if not already installed -if not "%CONDA_CHECK_STATUS%"=="0" ( - echo Installing Conda... - call powershell -Command "[System.Environment]::SetEnvironmentVariable('Path', [System.Environment]::GetEnvironmentVariable('Path','Machine') + ';' + [System.Environment]::GetEnvironmentVariable('Path','User'),'Process')" - echo Downloading Conda installer... - call bitsadmin /transfer "MinicondaDownload" %CONDA_URL% "%CONDA_INSTALLER%" - "%CONDA_INSTALLER%" /InstallationType=JustMe /RegisterPython=0 /AddToPath=1 /S /D=%CONDA_INSTALL_DIR% - if exist "%CONDA_INSTALL_DIR%\condabin\conda.bat" ( - echo Conda installed successfully. - set "CONDA_RUN_INIT=1" - set "CONDA_CHECK_STATUS=0" - set "PATH=%CONDA_INSTALL_DIR%\condabin;%PATH%" - ) -) -:: Install Docker if not already installed -if not "%DOCKER_CHECK_STATUS%"=="0" ( - echo Docker is not installed. Installing it now... - call choco install docker-cli docker-engine -y - call docker --version >nul 2>&1 - if %errorlevel% equ 0 ( - echo Starting Docker Engine... - net start com.docker.service >nul 2>&1 - if %errorlevel% equ 0 ( - echo Docker installed and started successfully. - set "DOCKER_CHECK_STATUS=0" - ) - ) -) -:: Build Docker image if required -if not "%DOCKER_BUILD_STATUS%"=="0" ( - call conda activate "%SCRIPT_DIR%\%PYTHON_ENV%" - call python -m pip install -e . - call docker build -f DockerfileUtils -t utils . - call conda deactivate - call docker images -q %DOCKER_UTILS_IMG% >nul 2>&1 - if %errorlevel% equ 0 ( - set "DOCKER_BUILD_STATUS=0" - ) -) -net session >nul 2>&1 -if %errorlevel% equ 0 ( - echo Restarting in user mode... - start "" /b cmd /c "%~f0" %ARGS% - exit /b -) -goto dispatch -exit /b - -:dispatch -if "%PROGRAMS_CHECK%"=="0" ( - if "%CONDA_CHECK_STATUS%"=="0" ( - if "%DOCKER_CHECK_STATUS%"=="0" ( - if "%DOCKER_BUILD_STATUS%"=="0" ( - goto main - exit /b - ) - ) else ( - goto failed - exit /b - ) - ) -) -echo PROGRAMS_CHECK: %PROGRAMS_CHECK% -echo CONDA_CHECK_STATUS: %CONDA_CHECK_STATUS% -echo DOCKER_CHECK_STATUS: %DOCKER_CHECK_STATUS% -echo DOCKER_BUILD_STATUS: %DOCKER_BUILD_STATUS% -timeout /t 5 /nobreak >nul -goto install_components -exit /b - -:main -if "%SCRIPT_MODE%"=="%FULL_DOCKER%" ( - python %SCRIPT_DIR%\app.py --script_mode %FULL_DOCKER% %ARGS% -) else ( - if "%CONDA_RUN_INIT%"=="1" ( - call conda init - set "CONDA_RUN_INIT=0" - ) - if not exist "%SCRIPT_DIR%\%PYTHON_ENV%" ( - call conda create --prefix %SCRIPT_DIR%\%PYTHON_ENV% python=%PYTHON_VERSION% -y - call conda activate %SCRIPT_DIR%\%PYTHON_ENV% - call python -m pip install --upgrade pip - call python -m pip install --upgrade -r requirements.txt --progress-bar=on - ) else ( - call conda activate %SCRIPT_DIR%\%PYTHON_ENV% - ) - python %SCRIPT_DIR%\app.py --script_mode %SCRIPT_MODE% %ARGS% - call conda deactivate -) -exit /b - -:failed -echo ebook2audiobook is not correctly installed or run. -exit /b - -endlocal -pause \ No newline at end of file diff --git a/ebook2audiobook.sh b/ebook2audiobook.sh deleted file mode 100644 index 6691d726ca12ca3e0343dc3f55edde12da5f0b7e..0000000000000000000000000000000000000000 --- a/ebook2audiobook.sh +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env bash - -PYTHON_VERSION="3.12" -export TTS_CACHE="./models" - -ARGS="$@" - -# Declare an associative array -declare -A arguments - -# Parse arguments -while [[ "$#" -gt 0 ]]; do - case "$1" in - --*) - key="${1/--/}" # Remove leading '--' - if [[ -n "$2" && ! "$2" =~ ^-- ]]; then - # If the next argument is a value (not another option) - arguments[$key]="$2" - shift # Move past the value - else - # Set to true for flags without values - arguments[$key]=true - fi - ;; - *) - echo "Unknown option: $1" - exit 1 - ;; - esac - shift # Move to the next argument -done - -NATIVE="native" -DOCKER_UTILS="docker_utils" -FULL_DOCKER="full_docker" - -SCRIPT_MODE="$NATIVE" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -WGET=$(which wget 2>/dev/null) -REQUIRED_PROGRAMS=("calibre" "ffmpeg") -DOCKER_UTILS_IMG="utils" -PYTHON_ENV="python_env" -CURRENT_ENV="" - -if [[ "$OSTYPE" = "darwin"* ]]; then - CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh" -else - CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -fi -CONDA_INSTALLER=/tmp/Miniconda3-latest.sh -CONDA_INSTALL_DIR=$HOME/miniconda3 -CONDA_PATH=$HOME/miniconda3/bin -CONDA_ENV=~/miniconda3/etc/profile.d/conda.sh -CONFIG_FILE="$HOME/.bashrc" -PATH="$CONDA_PATH:$PATH" - -declare -a programs_missing - -# Check if the current script is run inside a docker container -if [[ -n "$container" || -f /.dockerenv ]]; then - SCRIPT_MODE="$FULL_DOCKER" -else - if [[ -n "${arguments['script_mode']+exists}" ]]; then - if [ "${arguments['script_mode']}" = "$NATIVE" ] || [ "${arguments['script_mode']}" = "$DOCKER_UTILS" ]; then - SCRIPT_MODE="${arguments['script_mode']}" - fi - fi -fi - -# Check if running in a Conda or Python virtual environment -if [[ -n "$CONDA_DEFAULT_ENV" ]]; then - CURRENT_ENV="$CONDA_PREFIX" -elif [[ -n "$VIRTUAL_ENV" ]]; then - CURRENT_ENV="$VIRTUAL_ENV" -fi - -# If neither environment variable is set, check Python path -if [[ -z "$CURRENT_ENV" ]]; then - PYTHON_PATH=$(which python 2>/dev/null) - if [[ ( -n "$CONDA_PREFIX" && "$PYTHON_PATH" == "$CONDA_PREFIX/bin/python" ) || ( -n "$VIRTUAL_ENV" && "$PYTHON_PATH" == "$VIRTUAL_ENV/bin/python" ) ]]; then - CURRENT_ENV="${CONDA_PREFIX:-$VIRTUAL_ENV}" - fi -fi - -# Output result if a virtual environment is detected -if [[ -n "$CURRENT_ENV" ]]; then - echo -e "Current python virtual environment detected: $CURRENT_ENV." - echo -e "This script runs with its own virtual env and must be out of any other virtual environment when it's launched." - echo -e "If you are using miniconda then you would type in:" - echo -e "conda deactivate" - exit 1 -fi - -function required_programs_check { - local programs=("$@") - for program in "${programs[@]}"; do - if ! command -v "$program" >/dev/null 2>&1; then - echo -e "\e[33m$program is not installed.\e[0m" - programs_missing+=($program) - fi - done - local count=${#programs_missing[@]} - if [[ $count -eq 0 ]]; then - return 0 - else - return 1 - fi -} - -function install_programs { - echo -e "\e[33mInstalling required programs. NOTE: you must have 'sudo' priviliges or it will fail.\e[0m" - if [[ "$OSTYPE" = "darwin"* ]]; then - PACK_MGR="brew install" - if ! command -v brew &> /dev/null; then - echo -e "\e[33mHomebrew is not installed. Installing Homebrew...\e[0m" - /usr/bin/env bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" - echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> ~/.zprofile - eval "$(/opt/homebrew/bin/brew shellenv)" - fi - else - if command -v emerge &> /dev/null; then - PACK_MGR="sudo emerge" - elif command -v dnf &> /dev/null; then - PACK_MGR="sudo dnf install" - PACK_MGR_OPTIONS="-y" - elif command -v yum &> /dev/null; then - PACK_MGR="sudo yum install" - PACK_MGR_OPTIONS="-y" - elif command -v zypper &> /dev/null; then - PACK_MGR="sudo zypper install" - PACK_MGR_OPTIONS="-y" - elif command -v pacman &> /dev/null; then - PACK_MGR="sudo pacman -Sy" - elif command -v apt-get &> /dev/null; then - sudo apt-get update - PACK_MGR="sudo apt-get install" - PACK_MGR_OPTIONS="-y" - elif command -v apk &> /dev/null; then - PACK_MGR="sudo apk add" - else - echo "Cannot recognize your applications package manager. Please install the required applications manually." - return 1 - fi - - fi - if [ -z "$WGET" ]; then - echo -e "\e[33m wget is missing! trying to install it... \e[0m" - result=$(eval "$PACK_MGR wget $PACK_MGR_OPTIONS" 2>&1) - result_code=$? - if [ $result_code -eq 0 ]; then - WGET=$(which wget 2>/dev/null) - else - echo "Cannot 'wget'. Please install 'wget' manually." - return 1 - fi - fi - for program in "${programs_missing[@]}"; do - if [ "$program" = "calibre" ];then - # avoid conflict with calibre builtin lxml - pip uninstall lxml -y 2>/dev/null - echo -e "\e[33mInstalling Calibre...\e[0m" - if [[ "$OSTYPE" = "darwin"* ]]; then - eval "$PACK_MGR --cask calibre" - else - $WGET -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sh /dev/stdin - fi - if command -v calibre >/dev/null 2>&1; then - echo -e "\e[32m===============>>> Calibre is installed! <<===============\e[0m" - else - echo "Calibre installation failed." - fi - else - eval "$PACK_MGR $program $PKG_MGR_OPTIONS" - if command -v $program >/dev/null 2>&1; then - echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m" - else - echo "$program installation failed." - fi - fi - done - if required_programs_check "${REQUIRED_PROGRAMS[@]}"; then - return 0 - else - echo -e "\e[33mYou can run 'ebook2audiobook.sh --script_mode docker_utils' to avoid to install $REQUIRED_PROGRAMS natively.\e[0m" - return 1 - fi -} - -function conda_check { - if ! command -v conda &> /dev/null; then - echo -e "\e[33mconda is not installed!\e[0m" - echo -e "\e[33mDownloading conda installer...\e[0m" - wget -O "$CONDA_INSTALLER" "$CONDA_URL" - if [[ -f "$CONDA_INSTALLER" ]]; then - echo -e "\e[33mInstalling Miniconda...\e[0m" - bash "$CONDA_INSTALLER" -u -b -p "$CONDA_INSTALL_DIR" - rm -f "$CONDA_INSTALLER" - if [[ -f "$CONDA_INSTALL_DIR/bin/conda" ]]; then - conda init - echo -e "\e[32m===============>>> conda is installed! <<===============\e[0m" - else - echo -e "\e[31mconda installation failed.\e[0m" - return 1 - fi - else - echo -e "\e[31mFailed to download Miniconda installer.\e[0m" - echo -e "\e[33mI'ts better to use the install.sh to install everything needed.\e[0m" - return 1 - fi - fi - if [[ ! -d $SCRIPT_DIR/$PYTHON_ENV ]]; then - # Use this condition to chmod writable folders once - chmod -R 777 ./audiobooks ./tmp ./models - conda create --prefix $SCRIPT_DIR/$PYTHON_ENV python=$PYTHON_VERSION -y - source $CONDA_ENV - conda activate $SCRIPT_DIR/$PYTHON_ENV - python -m pip install --upgrade pip - python -m pip install --upgrade -r requirements.txt --progress-bar=on - conda deactivate - fi - return 0 -} - -function docker_check { - if ! command -v docker &> /dev/null; then - echo -e "\e[33m docker is missing! trying to install it... \e[0m" - if [[ "$OSTYPE" == "darwin"* ]]; then - echo "Installing Docker using Homebrew..." - $PACK_MGR --cask docker $PACK_MGR_OPTIONS - else - $WGET -qO get-docker.sh https://get.docker.com && \ - sudo sh get-docker.sh - sudo systemctl start docker - sudo systemctl enable docker - docker run hello-world - rm -f get-docker.sh - fi - echo -e "\e[32m===============>>> docker is installed! <<===============\e[0m" - docker_build - else - # Check if Docker service is running - if docker info >/dev/null 2>&1; then - if [[ "$(docker images -q $DOCKER_UTILS_IMG 2> /dev/null)" = "" ]]; then - docker_build - fi - else - echo -e "\e[33mDocker is not running\e[0m" - return 1 - fi - fi - return 0 -} - -function docker_build { -# Check if the Docker socket is accessible - if [[ -e /var/run/docker.sock || -e /run/docker.sock ]]; then - echo -e "\e[33mDocker image '$DOCKER_UTILS_IMG' not found. Trying to build it...\e[0m" - docker build -f DockerfileUtils -t utils . - else - echo -e "\e[33mcannot connect to docker socket. Check if the docker socket is running.\e[0m" - fi -} - -if [ "$SCRIPT_MODE" = "$FULL_DOCKER" ]; then - echo -e "\e[33mRunning in $FULL_DOCKER mode\e[0m" - python app.py --script_mode $SCRIPT_MODE $ARGS -elif [[ "$SCRIPT_MODE" == "$NATIVE" || "$SCRIPT_MODE" = "$DOCKER_UTILS" ]]; then - pass=true - if [ "$SCRIPT_MODE" == "$NATIVE" ]; then - echo -e "\e[33mRunning in $NATIVE mode\e[0m" - if ! required_programs_check "${REQUIRED_PROGRAMS[@]}"; then - if ! install_programs; then - pass=false - fi - fi - else - echo -e "\e[33mRunning in $DOCKER_UTILS mode\e[0m" - if conda_check; then - if docker_check; then - source $CONDA_ENV - conda activate $SCRIPT_DIR/$PYTHON_ENV - python app.py --script_mode $DOCKER_UTILS $ARGS - conda deactivate - fi - fi - fi - if [ $pass = true ]; then - if conda_check; then - source $CONDA_ENV - conda activate $SCRIPT_DIR/$PYTHON_ENV - python app.py --script_mode $SCRIPT_MODE $ARGS - conda deactivate - fi - fi -else - echo -e "\e[33mebook2audiobook is not correctly installed or run.\e[0m" -fi - -exit 0 diff --git a/legacy/v1.0/Dockerfile b/legacy/v1.0/Dockerfile deleted file mode 100644 index fb196187bcbe52a502d63afd66911a741f4e568b..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Dockerfile +++ /dev/null @@ -1,93 +0,0 @@ -# Use an official NVIDIA CUDA image with cudnn8 and Ubuntu 20.04 as the base -FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 - -# Set non-interactive installation to avoid timezone and other prompts -ENV DEBIAN_FRONTEND=noninteractive - -# Install necessary packages including Miniconda -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - git \ - espeak \ - espeak-ng \ - ffmpeg \ - tk \ - mecab \ - libmecab-dev \ - mecab-ipadic-utf8 \ - build-essential \ - calibre \ - && rm -rf /var/lib/apt/lists/* - -RUN ebook-convert --version - -# Install Miniconda -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh - - - -# Set PATH to include conda -ENV PATH=/opt/conda/bin:$PATH - -# Create a conda environment with Python 3.10 -RUN conda create -n ebookenv python=3.10 -y - -# Activate the conda environment -SHELL ["conda", "run", "-n", "ebookenv", "/bin/bash", "-c"] - -# Install Python dependencies using conda and pip -RUN conda install -n ebookenv -c conda-forge \ - pydub \ - nltk \ - mecab-python3 \ - && pip install --no-cache-dir \ - bs4 \ - beautifulsoup4 \ - ebooklib \ - tqdm \ - tts==0.21.3 \ - unidic \ - gradio - -# Download unidic -RUN python -m unidic download - -# Set the working directory in the container -WORKDIR /ebook2audiobookXTTS - -# Clone the ebook2audiobookXTTS repository -RUN git clone https://github.com/DrewThomasson/ebook2audiobookXTTS.git . - -# Copy test audio file -COPY default_voice.wav /ebook2audiobookXTTS/ - -# Run a test to set up XTTS -RUN echo "import torch" > /tmp/script1.py && \ - echo "from TTS.api import TTS" >> /tmp/script1.py && \ - echo "device = 'cuda' if torch.cuda.is_available() else 'cpu'" >> /tmp/script1.py && \ - echo "print(TTS().list_models())" >> /tmp/script1.py && \ - echo "tts = TTS('tts_models/multilingual/multi-dataset/xtts_v2').to(device)" >> /tmp/script1.py && \ - echo "wav = tts.tts(text='Hello world!', speaker_wav='default_voice.wav', language='en')" >> /tmp/script1.py && \ - echo "tts.tts_to_file(text='Hello world!', speaker_wav='default_voice.wav', language='en', file_path='output.wav')" >> /tmp/script1.py && \ - yes | python /tmp/script1.py - -# Remove the test audio file -RUN rm -f /ebook2audiobookXTTS/output.wav - -# Verify that the script exists and has the correct permissions -RUN ls -la /ebook2audiobookXTTS/ - -# Check if the script exists and log its presence -RUN if [ -f /ebook2audiobookXTTS/custom_model_ebook2audiobookXTTS_with_link_gradio.py ]; then echo "Script found."; else echo "Script not found."; exit 1; fi - -# Modify the Python script to set share=True -RUN sed -i 's/demo.launch(share=False)/demo.launch(share=True)/' /ebook2audiobookXTTS/custom_model_ebook2audiobookXTTS_with_link_gradio.py - -# Download the punkt package for nltk -RUN python -m nltk.downloader punkt - -# Set the command to run your GUI application using the conda environment -CMD ["conda", "run", "--no-capture-output", "-n", "ebookenv", "python", "/ebook2audiobookXTTS/custom_model_ebook2audiobookXTTS_with_link_gradio.py"] - diff --git a/legacy/v1.0/LICENSE b/legacy/v1.0/LICENSE deleted file mode 100644 index 7fe2ff326e4f6b887789fde344b515c10c3c48dc..0000000000000000000000000000000000000000 --- a/legacy/v1.0/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Drew Thomasson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/4.wav b/legacy/v1.0/Notebooks/Kaggel Archive Code/4.wav deleted file mode 100644 index 7d9b5213bf9d351cf5a19d67b244010a39357127..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/Notebooks/Kaggel Archive Code/4.wav and /dev/null differ diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/LICENSE b/legacy/v1.0/Notebooks/Kaggel Archive Code/LICENSE deleted file mode 100644 index 7fe2ff326e4f6b887789fde344b515c10c3c48dc..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2024 Drew Thomasson - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/README.md b/legacy/v1.0/Notebooks/Kaggel Archive Code/README.md deleted file mode 100644 index d61f07e55b3e857a769a31e3f36a76feaef23a5c..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/README.md +++ /dev/null @@ -1,118 +0,0 @@ -# this is a sample for running on kaggle and it may not be updated frequently - -# ebook2audiobook kaggle eddition -Generates an audiobook with chapters and ebook metadata using Calibre and Xtts from Coqui tts, and with optional voice cloning, and supports multiple languages - -# import this notebook to kaggle -https://github.com/Rihcus/ebook2audiobookXTTS/blob/main/kaggle-ebook2audiobook-demo.ipynb - -## Features - -- Converts eBooks to text format using Calibre's `ebook-convert` tool. -- Splits the eBook into chapters for structured audio conversion. -- Uses XTTS from Coqui TTS for high-quality text-to-speech conversion. -- Optional voice cloning feature using a provided voice file. -- Supports different languages for text-to-speech conversion, with English as the default. -- Confirmed to run on only 4 gb ram - -## Requirements - -- Python 3.x -- `coqui-tts` Python package -- Calibre (for eBook conversion) -- FFmpeg (for audiobook file creation) -- Optional: Custom voice file for voice cloning - -### Installation Instructions for Dependencies - -Install Python 3.x from [Python.org](https://www.python.org/downloads/). - -Install Calibre: -- Ubuntu: `sudo apt-get install -y calibre` -- macOS: `brew install calibre` -- Windows(Powershell in Administrator mode): `choco install calibre` - -Install FFmpeg: -- Ubuntu: `sudo apt-get install -y ffmpeg` -- macOS: `brew install ffmpeg` -- Windows(Powershell in Administrator mode): `choco install ffmpeg` - -Install Mecab for (Non Latin-based Languages tts support)(Optional): -- Ubuntu: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` -- macOS: `brew install mecab`, `brew install mecab-ipadic` -- Windows(Powershell in Administrator mode no support for mecab-ipadic easy install so no Japanese for windows :/): `choco install mecab ` - -Install Python packages: -```bash -pip install tts pydub nltk beautifulsoup4 ebooklib tqdm -``` -(For non Latin-based Languages tts support)(Optional) -`python -m unidic download` -```bash -pip install mecab mecab-python3 unidic -``` - -### Supported Languages - -The script supports the following languages for text-to-speech conversion: - -English (en), -Spanish (es), -French (fr), -German (de), -Italian (it), -Portuguese (pt), -Polish (pl), -Turkish (tr), -Russian (ru), -Dutch (nl), -Czech (cs), -Arabic (ar), -Chinese (zh-cn), -Japanese (ja), -Hungarian (hu), -Korean (ko) - -Specify the language code when running the script to use these languages. - -### Usage - -Navigate to the script's directory in the terminal and execute one of the following commands: -If you have any trouble getting it to run in Windows then it should run fine in WSL2 - -Basic Usage: ALL PARAMETERS ARE MANDATORY WHEN CALLED THE SCRIPT - -```bash -python ebook2audiobook.py [path_to_voice_file] [language_code] -``` -Replace with the path to your eBook file. -include for voice cloning. -include to specify the language - - -## Demo - - - -https://github.com/DrewThomasson/ebook2audiobookXTTS/assets/126999465/bccd7240-f967-4d27-a87d-445034db7d21 - - - -### Supported ebook File Types: -.epub, .pdf, .mobi, .txt, .html, .rtf, .chm, .lit, .pdb, .fb2, .odt, .cbr, .cbz, .prc, .lrf, .pml, .snb, .cbc, .rb, and .tcr, -(Best results are from using epub or mobi for auto chapter detection) - -### outputs as a m4b with all book metadata and chapters, example output file in an audiobook player app -![Example_of_output_in_audiobook_program](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) - -A special thanks to the creaters of: - - --Coqui TTS - --https://github.com/coqui-ai/TTS - - --Calibre - --https://calibre-ebook.com diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/Worker_2T4.sh b/legacy/v1.0/Notebooks/Kaggel Archive Code/Worker_2T4.sh deleted file mode 100644 index f366fdc1c8fbe3f906f80f0a9d79a2e68fe49af7..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/Worker_2T4.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -workers=$1 - -# Clean up operator directory -rm -rf "./Operator" -rm -rf "./Chapter_wav_files" -mkdir "./Operator" -mkdir "./Chapter_wav_files" - - -# Make appropriate temp directories -for i in $(seq 1 $workers); do - mkdir "./Operator/$i" - mkdir "./Operator/$i/temp" - mkdir "./Operator/$i/temp_ebook" -done - -echo "Created $workers directories" - -#Divide the chapters -share=1 -for FILE in ./Working_files/temp_ebook/*; do - cp $FILE "./Operator/$share/temp_ebook/" - if [ $share -lt $workers ]; - then - share=$((share+1)) - else - share=1 - fi -done - -echo "Split chapters into operator" - -#Run audio generation -#for i in $(seq 1 $workers); do -# echo "Starting Worker $i" -# python p2a_worker.py $i & -#done - -gpu=1 -for i in $(seq 1 $workers); do - if [ $gpu -lt 2 ]; - then - echo "Starting Worker $i on GPU 1" - python p2a_worker_gpu1.py $i & #Run audio generation GPU 1 T4 - gpu=2 # switch to gpu 2 on next loop - else - echo "Starting Worker $i on GPU 2" - python p2a_worker_gpu2.py $i & #Run audio generation GPU 2 T4 - gpu=1 # switch to gpu 1 on next loop - fi -done - - - -echo "All workers started waiting for completion...." -wait -echo "Done!" diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/default_voice.wav b/legacy/v1.0/Notebooks/Kaggel Archive Code/default_voice.wav deleted file mode 100644 index d98ca272441703d70a195f2c098a78a4ff6f100e..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/Notebooks/Kaggel Archive Code/default_voice.wav and /dev/null differ diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub b/legacy/v1.0/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub deleted file mode 100644 index 6f39e5a56e05f80fdca14bdb89431776461178dc..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/Notebooks/Kaggel Archive Code/demo_mini_story_chapters_Drew.epub and /dev/null differ diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/ebook2audiobook.py b/legacy/v1.0/Notebooks/Kaggel Archive Code/ebook2audiobook.py deleted file mode 100644 index d224a7d8f2d02aa56539fe8efbbe7b952ab9066e..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/ebook2audiobook.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removeing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and and removeing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb b/legacy/v1.0/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb deleted file mode 100644 index a0eca0c5c2ede50184d480f0f7813bdccc78b12e..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/kaggle-ebook2audiobook-demo.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30733,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"Install depdenencies","metadata":{}},{"cell_type":"code","source":"#!DEBIAN_FRONTEND=noninteractive\n!sudo apt-get update # && sudo apt-get -y upgrade\n!sudo apt-get -y install libegl1 \n!sudo apt-get -y install libopengl0\n!sudo apt-get -y install libxcb-cursor0\n!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n!sudo apt-get install -y ffmpeg\n!pip install tts pydub nltk beautifulsoup4 ebooklib tqdm\n!pip install numpy==1.26.4","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-06-17T21:17:43.474429Z","iopub.execute_input":"2024-06-17T21:17:43.474679Z","iopub.status.idle":"2024-06-17T21:20:20.992799Z","shell.execute_reply.started":"2024-06-17T21:17:43.474655Z","shell.execute_reply":"2024-06-17T21:20:20.991791Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Download modified ebook2audiobookXTTS\nhttps://github.com/Rihcus/ebook2audiobookXTTS\n\nOrigional unmodified version\nhttps://github.com/DrewThomasson/ebook2audiobookXTTS","metadata":{}},{"cell_type":"code","source":"!git clone https://github.com/Rihcus/ebook2audiobookXTTS","metadata":{"execution":{"iopub.status.busy":"2024-03-25T23:22:24.156772Z","iopub.execute_input":"2024-03-25T23:22:24.157618Z","iopub.status.idle":"2024-03-25T23:22:26.202486Z","shell.execute_reply.started":"2024-03-25T23:22:24.157577Z","shell.execute_reply":"2024-03-25T23:22:26.201179Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"(optional) Uploading your own epub book.\n\nBy default this notebook will use a sample epub book for testing/demo. \n\nIf you want to use your own book you will need to create a private kaggle data set, upload your epub to it, attach it to this notebook, and uncomment the two lines of code bellow, and update the data set path","metadata":{}},{"cell_type":"code","source":"# !cp -r /kaggle/input//*.epub /kaggle/working/ebook2audiobookXTTS #copy your custom book\n# !rm /kaggle/working/ebook2audiobookXTTS/demo_mini_story_chapters_Drew.epub #remove default sample book","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"This to install xtts_v2 models","metadata":{}},{"cell_type":"code","source":"import os\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n!cd /kaggle/working/ebook2audiobookXTTS && tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 --text \"test\" --speaker_wav ./4.wav --language_idx en --use_cuda true","metadata":{"execution":{"iopub.status.busy":"2024-03-25T23:23:15.626677Z","iopub.execute_input":"2024-03-25T23:23:15.627585Z","iopub.status.idle":"2024-03-25T23:27:40.712856Z","shell.execute_reply.started":"2024-03-25T23:23:15.627548Z","shell.execute_reply":"2024-03-25T23:27:40.711852Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"This is a modified version of ebook2audiobookXTTS. \n\n- p1.py only runs the first part ebook2audiobookXTTS and generates chapter txts (I commented out other parts)\n - https://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p1.py\n- Worker_2T4.sh as a basic attempt at multigpu support. The 4 argument processes of ebook2audiobook will be run in parallel\n - Worker_2T4 will try to divide the chapter in even groups based on number of workers (ex 4 group 4 workers)\n - It will try to divy up the work between kaggles two T4 GPUS\n - I'm not sure how much of a difference it makes since kaggles cpu limitations\n \nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/Worker_2T4.sh\n\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p2a_worker_gpu1.py\n\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p2a_worker_gpu2.py","metadata":{}},{"cell_type":"code","source":"!cd /kaggle/working/ebook2audiobookXTTS && python p1.py \"$(ls ./*.epub)\" \"4.wav\" \"en\"\n!cd /kaggle/working/ebook2audiobookXTTS && bash Worker_2T4.sh 4","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"p3.py runs the final ffmpeg command. ffmpeg has been a bit buggy\nhttps://github.com/Rihcus/ebook2audiobookXTTS/blob/main/p3.py","metadata":{}},{"cell_type":"code","source":"!cd /kaggle/working/ebook2audiobookXTTS && python p3.py \"$(ls ./*.epub)\" \"4.wav\" \"en\"","metadata":{},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/p1.py b/legacy/v1.0/Notebooks/Kaggel Archive Code/p1.py deleted file mode 100644 index dec3eba3b93ce57a3649214414bcef521e3bd2a5..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/p1.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removeing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and and removeing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) -# audiobook_output_path = os.path.join(".", "Audiobooks") -# print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") -# convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py b/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py deleted file mode 100644 index d39a187ed33b187fff0a219b48b8562c758338f5..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu1.py +++ /dev/null @@ -1,465 +0,0 @@ -print("starting...") - -#import os -#import shutil -#import subprocess -import re -#from pydub import AudioSegment -#import tempfile -#from pydub import AudioSegment -#import os -import nltk -#from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -#import os -#import subprocess -#import ebooklib -#from ebooklib import epub -#from bs4 import BeautifulSoup -#import re -#import csv -#import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -#import os -import subprocess -import sys -import torchaudio # not sure if this is needed - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) -# temp_audio_directory = os.path.join(".", "Working_files", "temp") - temp_audio_directory = os.path.join(".", "Operator",worker_num, "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": -# if len(sys.argv) < 2: -# print("Usage: python script.py [target_voice_file_path]") -# sys.exit(1) - - worker_num = sys.argv[1] #to let the script know which temp dir its using in operator -# ebook_file_path = sys.argv[1] - target_voice = "./4.wav" # sys.argv[2] if len(sys.argv) > 2 else None - language = "en" # sys.argv[3] if len(sys.argv) > 3 else None - -# if not calibre_installed(): -# sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") -# chapters_directory = os.path.join(".","Working_files", "temp_ebook") - chapters_directory = os.path.join(".","Operator",worker_num, "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py b/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py deleted file mode 100644 index 857aa89df181282de5b96bbd09b9eeb6dcd998a1..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/p2a_worker_gpu2.py +++ /dev/null @@ -1,465 +0,0 @@ -print("starting...") - -#import os -#import shutil -#import subprocess -import re -#from pydub import AudioSegment -#import tempfile -#from pydub import AudioSegment -#import os -import nltk -#from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -#import os -#import subprocess -#import ebooklib -#from ebooklib import epub -#from bs4 import BeautifulSoup -#import re -#import csv -#import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -#import os -import subprocess -import sys -import torchaudio # not sure if this is needed - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) -# temp_audio_directory = os.path.join(".", "Working_files", "temp") - temp_audio_directory = os.path.join(".", "Operator",worker_num, "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": -# if len(sys.argv) < 2: -# print("Usage: python script.py [target_voice_file_path]") -# sys.exit(1) - - worker_num = sys.argv[1] #to let the script know which temp dir its using in operator -# ebook_file_path = sys.argv[1] - target_voice = "./4.wav" # sys.argv[2] if len(sys.argv) > 2 else None - language = "en" # sys.argv[3] if len(sys.argv) > 3 else None - -# if not calibre_installed(): -# sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") -# chapters_directory = os.path.join(".","Working_files", "temp_ebook") - chapters_directory = os.path.join(".","Operator",worker_num, "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) -# create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/Notebooks/Kaggel Archive Code/p3.py b/legacy/v1.0/Notebooks/Kaggel Archive Code/p3.py deleted file mode 100644 index a22a76c081ded8525e375cfc523358c717c314d8..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/Kaggel Archive Code/p3.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - -# print("Wiping and removeing Working_files folder...") -# remove_folder_with_contents(full_folder_working_files) -# -# print("Wiping and and removeing chapter_wav_files folder...") -# remove_folder_with_contents(output_audio_directory) - -# create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") -# print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") -# convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/Notebooks/colab_ebook2audiobookxtts.ipynb b/legacy/v1.0/Notebooks/colab_ebook2audiobookxtts.ipynb deleted file mode 100644 index 0adfde9b721367015449005bacb908beb14c9d8a..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/colab_ebook2audiobookxtts.ipynb +++ /dev/null @@ -1,106 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Welcome to the ebook2audiobookxtts free google colab!\n", - "## 🌟 Features\n", - "\n", - "- 📖 Converts eBooks to text format with Calibre.\n", - "- 📚 Splits eBook into chapters for organized audio.\n", - "- 🎙️ High-quality text-to-speech with Coqui XTTS.\n", - "- 🗣️ Optional voice cloning with your own voice file.\n", - "- 🌍 Supports multiple languages! (English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu), Korean (ko)).\n", - "## Want to run locally for free? ⬇\n", - "## [Check out the ebook2audiobookxtts github!](https://github.com/DrewThomasson/ebook2audiobookXTTS)" - ], - "metadata": { - "id": "DKNNnwD-HJwQ" - } - }, - { - "cell_type": "code", - "source": [ - "# @title 🛠️ Install requirments\n", - "#!DEBIAN_FRONTEND=noninteractive\n", - "!sudo apt-get update # && sudo apt-get -y upgrade\n", - "!sudo apt-get -y install libegl1\n", - "!sudo apt-get -y install libopengl0\n", - "!sudo apt-get -y install libxcb-cursor0\n", - "!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n", - "!sudo apt-get install -y ffmpeg\n", - "#!sudo apt-get install -y calibre\n", - "!pip install ebook2audiobook-install-counter\n", - "!pip install ebooklib\n", - "!pip install pydub\n", - "!pip install nltk\n", - "!pip install beautifulsoup4\n", - "!pip install tqdm\n", - "!pip install gradio\n", - "!pip install coqui-tts" - ], - "metadata": { - "id": "Edxj355K0rUz", - "collapsed": true, - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# @title 🚀 Run ebook2audiobookxtts! (Make sure to set the runtime to have gpu to have faster generation speeds! :)\n", - "#ntlk error fix\n", - "#https://github.com/delip/PyTorchNLPBook/issues/14\n", - "import nltk\n", - "nltk.download('punkt')\n", - "nltk.download('punkt_tab')\n", - "\n", - "#Auto agree to xtts\n", - "import os\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", - "\n", - "# To download the app.py and the Default_voice wav if not seen locally\n", - "!wget -O /content/app.py https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/app.py\n", - "!wget -O /content/default_voice.wav https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/default_voice.wav\n", - "\n", - "# Start the app with Share=True for the gradio interface\n", - "!python /content/app.py --share True" - ], - "metadata": { - "id": "658BTHueyLMo", - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - } - ] -} diff --git a/legacy/v1.0/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb b/legacy/v1.0/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb deleted file mode 100644 index 8b218357d0d3046e760c800f3aa0607c624a48b9..0000000000000000000000000000000000000000 --- a/legacy/v1.0/Notebooks/kaggle-beta-of-ebook2audiobookxtts-ipynb.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU","kaggle":{"accelerator":"gpu","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# IGNORE THESE ITS OLD LOL\n\n# install needed packages\n\n##!apt-get update\n\n##!apt-get install wget unzip git ffmpeg calibre\n\n\n\n# pip install requirments\n\n##!pip install tts==0.21.3 pydub nltk beautifulsoup4 ebooklib tqdm gradio\n\n\n\n##!pip install numpy==1.23\n\n##!pip install --no-binary lxml lxml\n\n##import os\n\n##os.kill(os.getpid(), 9)\n","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gh3HEhmzuqVA","outputId":"81217d71-7576-43db-d56c-07ce11ea6517","jupyter":{"source_hidden":true},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#!DEBIAN_FRONTEND=noninteractive\n\n!sudo apt-get update # && sudo apt-get -y upgrade\n\n!sudo apt-get -y install libegl1\n\n!sudo apt-get -y install libopengl0\n\n!sudo apt-get -y install libxcb-cursor0\n\n!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin\n\n!sudo apt-get install -y ffmpeg\n\n!pip install tts pydub nltk beautifulsoup4 ebooklib tqdm\n\n!pip install numpy==1.26.4\n\n!pip install gradio","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"Edxj355K0rUz","outputId":"9fc5f4e1-1ba2-4814-a477-496f626c2772","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Start the app with Share=True for the gradio interface\n\n\n\n#ntlk error fix\n\n#https://github.com/delip/PyTorchNLPBook/issues/14\n\nimport nltk\n\nnltk.download('punkt')\n\n\n\n#Auto agree to xtts\n\nimport os\n\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n\n\n!python /kaggle/working/app.py --share True","metadata":{"id":"EZIZva9Tvdbb","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#ntlk error fix\n\n#https://github.com/delip/PyTorchNLPBook/issues/14\n\nimport nltk\n\nnltk.download('punkt')\n\n\n\n#Auto agree to xtts\n\nimport os\n\nos.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n\n\n\n# To download the app.py and the Default_voice wav if not seen locally\n\n!wget -O /kaggle/working/app.py https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/app.py\n\n!wget -O /kaggle/working/default_voice.wav https://raw.githubusercontent.com/DrewThomasson/ebook2audiobookXTTS/main/default_voice.wav\n\n\n\n# Start the app with Share=True for the gradio interface\n\n!python /kaggle/working/app.py --share True","metadata":{"id":"658BTHueyLMo","colab":{"base_uri":"https://localhost:8080/"},"outputId":"e293e70d-b25a-41bc-dbac-7ca1ddf1d3d2","trusted":true},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/legacy/v1.0/README.md b/legacy/v1.0/README.md deleted file mode 100644 index 372667d1ce554676142c90bed6f26298a8c27972..0000000000000000000000000000000000000000 --- a/legacy/v1.0/README.md +++ /dev/null @@ -1,478 +0,0 @@ -# 📚 ebook2audiobook - -Convert eBooks to audiobooks with chapters and metadata using Calibre and Coqui XTTS. Supports optional voice cloning and multiple languages! - - -#### 🖥️ Web GUI Interface -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Click to see images of Web GUI -image -image -image -
- -## README.md -- en [English](README.md) -- zh_CN [简体中文](readme/README_CN.md) -- ru [Русский](readme/README_RU.md) - - -## 🌟 Features - -- 📖 Converts eBooks to text format with Calibre. -- 📚 Splits eBook into chapters for organized audio. -- 🎙️ High-quality text-to-speech with Coqui XTTS. -- 🗣️ Optional voice cloning with your own voice file. -- 🌍 Supports multiple languages (English by default). -- 🖥️ Designed to run on 4GB RAM. - -## 🤗 [Huggingface space demo](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Huggingface space is running on free cpu tier so expect very slow or timeout lol, just don't give it giant files is all -- Best to duplicate space or run locally. - -## Free Google Colab [![Free Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - -## 🛠️ Requirements - -- Python 3.10 -- `coqui-tts` Python package -- Calibre (for eBook conversion) -- FFmpeg (for audiobook creation) -- Optional: Custom voice file for voice cloning - - -### 🔧 Installation Instructions - -1. **Install Python 3.x** from [Python.org](https://www.python.org/downloads/). - -2. **Install Calibre**: - - **Ubuntu**: `sudo apt-get install -y calibre` - - **macOS**: `brew install calibre` - - **Windows** (Admin Powershell): `choco install calibre` - -3. **Install FFmpeg**: - - **Ubuntu**: `sudo apt-get install -y ffmpeg` - - **macOS**: `brew install ffmpeg` - - **Windows** (Admin Powershell): `choco install ffmpeg` - -4. **Optional: Install Mecab** (for non-Latin languages): - - **Ubuntu**: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` - - **macOS**: `brew install mecab`, `brew install mecab-ipadic` - - **Windows**: [mecab-website-to-install-manually](https://taku910.github.io/mecab/#download) (Note: Japanese support is limited) - -5. **Install Python packages**: - ```bash - pip install coqui-tts==0.24.2 pydub nltk beautifulsoup4 ebooklib tqdm gradio==4.44.0 - - python -m nltk.downloader punkt - python -m nltk.downloader punkt_tab - ``` - - **For non-Latin languages**: - ```bash - pip install mecab mecab-python3 unidic - - python -m unidic download - ``` - -## 🌐 Supported Languages - -- **English (en)** -- **Spanish (es)** -- **French (fr)** -- **German (de)** -- **Italian (it)** -- **Portuguese (pt)** -- **Polish (pl)** -- **Turkish (tr)** -- **Russian (ru)** -- **Dutch (nl)** -- **Czech (cs)** -- **Arabic (ar)** -- **Chinese (zh-cn)** -- **Japanese (ja)** -- **Hungarian (hu)** -- **Korean (ko)** - -Specify the language code when running the script in headless mode. -## 🚀 Usage - -### 🖥️ Launching Gradio Web Interface - -1. **Run the Script**: - ```bash - python app.py - ``` - -2. **Open the Web App**: Click the URL provided in the terminal to access the web app and convert eBooks. -3. **For Public Link**: Add `--share True` to the end of it like this: `python app.py --share True` -- **[For More Parameters]**: use the `-h` parameter like this `python app.py -h` - -### 📝 Basic Headless Usage - -```bash -python app.py --headless True --ebook --voice [path_to_voice_file] --language [language_code] -``` - -- ****: Path to your eBook file. -- **[path_to_voice_file]**: Optional for voice cloning. -- **[language_code]**: Optional to specify language. -- **[For More Parameters]**: use the `-h` parameter like this `python app.py -h` - -### 🧩 Headless Custom XTTS Model Usage - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model --custom_config --custom_vocab -``` - -- ****: Path to your eBook file. -- ****: Optional for voice cloning. -- ****: Optional to specify language. -- ****: Path to `model.pth`. -- ****: Path to `config.json`. -- ****: Path to `vocab.json`. -- **[For More Parameters]**: use the `-h` parameter like this `python app.py -h` - - -### 🧩 Headless Custom XTTS Model Usage With Zip link to XTTS Fine-Tune Model 🌐 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model_url -``` - -- ****: Path to your eBook file. -- ****: Optional for voice cloning. -- ****: Optional to specify language. -- ****: URL Path to zip of Model folder. For Example this for the [xtts_David_Attenborough_fine_tune](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/tree/main) `https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true` -- For a custom model a ref audio clip of the voice will also be needed: -[ref audio clip of David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) -- **[For More Parameters]**: use the `-h` parameter like this `python app.py -h` - -### 🔍 For Detailed Guide with list of all Parameters to use -```bash -python app.py -h -``` -- This will output the following: -```bash -usage: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the -Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share SHARE Set to True to enable a public shareable Gradio link. Defaults - to False. - --headless HEADLESS Set to True to run in headless mode without the Gradio - interface. Defaults to False. - --ebook EBOOK Path to the ebook file for conversion. Required in headless - mode. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default - voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to - English (en). - --use_custom_model USE_CUSTOM_MODEL - Set to True to use a custom TTS model. Defaults to False. Must - be True to use custom models, otherwise you'll get an error. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom - model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using - a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a - custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but - will be used if provided. Examples include David Attenborough's - model: 'https://huggingface.co/drewThomasson/xtts_David_Attenbor - ough_fine_tune/resolve/main/Finished_model_files.zip?download=tr - ue'. More XTTS fine-tunes can be found on my Hugging Face at - 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher Tempatures - will lead to more creative outputs IE: more Hallucinations. - Lower Tempatures will be more monotone outputs IE: less - Hallucinations. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults - to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from - repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. IE: How fast the - Narrerator will speak. Defaults to 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Enable splitting text into sentences. Defaults to True. - -Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice ---language en --use_custom_model True --custom_model model.pth --custom_config -config.json --custom_vocab vocab.json -``` - - -
- ⚠️ Legacy-Depricated Old Use Instructions - -## 🚀 Usage - -## Legacy files have been moved to `ebook2audiobookXTTS/legacy/` - -### 🖥️ Gradio Web Interface - -1. **Run the Script**: - ```bash - python custom_model_ebook2audiobookXTTS_gradio.py - ``` - -2. **Open the Web App**: Click the URL provided in the terminal to access the web app and convert eBooks. - -### 📝 Basic Usage - -```bash -python ebook2audiobook.py [path_to_voice_file] [language_code] -``` - -- ****: Path to your eBook file. -- **[path_to_voice_file]**: Optional for voice cloning. -- **[language_code]**: Optional to specify language. - -### 🧩 Custom XTTS Model - -```bash -python custom_model_ebook2audiobookXTTS.py -``` - -- ****: Path to your eBook file. -- ****: Optional for voice cloning. -- ****: Optional to specify language. -- ****: Path to `model.pth`. -- ****: Path to `config.json`. -- ****: Path to `vocab.json`. -
- -### 🐳 Using Docker - -You can also use Docker to run the eBook to Audiobook converter. This method ensures consistency across different environments and simplifies setup. - -#### 🚀 Running the Docker Container - -To run the Docker container and start the Gradio interface, use the following command: - - -Run with CPU only -```powershell -docker run -it --rm -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -Run with GPU Speedup (Nvida graphics cards only) -```powershell -docker run -it --rm --gpus all -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -This command will start the Gradio interface on port 7860.(localhost:7860) -- For more options like running the docker in headless mode or making the gradio link public add the `-h` parameter after the `app.py` in the docker launch command -
- Example of using docker in headless mode or modifying anything with the extra parameters + Full guide - -## Example of using docker in headless mode - -first for a docker pull of the latest with -```bash -docker pull athomasson2/ebook2audiobookxtts:huggingface -``` - -- Before you do run this you need to create a dir named "input-folder" in your current dir which will be linked, This is where you can put your input files for the docker image to see -```bash -mkdir input-folder && mkdir Audiobooks -``` - -- In the command below swap out **YOUR_INPUT_FILE.TXT** with the name of your input file - -```bash -docker run -it --rm \ - -v $(pwd)/input-folder:/home/user/app/input_folder \ - -v $(pwd)/Audiobooks:/home/user/app/Audiobooks \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py --headless True --ebook /home/user/app/input_folder/YOUR_INPUT_FILE.TXT -``` - -- And that should be it! - -- The output Audiobooks will be found in the Audiobook folder which will also be located in your local dir you ran this docker command in - - -## To get the help command for the other parameters this program has you can run this - -```bash -docker run -it --rm \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py -h - -``` - - -and that will output this - -```bash -user/app/ebook2audiobookXTTS/input-folder -v $(pwd)/Audiobooks:/home/user/app/ebook2audiobookXTTS/Audiobooks --memory="4g" --network none --platform linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -h -starting... -usage: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the -Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share SHARE Set to True to enable a public shareable Gradio link. Defaults - to False. - --headless HEADLESS Set to True to run in headless mode without the Gradio - interface. Defaults to False. - --ebook EBOOK Path to the ebook file for conversion. Required in headless - mode. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default - voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to - English (en). - --use_custom_model USE_CUSTOM_MODEL - Set to True to use a custom TTS model. Defaults to False. Must - be True to use custom models, otherwise you'll get an error. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom - model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using - a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a - custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but - will be used if provided. Examples include David Attenborough's - model: 'https://huggingface.co/drewThomasson/xtts_David_Attenbor - ough_fine_tune/resolve/main/Finished_model_files.zip?download=tr - ue'. More XTTS fine-tunes can be found on my Hugging Face at - 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher Tempatures - will lead to more creative outputs IE: more Hallucinations. - Lower Tempatures will be more monotone outputs IE: less - Hallucinations. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults - to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from - repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. IE: How fast the - Narrerator will speak. Defaults to 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Enable splitting text into sentences. Defaults to True. - -Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice ---language en --use_custom_model True --custom_model model.pth --custom_config -config.json --custom_vocab vocab.json -``` -
- -#### 🖥️ Docker GUI -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Click to see images of Web GUI -image -image -image -
-### 🛠️ For Custom Xtts Models - -Models built to be better at a specific voice. Check out my Hugging Face page [here](https://huggingface.co/drewThomasson). - -To use a custom model, paste the link of the `Finished_model_files.zip` file like this: - -[David Attenborough fine tuned Finished_model_files.zip](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true) - -For a custom model a ref audio clip of the voice will also be needed: -[ref audio clip of David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) - - - -More details can be found at the [Dockerfile Hub Page]([https://github.com/DrewThomasson/ebook2audiobookXTTS](https://hub.docker.com/repository/docker/athomasson2/ebook2audiobookxtts/general)). - -## 🌐 Fine Tuned Xtts models - -To find already fine-tuned XTTS models, visit [this Hugging Face link](https://huggingface.co/drewThomasson) 🌐. Search for models that include "xtts fine tune" in their names. - -## 🎥 Demos - -Rainy day voice - -https://github.com/user-attachments/assets/8486603c-38b1-43ce-9639-73757dfb1031 - -David Attenborough voice - -https://github.com/user-attachments/assets/47c846a7-9e51-4eb9-844a-7460402a20a8 - - -## 🤗 [Huggingface space demo](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Huggingface space is running on free cpu tier so expect very slow or timeout lol, just don't give it giant files is all -- Best to duplicate space or run locally. - -## Free Google Colab [![Free Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - - -## 📚 Supported eBook Formats - -- `.epub`, `.pdf`, `.mobi`, `.txt`, `.html`, `.rtf`, `.chm`, `.lit`, `.pdb`, `.fb2`, `.odt`, `.cbr`, `.cbz`, `.prc`, `.lrf`, `.pml`, `.snb`, `.cbc`, `.rb`, `.tcr` -- **Best results**: `.epub` or `.mobi` for automatic chapter detection - -## 📂 Output - -- Creates an `.m4b` file with metadata and chapters. -- **Example Output**: ![Example](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) - -## 🛠️ Common Issues: -- "It's slow!" - On CPU only this is very slow, and you can only get speedups though a NVIDIA GPU. [Discussion about this](https://github.com/DrewThomasson/ebook2audiobookXTTS/discussions/19#discussioncomment-10879846) For faster multilingual generation I would suggest my other [project that uses piper-tts](https://github.com/DrewThomasson/ebook2audiobookpiper-tts) instead(It doesn't have zero-shot voice cloning though, and is siri quality voices, but it is much faster on cpu.) -- "I'm having dependency issues" - Just use the docker, its fully self contained and has a headless mode, add `-h` parameter after the `app.py` in the docker run command for more information. -- "Im getting a truncated audio issue!" - PLEASE MAKE AN ISSUE OF THIS, I don't speak every language and I need advise from each person to fine tune my sentense splitting function on any other languages.😊 -- "The loading bar is stuck at 30% in the web gui!" - The web gui loading bar is extreamly basic as its just split between the three loading steps, refer to the terminal and what sentense it's on for a more accurate gauge on where is it progress wise. - -## What I need help with! 🙌 -## [Full list of things can be found here](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/32) -- Any help from people speaking any of the supported langues to help with proper sentence splitting methods -- Potentially creating readme Guides for Multiple languages(Becuase the only language I know is English 😔) - -## 🙏 Special Thanks - -- **Coqui TTS**: [Coqui TTS GitHub](https://github.com/coqui-ai/TTS) -- **Calibre**: [Calibre Website](https://calibre-ebook.com) - -- [@shakenbake15 for better chapter saving method](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/8) - diff --git a/legacy/v1.0/app.py b/legacy/v1.0/app.py deleted file mode 100644 index 2d3331fa243e6162a7fe49db138504925a0c2d2a..0000000000000000000000000000000000000000 --- a/legacy/v1.0/app.py +++ /dev/null @@ -1,1041 +0,0 @@ -print("starting...") - -import argparse - -language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" -] -char_limits = { - "en": 250, # English - "es": 239, # Spanish - "fr": 273, # French - "de": 253, # German - "it": 213, # Italian - "pt": 203, # Portuguese - "pl": 224, # Polish - "tr": 226, # Turkish - "ru": 182, # Russian - "nl": 251, # Dutch - "cs": 186, # Czech - "ar": 166, # Arabic - "zh-cn": 82, # Chinese (Simplified) - "ja": 71, # Japanese - "hu": 224, # Hungarian - "ko": 95, # Korean -} - -# Mapping of language codes to NLTK's supported language names -language_mapping = { - "en": "english", - "de": "german", - "fr": "french", - "es": "spanish", - "it": "italian", - "pt": "portuguese", - "nl": "dutch", - "pl": "polish", - "cs": "czech", - "ru": "russian", - "tr": "turkish", - "el": "greek", - "et": "estonian", - "no": "norwegian", - "ml": "malayalam", - "sl": "slovene", - "da": "danish", - "fi": "finnish", - "sv": "swedish" -} - - -# Convert the list of languages to a string to display in the help text -language_options_str = ", ".join(language_options) - -# Argument parser to handle optional parameters with descriptions -parser = argparse.ArgumentParser( - description="Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the Gradio interface or run the script in headless mode for direct conversion.", - epilog="Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice --language en --use_custom_model True --custom_model model.pth --custom_config config.json --custom_vocab vocab.json" -) -parser.add_argument("--share", type=bool, default=False, help="Set to True to enable a public shareable Gradio link. Defaults to False.") -parser.add_argument("--headless", type=bool, default=False, help="Set to True to run in headless mode without the Gradio interface. Defaults to False.") -parser.add_argument("--ebook", type=str, help="Path to the ebook file for conversion. Required in headless mode.") -parser.add_argument("--voice", type=str, help="Path to the target voice file for TTS. Optional, uses a default voice if not provided.") -parser.add_argument("--language", type=str, default="en", - help=f"Language for the audiobook conversion. Options: {language_options_str}. Defaults to English (en).") -parser.add_argument("--use_custom_model", type=bool, default=False, - help="Set to True to use a custom TTS model. Defaults to False. Must be True to use custom models, otherwise you'll get an error.") -parser.add_argument("--custom_model", type=str, help="Path to the custom model file (.pth). Required if using a custom model.") -parser.add_argument("--custom_config", type=str, help="Path to the custom config file (config.json). Required if using a custom model.") -parser.add_argument("--custom_vocab", type=str, help="Path to the custom vocab file (vocab.json). Required if using a custom model.") -parser.add_argument("--custom_model_url", type=str, - help=("URL to download the custom model as a zip file. Optional, but will be used if provided. " - "Examples include David Attenborough's model: " - "'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. " - "More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'.")) -parser.add_argument("--temperature", type=float, default=0.65, help="Temperature for the model. Defaults to 0.65. Higher Tempatures will lead to more creative outputs IE: more Hallucinations. Lower Tempatures will be more monotone outputs IE: less Hallucinations.") -parser.add_argument("--length_penalty", type=float, default=1.0, help="A length penalty applied to the autoregressive decoder. Defaults to 1.0. Not applied to custom models.") -parser.add_argument("--repetition_penalty", type=float, default=2.0, help="A penalty that prevents the autoregressive decoder from repeating itself. Defaults to 2.0.") -parser.add_argument("--top_k", type=int, default=50, help="Top-k sampling. Lower values mean more likely outputs and increased audio generation speed. Defaults to 50.") -parser.add_argument("--top_p", type=float, default=0.8, help="Top-p sampling. Lower values mean more likely outputs and increased audio generation speed. Defaults to 0.8.") -parser.add_argument("--speed", type=float, default=1.0, help="Speed factor for the speech generation. IE: How fast the Narrerator will speak. Defaults to 1.0.") -parser.add_argument("--enable_text_splitting", type=bool, default=False, help="Enable splitting text into sentences. Defaults to True.") - -args = parser.parse_args() - - - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import nltk -from nltk.tokenize import sent_tokenize -import sys -import torch -from TTS.api import TTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from tqdm import tqdm -import gradio as gr -from gradio import Progress -import urllib.request -import zipfile -import socket -#import MeCab -#import unidic - -#nltk.download('punkt_tab') - -# Import the locally stored Xtts default model -#import import_locally_stored_tts_model_files - -#make the nltk folder point to the nltk folder in the app dir -#nltk.data.path.append('/home/user/app/nltk_data') - -# Download UniDic if it's not already installed -#unidic.download() - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(f"Device selected is: {device}") - -#nltk.download('punkt') # Make sure to download the necessary models - - -def download_and_extract_zip(url, extract_to='.'): - try: - # Ensure the directory exists - os.makedirs(extract_to, exist_ok=True) - - zip_path = os.path.join(extract_to, 'model.zip') - - # Download with progress bar - with tqdm(unit='B', unit_scale=True, miniters=1, desc="Downloading Model") as t: - def reporthook(blocknum, blocksize, totalsize): - t.total = totalsize - t.update(blocknum * blocksize - t.n) - - urllib.request.urlretrieve(url, zip_path, reporthook=reporthook) - print(f"Downloaded zip file to {zip_path}") - - # Unzipping with progress bar - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - files = zip_ref.namelist() - with tqdm(total=len(files), unit="file", desc="Extracting Files") as t: - for file in files: - if not file.endswith('/'): # Skip directories - # Extract the file to the temporary directory - extracted_path = zip_ref.extract(file, extract_to) - # Move the file to the base directory - base_file_path = os.path.join(extract_to, os.path.basename(file)) - os.rename(extracted_path, base_file_path) - t.update(1) - - # Cleanup: Remove the ZIP file and any empty folders - os.remove(zip_path) - for root, dirs, files in os.walk(extract_to, topdown=False): - for name in dirs: - os.rmdir(os.path.join(root, name)) - print(f"Extracted files to {extract_to}") - - # Check if all required files are present - required_files = ['model.pth', 'config.json', 'vocab.json_'] - missing_files = [file for file in required_files if not os.path.exists(os.path.join(extract_to, file))] - - if not missing_files: - print("All required files (model.pth, config.json, vocab.json_) found.") - else: - print(f"Missing files: {', '.join(missing_files)}") - - except Exception as e: - print(f"Failed to download or extract zip file: {e}") - - - -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path, batch_size=256): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Process the chapter files in batches - for i in range(0, len(chapter_files), batch_size): - batch_files = chapter_files[i:i + batch_size] - batch_audio = AudioSegment.empty() # Initialize an empty AudioSegment for the batch - - # Sequentially append each file in the current batch to the batch_audio - for chapter_file in batch_files: - audio_segment = AudioSegment.from_wav(chapter_file) - batch_audio += audio_segment - - # Combine the batch audio with the overall combined_audio - combined_audio += batch_audio - - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in order to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - #nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" - - -# Function to check if vocab.json exists and rename it -def rename_vocab_file_if_exists(directory): - vocab_path = os.path.join(directory, 'vocab.json') - new_vocab_path = os.path.join(directory, 'vocab.json_') - - # Check if vocab.json exists - if os.path.exists(vocab_path): - # Rename the file - os.rename(vocab_path, new_vocab_path) - print(f"Renamed {vocab_path} to {new_vocab_path}") - return True # Return True if the file was found and renamed - - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -# Modify the function to handle special cases for Chinese, Italian, and default for others -def split_long_sentence(sentence, language='en', max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param language: The language of the sentence (default is English). - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - #Get the Max character length for the selected language -2 : with a default of 248 if no language is found - max_length = (char_limits.get(language, 250)-2) - - # Adjust the pause punctuation symbols based on language - if language == 'zh-cn': - punctuation = [',', '。', ';', '?', '!'] # Chinese-specific pause punctuation including sentence-ending marks - elif language == 'ja': - punctuation = ['、', '。', ';', '?', '!'] # Japanese-specific pause punctuation - elif language == 'ko': - punctuation = [',', '。', ';', '?', '!'] # Korean-specific pause punctuation - elif language == 'ar': - punctuation = ['،', '؛', '؟', '!', '·', '؛', '.'] # Arabic-specific punctuation - elif language == 'en': - punctuation = [',', ';', '.'] # English-specific pause punctuation - else: - # Default pause punctuation for other languages (es, fr, de, it, pt, pl, cs, ru, nl, tr, hu) - punctuation = [',', '.', ';', ':', '?', '!'] - - - - parts = [] - while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in punctuation and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS - -def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, target_voice_path=None, language=None, custom_model=None): - - if target_voice_path==None: - target_voice_path = default_target_voice_path - - if custom_model: - print("Loading custom model...") - config = XttsConfig() - config.load_json(custom_model['config']) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_path=custom_model['model'], vocab_path=custom_model['vocab'], use_deepspeed=False) - model.to(device) - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[target_voice_path]) - else: - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Check if the language code is supported - nltk_language = language_mapping.get(language) - if nltk_language: - # If the language is supported, tokenize using sent_tokenize - sentences = sent_tokenize(chapter_text, language=nltk_language) - else: - # If the language is not supported, handle it (e.g., return the text unchanged) - sentences = [chapter_text] # No tokenization, just wrap the text in a list - #sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, language=language) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - if custom_model: - # length penalty will not apply for custome models, its just too much of a headache perhaps if someone else can do it for me lol, im just one man :( - out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=temperature, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting) - #out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting) - torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) - else: - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting) - - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, target_voice_path=None, language="en"): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Check if the language code is supported - nltk_language = language_mapping.get(language) - if nltk_language: - # If the language is supported, tokenize using sent_tokenize - sentences = sent_tokenize(chapter_text, language=nltk_language) - else: - # If the language is not supported, handle it (e.g., return the text unchanged) - sentences = [chapter_text] # No tokenization, just wrap the text in a list - #sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, language=language) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - tts.tts_to_file( - text=fragment, - file_path=fragment_file_path, - speaker_wav=speaker_wav_path, - language=language, - temperature=temperature, - length_penalty=length_penalty, - repetition_penalty=repetition_penalty, - top_k=top_k, - top_p=top_p, - speed=speed, - enable_text_splitting=enable_text_splitting - ) - - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Define the functions to be used in the Gradio interface -def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, custom_model_url=None, progress=gr.Progress()): - - ebook_file_path = args.ebook if args.ebook else ebook_file.name - target_voice = args.voice if args.voice else target_voice_file.name if target_voice_file else None - custom_model = None - - - working_files = os.path.join(".", "Working_files", "temp_ebook") - full_folder_working_files = os.path.join(".", "Working_files") - chapters_directory = os.path.join(".", "Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - remove_folder_with_contents(full_folder_working_files) - remove_folder_with_contents(output_audio_directory) - - # If running in headless mode, use the language from args - if args.headless and args.language: - language = args.language - else: - language = language # Gradio dropdown value - - # If headless is used with the custom model arguments - if args.use_custom_model and args.custom_model and args.custom_config and args.custom_vocab: - custom_model = { - 'model': args.custom_model, - 'config': args.custom_config, - 'vocab': args.custom_vocab - } - - elif use_custom_model and custom_model_file and custom_config_file and custom_vocab_file: - custom_model = { - 'model': custom_model_file.name, - 'config': custom_config_file.name, - 'vocab': custom_vocab_file.name - } - if (use_custom_model and custom_model_url) or (args.use_custom_model and custom_model_url): - print(f"Received custom model URL: {custom_model_url}") - download_dir = os.path.join(".", "Working_files", "custom_model") - download_and_extract_zip(custom_model_url, download_dir) - - # Check if vocab.json exists and rename it - if rename_vocab_file_if_exists(download_dir): - print("vocab.json file was found and renamed.") - - custom_model = { - 'model': os.path.join(download_dir, 'model.pth'), - 'config': os.path.join(download_dir, 'config.json'), - 'vocab': os.path.join(download_dir, 'vocab.json_') - } - - try: - progress(0, desc="Starting conversion") - except Exception as e: - print(f"Error updating progress: {e}") - - if not calibre_installed(): - return "Calibre is not installed." - - - try: - progress(0.1, desc="Creating chapter-labeled book") - except Exception as e: - print(f"Error updating progress: {e}") - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - - try: - progress(0.3, desc="Converting chapters to audio") - except Exception as e: - print(f"Error updating progress: {e}") - - if use_custom_model: - convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, target_voice, language, custom_model) - else: - convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, temperature, length_penalty, repetition_penalty, top_k, top_p, speed, enable_text_splitting, target_voice, language) - - try: - progress(0.9, desc="Creating M4B from chapters") - except Exception as e: - print(f"Error updating progress: {e}") - - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) - - # Get the name of the created M4B file - m4b_filename = os.path.splitext(os.path.basename(ebook_file_path))[0] + '.m4b' - m4b_filepath = os.path.join(audiobook_output_path, m4b_filename) - - try: - progress(1.0, desc="Conversion complete") - except Exception as e: - print(f"Error updating progress: {e}") - print(f"Audiobook created at {m4b_filepath}") - return f"Audiobook created at {m4b_filepath}", m4b_filepath - - -def list_audiobook_files(audiobook_folder): - # List all files in the audiobook folder - files = [] - for filename in os.listdir(audiobook_folder): - if filename.endswith('.m4b'): # Adjust the file extension as needed - files.append(os.path.join(audiobook_folder, filename)) - return files - -def download_audiobooks(): - audiobook_output_path = os.path.join(".", "Audiobooks") - return list_audiobook_files(audiobook_output_path) - - -# Gradio UI setup -def run_gradio_interface(): - language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" - ] - - theme = gr.themes.Soft( - primary_hue="blue", - secondary_hue="blue", - neutral_hue="blue", - text_size=gr.themes.sizes.text_md, - ) - -# Gradio UI setup -def run_gradio_interface(): - language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" - ] - - theme = gr.themes.Soft( - primary_hue="blue", - secondary_hue="blue", - neutral_hue="blue", - text_size=gr.themes.sizes.text_md, - ) - - with gr.Blocks(theme=theme) as demo: - gr.Markdown( - """ - # eBook to Audiobook Converter - - Transform your eBooks into immersive audiobooks with optional custom TTS models. - - This interface is based on [Ebook2AudioBookXTTS](https://github.com/DrewThomasson/ebook2audiobookXTTS). - """ - ) - - with gr.Tabs(): # Create tabs for better UI organization - with gr.TabItem("Input Options"): - with gr.Row(): - with gr.Column(scale=3): - ebook_file = gr.File(label="eBook File") - target_voice_file = gr.File(label="Target Voice File (Optional)") - language = gr.Dropdown(label="Language", choices=language_options, value="en") - - with gr.Column(scale=3): - use_custom_model = gr.Checkbox(label="Use Custom Model") - custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False) - custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False) - custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False) - custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False) - - with gr.TabItem("Audio Generation Preferences"): # New tab for preferences - gr.Markdown( - """ - ### Customize Audio Generation Parameters - - Adjust the settings below to influence how the audio is generated. You can control the creativity, speed, repetition, and more. - """ - ) - temperature = gr.Slider( - label="Temperature", - minimum=0.1, - maximum=10.0, - step=0.1, - value=0.65, - info="Higher values lead to more creative, unpredictable outputs. Lower values make it more monotone." - ) - length_penalty = gr.Slider( - label="Length Penalty", - minimum=0.5, - maximum=10.0, - step=0.1, - value=1.0, - info="Penalize longer sequences. Higher values produce shorter outputs. Not applied to custom models." - ) - repetition_penalty = gr.Slider( - label="Repetition Penalty", - minimum=1.0, - maximum=10.0, - step=0.1, - value=2.0, - info="Penalizes repeated phrases. Higher values reduce repetition." - ) - top_k = gr.Slider( - label="Top-k Sampling", - minimum=10, - maximum=100, - step=1, - value=50, - info="Lower values restrict outputs to more likely words and increase speed at which audio generates. " - ) - top_p = gr.Slider( - label="Top-p Sampling", - minimum=0.1, - maximum=1.0, - step=.01, - value=0.8, - info="Controls cumulative probability for word selection. Lower values make the output more predictable and increase speed at which audio generates." - ) - speed = gr.Slider( - label="Speed", - minimum=0.5, - maximum=3.0, - step=0.1, - value=1.0, - info="Adjusts How fast the narrator will speak." - ) - enable_text_splitting = gr.Checkbox( - label="Enable Text Splitting", - value=False, - info="Splits long texts into sentences to generate audio in chunks. Useful for very long inputs." - ) - - convert_btn = gr.Button("Convert to Audiobook", variant="primary") - output = gr.Textbox(label="Conversion Status") - audio_player = gr.Audio(label="Audiobook Player", type="filepath") - download_btn = gr.Button("Download Audiobook Files") - download_files = gr.File(label="Download Files", interactive=False) - - convert_btn.click( - lambda *args: convert_ebook_to_audio( - *args[:7], - float(args[7]), # Ensure temperature is float - float(args[8]), # Ensure length_penalty is float - float(args[9]), # Ensure repetition_penalty is float - int(args[10]), # Ensure top_k is int - float(args[11]), # Ensure top_p is float - float(args[12]), # Ensure speed is float - *args[13:] - ), - inputs=[ - ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, - custom_vocab_file, temperature, length_penalty, repetition_penalty, - top_k, top_p, speed, enable_text_splitting, custom_model_url - ], - outputs=[output, audio_player] - ) - - - use_custom_model.change( - lambda x: [gr.update(visible=x)] * 4, - inputs=[use_custom_model], - outputs=[custom_model_file, custom_config_file, custom_vocab_file, custom_model_url] - ) - - download_btn.click( - download_audiobooks, - outputs=[download_files] - ) - - # Get the correct local IP or localhost - hostname = socket.gethostname() - local_ip = socket.gethostbyname(hostname) - - # Ensure Gradio runs and prints the correct local IP - print(f"Running on local URL: http://{local_ip}:7860") - print(f"Running on local URL: http://localhost:7860") - - # Launch Gradio app - demo.launch(server_name="0.0.0.0", server_port=7860, share=args.share) - - - - - -# Check if running in headless mode -if args.headless: - # If the arg.custom_model_url exists then use it as the custom_model_url lol - custom_model_url = args.custom_model_url if args.custom_model_url else None - - if not args.ebook: - print("Error: In headless mode, you must specify an ebook file using --ebook.") - exit(1) - - ebook_file_path = args.ebook - target_voice = args.voice if args.voice else None - custom_model = None - - if args.use_custom_model: - # Check if custom_model_url is provided - if args.custom_model_url: - # Download the custom model from the provided URL - custom_model_url = args.custom_model_url - else: - # If no URL is provided, ensure all custom model files are provided - if not args.custom_model or not args.custom_config or not args.custom_vocab: - print("Error: You must provide either a --custom_model_url or all of the following arguments:") - print("--custom_model, --custom_config, and --custom_vocab") - exit(1) - else: - # Assign the custom model files - custom_model = { - 'model': args.custom_model, - 'config': args.custom_config, - 'vocab': args.custom_vocab - } - - - - # Example headless execution - convert_ebook_to_audio(ebook_file_path, target_voice, args.language, args.use_custom_model, args.custom_model, args.custom_config, args.custom_vocab, args.temperature, args.length_penalty, args.repetition_penalty, args.top_k, args.top_p, args.speed, args.enable_text_splitting, custom_model_url) - - -else: - # Launch Gradio UI - run_gradio_interface() diff --git a/legacy/v1.0/default_voice.wav b/legacy/v1.0/default_voice.wav deleted file mode 100644 index d98ca272441703d70a195f2c098a78a4ff6f100e..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/default_voice.wav and /dev/null differ diff --git a/legacy/v1.0/demo_mini_story_chapters_Drew.epub b/legacy/v1.0/demo_mini_story_chapters_Drew.epub deleted file mode 100644 index 6f39e5a56e05f80fdca14bdb89431776461178dc..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/demo_mini_story_chapters_Drew.epub and /dev/null differ diff --git a/legacy/v1.0/demo_web_gui.gif b/legacy/v1.0/demo_web_gui.gif deleted file mode 100644 index f49cd5b6cd3c9cf9d3a1d7cdba68348bdba53bca..0000000000000000000000000000000000000000 --- a/legacy/v1.0/demo_web_gui.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00f590c76e206a1833778bad3c0ec9e698399825e566d6609ccac9821f5d5f55 -size 8545568 diff --git a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS.py b/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS.py deleted file mode 100644 index 9664f1d0c983f708dc481c67c2199c8d5a35bd01..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS.py +++ /dev/null @@ -1,484 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -import sys -import torch -from TTS.api import TTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from tqdm import tqdm - -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None): - if custom_model: - print("Loading custom model...") - config = XttsConfig() - config.load_json(custom_model['config']) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_path=custom_model['model'], vocab_path=custom_model['vocab'], use_deepspeed=False) - model.to(device) - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[target_voice_path]) - else: - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - if custom_model: - out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=0.7) - torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) - else: - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path] [language] [custom_model_path] [custom_config_path] [custom_vocab_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - custom_model = None - if len(sys.argv) > 6: - custom_model = { - 'model': sys.argv[4], - 'config': sys.argv[5], - 'vocab': sys.argv[6] - } - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".", "Working_files", "temp_ebook") - full_folder_working_files = os.path.join(".", "Working_files") - chapters_directory = os.path.join(".", "Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and removing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language, custom_model) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_gradio.py b/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_gradio.py deleted file mode 100644 index 3d99f35f3a03cefc7096e881ca09b6b48c3f2812..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_gradio.py +++ /dev/null @@ -1,609 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -import sys -import torch -from TTS.api import TTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from tqdm import tqdm - -nltk.download('punkt') # Make sure to download the necessary models - -import gradio as gr -from gradio import Progress - - -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS - -def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None): - if custom_model: - print("Loading custom model...") - config = XttsConfig() - config.load_json(custom_model['config']) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_path=custom_model['model'], vocab_path=custom_model['vocab'], use_deepspeed=False) - model.to(device) - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[target_voice_path]) - else: - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - if custom_model: - out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=0.7) - torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) - else: - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Define the functions to be used in the Gradio interface -def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, progress=gr.Progress()): - ebook_file_path = ebook_file.name - target_voice = target_voice_file.name if target_voice_file else None - custom_model = None - if use_custom_model and custom_model_file and custom_config_file and custom_vocab_file: - custom_model = { - 'model': custom_model_file.name, - 'config': custom_config_file.name, - 'vocab': custom_vocab_file.name - } - - try: - progress(0, desc="Starting conversion") - except Exception as e: - print(f"Error updating progress: {e}") - - if not calibre_installed(): - return "Calibre is not installed." - - working_files = os.path.join(".", "Working_files", "temp_ebook") - full_folder_working_files = os.path.join(".", "Working_files") - chapters_directory = os.path.join(".", "Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - remove_folder_with_contents(full_folder_working_files) - remove_folder_with_contents(output_audio_directory) - - try: - progress(0.1, desc="Creating chapter-labeled book") - except Exception as e: - print(f"Error updating progress: {e}") - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - - try: - progress(0.3, desc="Converting chapters to audio") - except Exception as e: - print(f"Error updating progress: {e}") - - if use_custom_model: - convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model) - else: - convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language) - - try: - progress(0.9, desc="Creating M4B from chapters") - except Exception as e: - print(f"Error updating progress: {e}") - - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) - - # Get the name of the created M4B file - m4b_filename = os.path.splitext(os.path.basename(ebook_file_path))[0] + '.m4b' - m4b_filepath = os.path.join(audiobook_output_path, m4b_filename) - - try: - progress(1.0, desc="Conversion complete") - except Exception as e: - print(f"Error updating progress: {e}") - - return f"Audiobook created at {m4b_filepath}", m4b_filepath - -language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" -] - -theme = gr.themes.Soft( - primary_hue="blue", - secondary_hue="blue", - neutral_hue="blue", - text_size=gr.themes.sizes.text_md, -) - -with gr.Blocks(theme=theme) as demo: - gr.Markdown( - """ - # eBook to Audiobook Converter - - Transform your eBooks into immersive audiobooks with optional custom TTS models. - """ - ) - - with gr.Row(): - with gr.Column(scale=3): - ebook_file = gr.File(label="eBook File") - target_voice_file = gr.File(label="Target Voice File (Optional)") - language = gr.Dropdown(label="Language", choices=language_options, value="en") - - with gr.Column(scale=3): - use_custom_model = gr.Checkbox(label="Use Custom Model") - custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False) - custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False) - custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False) - - convert_btn = gr.Button("Convert to Audiobook", variant="primary") - output = gr.Textbox(label="Conversion Status") - audio_player = gr.Audio(label="Audiobook Player", type="filepath") - - convert_btn.click( - convert_ebook_to_audio, - inputs=[ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file], - outputs=[output, audio_player] - ) - - use_custom_model.change( - lambda x: [gr.update(visible=x)] * 3, - inputs=[use_custom_model], - outputs=[custom_model_file, custom_config_file, custom_vocab_file] - ) - -demo.launch(share=False) diff --git a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_with_link_gradio.py b/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_with_link_gradio.py deleted file mode 100644 index 35de85ea960b6b88b7af5da6ed10b3415ed00c91..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/custom_model_ebook2audiobookXTTS_with_link_gradio.py +++ /dev/null @@ -1,700 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -import sys -import torch -from TTS.api import TTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from tqdm import tqdm -import gradio as gr -from gradio import Progress -import urllib.request -import zipfile - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(f"Device selected is: {device}") - -nltk.download('punkt') # Make sure to download the necessary models - - -def download_and_extract_zip(url, extract_to='.'): - try: - # Ensure the directory exists - os.makedirs(extract_to, exist_ok=True) - - zip_path = os.path.join(extract_to, 'model.zip') - - # Download with progress bar - with tqdm(unit='B', unit_scale=True, miniters=1, desc="Downloading Model") as t: - def reporthook(blocknum, blocksize, totalsize): - t.total = totalsize - t.update(blocknum * blocksize - t.n) - - urllib.request.urlretrieve(url, zip_path, reporthook=reporthook) - print(f"Downloaded zip file to {zip_path}") - - # Unzipping with progress bar - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - files = zip_ref.namelist() - with tqdm(total=len(files), unit="file", desc="Extracting Files") as t: - for file in files: - if not file.endswith('/'): # Skip directories - # Extract the file to the temporary directory - extracted_path = zip_ref.extract(file, extract_to) - # Move the file to the base directory - base_file_path = os.path.join(extract_to, os.path.basename(file)) - os.rename(extracted_path, base_file_path) - t.update(1) - - # Cleanup: Remove the ZIP file and any empty folders - os.remove(zip_path) - for root, dirs, files in os.walk(extract_to, topdown=False): - for name in dirs: - os.rmdir(os.path.join(root, name)) - print(f"Extracted files to {extract_to}") - - # Check if all required files are present - required_files = ['model.pth', 'config.json', 'vocab.json_'] - missing_files = [file for file in required_files if not os.path.exists(os.path.join(extract_to, file))] - - if not missing_files: - print("All required files (model.pth, config.json, vocab.json_) found.") - else: - print(f"Missing files: {', '.join(missing_files)}") - - except Exception as e: - print(f"Failed to download or extract zip file: {e}") - - - -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS - -def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None): - - if target_voice_path==None: - target_voice_path = default_target_voice_path - - if custom_model: - print("Loading custom model...") - config = XttsConfig() - config.load_json(custom_model['config']) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_path=custom_model['model'], vocab_path=custom_model['vocab'], use_deepspeed=False) - model.to(device) - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[target_voice_path]) - else: - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - if custom_model: - out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=0.7) - torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) - else: - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Define the functions to be used in the Gradio interface -def convert_ebook_to_audio(ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None, progress=gr.Progress()): - ebook_file_path = ebook_file.name - target_voice = target_voice_file.name if target_voice_file else None - custom_model = None - - - working_files = os.path.join(".", "Working_files", "temp_ebook") - full_folder_working_files = os.path.join(".", "Working_files") - chapters_directory = os.path.join(".", "Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - remove_folder_with_contents(full_folder_working_files) - remove_folder_with_contents(output_audio_directory) - - if use_custom_model and custom_model_file and custom_config_file and custom_vocab_file: - custom_model = { - 'model': custom_model_file.name, - 'config': custom_config_file.name, - 'vocab': custom_vocab_file.name - } - if use_custom_model and custom_model_url: - print(f"Received custom model URL: {custom_model_url}") - download_dir = os.path.join(".", "Working_files", "custom_model") - download_and_extract_zip(custom_model_url, download_dir) - custom_model = { - 'model': os.path.join(download_dir, 'model.pth'), - 'config': os.path.join(download_dir, 'config.json'), - 'vocab': os.path.join(download_dir, 'vocab.json_') - } - - try: - progress(0, desc="Starting conversion") - except Exception as e: - print(f"Error updating progress: {e}") - - if not calibre_installed(): - return "Calibre is not installed." - - - try: - progress(0.1, desc="Creating chapter-labeled book") - except Exception as e: - print(f"Error updating progress: {e}") - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - - try: - progress(0.3, desc="Converting chapters to audio") - except Exception as e: - print(f"Error updating progress: {e}") - - if use_custom_model: - convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model) - else: - convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language) - - try: - progress(0.9, desc="Creating M4B from chapters") - except Exception as e: - print(f"Error updating progress: {e}") - - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) - - # Get the name of the created M4B file - m4b_filename = os.path.splitext(os.path.basename(ebook_file_path))[0] + '.m4b' - m4b_filepath = os.path.join(audiobook_output_path, m4b_filename) - - try: - progress(1.0, desc="Conversion complete") - except Exception as e: - print(f"Error updating progress: {e}") - print(f"Audiobook created at {m4b_filepath}") - return f"Audiobook created at {m4b_filepath}", m4b_filepath - - -def list_audiobook_files(audiobook_folder): - # List all files in the audiobook folder - files = [] - for filename in os.listdir(audiobook_folder): - if filename.endswith('.m4b'): # Adjust the file extension as needed - files.append(os.path.join(audiobook_folder, filename)) - return files - -def download_audiobooks(): - audiobook_output_path = os.path.join(".", "Audiobooks") - return list_audiobook_files(audiobook_output_path) - - -language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" -] - -theme = gr.themes.Soft( - primary_hue="blue", - secondary_hue="blue", - neutral_hue="blue", - text_size=gr.themes.sizes.text_md, -) - -# Gradio UI setup -with gr.Blocks(theme=theme) as demo: - gr.Markdown( - """ - # eBook to Audiobook Converter - - Transform your eBooks into immersive audiobooks with optional custom TTS models. - """ - ) - - with gr.Row(): - with gr.Column(scale=3): - ebook_file = gr.File(label="eBook File") - target_voice_file = gr.File(label="Target Voice File (Optional)") - language = gr.Dropdown(label="Language", choices=language_options, value="en") - - with gr.Column(scale=3): - use_custom_model = gr.Checkbox(label="Use Custom Model") - custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False) - custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False) - custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False) - custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False) - - convert_btn = gr.Button("Convert to Audiobook", variant="primary") - output = gr.Textbox(label="Conversion Status") - audio_player = gr.Audio(label="Audiobook Player", type="filepath") - download_btn = gr.Button("Download Audiobook Files") - download_files = gr.File(label="Download Files", interactive=False) - - convert_btn.click( - convert_ebook_to_audio, - inputs=[ebook_file, target_voice_file, language, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url], - outputs=[output, audio_player] - ) - - use_custom_model.change( - lambda x: [gr.update(visible=x)] * 4, - inputs=[use_custom_model], - outputs=[custom_model_file, custom_config_file, custom_vocab_file, custom_model_url] - ) - - download_btn.click( - download_audiobooks, - outputs=[download_files] - ) - -demo.launch(share=True) diff --git a/legacy/v1.0/legacy/ebook2audiobook.py b/legacy/v1.0/legacy/ebook2audiobook.py deleted file mode 100644 index d224a7d8f2d02aa56539fe8efbbe7b952ab9066e..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/ebook2audiobook.py +++ /dev/null @@ -1,462 +0,0 @@ -print("starting...") - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -nltk.download('punkt') # Make sure to download the necessary models -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - # List directory contents - if not os.listdir(folder_path): - return True # The folder is empty - else: - return False # The folder is not empty - else: - print(f"The path {folder_path} is not a valid folder.") - return None # The path is not a valid folder - -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - - - - -def wipe_folder(folder_path): - # Check if the folder exists - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - # Iterate over all the items in the given folder - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - # If it's a file, remove it and print a message - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - # If it's a directory, remove it recursively and print a message - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - - -# Example usage -# folder_to_wipe = 'path_to_your_folder' -# wipe_folder(folder_to_wipe) - - -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - # Function to sort chapters based on their numeric order - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - # Extract metadata and cover image from the eBook file - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - # Combine WAV files into a single file - def combine_wav_files(chapter_files, output_path): - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Sequentially append each file to the combined_audio - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - # Export the combined audio to the output file path - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - # Function to generate metadata for M4B chapters - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - # Generate the final M4B file using ffmpeg - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - # Ensure the output directory exists - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - - - # Main logic - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - # Cleanup - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Example usage -# create_m4b_from_chapters('path_to_chapter_wavs', 'path_to_ebook_file', 'path_to_output_dir') - - - - - - -#this code right here isnt the book grabbing thing but its before to refrence in ordero to create the sepecial chapter labeled book thing with calibre idk some systems cant seem to get it so just in case but the next bit of code after this is the book grabbing code with booknlp -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -# Only run the main script if Value is True -def create_chapter_labeled_book(ebook_file_path): - # Function to ensure the existence of a directory - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - # Convert the ebook to EPUB format using Calibre's ebook-convert - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - # Create the directory if it doesn't exist - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - # Open the EPUB file - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - # Iterate through the items in the EPUB file - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Use BeautifulSoup to parse HTML content - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - # Check if the text is not empty - if text.strip(): - if len(text) < 2300 and previous_filename: - # Append text to the previous chapter if it's short - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - # Create a new chapter file and increment the counter - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - # Example usage - input_ebook = ebook_file_path # Replace with your eBook file path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - # Download the necessary NLTK data (if not already present) - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write the header row - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - # Process each chapter file - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - # Insert "NEWCHAPTERABC" at the beginning of each chapter's text - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - # Example usage - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - """Extract chapter number for sorting.""" - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - # Create the output folder if it doesn't exist - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - # List all txt files and sort them by chapter number - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: # Specify UTF-8 encoding here - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: # And here - outfile.write(infile.read()) - # Add the marker unless it's the last file - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - # Paths - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - - # Combine the chapters - combine_chapters(input_folder, output_file) - - ensure_directory(os.path.join(".", "Working_files", "Book")) - - -#create_chapter_labeled_book() - - - - -import os -import subprocess -import sys -import torchaudio - -# Check if Calibre's ebook-convert tool is installed -def calibre_installed(): - try: - subprocess.run(['ebook-convert', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except FileNotFoundError: - print("Calibre is not installed. Please install Calibre for this functionality.") - return False - - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment -# Assuming split_long_sentence and wipe_folder are defined elsewhere in your code - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -def combine_wav_files(input_directory, output_directory, file_name): - # Ensure that the output directory exists, create it if necessary - os.makedirs(output_directory, exist_ok=True) - - # Specify the output file path - output_file_path = os.path.join(output_directory, file_name) - - # Initialize an empty audio segment - combined_audio = AudioSegment.empty() - - # Get a list of all .wav files in the specified input directory and sort them - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - - # Sequentially append each file to the combined_audio - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - - # Export the combined audio to the output file path - combined_audio.export(output_file_path, format='wav') - - print(f"Combined audio saved to {output_file_path}") - -# Function to split long strings into parts -def split_long_sentence(sentence, max_length=249, max_pauses=10): - """ - Splits a sentence into parts based on length or number of pauses without recursion. - - :param sentence: The sentence to split. - :param max_length: Maximum allowed length of a sentence. - :param max_pauses: Maximum allowed number of pauses in a sentence. - :return: A list of sentence parts that meet the criteria. - """ - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - # Find the best place to split the sentence, preferring the last possible split to keep parts longer - split_at = possible_splits[-1] + 1 - else: - # If no punctuation to split on within max_length, split at max_length - split_at = max_length - - # Split the sentence and add the first part to the list - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - - # Add the remaining part of the sentence - parts.append(sentence) - return parts - -""" -if 'tts' not in locals(): - tts = TTS(selected_tts_model, progress_bar=True).to(device) -""" -from tqdm import tqdm - -# Convert chapters to audio using XTTS -def convert_chapters_to_audio(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) # Set progress_bar to False to avoid nested progress bars - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - # Extract chapter number from the filename - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - # Use the specified language model for sentence tokenization - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = [] - if language == "en": - fragments = split_long_sentence(sentence, max_length=249, max_pauses=10) - if language == "it": - fragments = split_long_sentence(sentence, max_length=213, max_pauses=10) - for fragment in fragments: - if fragment != "": #a hot fix to avoid blank fragments - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - - - -# Main execution flow -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python script.py [target_voice_file_path]") - sys.exit(1) - - ebook_file_path = sys.argv[1] - target_voice = sys.argv[2] if len(sys.argv) > 2 else None - language = sys.argv[3] if len(sys.argv) > 3 else None - - if not calibre_installed(): - sys.exit(1) - - working_files = os.path.join(".","Working_files", "temp_ebook") - full_folder_working_files =os.path.join(".","Working_files") - chapters_directory = os.path.join(".","Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - - print("Wiping and removeing Working_files folder...") - remove_folder_with_contents(full_folder_working_files) - - print("Wiping and and removeing chapter_wav_files folder...") - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file_path) - audiobook_output_path = os.path.join(".", "Audiobooks") - print(f"{chapters_directory}||||{output_audio_directory}|||||{target_voice}") - convert_chapters_to_audio(chapters_directory, output_audio_directory, target_voice, language) - create_m4b_from_chapters(output_audio_directory, ebook_file_path, audiobook_output_path) diff --git a/legacy/v1.0/legacy/gradio_gui_with_email_and_que.py b/legacy/v1.0/legacy/gradio_gui_with_email_and_que.py deleted file mode 100644 index 90997209b4bfa3addcadf0ef8d96ce41eb6da3b1..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/gradio_gui_with_email_and_que.py +++ /dev/null @@ -1,614 +0,0 @@ -print("starting...") -import ebooklib -from ebooklib import epub - -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -import os -import subprocess -import sys -import torchaudio - -import os -import torch -from TTS.api import TTS -from nltk.tokenize import sent_tokenize -from pydub import AudioSegment - -from tqdm import tqdm - - - -import os -import subprocess -import ebooklib -from ebooklib import epub -from bs4 import BeautifulSoup -import re -import csv -import nltk - -from bs4 import BeautifulSoup -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -import urllib.request -import zipfile -import requests -from tqdm import tqdm -import nltk -from nltk.tokenize import sent_tokenize -import torch -import torchaudio -import gradio as gr -from threading import Lock, Thread -from queue import Queue -import smtplib -from email.mime.text import MIMEText - - -import os -import shutil -import subprocess -import re -from pydub import AudioSegment -import tempfile -from pydub import AudioSegment -import os -import nltk -from nltk.tokenize import sent_tokenize -import sys -import torch -from TTS.api import TTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from tqdm import tqdm -import gradio as gr -from gradio import Progress -import urllib.request -import zipfile - - -default_target_voice_path = "default_voice.wav" # Ensure this is a valid path -default_language_code = "en" - - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(f"Device selected is: {device}") - -nltk.download('punkt') # Ensure necessary models are downloaded - -# Global variables for queue management -queue = Queue() -queue_lock = Lock() - -# Function to send an email with the download link -def send_email(to_address, download_link): - from_address = "your_email@example.com" # Replace with your email - subject = "Your Audiobook is Ready" - body = f"Your audiobook has been processed. You can download it from the following link: {download_link}" - - msg = MIMEText(body) - msg['Subject'] = subject - msg['From'] = from_address - msg['To'] = to_address - - try: - with smtplib.SMTP('smtp.example.com', 587) as server: # Replace with your SMTP server details - server.starttls() - server.login(from_address, "your_password") # Replace with your email password - server.sendmail(from_address, [to_address], msg.as_string()) - print(f"Email sent to {to_address}") - except Exception as e: - print(f"Failed to send email: {e}") - -# Function to download and extract the custom model -def download_and_extract_zip(url, extract_to='.'): - try: - os.makedirs(extract_to, exist_ok=True) - zip_path = os.path.join(extract_to, 'model.zip') - - with tqdm(unit='B', unit_scale=True, miniters=1, desc="Downloading Model") as t: - def reporthook(blocknum, blocksize, totalsize): - t.total = totalsize - t.update(blocknum * blocksize - t.n) - urllib.request.urlretrieve(url, zip_path, reporthook=reporthook) - print(f"Downloaded zip file to {zip_path}") - - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - files = zip_ref.namelist() - with tqdm(total=len(files), unit="file", desc="Extracting Files") as t: - for file in files: - if not file.endswith('/'): - extracted_path = zip_ref.extract(file, extract_to) - base_file_path = os.path.join(extract_to, os.path.basename(file)) - os.rename(extracted_path, base_file_path) - t.update(1) - - os.remove(zip_path) - for root, dirs, files in os.walk(extract_to, topdown=False): - for name in dirs: - os.rmdir(os.path.join(root, name)) - print(f"Extracted files to {extract_to}") - - required_files = ['model.pth', 'config.json', 'vocab.json_'] - missing_files = [file for file in required_files if not os.path.exists(os.path.join(extract_to, file))] - - if not missing_files: - print("All required files (model.pth, config.json, vocab.json_) found.") - else: - print(f"Missing files: {', '.join(missing_files)}") - - except Exception as e: - print(f"Failed to download or extract zip file: {e}") - -# Function to check if a folder is empty -def is_folder_empty(folder_path): - if os.path.exists(folder_path) and os.path.isdir(folder_path): - return not os.listdir(folder_path) - else: - print(f"The path {folder_path} is not a valid folder.") - return None - -# Function to remove a folder and its contents -def remove_folder_with_contents(folder_path): - try: - shutil.rmtree(folder_path) - print(f"Successfully removed {folder_path} and all of its contents.") - except Exception as e: - print(f"Error removing {folder_path}: {e}") - -# Function to wipe the contents of a folder -def wipe_folder(folder_path): - if not os.path.exists(folder_path): - print(f"The folder {folder_path} does not exist.") - return - - for item in os.listdir(folder_path): - item_path = os.path.join(folder_path, item) - if os.path.isfile(item_path): - os.remove(item_path) - print(f"Removed file: {item_path}") - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - print(f"Removed directory and its contents: {item_path}") - - print(f"All contents wiped from {folder_path}.") - -# Function to create M4B from chapters -def create_m4b_from_chapters(input_dir, ebook_file, output_dir): - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - def extract_metadata_and_cover(ebook_path): - try: - cover_path = ebook_path.rsplit('.', 1)[0] + '.jpg' - subprocess.run(['ebook-meta', ebook_path, '--get-cover', cover_path], check=True) - if os.path.exists(cover_path): - return cover_path - except Exception as e: - print(f"Error extracting eBook metadata or cover: {e}") - return None - - def combine_wav_files(chapter_files, output_path): - combined_audio = AudioSegment.empty() - for chapter_file in chapter_files: - audio_segment = AudioSegment.from_wav(chapter_file) - combined_audio += audio_segment - combined_audio.export(output_path, format='wav') - print(f"Combined audio saved to {output_path}") - - def generate_ffmpeg_metadata(chapter_files, metadata_file): - with open(metadata_file, 'w') as file: - file.write(';FFMETADATA1\n') - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - duration_ms = len(AudioSegment.from_wav(chapter_file)) - file.write(f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n') - file.write(f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n') - start_time += duration_ms - - def create_m4b(combined_wav, metadata_file, cover_image, output_m4b): - os.makedirs(os.path.dirname(output_m4b), exist_ok=True) - - ffmpeg_cmd = ['ffmpeg', '-i', combined_wav, '-i', metadata_file] - if cover_image: - ffmpeg_cmd += ['-i', cover_image, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '192k'] - if cover_image: - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] - ffmpeg_cmd += [output_m4b] - - subprocess.run(ffmpeg_cmd, check=True) - - chapter_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.wav')], key=sort_key) - temp_dir = tempfile.gettempdir() - temp_combined_wav = os.path.join(temp_dir, 'combined.wav') - metadata_file = os.path.join(temp_dir, 'metadata.txt') - cover_image = extract_metadata_and_cover(ebook_file) - output_m4b = os.path.join(output_dir, os.path.splitext(os.path.basename(ebook_file))[0] + '.m4b') - - combine_wav_files(chapter_files, temp_combined_wav) - generate_ffmpeg_metadata(chapter_files, metadata_file) - create_m4b(temp_combined_wav, metadata_file, cover_image, output_m4b) - - if os.path.exists(temp_combined_wav): - os.remove(temp_combined_wav) - if os.path.exists(metadata_file): - os.remove(metadata_file) - if cover_image and os.path.exists(cover_image): - os.remove(cover_image) - -# Function to create chapter-labeled book -def create_chapter_labeled_book(ebook_file_path): - def ensure_directory(directory_path): - if not os.path.exists(directory_path): - os.makedirs(directory_path) - print(f"Created directory: {directory_path}") - - ensure_directory(os.path.join(".", 'Working_files', 'Book')) - - def convert_to_epub(input_path, output_path): - try: - subprocess.run(['ebook-convert', input_path, output_path], check=True) - except subprocess.CalledProcessError as e: - print(f"An error occurred while converting the eBook: {e}") - return False - return True - - def save_chapters_as_text(epub_path): - directory = os.path.join(".", "Working_files", "temp_ebook") - ensure_directory(directory) - - book = epub.read_epub(epub_path) - - previous_chapter_text = '' - previous_filename = '' - chapter_counter = 0 - - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - soup = BeautifulSoup(item.get_content(), 'html.parser') - text = soup.get_text() - - if text.strip(): - if len(text) < 2300 and previous_filename: - with open(previous_filename, 'a', encoding='utf-8') as file: - file.write('\n' + text) - else: - previous_filename = os.path.join(directory, f"chapter_{chapter_counter}.txt") - chapter_counter += 1 - with open(previous_filename, 'w', encoding='utf-8') as file: - file.write(text) - print(f"Saved chapter: {previous_filename}") - - input_ebook = ebook_file_path - output_epub = os.path.join(".", "Working_files", "temp.epub") - - if os.path.exists(output_epub): - os.remove(output_epub) - print(f"File {output_epub} has been removed.") - else: - print(f"The file {output_epub} does not exist.") - - if convert_to_epub(input_ebook, output_epub): - save_chapters_as_text(output_epub) - - nltk.download('punkt') - - def process_chapter_files(folder_path, output_csv): - with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - writer.writerow(['Text', 'Start Location', 'End Location', 'Is Quote', 'Speaker', 'Chapter']) - - chapter_files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('_')[1].split('.')[0])) - for filename in chapter_files: - if filename.startswith('chapter_') and filename.endswith('.txt'): - chapter_number = int(filename.split('_')[1].split('.')[0]) - file_path = os.path.join(folder_path, filename) - - try: - with open(file_path, 'r', encoding='utf-8') as file: - text = file.read() - if text: - text = "NEWCHAPTERABC" + text - sentences = nltk.tokenize.sent_tokenize(text) - for sentence in sentences: - start_location = text.find(sentence) - end_location = start_location + len(sentence) - writer.writerow([sentence, start_location, end_location, 'True', 'Narrator', chapter_number]) - except Exception as e: - print(f"Error processing file {filename}: {e}") - - folder_path = os.path.join(".", "Working_files", "temp_ebook") - output_csv = os.path.join(".", "Working_files", "Book", "Other_book.csv") - - process_chapter_files(folder_path, output_csv) - - def sort_key(filename): - match = re.search(r'chapter_(\d+)\.txt', filename) - return int(match.group(1)) if match else 0 - - def combine_chapters(input_folder, output_file): - os.makedirs(os.path.dirname(output_file), exist_ok=True) - - files = [f for f in os.listdir(input_folder) if f.endswith('.txt')] - sorted_files = sorted(files, key=sort_key) - - with open(output_file, 'w', encoding='utf-8') as outfile: - for i, filename in enumerate(sorted_files): - with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as infile: - outfile.write(infile.read()) - if i < len(sorted_files) - 1: - outfile.write("\nNEWCHAPTERABC\n") - - input_folder = os.path.join(".", 'Working_files', 'temp_ebook') - output_file = os.path.join(".", 'Working_files', 'Book', 'Chapter_Book.txt') - - combine_chapters(input_folder, output_file) - ensure_directory(os.path.join(".", "Working_files", "Book")) - -# Function to combine WAV files -def combine_wav_files(input_directory, output_directory, file_name): - os.makedirs(output_directory, exist_ok=True) - output_file_path = os.path.join(output_directory, file_name) - combined_audio = AudioSegment.empty() - input_file_paths = sorted( - [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith(".wav")], - key=lambda f: int(''.join(filter(str.isdigit, f))) - ) - for input_file_path in input_file_paths: - audio_segment = AudioSegment.from_wav(input_file_path) - combined_audio += audio_segment - combined_audio.export(output_file_path, format='wav') - print(f"Combined audio saved to {output_file_path}") - -# Function to split long sentences -def split_long_sentence(sentence, max_length=249, max_pauses=10): - parts = [] - while len(sentence) > max_length or sentence.count(',') + sentence.count(';') + sentence.count('.') > max_pauses: - possible_splits = [i for i, char in enumerate(sentence) if char in ',;.' and i < max_length] - if possible_splits: - split_at = possible_splits[-1] + 1 - else: - split_at = max_length - parts.append(sentence[:split_at].strip()) - sentence = sentence[split_at:].strip() - parts.append(sentence) - return parts - -# Function to convert chapters to audio using custom model -def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None, custom_model=None): - if target_voice_path is None: - target_voice_path = default_target_voice_path - if custom_model: - print("Loading custom model...") - config = XttsConfig() - config.load_json(custom_model['config']) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_path=custom_model['model'], vocab_path=custom_model['vocab'], use_deepspeed=False) - model.device - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[target_voice_path]) - else: - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - if custom_model: - out = model.inference(fragment, language, gpt_cond_latent, speaker_embedding, temperature=0.7) - torchaudio.save(fragment_file_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) - else: - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - -# Function to convert chapters to audio using standard model -def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, target_voice_path=None, language=None): - selected_tts_model = "tts_models/multilingual/multi-dataset/xtts_v2" - tts = TTS(selected_tts_model, progress_bar=False).to(device) - - if not os.path.exists(output_audio_dir): - os.makedirs(output_audio_dir) - - for chapter_file in sorted(os.listdir(chapters_dir)): - if chapter_file.endswith('.txt'): - match = re.search(r"chapter_(\d+).txt", chapter_file) - if match: - chapter_num = int(match.group(1)) - else: - print(f"Skipping file {chapter_file} as it does not match the expected format.") - continue - - chapter_path = os.path.join(chapters_dir, chapter_file) - output_file_name = f"audio_chapter_{chapter_num}.wav" - output_file_path = os.path.join(output_audio_dir, output_file_name) - temp_audio_directory = os.path.join(".", "Working_files", "temp") - os.makedirs(temp_audio_directory, exist_ok=True) - temp_count = 0 - - with open(chapter_path, 'r', encoding='utf-8') as file: - chapter_text = file.read() - sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english') - for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"): - fragments = split_long_sentence(sentence, max_length=249 if language == "en" else 213, max_pauses=10) - for fragment in fragments: - if fragment != "": - print(f"Generating fragment: {fragment}...") - fragment_file_path = os.path.join(temp_audio_directory, f"{temp_count}.wav") - speaker_wav_path = target_voice_path if target_voice_path else default_target_voice_path - language_code = language if language else default_language_code - tts.tts_to_file(text=fragment, file_path=fragment_file_path, speaker_wav=speaker_wav_path, language=language_code) - temp_count += 1 - - combine_wav_files(temp_audio_directory, output_audio_dir, output_file_name) - wipe_folder(temp_audio_directory) - print(f"Converted chapter {chapter_num} to audio.") - -# Function to handle the processing of an eBook to an audiobook -def process_request(ebook_file, target_voice, language, email, use_custom_model, custom_model): - working_files = os.path.join(".", "Working_files", "temp_ebook") - full_folder_working_files = os.path.join(".", "Working_files") - chapters_directory = os.path.join(".", "Working_files", "temp_ebook") - output_audio_directory = os.path.join(".", 'Chapter_wav_files') - remove_folder_with_contents(full_folder_working_files) - remove_folder_with_contents(output_audio_directory) - - create_chapter_labeled_book(ebook_file.name) - audiobook_output_path = os.path.join(".", "Audiobooks") - - if use_custom_model: - convert_chapters_to_audio_custom_model(chapters_directory, output_audio_directory, target_voice, language, custom_model) - else: - convert_chapters_to_audio_standard_model(chapters_directory, output_audio_directory, target_voice, language) - - create_m4b_from_chapters(output_audio_directory, ebook_file.name, audiobook_output_path) - - m4b_filepath = os.path.join(audiobook_output_path, os.path.splitext(os.path.basename(ebook_file.name))[0] + '.m4b') - - # Upload the final audiobook to file.io - with open(m4b_filepath, 'rb') as f: - response = requests.post('https://file.io', files={'file': f}) - download_link = response.json().get('link', '') - - # Send the download link to the user's email - if email and download_link: - send_email(email, download_link) - - return download_link - -# Function to manage the queue and process each request sequentially -def handle_queue(): - while True: - ebook_file, target_voice, language, email, use_custom_model, custom_model = queue.get() - process_request(ebook_file, target_voice, language, email, use_custom_model, custom_model) - queue.task_done() - -# Start the queue handler thread -thread = Thread(target=handle_queue, daemon=True) -thread.start() - -# Gradio function to add a request to the queue -def enqueue_request(ebook_file, target_voice_file, language, email, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url=None): - target_voice = target_voice_file.name if target_voice_file else None - custom_model = None - - if use_custom_model and custom_model_file and custom_config_file and custom_vocab_file: - custom_model = { - 'model': custom_model_file.name, - 'config': custom_config_file.name, - 'vocab': custom_vocab_file.name - } - if use_custom_model and custom_model_url: - download_dir = os.path.join(".", "Working_files", "custom_model") - download_and_extract_zip(custom_model_url, download_dir) - custom_model = { - 'model': os.path.join(download_dir, 'model.pth'), - 'config': os.path.join(download_dir, 'config.json'), - 'vocab': os.path.join(download_dir, 'vocab.json_') - } - - # Add request to the queue - queue_lock.acquire() - queue.put((ebook_file, target_voice, language, email, use_custom_model, custom_model)) - position = queue.qsize() - queue_lock.release() - return f"Your request has been added to the queue. You are number {position} in line." - -# Gradio UI setup -language_options = [ - "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" -] - -theme = gr.themes.Soft( - primary_hue="blue", - secondary_hue="blue", - neutral_hue="blue", - text_size=gr.themes.sizes.text_md, -) - -with gr.Blocks(theme=theme) as demo: - gr.Markdown( - """ - # eBook to Audiobook Converter - - Transform your eBooks into immersive audiobooks with optional custom TTS models. - """ - ) - - with gr.Row(): - with gr.Column(scale=3): - ebook_file = gr.File(label="eBook File") - target_voice_file = gr.File(label="Target Voice File (Optional)") - language = gr.Dropdown(label="Language", choices=language_options, value="en") - email = gr.Textbox(label="Email Address") - - with gr.Column(scale=3): - use_custom_model = gr.Checkbox(label="Use Custom Model") - custom_model_file = gr.File(label="Custom Model File (Optional)", visible=False) - custom_config_file = gr.File(label="Custom Config File (Optional)", visible=False) - custom_vocab_file = gr.File(label="Custom Vocab File (Optional)", visible=False) - custom_model_url = gr.Textbox(label="Custom Model Zip URL (Optional)", visible=False) - - convert_btn = gr.Button("Convert to Audiobook", variant="primary") - queue_status = gr.Textbox(label="Queue Status") - - convert_btn.click( - enqueue_request, - inputs=[ebook_file, target_voice_file, language, email, use_custom_model, custom_model_file, custom_config_file, custom_vocab_file, custom_model_url], - outputs=[queue_status] - ) - - use_custom_model.change( - lambda x: [gr.update(visible=x)] * 4, - inputs=[use_custom_model], - outputs=[custom_model_file, custom_config_file, custom_vocab_file, custom_model_url] - ) - -demo.launch(share=True) diff --git a/legacy/v1.0/legacy/install.bat b/legacy/v1.0/legacy/install.bat deleted file mode 100644 index 9da6e0142053fdd884e7e5f211a2d6c5774c6083..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/install.bat +++ /dev/null @@ -1,18 +0,0 @@ -@echo off -:: Check for administrative privileges -net session >nul 2>&1 -if %errorLevel% neq 0 ( - echo This script requires administrator privileges. - echo Switching to administrator... - - powershell -Command "Start-Process cmd -ArgumentList '/c', '%~dpnx0' -Verb runAs" - exit /b -) - -:: If already elevated, continue the script -echo Running with administrator privileges... - -:: Run the PowerShell script in the same directory as this batch file -powershell -NoProfile -ExecutionPolicy Bypass -File "%~dp0install.ps1" - -pause diff --git a/legacy/v1.0/legacy/install.ps1 b/legacy/v1.0/legacy/install.ps1 deleted file mode 100644 index 445ce91c8908e0fd5ccf094360dea89f600a4f7c..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/install.ps1 +++ /dev/null @@ -1,255 +0,0 @@ -# Function to check if the script is running as Administrator -function Test-IsAdmin { - $currentUser = New-Object Security.Principal.WindowsPrincipal([Security.Principal.WindowsIdentity]::GetCurrent()) - return $currentUser.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) -} - -# If the script is not running as Administrator, restart it with elevated privileges -if (-not (Test-IsAdmin)) { - Write-Host "This script requires administrative privileges. Restarting as Administrator..." -ForegroundColor Yellow - Start-Process powershell.exe -ArgumentList "-NoProfile", "-ExecutionPolicy RemoteSigned", "-File", "`"$PSCommandPath`" $Params" -Verb RunAs - exit -} - -################# Main script starts here with admin privileges ################# - -# Function to check if Conda is installed -function Check-CondaInstalled { - Write-Host "Checking if Conda is installed..." - $condaPath = (Get-Command conda -ErrorAction SilentlyContinue).Source - if ($condaPath) { - Write-Host "Conda is already installed at: $condaPath" - return $true - } else { - Write-Host "Conda is not installed." - return $false - } -} - -function Check-ProgramsInstalled { - param ( - [string[]]$Programs - ) - - $programsMissing = @() - - if (-not (Get-Command choco -ErrorAction SilentlyContinue)) { - Write-Host "Chocolatey is not installed. Installing Chocolatey..." - Set-ExecutionPolicy Bypass -Scope Process -Force - [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 - iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) - - if (-not (Get-Command choco -ErrorAction SilentlyContinue)) { - return $true - } else { - Write-Host "Chocolatey installed successfully." - } - } - - foreach ($program in $Programs) { - if (Get-Command $program -ErrorAction SilentlyContinue) { - Write-Host "$program is installed." - } else { - $programsMissing += $program - } - } - - $missingCount = $programsMissing.Count - - if ($missingCount -eq 0) { - return $true - } else { - $installedCount = 0 - foreach ($program in $programsMissing) { - if ($program -eq "ffmpeg") { - Write-Host "Installing ffmpeg..." - choco install ffmpeg -y - - if (Get-Command ffmpeg -ErrorAction SilentlyContinue) { - Write-Host "ffmpeg installed successfully!" - $installedCount += 1 - } - } elseif ($program -eq "calibre") { - # Avoid conflict with calibre built-in lxml - pip uninstall lxml -y - - # Install Calibre using Chocolatey - Write-Host "Installing Calibre..." - choco install calibre -y - - # Verify Calibre installation - if (Get-Command calibre -ErrorAction SilentlyContinue) { - Write-Host "Calibre installed successfully!" - $installedCount += 1 - } - } - } - } - if ($installedCount -eq $countMissing) { - return $false - } - return $true -} - -# Function to check if Docker is installed and running -function Check-Docker { - Write-Host "Checking if Docker is installed..." - $dockerPath = (Get-Command docker -ErrorAction SilentlyContinue).Source - if ($dockerPath) { - Write-Host "Docker is installed at: $dockerPath" - # Check if Docker service is running - $dockerStatus = (Get-Service -Name com.docker.service -ErrorAction SilentlyContinue).Status - if ($dockerStatus -eq 'Running') { - Write-Host "Docker service is running." - return $true - } else { - Write-Host "Docker service is installed but not running. Attempting to start Docker service..." - Start-Service -Name "com.docker.service" -ErrorAction SilentlyContinue - - # Wait for Docker service to start - while ((Get-Service -Name "com.docker.service").Status -ne 'Running') { - Write-Host "Waiting for Docker service to start..." - Start-Sleep -Seconds 5 - } - Write-Host "Docker service is now running." - return $true - } - } else { - Write-Host "Docker is not installed." - return $false - } -} - -######### Miniconda installation - -$minicondaUrl = "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe" -$installerPath = "$env:TEMP\Miniconda3-latest-Windows-x86_64.exe" - -if (-not (Check-CondaInstalled)) { - # Check if the Miniconda installer already exists - if (-not (Test-Path $installerPath)) { - Write-Host "Downloading Miniconda installer..." - Invoke-WebRequest -Uri $minicondaUrl -OutFile $installerPath - } else { - Write-Host "Miniconda installer already exists at $installerPath. Skipping download." - } - - # Set the installation path for Miniconda - $installPath = "C:\Miniconda3" - - Write-Host "Installing Miniconda..." - Start-Process -FilePath $installerPath -ArgumentList "/InstallationType=JustMe", "/RegisterPython=0", "/AddToPath=1", "/S", "/D=$installPath" -NoNewWindow -Wait - - Write-Host "Verifying Miniconda installation..." - & "$installPath\Scripts\conda.exe" --version - Write-Host "Miniconda installation complete." -} else { - Write-Host "Skipping Miniconda installation." -} - -######### Docker installation - -$dockerMsiUrl = "https://desktop.docker.com/win/main/amd64/Docker%20Desktop%20Installer.exe" -$dockerInstallerPath = "$env:TEMP\DockerInstaller.exe" - -$dockerUtilsNeeded = Check-ProgramsInstalled -Programs @("ffmpeg", "calibre") - -if ($dockerUtilsNeeded) { - if (-not (Check-Docker)) { - # Verify the installer file or re-download if corrupted or missing - if (-not (Test-Path $dockerInstallerPath)) { - Write-Host "Downloading Docker installer for Windows..." - Invoke-WebRequest -Uri $dockerMsiUrl -OutFile $dockerInstallerPath - } - - # Launch the Docker installer - Write-Host "Launching Docker installer..." - Start-Process -FilePath $dockerInstallerPath - Write-Host "Please complete the Docker installation manually." - pause - - # Ensure Docker service is running after installation - Write-Host "Ensuring Docker service is running..." - Start-Service -Name "com.docker.service" -ErrorAction SilentlyContinue - - # Wait for Docker service to start - while ((Get-Service -Name "com.docker.service").Status -ne 'Running') { - Write-Host "Waiting for Docker service to start..." - Start-Sleep -Seconds 5 - } - - Write-Host "Docker service is now running." - } -} - -######### Install ebook2audiobook - -if (Check-CondaInstalled) { - - Write-Host "Installing ebook2audiobook..." -ForegroundColor Yellow - - # Set the working directory to the script's directory - $scriptDir = $PSScriptRoot - Set-Location -Path $scriptDir - - # Create new Conda environment with Python 3.11 in the script directory, showing progress - Write-Host "Creating Conda environment with Python 3.11 in $scriptDir..." - & conda create --prefix "$scriptDir\python_env" python=3.11 -y -v - - # Ensure the correct Python environment is active - Write-Host "Checking Python version in Conda environment..." - - # Get python.exe version from python_env - $pythonEnvVersion = & "$scriptDir\python_env\python.exe" --version - - # Get the Conda-managed Python version using conda run - $pythonVersion = & conda run --prefix "$scriptDir\python_env" python --version - - if ($pythonVersion.Trim() -eq $pythonEnvVersion.Trim()) { - Write-Host "Python versions match, proceeding with installation..." - - if ($dockerUtilsNeeded) { - # Build Docker image for utils - Write-Host "Building Docker image for utils..." - & conda run --prefix "$scriptDir\python_env" docker build -f DockerfileUtils -t utils . - } - - # Install required Python packages with pip, showing progress - Write-Host "Installing required Python packages..." - & conda run --prefix "$scriptDir\python_env" python.exe -m pip install --upgrade pip --progress-bar on -v - & conda run --prefix "$scriptDir\python_env" pip install pydub nltk beautifulsoup4 ebooklib translate coqui-tts tqdm mecab mecab-python3 unidic gradio>=4.44.0 docker --progress-bar on -v - - # Download unidic language model for MeCab with progress - Write-Host "Downloading unidic language model for MeCab..." - & conda run --prefix "$scriptDir\python_env" python.exe -m unidic download - - # Download spacy NLP model with progress - Write-Host "Downloading spaCy language model..." - & conda run --prefix "$scriptDir\python_env" python.exe -m spacy download en_core_web_sm - - # Install ebook2audiobook - Write-Host "Installing ebook2audiobook..." - & conda run --prefix "$scriptDir\python_env" pip install -e . - - # Delete Docker and Miniconda installers if both are installed and running - if ((Check-CondaInstalled) -and (Check-Docker)) { - Write-Host "Both Conda and Docker are installed and running. Deleting installer files..." - Remove-Item -Path $installerPath -Force -ErrorAction SilentlyContinue - Remove-Item -Path $dockerInstallerPath -Force -ErrorAction SilentlyContinue - Write-Host "Installer files deleted." - } - - Write-Host "******************* ebook2audiobook installation successful! *******************" -ForegroundColor Green - Write-Host "To launch ebook2audiobook:" -ForegroundColor Yellow - Write-Host "- in command line mode: ./ebook2audiobook.cmd --headless [other options]" - Write-Host "- in graphic web mode: ./ebook2audiobook.cmd [--share]" - } else { - Write-Host "The python terminal is still using the OS python version $pythonVersion, but it should be $pythonEnvVersion from the python_env virtual environment" - } - - # Deactivate Conda environment - Write-Host "Deactivating Conda environment..." - & conda deactivate -} else { - Write-Host "Installation cannot proceed. Either Conda is not installed or Docker is not running." -ForegroundColor Red -} diff --git a/legacy/v1.0/legacy/install.sh b/legacy/v1.0/legacy/install.sh deleted file mode 100644 index a647b9fbc36f9fe0ebaa2269078e55e005a944a5..0000000000000000000000000000000000000000 --- a/legacy/v1.0/legacy/install.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env bash - -WGET=$(which wget 2>/dev/null) -CONDA_VERSION=$(conda --version 2>/dev/null) -DOCKER_UTILS=$(which docker 2>/dev/null) -DOCKER_UTILS_NEEDED=false -PACK_MGR="" -PACK_MGR_OPTIONS="" - -if [[ "$OSTYPE" == "darwin"* ]]; then - PACK_MGR="brew install" -elif command -v emerge &> /dev/null; then - PACK_MGR="sudo emerge" -elif command -v dnf &> /dev/null; then - PACK_MGR="sudo dnf install" - PACK_MGR_OPTIONS="-y" -elif command -v yum &> /dev/null; then - PACK_MGR="sudo yum install" - PACK_MGR_OPTIONS="-y" -elif command -v zypper &> /dev/null; then - PACK_MGR="sudo zypper install" - PACK_MGR_OPTIONS="-y" -elif command -v pacman &> /dev/null; then - PACK_MGR="sudo pacman -Sy" -elif command -v apt-get &> /dev/null; then - sudo apt-get update - PACK_MGR="sudo apt-get install" - PACK_MGR_OPTIONS="-y" -elif command -v apk &> /dev/null; then - PACK_MGR="sudo apk add" -fi - -check_programs_installed() { - local programs=("$@") - declare -a programs_missing - - for program in "${programs[@]}"; do - if command -v "$program" >/dev/null 2>&1; then - echo "$program is installed." - else - echo "$program is not installed." - programs_missing+=($program) - fi - done - - local count=${#programs_missing[@]} - - if [[ $count -eq 0 || "$PKG_MGR" = "" ]]; then - DOCKER_UTILS_NEEDED=true - else - for program in "${programs_missing[@]}"; do - if [ "$program" = "ffmpeg" ];then - eval "$PKG_MGR ffmpeg $PKG_MGR_OPTIONS" - if command -v ffmpeg >/dev/null 2>&1; then - echo "FFmpeg installed successfully!" - else - echo "FFmpeg installation failed." - DOCKER_UTILS_NEEDED=true - break - fi - elif [ "$program" = "calibre" ];then - # avoid conflict with calibre builtin lxml - pip uninstall lxml -y 2>/dev/null - - if [[ "$OSTYPE" == "Linux" ]]; then - echo "Installing Calibre for Linux..." - $WGET -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sh /dev/stdin - elif [[ "$OSTYPE" == "Darwin"* ]]; then - echo "Installing Calibre for macOS using Homebrew..." - eval "$PACK_MGR --cask calibre" - fi - - if command -v calibre >/dev/null 2>&1; then - echo "Calibre installed successfully!" - else - echo "Calibre installation failed." - fi - fi - done - fi -} - -# Check for Homebrew on macOS -if [[ "$OSTYPE" == "darwin"* ]]; then - echo "Detected macOS." - if ! command -v brew &> /dev/null; then - echo "Homebrew is not installed. Installing Homebrew..." - /usr/bin/env bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" - echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> ~/.zprofile - eval "$(/opt/homebrew/bin/brew shellenv)" - fi -fi - -if [ -z "$WGET" ]; then - echo -e "\e[33m wget is missing! trying to install it... \e[0m" - if [ "$PACK_MGR" != "" ]; then - eval "$PACK_MGR wget $PACK_MGR_OPTIONS" - else - echo "Cannot recognize your package manager. Please install wget manually." - fi - WGET=$(which wget 2>/dev/null) -fi - -if [[ -n "$WGET" && -z "$CONDA_VERSION" ]]; then - echo -e "\e[33m conda is missing! trying to install it... \e[0m" - - if [[ "$OSTYPE" == "darwin"* ]]; then - $WGET https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3-latest.sh - else - $WGET https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest.sh - fi - - chmod +x Miniconda3-latest.sh - ./Miniconda3-latest.sh -b -u && \ - ~/miniconda3/bin/conda init && \ - rm -f Miniconda3-latest.sh - - # Source the appropriate shell configuration file - SHELL_RC=~/miniconda3/etc/profile.d/conda.sh - source $SHELL_RC - - CONDA_VERSION=$(conda --version 2>/dev/null) - echo -e "\e[32m===============>>> conda is installed! <<===============\e[0m" -fi - -check_programs_installed() - -if [ $DOCKER_UTILS_NEEDED = true ]; then - if [[ -n "$WGET" && -z "$DOCKER_UTILS" ]]; then - echo -e "\e[33m docker is missing! trying to install it... \e[0m" - if [[ "$OSTYPE" == "darwin"* ]]; then - echo "Installing Docker using Homebrew..." - brew install --cask docker - else - $WGET -qO get-docker.sh https://get.docker.com && \ - sudo sh get-docker.sh && \ - sudo systemctl start docker && \ - sudo systemctl enable docker && \ - docker run hello-world && \ - DOCKER_UTILS=$(which docker 2>/dev/null) - rm -f get-docker.sh - fi - echo -e "\e[32m===============>>> docker is installed! <<===============\e[0m" - fi -fi - -if [[ -n "$WGET" && -n "$CONDA_VERSION" ]]; then - SHELL_RC=~/miniconda3/etc/profile.d/conda.sh - echo -e "\e[33m Installing ebook2audiobook... \e[0m" - if [ $DOCKER_UTILS_NEEDED = true ]; then - conda create --prefix $(pwd)/python_env python=3.11 -y - source $SHELL_RC - conda activate $(pwd)/python_env - $DOCKER_UTILS build -f DockerfileUtils -t utils . - fi - pip install --upgrade pip && \ - pip install pydub nltk beautifulsoup4 ebooklib translate coqui-tts tqdm mecab mecab-python3 unidic gradio>=4.44.0 docker && \ - python -m unidic download && \ - python -m spacy download en_core_web_sm && \ - pip install -e . - if [ $DOCKER_UTILS_NEEDED = true ]; then - conda deactivate - conda deactivate - fi - echo -e "\e[32m******************* ebook2audiobook installation successful! *******************\e[0m" - echo -e "\e[33mTo launch ebook2audiobook:\e[0m" - echo -e "- in command line mode: ./ebook2audiobook.sh --headless [other options]" - echo -e "- in graphic web mode: ./ebook2audiobook.sh [--share]" -fi - -exit 0 diff --git a/legacy/v1.0/readme/README_CN.md b/legacy/v1.0/readme/README_CN.md deleted file mode 100644 index e00910f8430611116c70a6fc43f00942b1cbb707..0000000000000000000000000000000000000000 --- a/legacy/v1.0/readme/README_CN.md +++ /dev/null @@ -1,428 +0,0 @@ -# 📚 ebook2audiobook - -使用Calibre和Coqui XTTS将电子书转换为包含章节和元数据的有声读物。支持可选的语音克隆和多种语言! - -#### 🖥️ Web GUI界面 -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- 点击查看Web GUI的图片 -image -image -image -
- -## 🌟 特征 - -- 📖 使用Calibre将电子书转换为文本格式。 -- 📚 将电子书拆分为章节,以获得有组织的音频。 -- 🎙️ 使用Coqui XTTS实现高质量的文本到语音转换。 -- 🗣️ 可选择使用您自己的语音文件进行语音克隆。 -- 🌍 支持多种语言(默认为英语)。 -- 🖥️ 基于4GB RAM运行。 - -## 🛠️ 环境要求 - -- Python 3.10 -- `coqui-tts` Python package -- Calibre (用于电子书转换) -- FFmpeg (用于有声读物创作) -- Optional: 用于语音克隆的自定义语音文件 - -### 🔧 安装说明 - -1. **安装 Python 3.x** from [Python.org](https://www.python.org/downloads/). - -2. **安装 Calibre**: - - **Ubuntu**: `sudo apt-get install -y calibre` - - **macOS**: `brew install calibre` - - **Windows** (Admin Powershell): `choco install calibre` - -3. **安装 FFmpeg**: - - **Ubuntu**: `sudo apt-get install -y ffmpeg` - - **macOS**: `brew install ffmpeg` - - **Windows** (Admin Powershell): `choco install ffmpeg` - -4. **可选: Install Mecab** (非拉丁语言): - - **Ubuntu**: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` - - **macOS**: `brew install mecab`, `brew install mecab-ipadic` - - **Windows**: [mecab-website-to-install-manually](https://taku910.github.io/mecab/#download) (注:日语支持有限) - -5. **安装 Python packages**: - ```bash - pip install coqui-tts==0.24.2 pydub nltk beautifulsoup4 ebooklib tqdm gradio==4.44.0 - - python -m nltk.downloader punkt - python -m nltk.downloader punkt_tab - ``` - - **For non-Latin languages**: - ```bash - pip install mecab mecab-python3 unidic - - python -m unidic download - ``` - -## 🌐 支持的语言 - -- **English (en)** -- **Spanish (es)** -- **French (fr)** -- **German (de)** -- **Italian (it)** -- **Portuguese (pt)** -- **Polish (pl)** -- **Turkish (tr)** -- **Russian (ru)** -- **Dutch (nl)** -- **Czech (cs)** -- **Arabic (ar)** -- **Chinese (zh-cn)** -- **Japanese (ja)** -- **Hungarian (hu)** -- **Korean (ko)** - -在无头模式下运行脚本时指定语言代码。 -## 🚀 使用 - -### 🖥️ 启动Gradio Web界面 - -1. **运行脚本**: - ```bash - python app.py - ``` - -2. **打开web应用程序**: 点击终端中提供的URL访问web应用程序并转换电子书. -3. **公共链接**: 在末尾添加“--share True”,如下所示:`python app.py--share True` -- **[更多参数]**: 使用`-h`参数,如`python app.py-h` - -### 📝 基本的无头用法 - -```bash -python app.py --headless True --ebook --voice [path_to_voice_file] --language [language_code] -``` - -- ****: 电子书文件的路径。 -- **[path_to_voice_file]**: 指定转换的语音文件,可选。 -- **[language_code]**: 指定转换的语言,可选。 -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🧩 自定义XTTS模型的无头用法 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model --custom_config --custom_vocab -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: `model.pth`的路径。 -- ****: `config.json`的路径。 -- ****: `vocab.json`的路径。 -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🧩 自定义XTTS Fine-Tune 模型的无头用法 🌐 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model_url -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: 模型文件夹压缩包的URL路径。例如 - [xtts_David_Attenborough_fine_tune](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/tree/main) `https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true` -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🔍 详细指南,包括所有要使用的参数列表 -```bash -python app.py -h -``` -- 这将输出以下内容: -```bash -usage: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the -Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share SHARE Set to True to enable a public shareable Gradio link. Defaults - to False. - --headless HEADLESS Set to True to run in headless mode without the Gradio - interface. Defaults to False. - --ebook EBOOK Path to the ebook file for conversion. Required in headless - mode. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default - voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to - English (en). - --use_custom_model USE_CUSTOM_MODEL - Set to True to use a custom TTS model. Defaults to False. Must - be True to use custom models, otherwise you'll get an error. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom - model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using - a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a - custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but - will be used if provided. Examples include David Attenborough's - model: 'https://huggingface.co/drewThomasson/xtts_David_Attenbor - ough_fine_tune/resolve/main/Finished_model_files.zip?download=tr - ue'. More XTTS fine-tunes can be found on my Hugging Face at - 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher Tempatures - will lead to more creative outputs IE: more Hallucinations. - Lower Tempatures will be more monotone outputs IE: less - Hallucinations. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults - to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from - repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. IE: How fast the - Narrerator will speak. Defaults to 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Enable splitting text into sentences. Defaults to True. - -Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice ---language en --use_custom_model True --custom_model model.pth --custom_config -config.json --custom_vocab vocab.json -``` - -
- ⚠️ 遗留的旧版使用说明 - -## 🚀 使用 - -----> `ebook2audiobookXTTS/legacy/` - -### 🖥️ Web界面 - -1. **运行脚本**: - ```bash - python custom_model_ebook2audiobookXTTS_gradio.py - ``` - -2. **打开web应用程序**: 单击终端中提供的URL以访问web应用程序并转换电子书。 - -### 📝 基础用法 - -```bash -python ebook2audiobook.py [path_to_voice_file] [language_code] -``` - -- ****: 电子书文件的路径。 -- **[path_to_voice_file]**: 指定转换的语音文件,可选。 -- **[language_code]**: 指定转换的语言,可选。 - -### 🧩 自定义XTTS模型 - -```bash -python custom_model_ebook2audiobookXTTS.py -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: `model.pth`的路径。 -- ****: `config.json`的路径。 -- ****: `vocab.json`的路径。 -
- -### 🐳 使用Docker - -您还可以使用Docker运行电子书到有声读物的转换器。这种方法确保了不同环境之间的一致性,并简化了设置。 - -#### 🚀 运行Docker容器 - -要运行Docker容器并启动Gradio接口,请使用以下命令: - - -仅使用CPU运行 -```powershell -docker run -it --rm -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -使用GPU加速运行(仅限Nvida显卡) -```powershell -docker run -it --rm --gpus all -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -此命令将启动7860端口上的Gradio接口(localhost:7860) -- 对于更多选项,如以无头模式运行docker或公开gradio链接,请在docker启动命令中的`app.py`后添加`-h`参数 -
- 在无头模式下使用docker或使用额外参数修改任何内容的示例+完整指南 - -## 在无头模式下使用docker的示例 - -首先是docker pull的最新版本 -```bash -docker pull athomasson2/ebook2audiobookxtts:huggingface -``` - -- 在运行此命令之前,您需要在当前目录中创建一个名为“input folder”的目录,该目录将被链接,您可以在此处放置docker镜像的输入文件 -```bash -mkdir input-folder && mkdir Audiobooks -``` - -- 运行下面命令需要将 **YOUR_INPUT_FILE.TXT** 替换为您创建的输入文件的名称 - -```bash -docker run -it --rm \ - -v $(pwd)/input-folder:/home/user/app/input_folder \ - -v $(pwd)/Audiobooks:/home/user/app/Audiobooks \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py --headless True --ebook /home/user/app/input_folder/YOUR_INPUT_FILE.TXT -``` - -- 应该就是这样了! - -- 输出Audiobooks将在Audiobook文件夹中找到,该文件夹也位于您运行此docker命令的本地目录中 - - -## 要获取此程序中其他参数的帮助命令,可以运行以下命令 - -```bash -docker run -it --rm \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py -h - -``` - - -这将输出以下内容 - -```bash -user/app/ebook2audiobookXTTS/input-folder -v $(pwd)/Audiobooks:/home/user/app/ebook2audiobookXTTS/Audiobooks --memory="4g" --network none --platform linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -h -starting... -usage: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the -Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share SHARE Set to True to enable a public shareable Gradio link. Defaults - to False. - --headless HEADLESS Set to True to run in headless mode without the Gradio - interface. Defaults to False. - --ebook EBOOK Path to the ebook file for conversion. Required in headless - mode. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default - voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to - English (en). - --use_custom_model USE_CUSTOM_MODEL - Set to True to use a custom TTS model. Defaults to False. Must - be True to use custom models, otherwise you'll get an error. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom - model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using - a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a - custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but - will be used if provided. Examples include David Attenborough's - model: 'https://huggingface.co/drewThomasson/xtts_David_Attenbor - ough_fine_tune/resolve/main/Finished_model_files.zip?download=tr - ue'. More XTTS fine-tunes can be found on my Hugging Face at - 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher Tempatures - will lead to more creative outputs IE: more Hallucinations. - Lower Tempatures will be more monotone outputs IE: less - Hallucinations. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults - to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from - repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. IE: How fast the - Narrerator will speak. Defaults to 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Enable splitting text into sentences. Defaults to True. - -Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice ---language en --use_custom_model True --custom_model model.pth --custom_config -config.json --custom_vocab vocab.json -``` -
- -#### 🖥️ Docker图形用户界面 -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- 点击查看Web界面的图片 -image -image -image -
- -### 🛠️ 关于自定义XTTS模型 - -为更好地处理特定声音而构建的模型。查看我的Hugging Face页面 [here](https://huggingface.co/drewThomasson). - -要使用自定义模型,请粘贴“Finished_model_files.zip”文件的链接,如下所示: - -[David Attenborough fine tuned Finished_model_files.zip](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true) - - - - -更多详细信息请访问 [Dockerfile Hub Page]([https://github.com/DrewThomasson/ebook2audiobookXTTS](https://hub.docker.com/repository/docker/athomasson2/ebook2audiobookxtts/general)). - -## 🌐 微调XTTS模型 - -要查找已经过微调的XTTS型号,请访问[Hugging Face](https://huggingface.co/drewThomasson) 🌐. 模型搜索需要包含“xtts fine tune”的关键字。 - -## 🎥 Demos - -https://github.com/user-attachments/assets/8486603c-38b1-43ce-9639-73757dfb1031 - -## 🤗 [Huggingface space demo](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Huggingface空间正在空闲cpu层上运行,所以预计会非常慢或超时,哈哈,只是不要给它大文件 -- 最好复制空间或在本地运行。 -## 📚 支持的电子书格式 - -- `.epub`, `.pdf`, `.mobi`, `.txt`, `.html`, `.rtf`, `.chm`, `.lit`, `.pdb`, `.fb2`, `.odt`, `.cbr`, `.cbz`, `.prc`, `.lrf`, `.pml`, `.snb`, `.cbc`, `.rb`, `.tcr` -- **最佳结果**: `.epub` 或者 `.mobi`格式可以进行自动章节检测。 - -## 📂 输出 - -- 创建一个包含元数据和章节的“.m4b”文件。 -- **例子**: ![Example](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) diff --git a/legacy/v1.0/readme/README_RU.md b/legacy/v1.0/readme/README_RU.md deleted file mode 100644 index 21f008730ced7beb2fb9141d8f22fe9b6211569f..0000000000000000000000000000000000000000 --- a/legacy/v1.0/readme/README_RU.md +++ /dev/null @@ -1,387 +0,0 @@ -# 📚 ebook2audiobook - -Конвертация электронных книг в аудиокниги с сохранением глав и метаданных, используются механизмы Calibre и XTTS. Поддерживаются опциональное клонирование голоса и множественные языки! - - -#### 🖥️ Web-интерфейс -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Больше картинок Web-интерфейса -image -image -image -
- -## README.md -- en [English](README.md) -- zh_CN [简体中文](readme/README_CN.md) -- ru [Русский](readme/README_RU.md) - - -## 🌟 Возможности - -- 📖 Преобразование электронных книг в текстовый формат при помощи Calibre. -- 📚 Разбитие электронных книг по главам для аудиоформата. -- 🎙️ Высококачественное преобразование текста в голос при помощи Coqui XTTS. -- 🗣️ Опциональное клонирование голоса на основе вашего голосового файла. -- 🌍 Многоязыковая поддержка (английский по умолчанию). -- 🖥️ Для работы достаточно всего 4 Гб ОЗУ. - -## 🤗 [Демонстрация на HuggingFace](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Пространство на HuggingFace работает на бесплатном процессорном уровне, посему не стоит ожидать от него высокой скорости обработки или отсутствия сообщений о таймаутах. Даже и не пытайтесь обработать большие файлы. -- Лучше всего скопировать пространство или запустить приложение локально. - -## Бесплатный Google Colab [![Бесплатный Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - -## 🛠️ Требования - -- Python 3.10 -- `coqui-tts` Python package -- Calibre (для конвертации электронных книг) -- FFmpeg (для создания аудиокниг) -- Опционально: собственный файл с голосом для начитки - - -### 🔧 Установка - -1. **Установить Python 3.x** из [Python.org](https://www.python.org/downloads/). - -2. **Установить Calibre**: - - **Ubuntu**: `sudo apt-get install -y calibre` - - **macOS**: `brew install calibre` - - **Windows** (Admin Powershell): `choco install calibre` - -3. **Установить FFmpeg**: - - **Ubuntu**: `sudo apt-get install -y ffmpeg` - - **macOS**: `brew install ffmpeg` - - **Windows** (Admin Powershell): `choco install ffmpeg` - -4. **Опционально: установить Mecab** (для нелатинских языков): - - **Ubuntu**: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` - - **macOS**: `brew install mecab`, `brew install mecab-ipadic` - - **Windows**: [mecab-website-to-install-manually](https://taku910.github.io/mecab/#download) (Замечание: японский язык поддерживается ограниченно) - -5. **Установить пакеты Python**: - ```bash - pip install coqui-tts==0.24.2 pydub nltk beautifulsoup4 ebooklib tqdm gradio==4.44.0 - - python -m nltk.downloader punkt - python -m nltk.downloader punkt_tab - ``` - - **Для нелатинских языков**: - ```bash - pip install mecab mecab-python3 unidic - - python -m unidic download - ``` - -## 🌐 Поддерживаемые языки - -- **English (en)** -- **Spanish (es)** -- **French (fr)** -- **German (de)** -- **Italian (it)** -- **Portuguese (pt)** -- **Polish (pl)** -- **Turkish (tr)** -- **Russian (ru)** -- **Dutch (nl)** -- **Czech (cs)** -- **Arabic (ar)** -- **Chinese (zh-cn)** -- **Japanese (ja)** -- **Hungarian (hu)** -- **Korean (ko)** - -Указывайте код нужного языка при запуске в безинтерфейсном режиме (в командной строке). -## 🚀 Использование - -### 🖥️ Запуск Gradio Web-интерфейса - -1. **Запустите скрипт**: - ```bash - python app.py - ``` - -2. **Откройте Web-приложение**: нажмите на ссылку появившуюся в окне терминала для доступа к Web-приложению и конвертированию электронных книг. -3. **Для доступа из сети**: добавьте `--share True` в конец команды, наподобие: `python app.py --share True` -- **[Для большего количества параметров]**: используйте `-h` ключ, наподобие: `python app.py -h` - -### 📝 Типовое использование в безинтерфейсном режиме - -```bash -python app.py --headless True --ebook --voice [path_to_voice_file] --language [language_code] -``` - -- ****: путь к файлу электронной книги. -- **[path_to_voice_file]**: путь к примеру голоса, для опционального клонирования голоса для начитки. -- **[language_code]**: по желанию, выбрать язык. -- **[Для большего количества параметров]**: используйте `-h` ключ, наподобие `python app.py -h` - -### 🧩 Безинтерфейсное использование с индивиуальной моделью XTTS - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model --custom_config --custom_vocab -``` - -- ****: путь к файлу электронной книги. -- ****: путь к примеру голоса, для опционального клонирования. -- **\**: по желанию, выбрать язык. -- ****: путь к `model.pth`. -- ****: путь к `config.json`. -- ****: путь к `vocab.json`. -- **[Для большего количества параметров]**: используйте `-h` ключ, наподобие `python app.py -h` - - -### 🧩 Безинтерфейсное использование с индивидуальной моделью XTTS со ссылкой на Zip-архив, содержащий модель тонкой настройки XTTS 🌐 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model_url -``` - -- ****: путь к файлу eBook. -- ****: путь к примеру голоса, для опционального клонирования. -- **\**: по желанию, выбрать язык. -- ****: путь в виде URL к архиву формата zip с папкой модели. Например, [xtts_David_Attenborough_fine_tune](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/tree/main) `https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true` -- Для индивидуальной модели все равно потребуется референсный аудиофайл с голосом: -[референсный аудиофайл с голосом David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) -- **[Для большего количества параметров]**: используйте `-h` ключ, наподобие `python app.py -h` - -### 🔍 Для подробного списка всех параметров используйте -```bash -python app.py -h -``` -- Будет выведен примерно следующий список ключей: -```bash -использование: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Преобразование электронных книг в аудиокниги с использованием модели Text-to-Speech (TTS). Вы можете либо использовать -интерфейс Gradio, либо запустить скрипт в безинтерфейсном режиме (командная строка) для прямого конвертирования. - -опции: - -h, --help Отобразить этот список и выйти - --share SHARE Установить в True для включения публичного доступа к Web-интерфейсу Gradio. По умолчанию False. - --headless HEADLESS Установить в True для использования безинтерфейсного режима. По умолчанию False. - --ebook EBOOK Путь к электронной книге для конвертации. Необходимо для безинтерфейсного режима. - --voice VOICE Путь к целевому голосовому файлу для TTS (текст-в-голос). Опционально, используется голос по умолчанию, если путь не указан. - --language LANGUAGE Язык для конвертации в аудиокнигу. Варианты: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. По умолчанию English (en). - --use_custom_model USE_CUSTOM_MODEL - Установить в True для использования индивидуальной модели TTS. По умолчанию False. Необходимо переключить в - True для использования индивидуальной модели, в противном случае возникнет ошибка. - --custom_model CUSTOM_MODEL - Путь к файлу индивидуальной модели (.pth). Требуется, если используется индивидуальная модель. - --custom_config CUSTOM_CONFIG - Путь к конфигурационному файлу индивидуальной модели (config.json). Требуется, если используется индивидуальная модель. - --custom_vocab CUSTOM_VOCAB - Путь к словарю индивидуальной модели (vocab.json). Требуется, если используется индивидуальная модель. - --custom_model_url CUSTOM_MODEL_URL - URL для скачивания индивидуальной модели в виде zip-архива. Опционально, но если указано, то будет использовано. - Примеры включающие модель David Attenborough: 'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. Больше точно-настроенных моделей XTTS можно найти на Hugging Face 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Температура для модели. По умолчанию 0.65. Чем выше температура, тем более креативным будет синтез голоса, с большим наваждением. Чем меньше, тем более монотонным и спокойным. - --length_penalty LENGTH_PENALTY - Ограничение длины авторегрессионного декодера. По умолчанию 1.0. Не применяется к индивидуальным моделям. - --repetition_penalty REPETITION_PENALTY - Ограничение, предотвращающее повторение авторегрессивным декодером за собой. По умолчанию 2.0 - --top_k TOP_K Сэмплирование Top-k. Меньшее значение приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 50. - --top_p TOP_P Сэмплирование Top-p. Меньшее значение приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 0.8. - --speed SPEED Фактор скорости начитки. Чем больше значение, тем быстрее диктор будет читать текст. По умолчанию 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Включает разбиение текста на предложения. По умолчанию True. - -Пример: python script.py --headless --ebook path_to_ebook --voice path_to_voice --language en --use_custom_model True --custom_model model.pth --custom_config config.json --custom_vocab vocab.json -``` - - - -### 🐳 Использование Docker - -Помимо всего прочего, можно использовать Docker для использования конвертера электронных книг в аудиокниги. Этот метод обеспечивает согласованность в различных средах и упрощает настройку. - -#### 🚀 Запуск контейнера Docker - -Для запуска контейнера Docker и интерфейса Gradio используйте следующую команду: - - -Запуск с использованием только CPU (процессора) -```powershell -docker run -it --rm -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -Запуск с использованием ускорения на GPU (графической карты), поддерживаются только видеокарты NVIDIA -```powershell -docker run -it --rm --gpus all -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -Эта команда запускает интерфейс Gradio на порту 7860. (localhost:7860) -- Для получения большей информации о доступных командах в безинтерфейсном режиме или предоставление доступа к Gradio в сети, используйте ключ `-h` после имени команды `app.py` в терминале Docker -
- Пример использования Docker в безинтерфейсном режиме или модификаций параметров + полный гид - -## Пример использования Docker в безинтерфейсном режиме - -- Сперва необходимо получить свежий контейнер с приложением -```bash -docker pull athomasson2/ebook2audiobookxtts:huggingface -``` - -- Прежде чем запустить команду на исполнение, необходимо создать директорию с именем "input-folder" в текущей папке, которая будет подтянута к использованию. В эту папку необходимо помещать файлы, которые будут видны образу Docker -```bash -mkdir input-folder && mkdir Audiobooks -``` - -- В команде ниже замените **YOUR_INPUT_FILE.TXT** именем файла, который необходимо начитать - -```bash -docker run -it --rm \ - -v $(pwd)/input-folder:/home/user/app/input_folder \ - -v $(pwd)/Audiobooks:/home/user/app/Audiobooks \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py --headless True --ebook /home/user/app/input_folder/YOUR_INPUT_FILE.TXT -``` - -- И на этом это всё! - -- Начитанная аудиокнига будет сформирована в папке Audiobooks, которая будет создана в вашей локальной директории, в которой был осуществлен запуск Docker - - -## Для получения помощи по параметрам, необходимо запустить следующую команду - -```bash -docker run -it --rm \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py -h - -``` - - -и вывод будет следующим - -```bash -user/app/ebook2audiobookXTTS/input-folder -v $(pwd)/Audiobooks:/home/user/app/ebook2audiobookXTTS/Audiobooks --memory="4g" --network none --platform linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -h -starting... -Преобразование электронных книг в аудиокниги с использованием модели Text-to-Speech (TTS). Вы можете либо использовать -интерфейс Gradio, либо запустить скрипт в безинтерфейсном режиме (командная строка) для прямого конвертирования. - -Опции: - -h, --help Отобразить этот список и выйти - --share SHARE Установить в True для включения публичного доступа к Web-интерфейсу Gradio. По умолчанию False. - --headless HEADLESS Установить в True для использования безинтерфейсного режима. По умолчанию False. - --ebook EBOOK Путь к электронной книге для конвертации. Необходимо для безинтерфейсного режима. - --voice VOICE Путь к целевому голосовому файлу для TTS (текст-в-голос). Опционально, используется голос по умолчанию, если путь не указан. - --language LANGUAGE Язык для конвертации в аудиокнигу. Варианты: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. По умолчанию English (en). - --use_custom_model USE_CUSTOM_MODEL - Установить в True для использования индивидуальной модели TTS. По умолчанию False. Необходимо переключить в - True для использования индивидуальной модели, в противном случае возникнет ошибка. - --custom_model CUSTOM_MODEL - Путь к файлу индивидуальной модели (.pth). Требуется, если используется индивидуальная модель. - --custom_config CUSTOM_CONFIG - Путь к конфигурационному файлу индивидуальной модели (config.json). Требуется, если используется индивидуальная модель. - --custom_vocab CUSTOM_VOCAB - Путь к словарю индивидуальной модели (vocab.json). Требуется, если используется индивидуальная модель. - --custom_model_url CUSTOM_MODEL_URL - URL для скачивания индивидуальной модели в виде zip-архива. Опционально, но если указано, то будет использовано. - Примеры включающие модель David Attenborough: 'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. Больше точно-настроенных моделей XTTS можно найти на Hugging Face 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Температура для модели. По умолчанию 0.65. Чем выше температура, тем более креативным будет синтез голоса, с большим наваждением. Чем меньше, тем более монотонным и спокойным. - --length_penalty LENGTH_PENALTY - Ограничение длинны авторегрессионного декодера. По умолчанию 1.0. Не применяется к индивидуальным моделям. - --repetition_penalty REPETITION_PENALTY - Ограниечение предотвращающее повторение авторегрессивным декодером за собой. По умолчанию 2.0 - --top_k TOP_K Сэмплирование Top-k. Меньшее значение приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 50. - --top_p TOP_P Сэмплирование Top-p. Меньшее значение приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 0.8. - --speed SPEED Фактор скорости начитки. Чем больше значение, тем быстрее диктор будет читать текст. По умолчанию 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Включает разбиение текста на предложения. По умолчанию True. - -Пример: python script.py --headless --ebook path_to_ebook --voice path_to_voice --language en --use_custom_model True --custom_model model.pth --custom_config config.json --custom_vocab vocab.json -``` -
- -#### 🖥️ Docker Web-интерфейс -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Нажмите для просмотра изображений Web-интерфейса -image -image -image -
- -### 🛠️ Для индивидуальных Xtts моделей - -Модели создаются для лучшего использования с конкретным голосом. Проверьте различные модели на страничке Hugging Face [тут](https://huggingface.co/drewThomasson). - -Для использования индивидуальных моделей, используйте ссылку на архив с моделью `Finished_model_files.zip`, например: -[David Attenborough точно настроенный голос Finished_model_files.zip](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true) - -Для индивидуальной модели также необходим файл с голосом: -[файл с голосом David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) - - - -Больше информации можно найти на [странице Dockerfile Hub]([https://github.com/DrewThomasson/ebook2audiobookXTTS](https://hub.docker.com/repository/docker/athomasson2/ebook2audiobookxtts/general)). - -## 🌐 Точно отстроенные модели Xtts models - -Для поиска уже подготовленных точно настроенных моделей XTTS обратитесь к [этой страничке на Hugging Face](https://huggingface.co/drewThomasson) 🌐. Ищите модели которые имеют в наименовании "xtts fine tune". - -## 🎥 Демонстрация - -Голос ненастного дня - -https://github.com/user-attachments/assets/8486603c-38b1-43ce-9639-73757dfb1031 - -Голос David Attenborough - -https://github.com/user-attachments/assets/47c846a7-9e51-4eb9-844a-7460402a20a8 - - -## 🤗 [Демонстрация в пространстве Huggingface](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Пространства на Huggingface работают на бесплатном уровне процессоров, поэтому выполнение очень медленное и часто возникают ошибки связанные с истечением времени. Не пытайтесь преобразовывать большие файлы. -- Лучше всего клонировать пространство или запускать его локально. - -## Бесплатный Google Colab [![Бесплатный Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - - -## 📚 Поддерживаемые форматы электронных книг - -- **Можно**: `.epub`, `.pdf`, `.mobi`, `.txt`, `.html`, `.rtf`, `.chm`, `.lit`, `.pdb`, `.fb2`, `.odt`, `.cbr`, `.cbz`, `.prc`, `.lrf`, `.pml`, `.snb`, `.cbc`, `.rb`, `.tcr` -- **Лучше**: `.epub` или `.mobi` для автоматического определения глав. - -## 📂 Вывод - -- Создается файл с расширением `.m4b`, содержащий метаданные и главы. -- **Пример вывода**: ![Пример](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) - -## 🛠️ Частые проблемы: -- "Очень медленно!" - При конвертации только на CPU она происходит медленно, единственный способ ускорения - использовать GPU от NVIDIA: [Обсуждение](https://github.com/DrewThomasson/ebook2audiobookXTTS/discussions/19#discussioncomment-10879846). Для быстрой многоязыковой генерации аудио, рекомендуется использовать другой проект, [использующий piper-tts](https://github.com/DrewThomasson/ebook2audiobookpiper-tts). (Тем не менее, в нем нет функции клонирования голоса без лишней суеты, и он воспроизводит голоса в качестве Siri, но он намного быстрее работает на CPU.) -- "У меня проблема с зависимостями" - Просто используйте Docker. Образы в Docker самодостаточны, имеют, в том числе, режим работы с командной строкой, ключ для вывода помощи. -- "У меня проблема с обрезаным аудио!" - создайте запись о проблеме, автор не говорит на каждом из поддерживаемых языков, и ему требуется помощь по автоматическому разбиению текста на предложения в поддерживаемых языках.😊 -- "Процесс застопорился на 30% в Web-интерфейсе!" - Отображение прогресса в Web-интерфейсе выполнено на базовом уровне и содержит всего 3 шага, для контроллирования процесса посматривайте в терминальный вывод, где и отображается обработка текущего предложения. - -## С чем требуется помощь! 🙌 -## [Полный список тут](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/32) -- Любая помощь от людей, говорящих на поддерживаемых языках для более корректного разбиения текста на предложения. -- Потенциальная помощь в создании инструкций для разных языков (автор знает только английский 😔). - -## 🙏 Отдельные спасибо - -- **Coqui TTS**: [Coqui TTS GitHub](https://github.com/coqui-ai/TTS) -- **Calibre**: [Calibre Website](https://calibre-ebook.com) - -- [@shakenbake15 за лучший способ сохранения глав](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/8) - diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ar.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ar.m4b deleted file mode 100644 index ddc6c4426b0112ee6d715fc7de8412c2b2bb9c45..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ar.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/cs.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/cs.m4b deleted file mode 100644 index 4e3c05732c31bf8b792665e6c84ebdc89ca47713..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/cs.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/de.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/de.m4b deleted file mode 100644 index 3f64f7bf89674f913dd7a9b0c7f5f5c5509fd3d2..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/de.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/en.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/en.m4b deleted file mode 100644 index 9d0cc11bcb0b3b59cf4777f00fded4852e6c4444..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/en.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/es.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/es.m4b deleted file mode 100644 index 289845821835a39116cc77f7d3fa6bd6d1524f96..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/es.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/fr.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/fr.m4b deleted file mode 100644 index eb9605f98d4c6c5f484ab3dc4356e468c19b4584..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/fr.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/hu.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/hu.m4b deleted file mode 100644 index ae120e9e82c05f6063feb87a37071c3ce2f02b42..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/hu.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/it.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/it.m4b deleted file mode 100644 index 37686283c1b7e3dcd34d5af75696b1550b531998..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/it.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ko.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ko.m4b deleted file mode 100644 index d84bde3e200ee5699fbd6a8cbd05ee06d3c69dc8..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ko.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/nl.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/nl.m4b deleted file mode 100644 index 2b3d04cbf83a1967c9ddffd72d26b9b9840c597f..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/nl.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pl.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pl.m4b deleted file mode 100644 index 90b44063b1276e3bd9ddffce917b1ec501bb183a..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pl.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pt.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pt.m4b deleted file mode 100644 index 39d82c8766f15a8194c60eac76c3e312d5a02af8..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/pt.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ru.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ru.m4b deleted file mode 100644 index 0aa99a24787841d2a3e2d77616b88fea117b3399..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/ru.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/tr.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/tr.m4b deleted file mode 100644 index 39f18e33611e6317a2c2cd9d1878a47dfa64d673..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/tr.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/zh-cn.m4b b/legacy/v1.0/samples/Supported_language_sample__generated_outputs/zh-cn.m4b deleted file mode 100644 index 12d3acebb40aefe45fe890268feb4dbdef5e38a8..0000000000000000000000000000000000000000 Binary files a/legacy/v1.0/samples/Supported_language_sample__generated_outputs/zh-cn.m4b and /dev/null differ diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/ar.txt b/legacy/v1.0/samples/Supported_language_sample_texts/ar.txt deleted file mode 100644 index 57d42f9b4f3bec479f9b5ad00a6a5d85fe6bbc57..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/ar.txt +++ /dev/null @@ -1 +0,0 @@ -في البداية كان هناك نور، وظهر العالم إلى الوجود. ارتفعت الجبال، جرت الأنهار، وازدهرت الغابات بالحياة. ومع شروق الشمس كل يوم، اكتشف الناس عجائب الأرض. بنوا المنازل، شكلوا المجتمعات، وبدأوا في تأسيس الحضارات. مع مرور الوقت، انتقلت المعرفة من جيل إلى جيل، حاملةً معها القدرة على تشكيل المستقبل. ومن خلال الانتصارات والتحديات، واصلت البشرية النمو، واستكشفت أسرار الكون الواسعة. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/cs.txt b/legacy/v1.0/samples/Supported_language_sample_texts/cs.txt deleted file mode 100644 index ec22bb3d3abb44ad8059d94cfa5aa5f014593cf9..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/cs.txt +++ /dev/null @@ -1 +0,0 @@ -Na počátku bylo světlo, a svět vznikl. Hory se zvedly, řeky tekly a lesy se rozkvétaly životem. Každý den s východem slunce lidé objevovali zázraky Země. Stavěli domy, tvořili komunity a zakládali civilizace. Časem se znalosti předávaly dál a přinášely s sebou moc formovat budoucnost. Prostřednictvím triumfů a výzev lidstvo stále rostlo a zkoumalo nekonečná tajemství vesmíru. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/de.txt b/legacy/v1.0/samples/Supported_language_sample_texts/de.txt deleted file mode 100644 index 02a619bc8f55586f9747c5dd1762580f933802fe..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/de.txt +++ /dev/null @@ -1 +0,0 @@ -Am Anfang war Licht, und die Welt entstand. Berge erhoben sich, Flüsse flossen, und Wälder erblühten mit Leben. Als die Sonne jeden Tag aufging, entdeckten die Menschen die Wunder der Erde. Sie bauten Häuser, gründeten Gemeinschaften und begannen Zivilisationen. Mit der Zeit wurde Wissen weitergegeben, und mit ihm die Fähigkeit, die Zukunft zu gestalten. Durch Erfolge und Herausforderungen wuchs die Menschheit weiter und erforschte die weiten Geheimnisse des Universums. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/en.txt b/legacy/v1.0/samples/Supported_language_sample_texts/en.txt deleted file mode 100644 index c1c34c66ca39b394344d89cc16da22a743492c07..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/en.txt +++ /dev/null @@ -1 +0,0 @@ -In the beginning, there was light, and the world came into existence. Mountains rose, rivers flowed, and forests flourished with life. As the sun rose each day, people discovered the wonders of the Earth. They built homes, formed communities, and started civilizations. Over time, knowledge was passed down, and with it, the power to shape the future. Through triumphs and struggles, humanity continued to grow, exploring the vast mysteries of the universe. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/es.txt b/legacy/v1.0/samples/Supported_language_sample_texts/es.txt deleted file mode 100644 index 10e95eb0aa4bda142b17d7fe1e9fa03988fdd0fb..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/es.txt +++ /dev/null @@ -1 +0,0 @@ -Al principio, había luz, y el mundo comenzó a existir. Las montañas se alzaron, los ríos fluyeron, y los bosques se llenaron de vida. A medida que el sol se elevaba cada día, las personas descubrieron las maravillas de la Tierra. Construyeron hogares, formaron comunidades y comenzaron civilizaciones. Con el tiempo, el conocimiento se transmitió de generación en generación, trayendo consigo el poder de moldear el futuro. A través de triunfos y dificultades, la humanidad continuó creciendo, expl... \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/fr.txt b/legacy/v1.0/samples/Supported_language_sample_texts/fr.txt deleted file mode 100644 index b8fc2db64aea9c6dc26ec6064b75ef231fc836d2..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/fr.txt +++ /dev/null @@ -1 +0,0 @@ -Au commencement, il y avait de la lumière, et le monde prit forme. Les montagnes s'élevèrent, les rivières coulèrent, et les forêts s'épanouirent avec la vie. Chaque jour, sous le soleil levant, les hommes découvraient les merveilles de la Terre. Ils construisirent des habitations, formèrent des communautés et fondèrent des civilisations. Avec le temps, le savoir se transmit, apportant le pouvoir de façonner l'avenir. À travers les victoires et les épreuves, l'humanité continua à croître, expl... \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/hu.txt b/legacy/v1.0/samples/Supported_language_sample_texts/hu.txt deleted file mode 100644 index 1da1b6f6e151b837c92f2e1e9cd70898802b0116..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/hu.txt +++ /dev/null @@ -1 +0,0 @@ -Kezdetben fény volt, és a világ létrejött. A hegyek felmagasodtak, a folyók folytak, és az erdők élettel teltek meg. Ahogy a nap minden nap felkelt, az emberek felfedezték a Föld csodáit. Házakat építettek, közösségeket hoztak létre, és civilizációkat alapítottak. Idővel a tudás generációról generációra szállt, magával hozva a jövő alakításának erejét. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/it.txt b/legacy/v1.0/samples/Supported_language_sample_texts/it.txt deleted file mode 100644 index 6e63d31a9d9ea3127e5d99ff4b557aaca9ce216f..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/it.txt +++ /dev/null @@ -1 +0,0 @@ -All'inizio c'era la luce, e il mondo venne all'esistenza. Le montagne si innalzarono, i fiumi scorrevano, e le foreste si riempirono di vita. Ogni giorno, con il sorgere del sole, le persone scoprirono le meraviglie della Terra. Costruirono case, formarono comunità e fondarono civiltà. Col passare del tempo, la conoscenza si tramandò, portando con sé il potere di plasmare il futuro. Attraverso trionfi e difficoltà, l'umanità continuò a crescere, esplorando i vasti misteri dell'universo. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/ja.txt b/legacy/v1.0/samples/Supported_language_sample_texts/ja.txt deleted file mode 100644 index 03190c801efb19cfdf1421edb271eb11be08041f..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/ja.txt +++ /dev/null @@ -1 +0,0 @@ -はじめに光があり、世界が存在し始めました。山々がそびえ立ち、川が流れ、森が生命であふれました。太陽が毎日昇ると、人々は地球の不思議を発見しました。彼らは家を建て、コミュニティを形成し、文明を始めました。時が経つにつれ、知識は代々受け継がれ、未来を形作る力を持つようになりました。勝利と試練を通じて、人類は成長し続け、宇宙の広大な謎を探求していきました。 \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/ko.txt b/legacy/v1.0/samples/Supported_language_sample_texts/ko.txt deleted file mode 100644 index 51145d37668e8df5776a9074d3ee4af3f416a136..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/ko.txt +++ /dev/null @@ -1 +0,0 @@ -처음에 빛이 있었고 세상이 존재하게 되었습니다. 산이 솟아오르고 강이 흘러가며 숲은 생명으로 가득 찼습니다. 매일 태양이 떠오를 때 사람들은 지구의 경이로움을 발견했습니다. 그들은 집을 짓고 공동체를 형성하며 문명을 시작했습니다. 시간이 흐르면서 지식은 세대를 거쳐 전해졌고, 미래를 형성할 수 있는 힘이 되었습니다. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/nl.txt b/legacy/v1.0/samples/Supported_language_sample_texts/nl.txt deleted file mode 100644 index c934bd6cd4ab1d440ddaacf0ba5ef1fb43f03c8c..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/nl.txt +++ /dev/null @@ -1 +0,0 @@ -In het begin was er licht, en de wereld kwam tot bestaan. Bergen rezen op, rivieren stroomden, en bossen bloeiden vol leven. Toen de zon elke dag opkwam, ontdekten mensen de wonderen van de aarde. Ze bouwden huizen, vormden gemeenschappen en stichtten beschavingen. Na verloop van tijd werd kennis doorgegeven, samen met de kracht om de toekomst vorm te geven. Door triomfen en uitdagingen bleef de mensheid groeien en de uitgestrekte mysteries van het universum verkennen. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/pl.txt b/legacy/v1.0/samples/Supported_language_sample_texts/pl.txt deleted file mode 100644 index d84fdc8ab31bc7cdc803b44a6dd9c3044ff5f873..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/pl.txt +++ /dev/null @@ -1 +0,0 @@ -Na początku było światło, a świat powstał. Góry wyrosły, rzeki płynęły, a lasy zakwitły życiem. Gdy każdego dnia wschodziło słońce, ludzie odkrywali cuda Ziemi. Budowali domy, tworzyli wspólnoty i zakładali cywilizacje. Z czasem wiedza była przekazywana, niosąc ze sobą moc kształtowania przyszłości. Przez triumfy i trudności ludzkość wciąż rosła, odkrywając ogromne tajemnice wszechświata. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/pt.txt b/legacy/v1.0/samples/Supported_language_sample_texts/pt.txt deleted file mode 100644 index 250b095ecbd2f4bf08d8711416184138bbe4e3a8..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/pt.txt +++ /dev/null @@ -1 +0,0 @@ -No início, havia luz, e o mundo passou a existir. As montanhas se ergueram, os rios fluíram e as florestas floresceram com vida. Com o nascer do sol a cada dia, as pessoas descobriram as maravilhas da Terra. Construíram casas, formaram comunidades e fundaram civilizações. Com o tempo, o conhecimento foi transmitido, trazendo consigo o poder de moldar o futuro. Através de triunfos e desafios, a humanidade continuou a crescer, explorando os vastos mistérios do universo. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/ru.txt b/legacy/v1.0/samples/Supported_language_sample_texts/ru.txt deleted file mode 100644 index d730687815b6aaee2be6dd3a971060000fc44d98..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/ru.txt +++ /dev/null @@ -1 +0,0 @@ -В начале был свет, и мир появился. Горы поднялись, реки текли, а леса процветали жизнью. Каждый день с восходом солнца люди открывали чудеса Земли. Они строили дома, создавали сообщества и основывали цивилизации. Со временем знания передавались, принося с собой силу формировать будущее. Через триумфы и трудности человечество продолжало расти, исследуя необъятные тайны вселенной. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/tr.txt b/legacy/v1.0/samples/Supported_language_sample_texts/tr.txt deleted file mode 100644 index da0d7286b498cbedb12a2d9a285dcaf1a220ca92..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/tr.txt +++ /dev/null @@ -1 +0,0 @@ -Başlangıçta ışık vardı ve dünya var oldu. Dağlar yükseldi, nehirler aktı ve ormanlar hayatla doldu. Güneş her gün doğarken, insanlar Dünya'nın harikalarını keşfettiler. Evler inşa ettiler, topluluklar kurdular ve medeniyetler başlattılar. Zamanla bilgi kuşaktan kuşağa aktarıldı ve geleceği şekillendirme gücü beraberinde geldi. Zaferler ve zorluklar boyunca insanlık büyümeye devam etti, evrenin geniş gizemlerini keşfetti. \ No newline at end of file diff --git a/legacy/v1.0/samples/Supported_language_sample_texts/zh-cn.txt b/legacy/v1.0/samples/Supported_language_sample_texts/zh-cn.txt deleted file mode 100644 index c02b4af678ed516c7b5b66375775b31810040101..0000000000000000000000000000000000000000 --- a/legacy/v1.0/samples/Supported_language_sample_texts/zh-cn.txt +++ /dev/null @@ -1 +0,0 @@ -一开始,世界充满了光明,万物开始存在。山峦拔地而起,河流奔涌,森林里充满了生命。随着太阳每天升起,人们发现了地球的奇观。他们建造房屋,形成社区,开始了文明的历程。随着时间的推移,知识代代相传,并赋予了塑造未来的力量。在胜利与挑战中,人类不断成长,探索着宇宙的广袤奥秘。 \ No newline at end of file diff --git a/lib/__pycache__/conf.cpython-312.pyc b/lib/__pycache__/conf.cpython-312.pyc deleted file mode 100644 index dd752ad12e897beaf9756d9ba9ddeb924fed955b..0000000000000000000000000000000000000000 Binary files a/lib/__pycache__/conf.cpython-312.pyc and /dev/null differ diff --git a/lib/__pycache__/functions.cpython-312.pyc b/lib/__pycache__/functions.cpython-312.pyc deleted file mode 100644 index ad5eb92a41a6521c8c1300a39bde6156a6a4de25..0000000000000000000000000000000000000000 Binary files a/lib/__pycache__/functions.cpython-312.pyc and /dev/null differ diff --git a/lib/__pycache__/lang.cpython-312.pyc b/lib/__pycache__/lang.cpython-312.pyc deleted file mode 100644 index 1266e851876d8d4d8d7f647c2e9d899395946170..0000000000000000000000000000000000000000 Binary files a/lib/__pycache__/lang.cpython-312.pyc and /dev/null differ diff --git a/lib/__pycache__/tokenizer.cpython-312.pyc b/lib/__pycache__/tokenizer.cpython-312.pyc deleted file mode 100644 index 39801533eb25c6843bf72c1346318cb91860d7ff..0000000000000000000000000000000000000000 Binary files a/lib/__pycache__/tokenizer.cpython-312.pyc and /dev/null differ diff --git a/lib/conf.py b/lib/conf.py deleted file mode 100644 index bf867afe2f568c76c785eb389e95d5e49cf4bcdb..0000000000000000000000000000000000000000 --- a/lib/conf.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -from lib.lang import default_voice_file - -NATIVE = 'native' -DOCKER_UTILS = 'docker_utils' -FULL_DOCKER = 'full_docker' - -version = '2.0.0' -min_python_version = (3,10) -max_python_version = (3,12) - -requirements_file = os.path.abspath(os.path.join('.','requirements.txt')) - -docker_utils_image = 'utils' - -interface_host = '0.0.0.0' -interface_port = 7860 -interface_shared_expire = 72 # hours -interface_concurrency_limit = 8 # or None for unlimited -interface_component_options = { - "gr_tab_preferences": True, - "gr_voice_file": True, - "gr_group_custom_model": True -} - -python_env_dir = os.path.abspath(os.path.join('.','python_env')) - -models_dir = os.path.abspath(os.path.join('.','models')) -ebooks_dir = os.path.abspath(os.path.join('.','ebooks')) -processes_dir = os.path.abspath(os.path.join('.','tmp')) - -audiobooks_gradio_dir = os.path.abspath(os.path.join('.','audiobooks','gui','gradio')) -audiobooks_host_dir = os.path.abspath(os.path.join('.','audiobooks','gui','host')) -audiobooks_cli_dir = os.path.abspath(os.path.join('.','audiobooks','cli')) - -# <<<<<<< HEAD -# Automatically accept the non-commercial license -os.environ['COQUI_TOS_AGREED'] = '1' -os.environ['CALIBRE_TEMP_DIR'] = processes_dir -os.environ['CALIBRE_CACHE_DIRECTORY'] = processes_dir -os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1' -os.environ['DO_NOT_TRACK'] = 'true' -os.environ['HUGGINGFACE_HUB_CACHE'] = models_dir -os.environ['TTS_HOME'] = models_dir -os.environ['HF_HOME'] = models_dir -os.environ['HF_DATASETS_CACHE'] = models_dir -os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token') -os.environ['TTS_CACHE'] = models_dir -os.environ['TORCH_HOME'] = models_dir -os.environ['XDG_CACHE_HOME'] = models_dir - -ebook_formats = ['.epub', '.mobi', '.azw3', 'fb2', 'lrf', 'rb', 'snb', 'tcr', '.pdf', '.txt', '.rtf', 'doc', '.docx', '.html', '.odt', '.azw'] -audiobook_format = 'm4b' # or 'mp3' -audioproc_format = 'wav' # only 'wav' is valid for now - -default_tts_engine = 'xtts' -default_fine_tuned = 'std' -default_model_files = ['config.json', 'vocab.json', 'model.pth', 'ref.wav'] - -models = { - "xtts": { - "std": { - "lang": "multi", - "repo": "tts_models/multilingual/multi-dataset/xtts_v2", - "sub": "", - "voice": default_voice_file - }, - "AiExplained": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/AiExplained", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "AiExplained_24khz.wav")) - }, - "BobOdenkirk": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/BobOdenkirk", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "BobOdenkirk_24khz.wav")) - }, - "BobRoss": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/BobRoss", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "BobRoss_24khz.wav")) - }, - "BryanCranston": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/BryanCranston", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "BryanCranston_24khz.wav")) - }, - "DavidAttenborough": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/DavidAttenborough", - "voice": os.path.abspath(os.path.join("voices", "eng", "elder", "male", "DavidAttenborough_24khz.wav")) - }, - "DeathPuss&Boots": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/DeathPuss&Boots", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "DeathPuss&Boots_24khz.wav")) - }, - "GhostMW2": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/GhostMW2", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "GhostMW2_24khz.wav")) - }, - "JhonButlerASMR": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/JhonButlerASMR", - "voice": os.path.abspath(os.path.join("voices", "eng", "elder", "male", "JhonButlerASMR_24khz.wav")) - }, - "JhonMulaney": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/JhonMulaney", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "JhonMulaney_24khz.wav")) - }, - "MorganFreeman": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/MorganFreeman", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "MorganFreeman_24khz.wav")) - }, - "RainyDayHeadSpace": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/RainyDayHeadSpace", - "voice": os.path.abspath(os.path.join("voices", "eng", "elder", "male", "RainyDayHeadSpace_24khz.wav")) - }, - "WhisperSalemASMR": { - "lang": "eng", - "repo": "drewThomasson/fineTunedTTSModels", - "sub": "xtts-v2/eng/WhisperSalemASMR", - "voice": os.path.abspath(os.path.join("voices", "eng", "adult", "male", "WhisperSalemASMR_24khz.wav")) - } - }, - "fairseq": { - "std": { - "lang": "multi", - "repo": "tts_models/[lang]/fairseq/vits", - "sub": "", - "voice": default_voice_file - } - } -} \ No newline at end of file diff --git a/lib/functions.py b/lib/functions.py deleted file mode 100644 index b9f1f4498182caf3df7b72f3ac9047b96bcb47d9..0000000000000000000000000000000000000000 --- a/lib/functions.py +++ /dev/null @@ -1,1594 +0,0 @@ -import argparse -import csv -import docker -import ebooklib -import fnmatch -import gradio as gr -import hashlib -import json -import numpy as np -import os -import regex as re -import requests -import shutil -import socket -import subprocess -import sys -import threading -import time -import torch -import torchaudio -import urllib.request -import uuid -import zipfile -import traceback - -from bs4 import BeautifulSoup -from collections import Counter -from collections.abc import MutableMapping -from datetime import datetime -from ebooklib import epub -from glob import glob -from huggingface_hub import hf_hub_download -from iso639 import languages -from multiprocessing import Manager, Event -from pydub import AudioSegment -from tqdm import tqdm -from translate import Translator -from TTS.api import TTS as XTTS -from TTS.tts.configs.xtts_config import XttsConfig -from TTS.tts.models.xtts import Xtts -from urllib.parse import urlparse - -import lib.conf as conf -import lib.lang as lang - -def inject_configs(target_namespace): - # Extract variables from both modules and inject them into the target namespace - for module in (conf, lang): - target_namespace.update({k: v for k, v in vars(module).items() if not k.startswith('__')}) - -# Inject configurations into the global namespace of this module -inject_configs(globals()) - -def recursive_proxy(data, manager=None): - """Recursively convert a nested dictionary into Manager.dict proxies.""" - if manager is None: - manager = Manager() - if isinstance(data, dict): - proxy_dict = manager.dict() - for key, value in data.items(): - proxy_dict[key] = recursive_proxy(value, manager) - return proxy_dict - elif isinstance(data, list): - proxy_list = manager.list() - for item in data: - proxy_list.append(recursive_proxy(item, manager)) - return proxy_list - elif isinstance(data, (str, int, float, bool, type(None))): # Scalars - return data - else: - raise TypeError(f"Unsupported data type: {type(data)}") - -class ConversionContext: - def __init__(self): - self.manager = Manager() - self.sessions = self.manager.dict() # Store all session-specific contexts - self.cancellation_events = {} # Store multiprocessing.Event for each session - - def get_session(self, session_id): - """Retrieve or initialize session-specific context""" - if session_id not in self.sessions: - self.sessions[session_id] = recursive_proxy({ - "script_mode": NATIVE, - "client": None, - "language": default_language_code, - "audiobooks_dir": None, - "tmp_dir": None, - "src": None, - "id": session_id, - "chapters_dir": None, - "chapters_dir_sentences": None, - "epub": None, - "epub_path": None, - "filename_noext": None, - "fine_tuned": None, - "voice_file": None, - "custom_model": None, - "custom_model_dir": None, - "chapters": None, - "cover": None, - "metadata": { - "title": None, - "creator": None, - "contributor": None, - "language": None, - "language_iso1": None, - "identifier": None, - "publisher": None, - "date": None, - "description": None, - "subject": None, - "rights": None, - "format": None, - "type": None, - "coverage": None, - "relation": None, - "Source": None, - "Modified": None, - }, - "status": "Idle", - "progress": 0, - "cancellation_requested": False - }, manager=self.manager) - return self.sessions[session_id] - -context = ConversionContext() -is_gui_process = False - -class DependencyError(Exception): - def __init__(self, message=None): - super().__init__(message) - # Automatically handle the exception when it's raised - self.handle_exception() - - def handle_exception(self): - # Print the full traceback of the exception - traceback.print_exc() - - # Print the exception message - print(f'Caught DependencyError: {self}') - - # Exit the script if it's not a web process - if not is_gui_process: - sys.exit(1) - -def prepare_dirs(src, session): - try: - resume = False - os.makedirs(os.path.join(models_dir,'tts'), exist_ok=True) - os.makedirs(session['tmp_dir'], exist_ok=True) - os.makedirs(session['custom_model_dir'], exist_ok=True) - os.makedirs(session['audiobooks_dir'], exist_ok=True) - session['src'] = os.path.join(session['tmp_dir'], os.path.basename(src)) - if os.path.exists(session['src']): - if compare_files_by_hash(session['src'], src): - resume = True - if not resume: - shutil.rmtree(session['chapters_dir'], ignore_errors=True) - os.makedirs(session['chapters_dir'], exist_ok=True) - os.makedirs(session['chapters_dir_sentences'], exist_ok=True) - shutil.copy(src, session['src']) - return True - except Exception as e: - raise DependencyError(e) - -def check_programs(prog_name, command, options): - try: - subprocess.run([command, options], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True, None - except FileNotFoundError: - e = f'''********** Error: {prog_name} is not installed! if your OS calibre package version - is not compatible you still can run ebook2audiobook.sh (linux/mac) or ebook2audiobook.cmd (windows) **********''' - raise DependencyError(e) - except subprocess.CalledProcessError: - e = f'Error: There was an issue running {prog_name}.' - raise DependencyError(e) - -def check_fine_tuned(fine_tuned, language): - try: - for parent, children in models.items(): - if fine_tuned in children: - if language_xtts.get(language): - tts = 'xtts' - else: - tts = 'fairseq' - if parent == tts: - return parent - return False - except Exception as e: - raise RuntimeError(e) - -def analyze_uploaded_file(zip_path, required_files=None): - if required_files is None: - required_files = default_model_files - executable_extensions = {'.exe', '.bat', '.cmd', '.bash', '.bin', '.sh', '.msi', '.dll', '.com'} - try: - with zipfile.ZipFile(zip_path, 'r') as zf: - files_in_zip = set() - executables_found = False - for file_info in zf.infolist(): - file_name = file_info.filename - if file_info.is_dir(): - continue # Skip directories - base_name = os.path.basename(file_name) - files_in_zip.add(base_name) - _, ext = os.path.splitext(base_name.lower()) - if ext in executable_extensions: - executables_found = True - break - missing_files = [f for f in required_files if f not in files_in_zip] - is_valid = not executables_found and not missing_files - return is_valid, - except zipfile.BadZipFile: - raise ValueError("error: The file is not a valid ZIP archive.") - except Exception as e: - raise RuntimeError(f'analyze_uploaded_file(): {e}') - -async def extract_custom_model(file_src, dest=None, session=None, required_files=None): - try: - progress_bar = None - if is_gui_process: - progress_bar = gr.Progress(track_tqdm=True) - if dest is None: - dest = session['custom_model_dir'] = os.path.join(models_dir, '__sessions', f"model-{session['id']}") - os.makedirs(dest, exist_ok=True) - if required_files is None: - required_files = default_model_files - - dir_src = os.path.dirname(file_src) - dir_name = os.path.basename(file_src).replace('.zip', '') - - with zipfile.ZipFile(file_src, 'r') as zip_ref: - files = zip_ref.namelist() - files_length = len(files) - dir_tts = 'fairseq' - xtts_config = 'config.json' - - # Check the model type - config_data = {} - if xtts_config in zip_ref.namelist(): - with zip_ref.open(xtts_config) as file: - config_data = json.load(file) - if config_data.get('model') == 'xtts': - dir_tts = 'xtts' - - dir_dest = os.path.join(dest, dir_tts, dir_name) - os.makedirs(dir_dest, exist_ok=True) - - # Initialize progress bar - with tqdm(total=100, unit='%') as t: # Track progress as a percentage - for i, file in enumerate(files): - if file in required_files: - zip_ref.extract(file, dir_dest) - progress_percentage = ((i + 1) / files_length) * 100 - t.n = int(progress_percentage) - t.refresh() - if progress_bar is not None: - progress_bar(downloaded / total_size) - yield dir_name, progress_bar - - os.remove(file_src) - print(f'Extracted files to {dir_dest}') - yield dir_name, progress_bar - return - except Exception as e: - raise DependencyError(e) - -def calculate_hash(filepath, hash_algorithm='sha256'): - hash_func = hashlib.new(hash_algorithm) - with open(filepath, 'rb') as file: - while chunk := file.read(8192): # Read in chunks to handle large files - hash_func.update(chunk) - return hash_func.hexdigest() - -def compare_files_by_hash(file1, file2, hash_algorithm='sha256'): - return calculate_hash(file1, hash_algorithm) == calculate_hash(file2, hash_algorithm) - -def has_metadata(f): - try: - b = epub.read_epub(f) - metadata = b.get_metadata('DC', '') - if metadata: - return True - else: - return False - except Exception as e: - return False - -def convert_to_epub(session): - if session['cancellation_requested']: - #stop_and_detach_tts() - print('Cancel requested') - return False - if session['script_mode'] == DOCKER_UTILS: - try: - docker_dir = os.path.basename(session['tmp_dir']) - docker_file_in = os.path.basename(session['src']) - docker_file_out = os.path.basename(session['epub_path']) - - # Check if the input file is already an EPUB - if docker_file_in.lower().endswith('.epub'): - shutil.copy(session['src'], session['epub_path']) - return True - - # Convert the ebook to EPUB format using utils Docker image - container = session['client'].containers.run( - docker_utils_image, - command=f'ebook-convert /files/{docker_dir}/{docker_file_in} /files/{docker_dir}/{docker_file_out}', - volumes={session['tmp_dir']: {'bind': f'/files/{docker_dir}', 'mode': 'rw'}}, - remove=True, - detach=False, - stdout=True, - stderr=True - ) - print(container.decode('utf-8')) - return True - except docker.errors.ContainerError as e: - raise DependencyError(e) - except docker.errors.ImageNotFound as e: - raise DependencyError(e) - except docker.errors.APIError as e: - raise DependencyError(e) - else: - try: - util_app = shutil.which('ebook-convert') - subprocess.run([util_app, session['src'], session['epub_path']], check=True) - return True - except subprocess.CalledProcessError as e: - raise DependencyError(e) - -def get_cover(session): - try: - if session['cancellation_requested']: - #stop_and_detach_tts() - print('Cancel requested') - return False - cover_image = False - cover_path = os.path.join(session['tmp_dir'], session['filename_noext'] + '.jpg') - for item in session['epub'].get_items_of_type(ebooklib.ITEM_COVER): - cover_image = item.get_content() - break - if not cover_image: - for item in session['epub'].get_items_of_type(ebooklib.ITEM_IMAGE): - if 'cover' in item.file_name.lower() or 'cover' in item.get_id().lower(): - cover_image = item.get_content() - break - if cover_image: - with open(cover_path, 'wb') as cover_file: - cover_file.write(cover_image) - return cover_path - return True - except Exception as e: - raise DependencyError(e) - -def get_chapters(language, session): - try: - if session['cancellation_requested']: - #stop_and_detach_tts() - print('Cancel requested') - return False - all_docs = list(session['epub'].get_items_of_type(ebooklib.ITEM_DOCUMENT)) - if all_docs: - all_docs = all_docs[1:] - doc_patterns = [filter_pattern(str(doc)) for doc in all_docs if filter_pattern(str(doc))] - most_common_pattern = filter_doc(doc_patterns) - selected_docs = [doc for doc in all_docs if filter_pattern(str(doc)) == most_common_pattern] - chapters = [filter_chapter(doc, language) for doc in selected_docs] - if session['metadata'].get('creator'): - intro = f"{session['metadata']['creator']}, {session['metadata']['title']};\n " - chapters[0].insert(0, intro) - return chapters - return False - except Exception as e: - raise DependencyError(f'Error extracting main content pages: {e}') - -def filter_doc(doc_patterns): - pattern_counter = Counter(doc_patterns) - # Returns a list with one tuple: [(pattern, count)] - most_common = pattern_counter.most_common(1) - return most_common[0][0] if most_common else None - -def filter_pattern(doc_identifier): - parts = doc_identifier.split(':') - if len(parts) > 2: - segment = parts[1] - if re.search(r'[a-zA-Z]', segment) and re.search(r'\d', segment): - return ''.join([char for char in segment if char.isalpha()]) - elif re.match(r'^[a-zA-Z]+$', segment): - return segment - elif re.match(r'^\d+$', segment): - return 'numbers' - return None - -def filter_chapter(doc, language): - soup = BeautifulSoup(doc.get_body_content(), 'html.parser') - # Remove scripts and styles - for script in soup(["script", "style"]): - script.decompose() - # Normalize lines and remove unnecessary spaces - text = re.sub(r'(\r\n|\r|\n){3,}', '\r\n', soup.get_text().strip()) - text = replace_roman_numbers(text) - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = '\n'.join(chunk for chunk in chunks if chunk) - text = text.replace('»', '"').replace('«', '"') - # Pattern 1: Add a space between UTF-8 characters and numbers - text = re.sub(r'(?<=[\p{L}])(?=\d)|(?<=\d)(?=[\p{L}])', ' ', text) - # Pattern 2: Split numbers into groups of 4 - text = re.sub(r'(\d{4})(?=\d)', r'\1 ', text) - chapter_sentences = get_sentences(text, language) - return chapter_sentences - -def get_sentences(sentence, language, max_pauses=9): - max_length = language_mapping[language]['char_limit'] - punctuation = language_mapping[language]['punctuation'] - sentence = sentence.replace(".", ";\n") - parts = [] - while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses: - # Step 1: Look for the last period (.) within max_length - possible_splits = [i for i, char in enumerate(sentence[:max_length]) if char == '.'] - # Step 2: If no periods, look for the last comma (,) - if not possible_splits: - possible_splits = [i for i, char in enumerate(sentence[:max_length]) if char == ','] - # Step 3: If still no splits, look for any other punctuation - if not possible_splits: - possible_splits = [i for i, char in enumerate(sentence[:max_length]) if char in punctuation] - # Step 4: Determine where to split the sentence - if possible_splits: - split_at = possible_splits[-1] + 1 # Split at the last occurrence of punctuation - else: - # If no punctuation is found, split at the last space - last_space = sentence.rfind(' ', 0, max_length) - if last_space != -1: - split_at = last_space + 1 - else: - # If no space is found, force split at max_length - split_at = max_length - # Add the split sentence to parts - parts.append(sentence[:split_at].strip() + ' ') - sentence = sentence[split_at:].strip() - # Add the remaining sentence if any - if sentence: - parts.append(sentence.strip() + ' ') - return parts - -def convert_chapters_to_audio(session): - try: - if session['cancellation_requested']: - #stop_and_detach_tts() - print('Cancel requested') - return False - progress_bar = None - params = {} - if is_gui_process: - progress_bar = gr.Progress(track_tqdm=True) - params['tts_model'] = None - ''' - # List available TTS base models - print("Available Models:") - print("=================") - for index, model in enumerate(XTTS().list_models(), 1): - print(f"{index}. {model}") - ''' - if session['metadata']['language'] in language_xtts: - params['tts_model'] = 'xtts' - if session['custom_model'] is not None: - print(f"Loading TTS {params['tts_model']} model from {session['custom_model']}...") - model_path = os.path.join(session['custom_model'], 'model.pth') - config_path = os.path.join(session['custom_model'],'config.json') - vocab_path = os.path.join(session['custom_model'],'vocab.json') - voice_path = os.path.join(session['custom_model'],'ref.wav') - config = XttsConfig() - config.models_dir = os.path.join(models_dir,'tts') - config.load_json(config_path) - params['tts'] = Xtts.init_from_config(config) - params['tts'].load_checkpoint(config, checkpoint_path=model_path, vocab_path=vocab_path, eval=True) - print('Computing speaker latents...') - params['voice_file'] = session['voice_file'] if session['voice_file'] is not None else voice_path - params['gpt_cond_latent'], params['speaker_embedding'] = params['tts'].get_conditioning_latents(audio_path=[params['voice_file']]) - elif session['fine_tuned'] != 'std': - print(f"Loading TTS {params['tts_model']} model from {session['fine_tuned']}...") - hf_repo = models[params['tts_model']][session['fine_tuned']]['repo'] - hf_sub = models[params['tts_model']][session['fine_tuned']]['sub'] - cache_dir = os.path.join(models_dir,'tts') - model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}/model.pth", cache_dir=cache_dir) - config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}/config.json", cache_dir=cache_dir) - vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}/vocab.json", cache_dir=cache_dir) - config = XttsConfig() - config.models_dir = cache_dir - config.load_json(config_path) - params['tts'] = Xtts.init_from_config(config) - params['tts'].load_checkpoint(config, checkpoint_path=model_path, vocab_path=vocab_path, eval=True) - print('Computing speaker latents...') - params['voice_file'] = session['voice_file'] if session['voice_file'] is not None else models[params['tts_model']][session['fine_tuned']]['voice'] - params['gpt_cond_latent'], params['speaker_embedding'] = params['tts'].get_conditioning_latents(audio_path=[params['voice_file']]) - else: - print(f"Loading TTS {params['tts_model']} model from {models[params['tts_model']][session['fine_tuned']]['repo']}...") - params['tts'] = XTTS(model_name=models[params['tts_model']][session['fine_tuned']]['repo']) - params['voice_file'] = session['voice_file'] if session['voice_file'] is not None else models[params['tts_model']][session['fine_tuned']]['voice'] - params['tts'].to(session['device']) - else: - params['tts_model'] = 'fairseq' - model_repo = models[params['tts_model']][session['fine_tuned']]['repo'].replace("[lang]", session['metadata']['language']) - print(f"Loading TTS {model_repo} model from {model_repo}...") - params['tts'] = XTTS(model_repo) - params['voice_file'] = session['voice_file'] if session['voice_file'] is not None else models[params['tts_model']][session['fine_tuned']]['voice'] - params['tts'].to(session['device']) - - resume_chapter = 0 - resume_sentence = 0 - - # Check existing files to resume the process if it was interrupted - existing_chapters = sorted([f for f in os.listdir(session['chapters_dir']) if f.endswith(f'.{audioproc_format}')]) - existing_sentences = sorted([f for f in os.listdir(session['chapters_dir_sentences']) if f.endswith(f'.{audioproc_format}')]) - - if existing_chapters: - count_chapter_files = len(existing_chapters) - resume_chapter = count_chapter_files - 1 if count_chapter_files > 0 else 0 - print(f'Resuming from chapter {count_chapter_files}') - if existing_sentences: - resume_sentence = len(existing_sentences) - print(f'Resuming from sentence {resume_sentence}') - - total_chapters = len(session['chapters']) - total_sentences = sum(len(array) for array in session['chapters']) - current_sentence = 0 - - with tqdm(total=total_sentences, desc='convert_chapters_to_audio 0.00%', bar_format='{desc}: {n_fmt}/{total_fmt} ', unit='step', initial=resume_sentence) as t: - t.n = resume_sentence - t.refresh() - for x in range(resume_chapter, total_chapters): - chapter_num = x + 1 - chapter_audio_file = f'chapter_{chapter_num}.{audioproc_format}' - sentences = session['chapters'][x] - sentences_count = len(sentences) - start = current_sentence # Mark the starting sentence of the chapter - print(f"\nChapter {chapter_num} containing {sentences_count} sentences...") - for i, sentence in enumerate(sentences): - if current_sentence >= resume_sentence: - params['sentence_audio_file'] = os.path.join(session['chapters_dir_sentences'], f'{current_sentence}.{audioproc_format}') - params['sentence'] = sentence - if convert_sentence_to_audio(params, session): - t.update(1) - percentage = (current_sentence / total_sentences) * 100 - t.set_description(f'Processing {percentage:.2f}%') - print(f'Sentence: {sentence}') - t.refresh() - if progress_bar is not None: - progress_bar(current_sentence / total_sentences) - else: - return False - current_sentence += 1 - end = current_sentence - 1 - print(f"\nEnd of Chapter {chapter_num}") - if start >= resume_sentence: - if combine_audio_sentences(chapter_audio_file, start, end, session): - print(f'Combining chapter {chapter_num} to audio, sentence {start} to {end}') - else: - print('combine_audio_sentences() failed!') - return False - return True - except Exception as e: - raise DependencyError(e) - -def convert_sentence_to_audio(params, session): - try: - if session['cancellation_requested']: - #stop_and_detach_tts(params['tts']) - print('Cancel requested') - return False - generation_params = { - "temperature": session['temperature'], - "length_penalty": session["length_penalty"], - "repetition_penalty": session['repetition_penalty'], - "num_beams": int(session['length_penalty']) + 1 if session["length_penalty"] > 1 else 1, - "top_k": session['top_k'], - "top_p": session['top_p'], - "speed": session['speed'], - "enable_text_splitting": session['enable_text_splitting'] - } - if params['tts_model'] == 'xtts': - if session['custom_model'] is not None or session['fine_tuned'] != 'std': - output = params['tts'].inference( - text=params['sentence'], - language=session['metadata']['language_iso1'], - gpt_cond_latent=params['gpt_cond_latent'], - speaker_embedding=params['speaker_embedding'], - **generation_params - ) - torchaudio.save( - params['sentence_audio_file'], - torch.tensor(output[audioproc_format]).unsqueeze(0), - sample_rate=24000 - ) - else: - params['tts'].tts_to_file( - text=params['sentence'], - language=session['metadata']['language_iso1'], - file_path=params['sentence_audio_file'], - speaker_wav=params['voice_file'], - **generation_params - ) - elif params['tts_model'] == 'fairseq': - params['tts'].tts_with_vc_to_file( - text=params['sentence'], - file_path=params['sentence_audio_file'], - speaker_wav=params['voice_file'].replace('_24khz','_16khz'), - split_sentences=session['enable_text_splitting'] - ) - if os.path.exists(params['sentence_audio_file']): - return True - print(f"Cannot create {params['sentence_audio_file']}") - return False - except Exception as e: - raise DependencyError(e) - -def combine_audio_sentences(chapter_audio_file, start, end, session): - try: - chapter_audio_file = os.path.join(session['chapters_dir'], chapter_audio_file) - combined_audio = AudioSegment.empty() - # Get all audio sentence files sorted by their numeric indices - sentence_files = [f for f in os.listdir(session['chapters_dir_sentences']) if f.endswith(".wav")] - sentences_dir_ordered = sorted(sentence_files, key=lambda x: int(re.search(r'\d+', x).group())) - # Filter the files in the range [start, end] - selected_files = [ - file for file in sentences_dir_ordered - if start <= int(''.join(filter(str.isdigit, os.path.basename(file)))) <= end - ] - for file in selected_files: - if session['cancellation_requested']: - #stop_and_detach_tts(params['tts']) - print('Cancel requested') - return False - if session['cancellation_requested']: - msg = 'Cancel requested' - raise ValueError(msg) - audio_segment = AudioSegment.from_file(os.path.join(session['chapters_dir_sentences'],file), format=audioproc_format) - combined_audio += audio_segment - combined_audio.export(chapter_audio_file, format=audioproc_format) - print(f'Combined audio saved to {chapter_audio_file}') - return True - except Exception as e: - raise DependencyError(e) - - -def combine_audio_chapters(session): - def sort_key(chapter_file): - numbers = re.findall(r'\d+', chapter_file) - return int(numbers[0]) if numbers else 0 - - def assemble_audio(): - try: - combined_audio = AudioSegment.empty() - batch_size = 256 - # Process the chapter files in batches - for i in range(0, len(chapter_files), batch_size): - batch_files = chapter_files[i:i + batch_size] - batch_audio = AudioSegment.empty() # Initialize an empty AudioSegment for the batch - # Sequentially append each file in the current batch to the batch_audio - for chapter_file in batch_files: - if session['cancellation_requested']: - print('Cancel requested') - return False - audio_segment = AudioSegment.from_wav(os.path.join(session['chapters_dir'],chapter_file)) - batch_audio += audio_segment - combined_audio += batch_audio - combined_audio.export(assembled_audio, format=audioproc_format) - print(f'Combined audio saved to {assembled_audio}') - return True - except Exception as e: - raise DependencyError(e) - - def generate_ffmpeg_metadata(): - try: - if session['cancellation_requested']: - print('Cancel requested') - return False - ffmpeg_metadata = ';FFMETADATA1\n' - if session['metadata'].get('title'): - ffmpeg_metadata += f"title={session['metadata']['title']}\n" - if session['metadata'].get('creator'): - ffmpeg_metadata += f"artist={session['metadata']['creator']}\n" - if session['metadata'].get('language'): - ffmpeg_metadata += f"language={session['metadata']['language']}\n\n" - if session['metadata'].get('publisher'): - ffmpeg_metadata += f"publisher={session['metadata']['publisher']}\n" - if session['metadata'].get('description'): - ffmpeg_metadata += f"description={session['metadata']['description']}\n" - if session['metadata'].get('published'): - # Check if the timestamp contains fractional seconds - if '.' in session['metadata']['published']: - # Parse with fractional seconds - year = datetime.strptime(session['metadata']['published'], '%Y-%m-%dT%H:%M:%S.%f%z').year - else: - # Parse without fractional seconds - year = datetime.strptime(session['metadata']['published'], '%Y-%m-%dT%H:%M:%S%z').year - else: - # If published is not provided, use the current year - year = datetime.now().year - ffmpeg_metadata += f'year={year}\n' - if session['metadata'].get('identifiers') and isinstance(session['metadata'].get('identifiers'), dict): - isbn = session['metadata']['identifiers'].get('isbn', None) - if isbn: - ffmpeg_metadata += f'isbn={isbn}\n' # ISBN - mobi_asin = session['metadata']['identifiers'].get('mobi-asin', None) - if mobi_asin: - ffmpeg_metadata += f'asin={mobi_asin}\n' # ASIN - start_time = 0 - for index, chapter_file in enumerate(chapter_files): - if session['cancellation_requested']: - msg = 'Cancel requested' - raise ValueError(msg) - - duration_ms = len(AudioSegment.from_wav(os.path.join(session['chapters_dir'],chapter_file))) - ffmpeg_metadata += f'[CHAPTER]\nTIMEBASE=1/1000\nSTART={start_time}\n' - ffmpeg_metadata += f'END={start_time + duration_ms}\ntitle=Chapter {index + 1}\n' - start_time += duration_ms - # Write the metadata to the file - with open(metadata_file, 'w', encoding='utf-8') as file: - file.write(ffmpeg_metadata) - return True - except Exception as e: - raise DependencyError(e) - - def export_audio(): - try: - if session['cancellation_requested']: - print('Cancel requested') - return False - ffmpeg_cover = None - if session['script_mode'] == DOCKER_UTILS: - docker_dir = os.path.basename(session['tmp_dir']) - ffmpeg_combined_audio = f'/files/{docker_dir}/' + os.path.basename(assembled_audio) - ffmpeg_metadata_file = f'/files/{docker_dir}/' + os.path.basename(metadata_file) - ffmpeg_final_file = f'/files/{docker_dir}/' + os.path.basename(docker_final_file) - if session['cover'] is not None: - ffmpeg_cover = f'/files/{docker_dir}/' + os.path.basename(session['cover']) - ffmpeg_cmd = ['ffmpeg', '-i', ffmpeg_combined_audio, '-i', ffmpeg_metadata_file] - else: - ffmpeg_combined_audio = assembled_audio - ffmpeg_metadata_file = metadata_file - ffmpeg_final_file = final_file - if session['cover'] is not None: - ffmpeg_cover = session['cover'] - ffmpeg_cmd = [shutil.which('ffmpeg'), '-i', ffmpeg_combined_audio, '-i', ffmpeg_metadata_file] - if ffmpeg_cover is not None: - ffmpeg_cmd += ['-i', ffmpeg_cover, '-map', '0:a', '-map', '2:v'] - else: - ffmpeg_cmd += ['-map', '0:a'] - ffmpeg_cmd += ['-map_metadata', '1', '-c:a', 'aac', '-b:a', '128k', '-ar', '44100'] - if ffmpeg_cover is not None: - if ffmpeg_cover.endswith('.png'): - ffmpeg_cmd += ['-c:v', 'png', '-disposition:v', 'attached_pic'] # PNG cover - else: - ffmpeg_cmd += ['-c:v', 'copy', '-disposition:v', 'attached_pic'] # JPEG cover (no re-encoding needed) - if ffmpeg_cover is not None and ffmpeg_cover.endswith('.png'): - ffmpeg_cmd += ['-pix_fmt', 'yuv420p'] - ffmpeg_cmd += [ - '-af', - 'agate=threshold=-35dB:ratio=1.5:attack=10:release=200,acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,loudnorm=I=-19:TP=-3:LRA=7:linear=true,afftdn=nf=-50,equalizer=f=150:t=q:w=2:g=2,equalizer=f=250:t=q:w=2:g=-2,equalizer=f=12000:t=q:w=2:g=2', - '-movflags', '+faststart', '-y', ffmpeg_final_file - ] - if session['script_mode'] == DOCKER_UTILS: - try: - container = session['client'].containers.run( - docker_utils_image, - command=ffmpeg_cmd, - volumes={session['tmp_dir']: {'bind': f'/files/{docker_dir}', 'mode': 'rw'}}, - remove=True, - detach=False, - stdout=True, - stderr=True - ) - print(container.decode('utf-8')) - if shutil.copy(docker_final_file, final_file): - return True - return False - except docker.errors.ContainerError as e: - raise DependencyError(e) - except docker.errors.ImageNotFound as e: - raise DependencyError(e) - except docker.errors.APIError as e: - raise DependencyError(e) - else: - try: - subprocess.run(ffmpeg_cmd, env={}, check=True) - return True - except subprocess.CalledProcessError as e: - raise DependencyError(e) - - except Exception as e: - raise DependencyError(e) - - try: - chapter_files = [f for f in os.listdir(session['chapters_dir']) if f.endswith(".wav")] - chapter_files = sorted(chapter_files, key=lambda x: int(re.search(r'\d+', x).group())) - assembled_audio = os.path.join(session['tmp_dir'], session['metadata']['title'] + '.' + audioproc_format) - metadata_file = os.path.join(session['tmp_dir'], 'metadata.txt') - if assemble_audio(): - if generate_ffmpeg_metadata(): - final_name = session['metadata']['title'] + '.' + audiobook_format - docker_final_file = os.path.join(session['tmp_dir'], final_name) - final_file = os.path.join(session['audiobooks_dir'], final_name) - if export_audio(): - return final_file - return None - except Exception as e: - raise DependencyError(e) - -def replace_roman_numbers(text): - def roman_to_int(s): - try: - roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000, - 'IV': 4, 'IX': 9, 'XL': 40, 'XC': 90, 'CD': 400, 'CM': 900} - i = 0 - num = 0 - # Iterate over the string to calculate the integer value - while i < len(s): - # Check for two-character numerals (subtractive combinations) - if i + 1 < len(s) and s[i:i+2] in roman: - num += roman[s[i:i+2]] - i += 2 - else: - # Add the value of the single character - num += roman[s[i]] - i += 1 - return num - except Exception as e: - return s - - roman_chapter_pattern = re.compile( - r'\b(chapter|volume|chapitre|tome|capitolo|capítulo|volumen|Kapitel|глава|том|κεφάλαιο|τόμος|capitul|poglavlje)\s' - r'(M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|[IVXLCDM]+)\b', - re.IGNORECASE - ) - - roman_numerals_with_period = re.compile( - r'^(M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|[IVXLCDM])\.+' - ) - - def replace_chapter_match(match): - chapter_word = match.group(1) - roman_numeral = match.group(2) - integer_value = roman_to_int(roman_numeral.upper()) - return f'{chapter_word.capitalize()} {integer_value}' - - def replace_numeral_with_period(match): - roman_numeral = match.group(1) - integer_value = roman_to_int(roman_numeral) - return f'{integer_value}.' - - text = roman_chapter_pattern.sub(replace_chapter_match, text) - text = roman_numerals_with_period.sub(replace_numeral_with_period, text) - return text -''' -def stop_and_detach_tts(tts=None): - if tts is not None: - if next(tts.parameters()).is_cuda: - tts.to('cpu') - del tts - if torch.cuda.is_available(): - torch.cuda.empty_cache() -''' -def delete_old_web_folders(root_dir): - try: - if not os.path.exists(root_dir): - os.makedirs(root_dir) - print(f'Created missing directory: {root_dir}') - current_time = time.time() - age_limit = current_time - interface_shared_expire * 60 * 60 # 24 hours in seconds - for folder_name in os.listdir(root_dir): - dir_path = os.path.join(root_dir, folder_name) - if os.path.isdir(dir_path) and folder_name.startswith('web-'): - folder_creation_time = os.path.getctime(dir_path) - if folder_creation_time < age_limit: - shutil.rmtree(dir_path) - except Exception as e: - raise DependencyError(e) - -def compare_file_metadata(f1, f2): - if os.path.getsize(f1) != os.path.getsize(f2): - return False - if os.path.getmtime(f1) != os.path.getmtime(f2): - return False - return True - -def convert_ebook(args): - try: - global is_gui_process - global context - error = None - try: - if len(args['language']) == 2: - lang_array = languages.get(alpha2=args['language']) - if lang_array and lang_array.part3: - args['language'] = lang_array.part3 - else: - args['language'] = None - else: - lang_array = languages.get(part3=args['language']) - if not lang_array: - args['language'] = None - except Exception as e: - args['language'] = None - pass - - if args['language'] is not None and args['language'] in language_mapping.keys(): - session_id = args['session'] if args['session'] is not None else str(uuid.uuid4()) - session = context.get_session(session_id) - session['id'] = session_id - session['src'] = args['ebook'] - session['script_mode'] = args['script_mode'] if args['script_mode'] is not None else NATIVE - session['audiobooks_dir'] = args['audiobooks_dir'] - is_gui_process = args['is_gui_process'] - device = args['device'].lower() - voice_file = args['voice'] - language = args['language'] - temperature = args['temperature'] - length_penalty = args['length_penalty'] - repetition_penalty = args['repetition_penalty'] - top_k = args['top_k'] - top_p = args['top_p'] - speed = args['speed'] - enable_text_splitting = args['enable_text_splitting'] if args['enable_text_splitting'] is not None else True - custom_model_file = args['custom_model'] if args['custom_model'] != 'none' and args['custom_model'] is not None else None - fine_tuned = args['fine_tuned'] if check_fine_tuned(args['fine_tuned'], args['language']) else None - - if not fine_tuned: - raise ValueError('The fine tuned model does not exist.') - - if not os.path.splitext(args['ebook'])[1]: - raise ValueError('The selected ebook file has no extension. Please select a valid file.') - - if session['script_mode'] == NATIVE: - bool, e = check_programs('Calibre', 'calibre', '--version') - if not bool: - raise DependencyError(e) - bool, e = check_programs('FFmpeg', 'ffmpeg', '-version') - if not bool: - raise DependencyError(e) - elif session['script_mode'] == DOCKER_UTILS: - session['client'] = docker.from_env() - - session['tmp_dir'] = os.path.join(processes_dir, f"ebook-{session['id']}") - session['chapters_dir'] = os.path.join(session['tmp_dir'], f"chapters_{hashlib.md5(args['ebook'].encode()).hexdigest()}") - session['chapters_dir_sentences'] = os.path.join(session['chapters_dir'], 'sentences') - - if not is_gui_process: - print(f'*********** Session: {session_id}', '************* Store it in case of interruption or crash you can resume the conversion') - session['custom_model_dir'] = os.path.join(models_dir,'__sessions',f"model-{session['id']}") - if custom_model_file: - session['custom_model'], progression_status = extract_custom_model(custom_model_file, session['custom_model_dir']) - if not session['custom_model']: - raise ValueError(f'{custom_model_file} could not be extracted or mandatory files are missing') - - if prepare_dirs(args['ebook'], session): - session['filename_noext'] = os.path.splitext(os.path.basename(session['src']))[0] - if not torch.cuda.is_available() or device == 'cpu': - if device == 'gpu': - print('GPU is not available on your device!') - device = 'cpu' - else: - device = 'cuda' - torch.device(device) - print(f'Available Processor Unit: {device}') - session['epub_path'] = os.path.join(session['tmp_dir'], '__' + session['filename_noext'] + '.epub') - has_src_metadata = has_metadata(session['src']) - if convert_to_epub(session): - session['epub'] = epub.read_epub(session['epub_path'], {'ignore_ncx': True}) - metadata = dict(session['metadata']) - for key, value in metadata.items(): - data = session['epub'].get_metadata('DC', key) - if data: - for value, attributes in data: - if key == 'language' and not has_src_metadata: - session['metadata'][key] = language - else: - session['metadata'][key] = value - language_array = languages.get(part3=language) - if language_array and language_array.part1: - session['metadata']['language_iso1'] = language_array.part1 - if session['metadata']['language'] == language or session['metadata']['language_iso1'] and session['metadata']['language'] == session['metadata']['language_iso1']: - session['metadata']['title'] = os.path.splitext(os.path.basename(session['src']))[0] if not session['metadata']['title'] else session['metadata']['title'] - session['metadata']['creator'] = False if not session['metadata']['creator'] else session['metadata']['creator'] - session['cover'] = get_cover(session) - if session['cover']: - session['chapters'] = get_chapters(language, session) - if session['chapters']: - session['device'] = device - session['temperature'] = temperature - session['length_penalty'] = length_penalty - session['repetition_penalty'] = repetition_penalty - session['top_k'] = top_k - session['top_p'] = top_p - session['speed'] = speed - session['enable_text_splitting'] = enable_text_splitting - session['fine_tuned'] = fine_tuned - session['voice_file'] = voice_file - session['language'] = language - if convert_chapters_to_audio(session): - final_file = combine_audio_chapters(session) - if final_file is not None: - chapters_dirs = [ - dir_name for dir_name in os.listdir(session['tmp_dir']) - if fnmatch.fnmatch(dir_name, "chapters_*") and os.path.isdir(os.path.join(session['tmp_dir'], dir_name)) - ] - if len(chapters_dirs) > 1: - if os.path.exists(session['chapters_dir']): - shutil.rmtree(session['chapters_dir']) - if os.path.exists(session['epub_path']): - os.remove(session['epub_path']) - if os.path.exists(session['cover']): - os.remove(session['cover']) - else: - if os.path.exists(session['tmp_dir']): - shutil.rmtree(session['tmp_dir']) - progress_status = f'Audiobook {os.path.basename(final_file)} created!' - return progress_status, final_file - else: - error = 'combine_audio_chapters() error: final_file not created!' - else: - error = 'convert_chapters_to_audio() failed!' - else: - error = 'get_chapters() failed!' - else: - error = 'get_cover() failed!' - else: - error = f"WARNING: Ebook language: {session['metadata']['language']}, language selected: {language}" - else: - error = 'convert_to_epub() failed!' - else: - error = f"Temporary directory {session['tmp_dir']} not removed due to failure." - else: - error = f"Language {args['language']} is not supported." - if session['cancellation_requested']: - error = 'Cancelled' - print(error) - return error, None - except Exception as e: - print(f'convert_ebook() Exception: {e}') - return e, None - -def web_interface(args): - script_mode = args['script_mode'] - is_gui_process = args['is_gui_process'] - is_gui_shared = args['share'] - is_converting = False - audiobooks_dir = None - ebook_src = None - audiobook_file = None - language_options = [ - ( - f"{details['name']} - {details['native_name']}" if details['name'] != details['native_name'] else details['name'], - lang - ) - for lang, details in language_mapping.items() - ] - custom_model_options = None - fine_tuned_options = list(models['xtts'].keys()) - default_language_name = next((name for name, key in language_options if key == default_language_code), None) - - theme = gr.themes.Origin( - primary_hue='amber', - secondary_hue='green', - neutral_hue='gray', - radius_size='lg', - font_mono=['JetBrains Mono', 'monospace', 'Consolas', 'Menlo', 'Liberation Mono'] - ) - - with gr.Blocks(theme=theme) as interface: - gr.HTML( - ''' - - ''' - ) - gr.Markdown( - f''' - # Ebook2Audiobook v{version}
- https://github.com/DrewThomasson/ebook2audiobook
- Convert eBooks into immersive audiobooks with realistic voice TTS models.
- Multiuser, multiprocessing, multithread on a geo cluster to share the conversion to the Grid. - ''' - ) - with gr.Tabs(): - gr_tab_main = gr.TabItem('Input Options') - with gr_tab_main: - with gr.Row(): - with gr.Column(scale=3): - with gr.Group(): - gr_ebook_file = gr.File(label='EBook File (.epub, .mobi, .azw3, fb2, lrf, rb, snb, tcr, .pdf, .txt, .rtf, doc, .docx, .html, .odt, .azw)', file_types=['.epub', '.mobi', '.azw3', 'fb2', 'lrf', 'rb', 'snb', 'tcr', '.pdf', '.txt', '.rtf', 'doc', '.docx', '.html', '.odt', '.azw']) - with gr.Group(): - gr_voice_file = gr.File(label='*Cloning Voice (a .wav 24khz for XTTS base model and 16khz for FAIRSEQ base model, no more than 6 sec)', file_types=['.wav'], visible=interface_component_options['gr_voice_file']) - gr.Markdown('

  * Optional

') - with gr.Group(): - gr_device = gr.Radio(label='Processor Unit', choices=['CPU', 'GPU'], value='CPU') - with gr.Group(): - gr_language = gr.Dropdown(label='Language', choices=[name for name, _ in language_options], value=default_language_name) - with gr.Column(scale=3): - gr_group_custom_model = gr.Group(visible=interface_component_options['gr_group_custom_model']) - with gr_group_custom_model: - gr_custom_model_file = gr.File(label='*Custom XTTS Model (a .zip containing config.json, vocab.json, model.pth, ref.wav)', file_types=['.zip']) - gr_custom_model_list = gr.Dropdown(label='', choices=['none'], interactive=True) - gr.Markdown('

  * Optional

') - with gr.Group(): - gr_session_status = gr.Textbox(label='Session') - with gr.Group(): - gr_tts_engine = gr.Dropdown(label='TTS Base', choices=[default_tts_engine], value=default_tts_engine, interactive=True) - gr_fine_tuned = gr.Dropdown(label='Fine Tuned Models', choices=fine_tuned_options, value=default_fine_tuned, interactive=True) - gr_tab_preferences = gr.TabItem('Audio Generation Preferences', visible=interface_component_options['gr_tab_preferences']) - with gr_tab_preferences: - gr.Markdown( - ''' - ### Customize Audio Generation Parameters - Adjust the settings below to influence how the audio is generated. You can control the creativity, speed, repetition, and more. - ''' - ) - gr_temperature = gr.Slider( - label='Temperature', - minimum=0.1, - maximum=10.0, - step=0.1, - value=0.65, - info='Higher values lead to more creative, unpredictable outputs. Lower values make it more monotone.' - ) - gr_length_penalty = gr.Slider( - label='Length Penalty', - minimum=0.5, - maximum=10.0, - step=0.1, - value=1.0, - info='Penalize longer sequences. Higher values produce shorter outputs. Not applied to custom models.' - ) - gr_repetition_penalty = gr.Slider( - label='Repetition Penalty', - minimum=1.0, - maximum=10.0, - step=0.1, - value=2.5, - info='Penalizes repeated phrases. Higher values reduce repetition.' - ) - gr_top_k = gr.Slider( - label='Top-k Sampling', - minimum=10, - maximum=100, - step=1, - value=50, - info='Lower values restrict outputs to more likely words and increase speed at which audio generates.' - ) - gr_top_p = gr.Slider( - label='Top-p Sampling', - minimum=0.1, - maximum=1.0, - step=.01, - value=0.8, - info='Controls cumulative probability for word selection. Lower values make the output more predictable and increase speed at which audio generates.' - ) - gr_speed = gr.Slider( - label='Speed', - minimum=0.5, - maximum=3.0, - step=0.1, - value=1.0, - info='Adjusts how fast the narrator will speak.' - ) - gr_enable_text_splitting = gr.Checkbox( - label='Enable Text Splitting', - value=True, - info='Splits long texts into sentences to generate audio in chunks. Useful for very long inputs.' - ) - - gr_state = gr.State(value="") # Initialize state for each user session - gr_session = gr.Textbox(label='Session', visible=False) - gr_conversion_progress = gr.Textbox(label='Progress') - gr_convert_btn = gr.Button('Convert', variant='primary', interactive=False) - gr_audio_player = gr.Audio(label='Listen', type='filepath', show_download_button=False, container=True, visible=False) - gr_audiobooks_ddn = gr.Dropdown(choices=[], label='Audiobooks') - gr_audiobook_link = gr.File(label='Download') - gr_write_data = gr.JSON(visible=False) - gr_read_data = gr.JSON(visible=False) - gr_data = gr.State({}) - gr_modal_html = gr.HTML() - - def show_modal(message): - return f''' - - - ''' - - def hide_modal(): - return '' - - def update_interface(): - nonlocal is_converting - is_converting = False - return gr.update('Convert', variant='primary', interactive=False), gr.update(value=None), gr.update(value=None), gr.update(value=audiobook_file), update_audiobooks_ddn(), hide_modal() - - def refresh_audiobook_list(): - files = [] - if audiobooks_dir is not None: - if os.path.exists(audiobooks_dir): - files = [f for f in os.listdir(audiobooks_dir)] - files.sort(key=lambda x: os.path.getmtime(os.path.join(audiobooks_dir, x)), reverse=True) - return files - - def change_gr_audiobooks_ddn(audiobook): - if audiobooks_dir is not None: - if audiobook: - link = os.path.join(audiobooks_dir, audiobook) - return link, link, gr.update(visible=True) - return None, None, gr.update(visible=False) - - def update_convert_btn(upload_file=None, custom_model_file=None, session_id=None): - if session_id is None: - yield gr.update(variant='primary', interactive=False) - return - else: - session = context.get_session(session_id) - if hasattr(upload_file, 'name') and not hasattr(custom_model_file, 'name'): - yield gr.update(variant='primary', interactive=True) - else: - yield gr.update(variant='primary', interactive=False) - return - - def update_audiobooks_ddn(): - files = refresh_audiobook_list() - return gr.update(choices=files, label='Audiobooks', value=files[0] if files else None) - - async def change_gr_ebook_file(f, session_id): - nonlocal is_converting - if context and session_id: - session = context.get_session(session_id) - if f is None: - if is_converting: - session['cancellation_requested'] = True - yield show_modal('Cancellation requested, please wait...') - return - session['cancellation_requested'] = False - yield hide_modal() - return - - def change_gr_language(selected: str, session_id: str): - nonlocal custom_model_options - if selected == 'zzzz': - new_language_name = default_language_name - new_language_key = default_language_code - else: - new_language_name, new_language_key = next(((name, key) for name, key in language_options if key == selected), (None, None)) - tts_engine_options = ['xtts'] if language_xtts.get(new_language_key, False) else ['fairseq'] - fine_tuned_options = [ - model_name - for model_name, model_details in models.get(tts_engine_options[0], {}).items() - if model_details.get('lang') == 'multi' or model_details.get('lang') == new_language_key - ] - custom_model_options = ['none'] - if context and session_id: - session = context.get_session(session_id) - session['language'] = new_language_key - custom_model_tts = check_custom_model_tts(session) - custom_model_tts_dir = os.path.join(session['custom_model_dir'], custom_model_tts) - if os.path.exists(custom_model_tts_dir): - custom_model_options += os.listdir(custom_model_tts_dir) - return ( - gr.update(value=new_language_name), - gr.update(choices=tts_engine_options, value=tts_engine_options[0]), - gr.update(choices=fine_tuned_options, value=fine_tuned_options[0] if fine_tuned_options else 'none'), - gr.update(choices=custom_model_options, value=custom_model_options[0]) - ) - - def check_custom_model_tts(session): - custom_model_tts = 'xtts' - if not language_xtts.get(session['language']): - custom_model_tts = 'fairseq' - custom_model_tts_dir = os.path.join(session['custom_model_dir'], custom_model_tts) - if not os.path.isdir(custom_model_tts_dir): - os.makedirs(custom_model_tts_dir, exist_ok=True) - return custom_model_tts - - def change_gr_custom_model_list(custom_model_list): - if custom_model_list == 'none': - return gr.update(visible=True) - return gr.update(visible=False) - - async def change_gr_custom_model_file(custom_model_file, session_id): - try: - nonlocal custom_model_options, gr_custom_model_file, gr_conversion_progress - if context and session_id: - session = context.get_session(session_id) - if custom_model_file is not None: - if analyze_uploaded_file(custom_model_file): - session['custom_model'], progress_status = extract_custom_model(custom_model_file, None, session) - if session['custom_model']: - custom_model_tts_dir = check_custom_model_tts(session) - custom_model_options = ['none'] + os.listdir(os.path.join(session['custom_model_dir'], custom_model_tts_dir)) - yield ( - gr.update(visible=False), - gr.update(choices=custom_model_options, value=session['custom_model']), - gr.update(value=f"{session['custom_model']} added to the custom list") - ) - gr_custom_model_file = gr.File(label='*XTTS Model (a .zip containing config.json, vocab.json, model.pth, ref.wav)', value=None, file_types=['.zip']) - return - yield gr.update(), gr.update(), gr.update(value='Invalid file! Please upload a valid ZIP.') - return - except Exception as e: - yield gr.update(), gr.update(), gr.update(value=f'Error: {str(e)}') - return - - def change_gr_tts_engine(engine): - if engine == 'xtts': - return gr.update(visible=True) - else: - return gr.update(visible=False) - - def change_gr_fine_tuned(fine_tuned): - visible = False - if fine_tuned == 'std': - visible = True - return gr.update(visible=visible) - - def change_gr_data(data): - data['event'] = 'change_data' - return data - - def change_gr_read_data(data): - nonlocal audiobooks_dir - nonlocal custom_model_options - warning_text_extra = '' - if not data: - data = {'session_id': str(uuid.uuid4())} - warning_text = f"Session: {data['session_id']}" - else: - if 'session_id' not in data: - data['session_id'] = str(uuid.uuid4()) - warning_text = data['session_id'] - event = data.get('event', '') - if event != 'load': - return [gr.update(), gr.update(), gr.update(), gr.update(), gr.update()] - session = context.get_session(data['session_id']) - session['custom_model_dir'] = os.path.join(models_dir,'__sessions',f"model-{session['id']}") - os.makedirs(session['custom_model_dir'], exist_ok=True) - custom_model_tts_dir = check_custom_model_tts(session) - custom_model_options = ['none'] + os.listdir(os.path.join(session['custom_model_dir'],custom_model_tts_dir)) - if is_gui_shared: - warning_text_extra = f' Note: access limit time: {interface_shared_expire} hours' - audiobooks_dir = os.path.join(audiobooks_gradio_dir, f"web-{data['session_id']}") - delete_old_web_folders(audiobooks_gradio_dir) - else: - audiobooks_dir = os.path.join(audiobooks_host_dir, f"web-{data['session_id']}") - return [data, f'{warning_text}{warning_text_extra}', data['session_id'], update_audiobooks_ddn(), gr.update(choices=custom_model_options, value='none')] - - def submit_convert_btn( - session, device, ebook_file, voice_file, language, - custom_model_file, temperature, length_penalty, - repetition_penalty, top_k, top_p, speed, enable_text_splitting, fine_tuned - ): - nonlocal is_converting - - args = { - "is_gui_process": is_gui_process, - "session": session, - "script_mode": script_mode, - "device": device.lower(), - "ebook": ebook_file.name if ebook_file else None, - "audiobooks_dir": audiobooks_dir, - "voice": voice_file.name if voice_file else None, - "language": next((key for name, key in language_options if name == language), None), - "custom_model": next((key for name, key in language_options if name != 'none'), None), - "temperature": float(temperature), - "length_penalty": float(length_penalty), - "repetition_penalty": float(repetition_penalty), - "top_k": int(top_k), - "top_p": float(top_p), - "speed": float(speed), - "enable_text_splitting": enable_text_splitting, - "fine_tuned": fine_tuned - } - - if args["ebook"] is None: - yield gr.update(value='Error: a file is required.') - return - - try: - is_converting = True - progress_status, audiobook_file = convert_ebook(args) - if audiobook_file is None: - if is_converting: - yield gr.update(value='Conversion cancelled.') - return - else: - yield gr.update(value='Conversion failed.') - return - else: - yield progress_status - return - except Exception as e: - yield DependencyError(e) - return - - gr_ebook_file.change( - fn=update_convert_btn, - inputs=[gr_ebook_file, gr_custom_model_file, gr_session], - outputs=gr_convert_btn - ).then( - fn=change_gr_ebook_file, - inputs=[gr_ebook_file, gr_session], - outputs=[gr_modal_html] - ) - gr_language.change( - fn=lambda selected, session_id: change_gr_language(dict(language_options).get(selected, 'Unknown'), session_id), - inputs=[gr_language, gr_session], - outputs=[gr_language, gr_tts_engine, gr_fine_tuned, gr_custom_model_list] - ) - gr_audiobooks_ddn.change( - fn=change_gr_audiobooks_ddn, - inputs=gr_audiobooks_ddn, - outputs=[gr_audiobook_link, gr_audio_player, gr_audio_player] - ) - gr_custom_model_file.change( - fn=change_gr_custom_model_file, - inputs=[gr_custom_model_file, gr_session], - outputs=[gr_fine_tuned, gr_custom_model_list, gr_conversion_progress] - ) - gr_custom_model_list.change( - fn=change_gr_custom_model_list, - inputs=gr_custom_model_list, - outputs=gr_fine_tuned - ) - gr_tts_engine.change( - fn=change_gr_tts_engine, - inputs=gr_tts_engine, - outputs=gr_tab_preferences - ) - gr_fine_tuned.change( - fn=change_gr_fine_tuned, - inputs=gr_fine_tuned, - outputs=gr_group_custom_model - ) - gr_session.change( - fn=change_gr_data, - inputs=gr_data, - outputs=gr_write_data - ) - gr_write_data.change( - fn=None, - inputs=gr_write_data, - js=''' - (data) => { - localStorage.clear(); - console.log(data); - window.localStorage.setItem('data', JSON.stringify(data)); - } - ''' - ) - gr_read_data.change( - fn=change_gr_read_data, - inputs=gr_read_data, - outputs=[gr_data, gr_session_status, gr_session, gr_audiobooks_ddn, gr_custom_model_list] - ) - gr_convert_btn.click( - fn=update_convert_btn, - inputs=None, - outputs=gr_convert_btn - ).then( - fn=submit_convert_btn, - inputs=[ - gr_session, gr_device, gr_ebook_file, gr_voice_file, gr_language, - gr_custom_model_list, gr_temperature, gr_length_penalty, - gr_repetition_penalty, gr_top_k, gr_top_p, gr_speed, gr_enable_text_splitting, gr_fine_tuned - ], - outputs=gr_conversion_progress - ).then( - fn=update_interface, - inputs=None, - outputs=[gr_convert_btn, gr_ebook_file, gr_voice_file, gr_audio_player, gr_audiobooks_ddn, gr_modal_html] - ) - interface.load( - fn=None, - js=''' - () => { - const dataStr = window.localStorage.getItem('data'); - if (dataStr) { - const obj = JSON.parse(dataStr); - obj.event = 'load'; - console.log(obj); - return obj; - } - return null; - } - ''', - outputs=gr_read_data - ) - - try: - interface.queue(default_concurrency_limit=interface_concurrency_limit).launch(server_name=interface_host, server_port=interface_port, share=is_gui_shared) - except OSError as e: - print(f'Connection error: {e}') - except socket.error as e: - print(f'Socket error: {e}') - except KeyboardInterrupt: - print('Server interrupted by user. Shutting down...') - except Exception as e: - print(f'An unexpected error occurred: {e}') diff --git a/lib/lang.py b/lib/lang.py deleted file mode 100644 index 1c9e0e89f5ba7afd33981834fdfd286e3b6939f0..0000000000000000000000000000000000000000 --- a/lib/lang.py +++ /dev/null @@ -1,1162 +0,0 @@ -import os - -language_xtts = {"eng": "en", "spa": "es", "fra": "fr", "deu": "de", "ita": "it", "por": "pt", "pol": "pl", "tur": "tr", "rus": "ru", "nld": "nl", "ces": "cs", "ara": "ar", "zho": "zh-cn", "jpn": "ja", "hun": "hu", "kor": "ko", "hin": "hi", "vie": "vi"} - -default_language_code = 'eng' # ISO-639-3 -default_voice_file = os.path.abspath(os.path.join('.','voices',default_language_code,'adult','female','Jennifer_24khz.wav')) # or 'male','Curt_24khz.wav' - -language_mapping = { - "eng": {"name": "English", "native_name": "English", "char_limit": 192, "model": "en_core_web_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zho": {"name": "Chinese", "native_name": "中文", "char_limit": 72, "model": "zh_core_web_md", "punctuation": ["。", ",", ":", ";"]}, - "spa": {"name": "Spanish", "native_name": "Español", "char_limit": 192, "model": "es_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fra": {"name": "French", "native_name": "Français", "char_limit": 192, "model": "fr_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "por": {"name": "Portuguese", "native_name": "Português", "char_limit": 192, "model": "pt_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rus": {"name": "Russian", "native_name": "Русский", "char_limit": 192, "model": "ru_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ind": {"name": "Indonesian", "native_name": "Bahasa Indonesia", "char_limit": 192, "model": "ind.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hin": {"name": "Hindi", "native_name": "हिन्दी", "char_limit": 192, "model": "hin.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "ben": {"name": "Bengali", "native_name": "বাংলা", "char_limit": 192, "model": "ben.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "yor": {"name": "Yoruba", "native_name": "Èdè Yorùbá", "char_limit": 72, "model": "yor.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ara": {"name": "Arabic", "native_name": "العربية", "char_limit": 192, "model": "ara.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "jav": {"name": "Javanese", "native_name": "Basa Jawa", "char_limit": 192, "model": "jav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jpn": {"name": "Japanese", "native_name": "日本語", "char_limit": 71, "model": "ja_core_news_md", "punctuation": ["、", "。", "・", "!", "?", "ー", "(", ")", "「", "」", "『", "』", "〜", "【", "】", "……", "―", "/"]}, - "kor": {"name": "Korean", "native_name": "한국어", "char_limit": 192, "model": "ko_core_news_md", "punctuation": [";\n", ",", "!", "?", "…"]}, - "deu": {"name": "German, Standard", "native_name": "Deutsch", "char_limit": 72, "model": "de_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ita": {"name": "Italian", "native_name": "Italiano", "char_limit": 192, "model": "it_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fas": {"name": "Persian", "native_name": "فارسی", "char_limit": 192, "model": "fas.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "tam": {"name": "Tamil", "native_name": "தமிழ்", "char_limit": 192, "model": "tam.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "tel": {"name": "Telugu", "native_name": "తెలుగు", "char_limit": 192, "model": "tel.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "tur": {"name": "Turkish", "native_name": "Türkçe", "char_limit": 192, "model": "tr_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pol": {"name": "Polish", "native_name": "Polski", "char_limit": 192, "model": "pl_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hun": {"name": "Hungarian", "native_name": "Magyar", "char_limit": 72, "model": "hu_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nld": {"name": "Dutch", "native_name": "Nederlands", "char_limit": 192, "model": "nl_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - - "zzzz": {"name": "------------------ More languages (A to Z) ------------------", "native_name": "------------------ More languages (A to Z) ------------------", "char_limit": 0, "model": "", "punctuation": []}, - - "abi": {"name": "Abidji", "native_name": "Abidji", "char_limit": 192, "model": "abi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ace": {"name": "Aceh", "native_name": "Acèh", "char_limit": 192, "model": "ace.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aca": {"name": "Achagua", "native_name": "Achagua", "char_limit": 192, "model": "aca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "acn": {"name": "Achang", "native_name": "Achang", "char_limit": 192, "model": "acn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "acr": {"name": "Achi", "native_name": "Achi", "char_limit": 192, "model": "acr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ach": {"name": "Acholi", "native_name": "Acholi", "char_limit": 192, "model": "ach.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "acu": {"name": "Achuar-Shiwiar", "native_name": "Achuar-Shiwiar", "char_limit": 192, "model": "acu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guq": {"name": "Aché", "native_name": "Aché", "char_limit": 192, "model": "guq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ade": {"name": "Adele", "native_name": "Adele", "char_limit": 192, "model": "ade.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "adj": {"name": "Adioukrou", "native_name": "Adioukrou", "char_limit": 72, "model": "adj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "agd": {"name": "Agarabi", "native_name": "Agarabi", "char_limit": 192, "model": "agd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "agx": {"name": "Aghul", "native_name": "Aghul", "char_limit": 192, "model": "agx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "agn": {"name": "Agutaynen", "native_name": "Agutaynen", "char_limit": 72, "model": "agn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aha": {"name": "Ahanta", "native_name": "Ahanta", "char_limit": 192, "model": "aha.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aka": {"name": "Akan", "native_name": "Akan", "char_limit": 192, "model": "aka.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "knj": {"name": "Akateko", "native_name": "Akateko", "char_limit": 72, "model": "knj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ake": {"name": "Akawaio", "native_name": "Akawaio", "char_limit": 72, "model": "ake.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aeu": {"name": "Akeu", "native_name": "Akeu", "char_limit": 192, "model": "aeu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ahk": {"name": "Akha", "native_name": "Akha", "char_limit": 192, "model": "ahk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bss": {"name": "Akoose", "native_name": "Akoose", "char_limit": 72, "model": "bss.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "alj": {"name": "Alangan", "native_name": "Alangan", "char_limit": 72, "model": "alj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sqi": {"name": "Albanian", "native_name": "Shqip", "char_limit": 192, "model": "sqi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "alt": {"name": "Altai, Southern", "native_name": "Алтай тили", "char_limit": 41, "model": "alt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "alp": {"name": "Alune", "native_name": "Alune", "char_limit": 192, "model": "alp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "alz": {"name": "Alur", "native_name": "Alur", "char_limit": 192, "model": "alz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kab": {"name": "Amazigh", "native_name": "Tamaziɣt", "char_limit": 72, "model": "kab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "amk": {"name": "Ambai", "native_name": "Ambai", "char_limit": 192, "model": "amk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mmg": {"name": "Ambrym, North", "native_name": "Ambrym", "char_limit": 192, "model": "mmg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "amh": {"name": "Amharic", "native_name": "አማርኛ", "char_limit": 41, "model": "amh.tar.gz", "punctuation": ["።", "፣", "፨", "፧", "…"]}, - "ami": {"name": "Amis", "native_name": "Amis", "char_limit": 192, "model": "ami.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "azg": {"name": "Amuzgo, San Pedro Amuzgos", "native_name": "Amuzgo", "char_limit": 192, "model": "azg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "agg": {"name": "Angor", "native_name": "Angor", "char_limit": 192, "model": "agg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "boj": {"name": "Anjam", "native_name": "Anjam", "char_limit": 192, "model": "boj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cko": {"name": "Anufo", "native_name": "Anufo", "char_limit": 192, "model": "cko.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "any": {"name": "Anyin", "native_name": "Anyin", "char_limit": 192, "model": "any.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "arl": {"name": "Arabela", "native_name": "Arabela", "char_limit": 192, "model": "arl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "atq": {"name": "Aralle-Tabulahan", "native_name": "Aralle-Tabulahan", "char_limit": 192, "model": "atq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "luc": {"name": "Aringa", "native_name": "Aringa", "char_limit": 192, "model": "luc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hyw": {"name": "Armenian, Western", "native_name": "Հայերեն", "char_limit": 192, "model": "hyw.tar.gz", "punctuation": ["։", "՝", "՛", "՞", "…"]}, - "apr": {"name": "Arop-Lokep", "native_name": "Arop-Lokep", "char_limit": 192, "model": "apr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aia": {"name": "Arosi", "native_name": "Arosi", "char_limit": 192, "model": "aia.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "msy": {"name": "Aruamu", "native_name": "Aruamu", "char_limit": 192, "model": "msy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cni": {"name": "Asháninka", "native_name": "Asháninka", "char_limit": 72, "model": "cni.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cjo": {"name": "Ashéninka, Pajonal", "native_name": "Ashéninka", "char_limit": 72, "model": "cjo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cpu": {"name": "Ashéninka, Pichis", "native_name": "Ashéninka", "char_limit": 72, "model": "cpu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cpb": {"name": "Ashéninka, Ucayali-Yurúa", "native_name": "Ashéninka", "char_limit": 72, "model": "cpb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "asm": {"name": "Assamese", "native_name": "অসমীয়া", "char_limit": 192, "model": "asm.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "asa": {"name": "Asu", "native_name": "Asu", "char_limit": 192, "model": "asa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "teo": {"name": "Ateso", "native_name": "Ateso", "char_limit": 192, "model": "teo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ati": {"name": "Attié", "native_name": "Attié", "char_limit": 192, "model": "ati.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "djk": {"name": "Aukan", "native_name": "Aukan", "char_limit": 192, "model": "djk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ava": {"name": "Avar", "native_name": "Авар", "char_limit": 192, "model": "ava.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "avn": {"name": "Avatime", "native_name": "Avatime", "char_limit": 192, "model": "avn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "avu": {"name": "Avokaya", "native_name": "Avokaya", "char_limit": 192, "model": "avu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "awb": {"name": "Awa", "native_name": "Awa", "char_limit": 192, "model": "awb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kwi": {"name": "Awa-Cuaiquer", "native_name": "Awa-Cuaiquer", "char_limit": 192, "model": "kwi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "awa": {"name": "Awadhi", "native_name": "अवधी", "char_limit": 192, "model": "awa.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "agr": {"name": "Awajún", "native_name": "Awajún", "char_limit": 192, "model": "agr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "agu": {"name": "Awakateko", "native_name": "Awakateko", "char_limit": 192, "model": "agu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ayr": {"name": "Aymara, Central", "native_name": "Aymara", "char_limit": 192, "model": "ayr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ayo": {"name": "Ayoreo", "native_name": "Ayoreo", "char_limit": 192, "model": "ayo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "abp": {"name": "Ayta, Abellen", "native_name": "Abellen", "char_limit": 192, "model": "abp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "blx": {"name": "Ayta, Mag-Indi", "native_name": "Mag-Indi", "char_limit": 192, "model": "blx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sgb": {"name": "Ayta, Mag-antsi", "native_name": "Mag-antsi", "char_limit": 192, "model": "sgb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "azj-script_cyrillic": {"name": "Azerbaijani, North - Cyrillic", "native_name": "Азәрбајҹан", "char_limit": 192, "model": "azj-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "azj-script_latin": {"name": "Azerbaijani, North - Latin", "native_name": "Azərbaycan", "char_limit": 72, "model": "azj-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "azb": {"name": "Azerbaijani, South - Arabic", "native_name": "گؤنئی", "char_limit": 192, "model": "azb.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "bba": {"name": "Baatonum", "native_name": "Baatonum", "char_limit": 192, "model": "bba.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bhz": {"name": "Bada", "native_name": "Bada", "char_limit": 192, "model": "bhz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bvc": {"name": "Baelelea", "native_name": "Baelelea", "char_limit": 192, "model": "bvc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bfy": {"name": "Bagheli", "native_name": "बघेली", "char_limit": 192, "model": "bfy.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "bgq": {"name": "Bagri", "native_name": "बागड़ी", "char_limit": 192, "model": "bgq.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "bdq": {"name": "Bahnar", "native_name": "Bahnar", "char_limit": 192, "model": "bdq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bdh": {"name": "Baka", "native_name": "Baka", "char_limit": 192, "model": "bdh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bqi": {"name": "Bakhtiâri", "native_name": "بختیاری", "char_limit": 192, "model": "bqi.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "bjw": {"name": "Bakwé", "native_name": "Bakwé", "char_limit": 192, "model": "bjw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "blz": {"name": "Balantak", "native_name": "Balantak", "char_limit": 192, "model": "blz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ban": {"name": "Bali", "native_name": "Bali", "char_limit": 192, "model": "ban.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bcc-script_latin": {"name": "Balochi, Southern - Latin", "native_name": "Balochi", "char_limit": 72, "model": "bcc-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bcc-script_arabic": {"name": "Balochi, Southern - Arabic", "native_name": "بلوچی", "char_limit": 192, "model": "bcc-script_arabic.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "bam": {"name": "Bamanankan", "native_name": "Bamanankan", "char_limit": 72, "model": "bam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ptu": {"name": "Bambam", "native_name": "Bambam", "char_limit": 192, "model": "ptu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bcw": {"name": "Bana", "native_name": "Bana", "char_limit": 192, "model": "bcw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bqj": {"name": "Bandial", "native_name": "Bandial", "char_limit": 192, "model": "bqj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bno": {"name": "Bantoanon", "native_name": "Bantoanon", "char_limit": 192, "model": "bno.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bbb": {"name": "Barai", "native_name": "Barai", "char_limit": 192, "model": "bbb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bfa": {"name": "Bari", "native_name": "Bari", "char_limit": 192, "model": "bfa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bjz": {"name": "Baruga", "native_name": "Baruga", "char_limit": 192, "model": "bjz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bak": {"name": "Bashkort", "native_name": "Башҡорт", "char_limit": 192, "model": "bak.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "eus": {"name": "Basque", "native_name": "Euskara", "char_limit": 192, "model": "eus.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bsq": {"name": "Bassa", "native_name": "Bassa", "char_limit": 192, "model": "bsq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "akb": {"name": "Batak Angkola", "native_name": "Batak Angkola", "char_limit": 192, "model": "akb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "btd": {"name": "Batak Dairi", "native_name": "Batak Dairi", "char_limit": 192, "model": "btd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "btx": {"name": "Batak Karo", "native_name": "Batak Karo", "char_limit": 192, "model": "btx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bts": {"name": "Batak Simalungun", "native_name": "Batak Simalungun", "char_limit": 192, "model": "bts.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bbc": {"name": "Batak Toba", "native_name": "Batak Toba", "char_limit": 192, "model": "bbc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bvz": {"name": "Bauzi", "native_name": "Bauzi", "char_limit": 192, "model": "bvz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bjv": {"name": "Bedjond", "native_name": "Bedjond", "char_limit": 192, "model": "bjv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bep": {"name": "Behoa", "native_name": "Behoa", "char_limit": 192, "model": "bep.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bkv": {"name": "Bekwarra", "native_name": "Bekwarra", "char_limit": 192, "model": "bkv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bzj": {"name": "Belize English Creole", "native_name": "Kriol", "char_limit": 192, "model": "bzj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bem": {"name": "Bemba", "native_name": "Ichibemba", "char_limit": 72, "model": "bem.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bng": {"name": "Benga", "native_name": "Benga", "char_limit": 192, "model": "bng.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bom": {"name": "Berom", "native_name": "Berom", "char_limit": 192, "model": "bom.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "btt": {"name": "Bete-Bendi", "native_name": "Bete-Bendi", "char_limit": 192, "model": "btt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bha": {"name": "Bharia", "native_name": "Bharia", "char_limit": 192, "model": "bha.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bgw": {"name": "Bhatri", "native_name": "Bhatri", "char_limit": 192, "model": "bgw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bht": {"name": "Bhattiyali", "native_name": "Bhattiyali", "char_limit": 192, "model": "bht.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "beh": {"name": "Biali", "native_name": "Biali", "char_limit": 192, "model": "beh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sne": {"name": "Bidayuh, Bau", "native_name": "Bidayuh Bau", "char_limit": 192, "model": "sne.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ubl": {"name": "Bikol, Buhi’non", "native_name": "Bikol Buhi’non", "char_limit": 192, "model": "ubl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bcl": {"name": "Bikol, Central", "native_name": "Bikol Central", "char_limit": 192, "model": "bcl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bim": {"name": "Bimoba", "native_name": "Bimoba", "char_limit": 192, "model": "bim.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bkd": {"name": "Binukid", "native_name": "Binukid", "char_limit": 192, "model": "bkd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bjr": {"name": "Binumarien", "native_name": "Binumarien", "char_limit": 192, "model": "bjr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bfo": {"name": "Birifor, Malba", "native_name": "Birifor Malba", "char_limit": 192, "model": "bfo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "biv": {"name": "Birifor, Southern", "native_name": "Birifor Southern", "char_limit": 192, "model": "biv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bib": {"name": "Bisa", "native_name": "Bisa", "char_limit": 192, "model": "bib.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bis": {"name": "Bislama", "native_name": "Bislama", "char_limit": 192, "model": "bis.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bzi": {"name": "Bisu", "native_name": "Bisu", "char_limit": 192, "model": "bzi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bqp": {"name": "Bisã", "native_name": "Bisã", "char_limit": 192, "model": "bqp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bpr": {"name": "Blaan, Koronadal", "native_name": "Blaan Koronadal", "char_limit": 192, "model": "bpr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bps": {"name": "Blaan, Sarangani", "native_name": "Blaan Sarangani", "char_limit": 192, "model": "bps.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bwq": {"name": "Bobo Madaré, Southern", "native_name": "Bobo Madaré Southern", "char_limit": 192, "model": "bwq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bdv": {"name": "Bodo Parja", "native_name": "Bodo Parja", "char_limit": 192, "model": "bdv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bqc": {"name": "Boko", "native_name": "Boko", "char_limit": 192, "model": "bqc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bus": {"name": "Bokobaru", "native_name": "Bokobaru", "char_limit": 192, "model": "bus.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bnp": {"name": "Bola", "native_name": "Bola", "char_limit": 192, "model": "bnp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bmq": {"name": "Bomu", "native_name": "Bomu", "char_limit": 192, "model": "bmq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bdg": {"name": "Bonggi", "native_name": "Bonggi", "char_limit": 192, "model": "bdg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "boa": {"name": "Bora", "native_name": "Bora", "char_limit": 192, "model": "boa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ksr": {"name": "Borong", "native_name": "Borong", "char_limit": 192, "model": "ksr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bor": {"name": "Borôro", "native_name": "Borôro", "char_limit": 192, "model": "bor.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bru": {"name": "Bru, Eastern", "native_name": "Bru", "char_limit": 192, "model": "bru.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "box": {"name": "Buamu", "native_name": "Buamu", "char_limit": 192, "model": "box.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bzh": {"name": "Buang, Mapos", "native_name": "Buang", "char_limit": 192, "model": "bzh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bgt": {"name": "Bughotu", "native_name": "Bughotu", "char_limit": 192, "model": "bgt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sab": {"name": "Buglere", "native_name": "Buglere", "char_limit": 192, "model": "sab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bul": {"name": "Bulgarian", "native_name": "Български", "char_limit": 192, "model": "bul.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "bwu": {"name": "Buli", "native_name": "Buli", "char_limit": 192, "model": "bwu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bmv": {"name": "Bum", "native_name": "Bum", "char_limit": 192, "model": "bmv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mya": {"name": "Burmese", "native_name": "မြန်မာ", "char_limit": 192, "model": "mya.tar.gz", "punctuation": ["။", "၊", "!", "?", "…"]}, - "tte": {"name": "Bwanabwana", "native_name": "Bwanabwana", "char_limit": 192, "model": "tte.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cjp": {"name": "Cabécar", "native_name": "Cabécar", "char_limit": 192, "model": "cjp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbv": {"name": "Cacua", "native_name": "Cacua", "char_limit": 192, "model": "cbv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kaq": {"name": "Capanahua", "native_name": "Capanahua", "char_limit": 192, "model": "kaq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cot": {"name": "Caquinte", "native_name": "Caquinte", "char_limit": 192, "model": "cot.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbc": {"name": "Carapana", "native_name": "Carapana", "char_limit": 192, "model": "cbc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "car": {"name": "Carib", "native_name": "Carib", "char_limit": 192, "model": "car.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cat": {"name": "Catalan", "native_name": "Català", "char_limit": 72, "model": "ca_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ceb": {"name": "Cebuano", "native_name": "Cebuano", "char_limit": 192, "model": "ceb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cme": {"name": "Cerma", "native_name": "Cerma", "char_limit": 192, "model": "cme.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbi": {"name": "Chachi", "native_name": "Cha’palaa", "char_limit": 72, "model": "cbi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ceg": {"name": "Chamacoco", "native_name": "Chamacoco", "char_limit": 192, "model": "ceg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cly": {"name": "Chatino, Eastern Highland", "native_name": "Chatino", "char_limit": 192, "model": "cly.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cya": {"name": "Chatino, Nopala", "native_name": "Chatino", "char_limit": 192, "model": "cya.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "che": {"name": "Chechen", "native_name": "Нохчийн", "char_limit": 192, "model": "che.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "hne": {"name": "Chhattisgarhi", "native_name": "छत्तीसगढ़ी", "char_limit": 192, "model": "hne.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "nya": {"name": "Chichewa", "native_name": "Chichewa", "char_limit": 192, "model": "nya.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dig": {"name": "Chidigo", "native_name": "Chidigo", "char_limit": 192, "model": "dig.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dug": {"name": "Chiduruma", "native_name": "Chiduruma", "char_limit": 192, "model": "dug.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bgr": {"name": "Chin, Bawm", "native_name": "Bawm Chin", "char_limit": 192, "model": "bgr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cek": {"name": "Chin, Eastern Khumi", "native_name": "Khumi Chin", "char_limit": 192, "model": "cek.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cfm": {"name": "Chin, Falam", "native_name": "Falam Chin", "char_limit": 192, "model": "cfm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cnh": {"name": "Chin, Hakha", "native_name": "Hakha Chin", "char_limit": 192, "model": "cnh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hlt": {"name": "Chin, Matu", "native_name": "Matu Chin", "char_limit": 192, "model": "hlt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mwq": {"name": "Chin, Müün", "native_name": "Müün Chin", "char_limit": 192, "model": "mwq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ctd": {"name": "Chin, Tedim", "native_name": "Tedim Chin", "char_limit": 192, "model": "ctd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tcz": {"name": "Chin, Thado", "native_name": "Thado Chin", "char_limit": 192, "model": "tcz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zyp": {"name": "Chin, Zyphe", "native_name": "Zyphe Chin", "char_limit": 192, "model": "zyp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cco": {"name": "Chinantec, Comaltepec", "native_name": "Chinantec", "char_limit": 192, "model": "cco.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cnl": {"name": "Chinantec, Lalana", "native_name": "Chinantec", "char_limit": 192, "model": "cnl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cle": {"name": "Chinantec, Lealao", "native_name": "Chinantec", "char_limit": 192, "model": "cle.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "chz": {"name": "Chinantec, Ozumacín", "native_name": "Chinantec", "char_limit": 192, "model": "chz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cpa": {"name": "Chinantec, Palantla", "native_name": "Chinantec", "char_limit": 192, "model": "cpa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cso": {"name": "Chinantec, Sochiapam", "native_name": "Chinantec", "char_limit": 192, "model": "cso.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cnt": {"name": "Chinantec, Tepetotutla", "native_name": "Chinantec", "char_limit": 192, "model": "cnt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cuc": {"name": "Chinantec, Usila", "native_name": "Chinantec", "char_limit": 192, "model": "cuc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hak": {"name": "Chinese, Hakka", "native_name": "客家話", "char_limit": 192, "model": "hak.tar.gz", "punctuation": ["。", ",", "!", "?", "…"]}, - "nan": {"name": "Chinese, Min Nan", "native_name": "閩南語", "char_limit": 192, "model": "nan.tar.gz", "punctuation": ["。", ",", "!", "?", "…"]}, - "xnj": {"name": "Chingoni", "native_name": "Chingoni", "char_limit": 192, "model": "xnj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cap": {"name": "Chipaya", "native_name": "Chipaya", "char_limit": 192, "model": "cap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cax": {"name": "Chiquitano", "native_name": "Chiquitano", "char_limit": 192, "model": "cax.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ctg": {"name": "Chittagonian", "native_name": "চাটগাঁইয়া", "char_limit": 192, "model": "ctg.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "ctu": {"name": "Chol", "native_name": "Ch’ol", "char_limit": 72, "model": "ctu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "chf": {"name": "Chontal, Tabasco", "native_name": "Chontal", "char_limit": 72, "model": "chf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cce": {"name": "Chopi", "native_name": "Chopi", "char_limit": 192, "model": "cce.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crt": {"name": "Chorote, Iyojwa’ja", "native_name": "Iyojwa’ja Chorote", "char_limit": 72, "model": "crt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crq": {"name": "Chorote, Iyo’wujwa", "native_name": "Iyo’wujwa Chorote", "char_limit": 72, "model": "crq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cac-dialect_sansebastiáncoatán": {"name": "Chuj - San Sebastián Coatán", "native_name": "Chuj", "char_limit": 72, "model": "cac-dialect_sansebastiáncoatán.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cac-dialect_sanmateoixtatán": {"name": "Chuj - San Mateo Ixtatán", "native_name": "Chuj", "char_limit": 72, "model": "cac-dialect_sanmateoixtatán.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ckt": {"name": "Chukchi", "native_name": "Чукотский", "char_limit": 192, "model": "ckt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ncu": {"name": "Chumburung", "native_name": "Chumburung", "char_limit": 192, "model": "ncu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cdj": {"name": "Churahi", "native_name": "Churahi", "char_limit": 192, "model": "cdj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "chv": {"name": "Chuvash", "native_name": "Чӑвашла", "char_limit": 192, "model": "chv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "caa": {"name": "Ch’orti’", "native_name": "Ch’orti’", "char_limit": 72, "model": "caa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "asg": {"name": "Cishingini", "native_name": "Cishingini", "char_limit": 192, "model": "asg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "con": {"name": "Cofán", "native_name": "A’ingae", "char_limit": 72, "model": "con.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crn": {"name": "Cora, El Nayar", "native_name": "Naayeri", "char_limit": 72, "model": "crn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cok": {"name": "Cora, Santa Teresa", "native_name": "Náayari", "char_limit": 72, "model": "cok.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crk-script_latin": {"name": "Cree, Plains - Latin", "native_name": "Nēhiyawēwin", "char_limit": 192, "model": "crk-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crk-script_syllabics": {"name": "Cree, Plains - Syllabsics", "native_name": "ᓀᐦᐃᔭᐍᐏᐣ", "char_limit": 41, "model": "crk-script_syllabics.tar.gz", "punctuation": [",", ";\n", "?", "!", "(", ")", ":", ";", "—", "-", "“", "”", "..."]}, - "crh": {"name": "Crimean Tatar", "native_name": "Къырымтатарджа", "char_limit": 192, "model": "crh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "hrv": {"name": "Croatian", "native_name": "hrvatski", "char_limit": 192, "model": "hr_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cui": {"name": "Cuiba", "native_name": "Cuiba", "char_limit": 192, "model": "cui.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ces": {"name": "Czech", "native_name": "Čeština", "char_limit": 186, "model": "cs_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dan": {"name": "Danish", "native_name": "Dansk", "char_limit": 192, "model": "da_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dsh": {"name": "Daasanach", "native_name": "Daasanach", "char_limit": 192, "model": "dsh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dbq": {"name": "Daba", "native_name": "Daba", "char_limit": 192, "model": "dbq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dga": {"name": "Dagaare, Southern", "native_name": "Dagaare", "char_limit": 192, "model": "dga.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dgi": {"name": "Dagara, Northern", "native_name": "Dagara", "char_limit": 192, "model": "dgi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dgk": {"name": "Dagba", "native_name": "Dagba", "char_limit": 192, "model": "dgk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dnj-dialect_gweetaawueast": {"name": "Dan - Gweetaawueast", "native_name": "Gweetaa Wu East", "char_limit": 192, "model": "dnj-dialect_gweetaawueast.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dnj-dialect_blowowest": {"name": "Dan - Blowowest", "native_name": "Blowo West", "char_limit": 192, "model": "dnj-dialect_blowowest.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "daa": {"name": "Dangaléat", "native_name": "Dangaléat", "char_limit": 192, "model": "daa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dnt": {"name": "Dani, Mid Grand Valley", "native_name": "Mid Grand Valley Dani", "char_limit": 192, "model": "dnt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dnw": {"name": "Dani, Western", "native_name": "Western Dani", "char_limit": 192, "model": "dnw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dar": {"name": "Dargwa", "native_name": "Дарган мез", "char_limit": 192, "model": "dar.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "tcc": {"name": "Datooga", "native_name": "Datooga", "char_limit": 192, "model": "tcc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dwr": {"name": "Dawro", "native_name": "Dawro", "char_limit": 192, "model": "dwr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ded": {"name": "Dedua", "native_name": "Dedua", "char_limit": 192, "model": "ded.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mzw": {"name": "Deg", "native_name": "Deg", "char_limit": 192, "model": "mzw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ntr": {"name": "Delo", "native_name": "Delo", "char_limit": 192, "model": "ntr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ddn": {"name": "Dendi", "native_name": "Dendi", "char_limit": 192, "model": "ddn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "des": {"name": "Desano", "native_name": "Desano", "char_limit": 192, "model": "des.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dso": {"name": "Desiya", "native_name": "Desiya", "char_limit": 192, "model": "dso.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nfa": {"name": "Dhao", "native_name": "Dhao", "char_limit": 192, "model": "nfa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dhi": {"name": "Dhimal", "native_name": "Dhimal", "char_limit": 192, "model": "dhi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gud": {"name": "Dida, Yocoboué", "native_name": "Dida", "char_limit": 192, "model": "gud.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "did": {"name": "Didinga", "native_name": "Didinga", "char_limit": 192, "model": "did.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mhu": {"name": "Digaro-Mishmi", "native_name": "Digaro-Mishmi", "char_limit": 192, "model": "mhu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dip": {"name": "Dinka, Northeastern", "native_name": "Dinka", "char_limit": 192, "model": "dip.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dik": {"name": "Dinka, Southwestern", "native_name": "Dinka", "char_limit": 192, "model": "dik.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tbz": {"name": "Ditammari", "native_name": "Ditammari", "char_limit": 192, "model": "tbz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dts": {"name": "Dogon, Toro So", "native_name": "Dogon", "char_limit": 192, "model": "dts.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dos": {"name": "Dogosé", "native_name": "Dogosé", "char_limit": 192, "model": "dos.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dgo": {"name": "Dogri", "native_name": "डोगरी", "char_limit": 72, "model": "dgo.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "mvp": {"name": "Duri", "native_name": "Duri", "char_limit": 192, "model": "mvp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jen": {"name": "Dza", "native_name": "Dza", "char_limit": 192, "model": "jen.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dzo": {"name": "Dzongkha", "native_name": "རྫོང་ཁ", "char_limit": 41, "model": "dzo.tar.gz", "punctuation": ["།", "༄", "༅", "༆", "…"]}, - "idd": {"name": "Ede Idaca", "native_name": "Ede Idaca", "char_limit": 192, "model": "idd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "eka": {"name": "Ekajuk", "native_name": "Ekajuk", "char_limit": 192, "model": "eka.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cto": {"name": "Embera Catío", "native_name": "Embera Catío", "char_limit": 72, "model": "cto.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "emp": {"name": "Emberá, Northern", "native_name": "Emberá", "char_limit": 72, "model": "emp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "enx": {"name": "Enxet", "native_name": "Enxet", "char_limit": 192, "model": "enx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sja": {"name": "Epena", "native_name": "Epena", "char_limit": 192, "model": "sja.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myv": {"name": "Erzya", "native_name": "Эрзянь", "char_limit": 72, "model": "myv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "mcq": {"name": "Ese", "native_name": "Ese", "char_limit": 192, "model": "mcq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ese": {"name": "Ese Ejja", "native_name": "Ese Ejja", "char_limit": 192, "model": "ese.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "evn": {"name": "Evenki", "native_name": "Эвенки", "char_limit": 72, "model": "evn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "eza": {"name": "Ezaa", "native_name": "Ezaa", "char_limit": 192, "model": "eza.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ewe": {"name": "Éwé", "native_name": "Éwé", "char_limit": 72, "model": "ewe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fal": {"name": "Fali, South", "native_name": "Fali", "char_limit": 192, "model": "fal.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fao": {"name": "Faroese", "native_name": "Føroyskt", "char_limit": 72, "model": "fao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "far": {"name": "Fataleka", "native_name": "Fataleka", "char_limit": 192, "model": "far.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fij": {"name": "Fijian", "native_name": "Na Vosa Vakaviti", "char_limit": 72, "model": "fij.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fin": {"name": "Finnish", "native_name": "Suomi", "char_limit": 72, "model": "fi_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fon": {"name": "Fon", "native_name": "Fon", "char_limit": 192, "model": "fon.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "frd": {"name": "Fordata", "native_name": "Fordata", "char_limit": 192, "model": "frd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ful": {"name": "Fulah", "native_name": "Fulfulde", "char_limit": 72, "model": "ful.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "flr": {"name": "Fuliiru", "native_name": "Fuliiru", "char_limit": 192, "model": "flr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gau": {"name": "Gadaba, Mudhili", "native_name": "Gadaba", "char_limit": 192, "model": "gau.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gbk": {"name": "Gaddi", "native_name": "Gaddi", "char_limit": 192, "model": "gbk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gag-script_cyrillic": {"name": "Gagauz - Cyrillic", "native_name": "Гагаузча", "char_limit": 72, "model": "gag-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "gag-script_latin": {"name": "Gagauz - Latin", "native_name": "Gagauz", "char_limit": 72, "model": "gag-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gbi": {"name": "Galela", "native_name": "Galela", "char_limit": 192, "model": "gbi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gmv": {"name": "Gamo", "native_name": "Gamo", "char_limit": 192, "model": "gmv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lug": {"name": "Ganda", "native_name": "Luganda", "char_limit": 72, "model": "lug.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pwg": {"name": "Gapapaiwa", "native_name": "Gapapaiwa", "char_limit": 192, "model": "pwg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gbm": {"name": "Garhwali", "native_name": "गढ़वाळी", "char_limit": 192, "model": "gbm.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "cab": {"name": "Garifuna", "native_name": "Garifuna", "char_limit": 192, "model": "cab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "grt": {"name": "Garo", "native_name": "Garo", "char_limit": 192, "model": "grt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "krs": {"name": "Gbaya", "native_name": "Gbaya", "char_limit": 192, "model": "krs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gso": {"name": "Gbaya, Southwest", "native_name": "Gbaya", "char_limit": 192, "model": "gso.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nlg": {"name": "Gela", "native_name": "Gela", "char_limit": 192, "model": "nlg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gej": {"name": "Gen", "native_name": "Gen", "char_limit": 192, "model": "gej.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gri": {"name": "Ghari", "native_name": "Ghari", "char_limit": 192, "model": "gri.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kik": {"name": "Gikuyu", "native_name": "Gĩkũyũ", "char_limit": 72, "model": "kik.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "acd": {"name": "Gikyode", "native_name": "Gikyode", "char_limit": 192, "model": "acd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "glk": {"name": "Gilaki", "native_name": "گیلکی", "char_limit": 192, "model": "glk.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "gof-script_latin": {"name": "Gofa", "native_name": "Gofa", "char_limit": 192, "model": "gof-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gog": {"name": "Gogo", "native_name": "Gogo", "char_limit": 192, "model": "gog.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gkn": {"name": "Gokana", "native_name": "Gokana", "char_limit": 192, "model": "gkn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wsg": {"name": "Gondi, Adilabad", "native_name": "Gondi", "char_limit": 192, "model": "wsg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gjn": {"name": "Gonja", "native_name": "Gonja", "char_limit": 192, "model": "gjn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gqr": {"name": "Gor", "native_name": "Gor", "char_limit": 192, "model": "gqr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gor": {"name": "Gorontalo", "native_name": "Gorontalo", "char_limit": 192, "model": "gor.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gux": {"name": "Gourmanchéma", "native_name": "Gourmanchéma", "char_limit": 192, "model": "gux.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gbo": {"name": "Grebo, Northern", "native_name": "Grebo", "char_limit": 192, "model": "gbo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ell": {"name": "Greek", "native_name": "Ελληνικά", "char_limit": 72, "model": "ell.tar.gz", "punctuation": [";\n", ",", "!", ";", "…"]}, - "grc": {"name": "Greek, Ancient", "native_name": "Ἑλληνική", "char_limit": 192, "model": "grc.tar.gz", "punctuation": [";\n", ",", "!", ";", "…"]}, - "guh": {"name": "Guahibo", "native_name": "Guahibo", "char_limit": 192, "model": "guh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gub": {"name": "Guajajára", "native_name": "Guajajára", "char_limit": 72, "model": "gub.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "grn": {"name": "Guarani", "native_name": "Avañe'ẽ", "char_limit": 72, "model": "grn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gyr": {"name": "Guarayu", "native_name": "Guarayu", "char_limit": 192, "model": "gyr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guo": {"name": "Guayabero", "native_name": "Guayabero", "char_limit": 192, "model": "guo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gde": {"name": "Gude", "native_name": "Gude", "char_limit": 192, "model": "gde.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guj": {"name": "Gujarati", "native_name": "ગુજરાતી", "char_limit": 192, "model": "guj.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "gvl": {"name": "Gulay", "native_name": "Gulay", "char_limit": 192, "model": "gvl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guk": {"name": "Gumuz", "native_name": "Gumuz", "char_limit": 192, "model": "guk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rub": {"name": "Gungu", "native_name": "Gungu", "char_limit": 192, "model": "rub.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dah": {"name": "Gwahatike", "native_name": "Gwahatike", "char_limit": 192, "model": "dah.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gwr": {"name": "Gwere", "native_name": "Gwere", "char_limit": 192, "model": "gwr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gwi": {"name": "Gwich’in", "native_name": "Gwich’in", "char_limit": 72, "model": "gwi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hat": {"name": "Haitian Creole", "native_name": "Kreyòl Ayisyen", "char_limit": 72, "model": "hat.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hlb": {"name": "Halbi", "native_name": "Halbi", "char_limit": 192, "model": "hlb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "amf": {"name": "Hamer-Banna", "native_name": "Hamer-Banna", "char_limit": 192, "model": "amf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hag": {"name": "Hanga", "native_name": "Hanga", "char_limit": 192, "model": "hag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hnn": {"name": "Hanunoo", "native_name": "Hanunoo", "char_limit": 192, "model": "hnn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bgc": {"name": "Haryanvi", "native_name": "हरियाणवी", "char_limit": 72, "model": "bgc.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "had": {"name": "Hatam", "native_name": "Hatam", "char_limit": 192, "model": "had.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hau": {"name": "Hausa", "native_name": "Hausa", "char_limit": 72, "model": "hau.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hwc": {"name": "Hawaii Pidgin", "native_name": "Hawai‘i Creole English", "char_limit": 72, "model": "hwc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hvn": {"name": "Hawu", "native_name": "Hawu", "char_limit": 192, "model": "hvn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hay": {"name": "Haya", "native_name": "Haya", "char_limit": 192, "model": "hay.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xed": {"name": "Hdi", "native_name": "Hdi", "char_limit": 192, "model": "xed.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "heb": {"name": "Hebrew", "native_name": "עברית", "char_limit": 72, "model": "heb.tar.gz", "punctuation": [".״", ",", "!", "?", "…"]}, - "heh": {"name": "Hehe", "native_name": "Hehe", "char_limit": 192, "model": "heh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hil": {"name": "Hiligaynon", "native_name": "Ilonggo", "char_limit": 72, "model": "hil.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hif": {"name": "Hindi, Fiji", "native_name": "फ़िजी हिंदी", "char_limit": 192, "model": "hif.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "hns": {"name": "Hindustani, Sarnami", "native_name": "सरनामी", "char_limit": 192, "model": "hns.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "hoc": {"name": "Ho", "native_name": "हो", "char_limit": 192, "model": "hoc.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "hoy": {"name": "Holiya", "native_name": "Holiya", "char_limit": 192, "model": "hoy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hus-dialect_westernpotosino": {"name": "Huastec - Western Potosino", "native_name": "Teenek", "char_limit": 72, "model": "hus-dialect_westernpotosino.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hus-dialect_centralveracruz": {"name": "Huastec - Central Veracruz", "native_name": "Teenek", "char_limit": 72, "model": "hus-dialect_centralveracruz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "huv": {"name": "Huave, San Mateo del Mar", "native_name": "Ombeayiüts", "char_limit": 72, "model": "huv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hui": {"name": "Huli", "native_name": "Huli", "char_limit": 192, "model": "hui.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hap": {"name": "Hupla", "native_name": "Hupla", "char_limit": 192, "model": "hap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "iba": {"name": "Iban", "native_name": "Iban", "char_limit": 192, "model": "iba.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "isl": {"name": "Icelandic", "native_name": "Íslenska", "char_limit": 192, "model": "isl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dbj": {"name": "Ida’an", "native_name": "Ida’an", "char_limit": 192, "model": "dbj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ifa": {"name": "Ifugao, Amganad", "native_name": "Ifugao", "char_limit": 192, "model": "ifa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ifb": {"name": "Ifugao, Batad", "native_name": "Ifugao", "char_limit": 192, "model": "ifb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ifu": {"name": "Ifugao, Mayoyao", "native_name": "Ifugao", "char_limit": 192, "model": "ifu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ifk": {"name": "Ifugao, Tuwali", "native_name": "Ifugao", "char_limit": 192, "model": "ifk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ife": {"name": "Ifè", "native_name": "Ifè", "char_limit": 192, "model": "ife.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ign": {"name": "Ignaciano", "native_name": "Ignaciano", "char_limit": 192, "model": "ign.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ikk": {"name": "Ika", "native_name": "Ika", "char_limit": 192, "model": "ikk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "iqw": {"name": "Ikwo", "native_name": "Ikwo", "char_limit": 192, "model": "iqw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ilb": {"name": "Ila", "native_name": "Ila", "char_limit": 192, "model": "ilb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ilo": {"name": "Ilocano", "native_name": "Ilocano", "char_limit": 72, "model": "ilo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "imo": {"name": "Imbongu", "native_name": "Imbongu", "char_limit": 192, "model": "imo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "inb": {"name": "Inga", "native_name": "Inga", "char_limit": 192, "model": "inb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ipi": {"name": "Ipili", "native_name": "Ipili", "char_limit": 192, "model": "ipi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "irk": {"name": "Iraqw", "native_name": "Iraqw", "char_limit": 192, "model": "irk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "icr": {"name": "Islander English Creole", "native_name": "Islander Creole", "char_limit": 192, "model": "icr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "itv": {"name": "Itawit", "native_name": "Itawit", "char_limit": 192, "model": "itv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "itl": {"name": "Itelmen", "native_name": "Itelmen", "char_limit": 192, "model": "itl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "atg": {"name": "Ivbie North-Okpela-Arhe", "native_name": "Ivbie North-Okpela-Arhe", "char_limit": 192, "model": "atg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ixl-dialect_sanjuancotzal": {"name": "Ixil - San Juan Cotzal", "native_name": "Ixil", "char_limit": 192, "model": "ixl-dialect_sanjuancotzal.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ixl-dialect_sangasparchajul": {"name": "Ixil - San Gaspar Chajul", "native_name": "Ixil", "char_limit": 192, "model": "ixl-dialect_sangasparchajul.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ixl-dialect_santamarianebaj": {"name": "Ixil - Santa Maria Nebaj", "native_name": "Ixil", "char_limit": 192, "model": "ixl-dialect_santamarianebaj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nca": {"name": "Iyo", "native_name": "Iyo", "char_limit": 192, "model": "nca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "izr": {"name": "Izere", "native_name": "Izere", "char_limit": 192, "model": "izr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "izz": {"name": "Izii", "native_name": "Izii", "char_limit": 192, "model": "izz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jac": {"name": "Jakalteko", "native_name": "Jakalteko", "char_limit": 192, "model": "jac.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jam": {"name": "Jamaican English Creole", "native_name": "Patois", "char_limit": 72, "model": "jam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jvn": {"name": "Javanese, Suriname", "native_name": "Basa Jawa Suriname", "char_limit": 192, "model": "jvn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kac": {"name": "Jingpho", "native_name": "Jingpho", "char_limit": 192, "model": "kac.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dyo": {"name": "Jola-Fonyi", "native_name": "Joola Foñi", "char_limit": 72, "model": "dyo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "csk": {"name": "Jola-Kasa", "native_name": "Joola Kasa", "char_limit": 72, "model": "csk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "adh": {"name": "Jopadhola", "native_name": "Jopadhola", "char_limit": 72, "model": "adh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jun": {"name": "Juang", "native_name": "Juang", "char_limit": 192, "model": "jun.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jbu": {"name": "Jukun Takum", "native_name": "Jukun Takum", "char_limit": 192, "model": "jbu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dyu": {"name": "Jula", "native_name": "Julakan", "char_limit": 72, "model": "dyu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bex": {"name": "Jur Modo", "native_name": "Jur Modo", "char_limit": 192, "model": "bex.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "juy": {"name": "Juray", "native_name": "Juray", "char_limit": 192, "model": "juy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gna": {"name": "Kaansa", "native_name": "Kaansa", "char_limit": 192, "model": "gna.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "urb": {"name": "Kaapor", "native_name": "Kaapor", "char_limit": 192, "model": "urb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kbp": {"name": "Kabiyè", "native_name": "Kabiyè", "char_limit": 72, "model": "kbp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cwa": {"name": "Kabwa", "native_name": "Kabwa", "char_limit": 192, "model": "cwa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dtp": {"name": "Kadazan Dusun", "native_name": "Kadazan Dusun", "char_limit": 72, "model": "dtp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kbr": {"name": "Kafa", "native_name": "Kafa", "char_limit": 192, "model": "kbr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cgc": {"name": "Kagayanen", "native_name": "Kagayanen", "char_limit": 192, "model": "cgc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kki": {"name": "Kagulu", "native_name": "Kagulu", "char_limit": 192, "model": "kki.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kzf": {"name": "Kaili, Da’a", "native_name": "Kaili Da’a", "char_limit": 72, "model": "kzf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lew": {"name": "Kaili, Ledo", "native_name": "Kaili Ledo", "char_limit": 72, "model": "lew.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbr": {"name": "Kakataibo-Kashibo", "native_name": "Kakataibo-Kashibo", "char_limit": 192, "model": "cbr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kkj": {"name": "Kako", "native_name": "Kako", "char_limit": 192, "model": "kkj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "keo": {"name": "Kakwa", "native_name": "Kakwa", "char_limit": 192, "model": "keo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kqe": {"name": "Kalagan", "native_name": "Kalagan", "char_limit": 192, "model": "kqe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kak": {"name": "Kalanguya", "native_name": "Kalanguya", "char_limit": 192, "model": "kak.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyb": {"name": "Kalinga, Butbut", "native_name": "Kalinga Butbut", "char_limit": 192, "model": "kyb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "knb": {"name": "Kalinga, Lubuagan", "native_name": "Kalinga Lubuagan", "char_limit": 192, "model": "knb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kmd": {"name": "Kalinga, Majukayang", "native_name": "Kalinga Majukayang", "char_limit": 192, "model": "kmd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kml": {"name": "Kalinga, Tanudan", "native_name": "Kalinga Tanudan", "char_limit": 192, "model": "kml.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ify": {"name": "Kallahan, Keley-i", "native_name": "Kallahan Keley-i", "char_limit": 192, "model": "ify.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xal": {"name": "Kalmyk-Oirat", "native_name": "Хальмг", "char_limit": 192, "model": "xal.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "kbq": {"name": "Kamano", "native_name": "Kamano", "char_limit": 192, "model": "kbq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kay": {"name": "Kamayurá", "native_name": "Kamayurá", "char_limit": 192, "model": "kay.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ktb": {"name": "Kambaata", "native_name": "Kambaata", "char_limit": 192, "model": "ktb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hig": {"name": "Kamwe", "native_name": "Kamwe", "char_limit": 192, "model": "hig.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gam": {"name": "Kandawo", "native_name": "Kandawo", "char_limit": 192, "model": "gam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbu": {"name": "Kandozi-Chapra", "native_name": "Kandozi-Chapra", "char_limit": 192, "model": "cbu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xnr": {"name": "Kangri", "native_name": "Kangri", "char_limit": 192, "model": "xnr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kmu": {"name": "Kanite", "native_name": "Kanite", "char_limit": 192, "model": "kmu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kne": {"name": "Kankanaey", "native_name": "Kankanaey", "char_limit": 192, "model": "kne.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kan": {"name": "Kannada", "native_name": "ಕನ್ನಡ", "char_limit": 192, "model": "kan.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "kby": {"name": "Kanuri, Manga", "native_name": "Kanuri", "char_limit": 192, "model": "kby.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pam": {"name": "Kapampangan", "native_name": "Kapampangan", "char_limit": 192, "model": "pam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_santamaríadejesús": {"name": "Kaqchikel - Santa María de Jesús", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_santamaríadejesús.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_southcentral": {"name": "Kaqchikel - dialect South Central", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_southcentral.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_yepocapa": {"name": "Kaqchikel - dialect Yepocapa", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_yepocapa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_western": {"name": "Kaqchikel - dialect Western", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_western.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_santodomingoxenacoj": {"name": "Kaqchikel - dialect Santo Domingo Xenacoj", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_santodomingoxenacoj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cak-dialect_central": {"name": "Kaqchikel - Dialect Central", "native_name": "Kaqchikel", "char_limit": 192, "model": "cak-dialect_central.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xrb": {"name": "Karaboro, Eastern", "native_name": "Karaboro", "char_limit": 192, "model": "xrb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "krc": {"name": "Karachay-Balkar", "native_name": "Къарачай-Малкъар", "char_limit": 192, "model": "krc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "kaa": {"name": "Karakalpak", "native_name": "Qaraqalpaq", "char_limit": 72, "model": "kaa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "krl": {"name": "Karelian", "native_name": "Karjala", "char_limit": 72, "model": "krl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pww": {"name": "Karen, Pwo Northern", "native_name": "Pwo Karen", "char_limit": 192, "model": "pww.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xsm": {"name": "Kasem", "native_name": "Kasem", "char_limit": 192, "model": "xsm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbs": {"name": "Kashinawa", "native_name": "Kashinawa", "char_limit": 192, "model": "cbs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pss": {"name": "Kaulong", "native_name": "Kaulong", "char_limit": 192, "model": "pss.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kxf": {"name": "Kawyaw", "native_name": "Kawyaw", "char_limit": 192, "model": "kxf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyz": {"name": "Kayabí", "native_name": "Kayabí", "char_limit": 192, "model": "kyz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyu": {"name": "Kayah, Western", "native_name": "Kayah", "char_limit": 192, "model": "kyu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "txu": {"name": "Kayapó", "native_name": "Kayapó", "char_limit": 72, "model": "txu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kaz": {"name": "Kazakh", "native_name": "Қазақ тілі", "char_limit": 192, "model": "kaz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ndp": {"name": "Kebu", "native_name": "Kebu", "char_limit": 192, "model": "ndp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kbo": {"name": "Keliko", "native_name": "Keliko", "char_limit": 192, "model": "kbo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyq": {"name": "Kenga", "native_name": "Kenga", "char_limit": 192, "model": "kyq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ken": {"name": "Kenyang", "native_name": "Kenyang", "char_limit": 192, "model": "ken.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ker": {"name": "Kera", "native_name": "Kera", "char_limit": 192, "model": "ker.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xte": {"name": "Ketengban", "native_name": "Ketengban", "char_limit": 192, "model": "xte.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyg": {"name": "Keyagana", "native_name": "Keyagana", "char_limit": 192, "model": "kyg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kjh": {"name": "Khakas", "native_name": "Хакас тілі", "char_limit": 192, "model": "kjh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "kca": {"name": "Khanty", "native_name": "Ханты", "char_limit": 72, "model": "kca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "khm": {"name": "Khmer", "native_name": "ភាសាខ្មែរ", "char_limit": 41, "model": "khm.tar.gz", "punctuation": ["។", ",", "!", "?", "…"]}, - "kxm": {"name": "Khmer, Northern", "native_name": "ភាសាខ្មែរ, ភាគខាងជើង", "char_limit": 41, "model": "kxm.tar.gz", "punctuation": ["។", ",", "!", "?", "…"]}, - "kjg": {"name": "Khmu", "native_name": "ຂະມູ", "char_limit": 192, "model": "kjg.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "nyf": {"name": "Kigiryama", "native_name": "Kigiryama", "char_limit": 192, "model": "nyf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kij": {"name": "Kilivila", "native_name": "Kilivila", "char_limit": 192, "model": "kij.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kia": {"name": "Kim", "native_name": "Kim", "char_limit": 192, "model": "kia.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kqr": {"name": "Kimaragang", "native_name": "Kimaragang", "char_limit": 192, "model": "kqr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kqp": {"name": "Kimré", "native_name": "Kimré", "char_limit": 192, "model": "kqp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "krj": {"name": "Kinaray-a", "native_name": "Kinaray-a", "char_limit": 192, "model": "krj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zga": {"name": "Kinga", "native_name": "Kinga", "char_limit": 192, "model": "zga.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kin": {"name": "Kinyarwanda", "native_name": "Ikinyarwanda", "char_limit": 72, "model": "kin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pkb": {"name": "Kipfokomo", "native_name": "Kipfokomo", "char_limit": 192, "model": "pkb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "geb": {"name": "Kire", "native_name": "Kire", "char_limit": 192, "model": "geb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gil": {"name": "Kiribati", "native_name": "Taetae ni Kiribati", "char_limit": 72, "model": "gil.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kje": {"name": "Kisar", "native_name": "Kisar", "char_limit": 192, "model": "kje.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kss": {"name": "Kisi, Southern", "native_name": "Kisi", "char_limit": 192, "model": "kss.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "thk": {"name": "Kitharaka", "native_name": "Kitharaka", "char_limit": 192, "model": "thk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "klu": {"name": "Klao", "native_name": "Klao", "char_limit": 192, "model": "klu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyo": {"name": "Klon", "native_name": "Klon", "char_limit": 192, "model": "kyo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kog": {"name": "Kogi", "native_name": "Kogi", "char_limit": 192, "model": "kog.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kfb": {"name": "Kolami, Northwestern", "native_name": "Kolami", "char_limit": 192, "model": "kfb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kpv": {"name": "Komi-Zyrian", "native_name": "Коми", "char_limit": 72, "model": "kpv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "bbo": {"name": "Konabéré", "native_name": "Konabéré", "char_limit": 192, "model": "bbo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xon": {"name": "Konkomba", "native_name": "Konkomba", "char_limit": 192, "model": "xon.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kma": {"name": "Konni", "native_name": "Konni", "char_limit": 192, "model": "kma.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kno": {"name": "Kono", "native_name": "Kono", "char_limit": 192, "model": "kno.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kxc": {"name": "Konso", "native_name": "Konso", "char_limit": 192, "model": "kxc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ozm": {"name": "Koonzime", "native_name": "Koonzime", "char_limit": 192, "model": "ozm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kqy": {"name": "Koorete", "native_name": "Koorete", "char_limit": 192, "model": "kqy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "coe": {"name": "Koreguaje", "native_name": "Koreguaje", "char_limit": 192, "model": "coe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kpq": {"name": "Korupun-Sela", "native_name": "Korupun-Sela", "char_limit": 192, "model": "kpq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kpy": {"name": "Koryak", "native_name": "Курил", "char_limit": 72, "model": "kpy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "kyf": {"name": "Kouya", "native_name": "Kouya", "char_limit": 192, "model": "kyf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kff-script_telugu": {"name": "Koya", "native_name": "కోయా", "char_limit": 192, "model": "kff-script_telugu.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "kri": {"name": "Krio", "native_name": "Krio", "char_limit": 192, "model": "kri.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rop": {"name": "Kriol", "native_name": "Kriol", "char_limit": 192, "model": "rop.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ktj": {"name": "Krumen, Plapo", "native_name": "Krumen, Plapo", "char_limit": 192, "model": "ktj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ted": {"name": "Krumen, Tepo", "native_name": "Krumen, Tepo", "char_limit": 192, "model": "ted.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "krr": {"name": "Krung", "native_name": "Krung", "char_limit": 192, "model": "krr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdt": {"name": "Kuay", "native_name": "Kuay", "char_limit": 192, "model": "kdt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kez": {"name": "Kukele", "native_name": "Kukele", "char_limit": 192, "model": "kez.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cul": {"name": "Kulina", "native_name": "Kulina", "char_limit": 192, "model": "cul.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kle": {"name": "Kulung", "native_name": "Kulung", "char_limit": 192, "model": "kle.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdi": {"name": "Kumam", "native_name": "Kumam", "char_limit": 192, "model": "kdi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kue": {"name": "Kuman", "native_name": "Kuman", "char_limit": 192, "model": "kue.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kum": {"name": "Kumyk", "native_name": "Къумукъ", "char_limit": 72, "model": "kum.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "kvn": {"name": "Kuna, Border", "native_name": "Kuna, Border", "char_limit": 192, "model": "kvn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cuk": {"name": "Kuna, San Blas", "native_name": "Kuna, San Blas", "char_limit": 192, "model": "cuk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdn": {"name": "Kunda", "native_name": "Kunda", "char_limit": 192, "model": "kdn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xuo": {"name": "Kuo", "native_name": "Kuo", "char_limit": 192, "model": "xuo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "key": {"name": "Kupia", "native_name": "Kupia", "char_limit": 192, "model": "key.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kpz": {"name": "Kupsapiiny", "native_name": "Kupsapiiny", "char_limit": 192, "model": "kpz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "knk": {"name": "Kuranko", "native_name": "Kuranko", "char_limit": 192, "model": "knk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kmr-script_latin": {"name": "Kurdish, Northern - Latin", "native_name": "Kurmancî", "char_limit": 72, "model": "kmr-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kmr-script_arabic": {"name": "Kurdish, Northern - Arabic", "native_name": "كورمانجي", "char_limit": 192, "model": "kmr-script_arabic.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "kmr-script_cyrillic": {"name": "Kurdish, Northern - Cyrillic", "native_name": "Курманджи", "char_limit": 72, "model": "kmr-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "xua": {"name": "Kurumba, Alu", "native_name": "Kurumba", "char_limit": 192, "model": "xua.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kru": {"name": "Kurux", "native_name": "कुड़ुख", "char_limit": 192, "model": "kru.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "kus": {"name": "Kusaal", "native_name": "Kusaal", "char_limit": 192, "model": "kus.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kub": {"name": "Kutep", "native_name": "Kutep", "char_limit": 192, "model": "kub.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdc": {"name": "Kutu", "native_name": "Kutu", "char_limit": 192, "model": "kdc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kxv": {"name": "Kuvi", "native_name": "Kuvi", "char_limit": 192, "model": "kxv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "blh": {"name": "Kuwaa", "native_name": "Kuwaa", "char_limit": 192, "model": "blh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cwt": {"name": "Kuwaataay", "native_name": "Kuwaataay", "char_limit": 192, "model": "cwt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kwd": {"name": "Kwaio", "native_name": "Kwaio", "char_limit": 192, "model": "kwd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tnk": {"name": "Kwamera", "native_name": "Kwamera", "char_limit": 192, "model": "tnk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kwf": {"name": "Kwara’ae", "native_name": "Kwara’ae", "char_limit": 192, "model": "kwf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cwe": {"name": "Kwere", "native_name": "Kwere", "char_limit": 192, "model": "cwe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kyc": {"name": "Kyaka", "native_name": "Kyaka", "char_limit": 192, "model": "kyc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tye": {"name": "Kyanga", "native_name": "Kyanga", "char_limit": 192, "model": "tye.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kir": {"name": "Kyrgyz", "native_name": "Кыргызча", "char_limit": 72, "model": "kir.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "quc-dialect_north": {"name": "K’iche’ - dialect North", "native_name": "K’iche’", "char_limit": 192, "model": "quc-dialect_north.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quc-dialect_east": {"name": "K’iche’ - dialect East", "native_name": "K’iche’", "char_limit": 192, "model": "quc-dialect_east.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quc-dialect_central": {"name": "K’iche’ - dialect Central", "native_name": "K’iche’", "char_limit": 192, "model": "quc-dialect_central.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lac": {"name": "Lacandon", "native_name": "Lacandon", "char_limit": 192, "model": "lac.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lsi": {"name": "Lacid", "native_name": "Lacid", "char_limit": 192, "model": "lsi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lbj": {"name": "Ladakhi", "native_name": "Ladakhi", "char_limit": 192, "model": "lbj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lhu": {"name": "Lahu", "native_name": "Lahu", "char_limit": 192, "model": "lhu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "las": {"name": "Lama", "native_name": "Lama", "char_limit": 192, "model": "las.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lam": {"name": "Lamba", "native_name": "Lamba", "char_limit": 192, "model": "lam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lns": {"name": "Lamnso’", "native_name": "Lamnso’", "char_limit": 192, "model": "lns.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ljp": {"name": "Lampung Api", "native_name": "Lampung Api", "char_limit": 192, "model": "ljp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "laj": {"name": "Lango", "native_name": "Lango", "char_limit": 192, "model": "laj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lao": {"name": "Lao", "native_name": "ລາວ", "char_limit": 72, "model": "lao.tar.gz", "punctuation": ["।", "།", "?", "!", "…"]}, - "lat": {"name": "Latin", "native_name": "Latina", "char_limit": 192, "model": "lat.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lav": {"name": "Latvian", "native_name": "Latviešu", "char_limit": 192, "model": "lav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "law": {"name": "Lauje", "native_name": "Lauje", "char_limit": 192, "model": "law.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lcp": {"name": "Lawa, Western", "native_name": "Lawa", "char_limit": 192, "model": "lcp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lzz": {"name": "Laz", "native_name": "ლაზური", "char_limit": 192, "model": "lzz.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "lln": {"name": "Lele", "native_name": "Lele", "char_limit": 192, "model": "lln.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lef": {"name": "Lelemi", "native_name": "Lelemi", "char_limit": 192, "model": "lef.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "acf": {"name": "Lesser Antillean French Creole", "native_name": "Kwéyòl", "char_limit": 192, "model": "acf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lww": {"name": "Lewo", "native_name": "Lewo", "char_limit": 192, "model": "lww.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mhx": {"name": "Lhao Vo", "native_name": "Lhao Vo", "char_limit": 192, "model": "mhx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "eip": {"name": "Lik", "native_name": "Lik", "char_limit": 192, "model": "eip.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lia": {"name": "Limba, West-Central", "native_name": "Limba", "char_limit": 192, "model": "lia.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lif": {"name": "Limbu", "native_name": "ᤕᤠᤰᤌᤢᤱ", "char_limit": 192, "model": "lif.tar.gz", "punctuation": ["।", "?", "!", "…"]}, - "onb": {"name": "Lingao", "native_name": "Lingao", "char_limit": 192, "model": "onb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lis": {"name": "Lisu", "native_name": "ꓡꓲꓢꓳ", "char_limit": 192, "model": "lis.tar.gz", "punctuation": ["꓾", "꓿", "!", "?", "…"]}, - "loq": {"name": "Lobala", "native_name": "Lobala", "char_limit": 192, "model": "loq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lob": {"name": "Lobi", "native_name": "Lobi", "char_limit": 192, "model": "lob.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yaz": {"name": "Lokaa", "native_name": "Lokaa", "char_limit": 192, "model": "yaz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lok": {"name": "Loko", "native_name": "Loko", "char_limit": 192, "model": "lok.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "llg": {"name": "Lole", "native_name": "Lole", "char_limit": 192, "model": "llg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ycl": {"name": "Lolopo", "native_name": "Lolopo", "char_limit": 192, "model": "ycl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lom": {"name": "Loma", "native_name": "Loma", "char_limit": 192, "model": "lom.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ngl": {"name": "Lomwe", "native_name": "Lomwe", "char_limit": 192, "model": "ngl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lon": {"name": "Lomwe, Malawi", "native_name": "Lomwe", "char_limit": 192, "model": "lon.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lex": {"name": "Luang", "native_name": "Luang", "char_limit": 192, "model": "lex.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lgg": {"name": "Lugbara", "native_name": "Lugbara", "char_limit": 192, "model": "lgg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ruf": {"name": "Luguru", "native_name": "Luguru", "char_limit": 192, "model": "ruf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dop": {"name": "Lukpa", "native_name": "Lukpa", "char_limit": 192, "model": "dop.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lnd": {"name": "Lundayeh", "native_name": "Lundayeh", "char_limit": 192, "model": "lnd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ndy": {"name": "Lutos", "native_name": "Lutos", "char_limit": 192, "model": "ndy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lwo": {"name": "Luwo", "native_name": "Luwo", "char_limit": 192, "model": "lwo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lee": {"name": "Lyélé", "native_name": "Lyélé", "char_limit": 72, "model": "lee.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mev": {"name": "Maan", "native_name": "Maan", "char_limit": 192, "model": "mev.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfz": {"name": "Mabaan", "native_name": "Mabaan", "char_limit": 192, "model": "mfz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jmc": {"name": "Machame", "native_name": "Machame", "char_limit": 192, "model": "jmc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myy": {"name": "Macuna", "native_name": "Macuna", "char_limit": 192, "model": "myy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbc": {"name": "Macushi", "native_name": "Macushi", "char_limit": 192, "model": "mbc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mda": {"name": "Mada", "native_name": "Mada", "char_limit": 192, "model": "mda.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mad": {"name": "Madura", "native_name": "Madura", "char_limit": 192, "model": "mad.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mag": {"name": "Magahi", "native_name": "Magahi", "char_limit": 192, "model": "mag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ayz": {"name": "Mai Brat", "native_name": "Mai Brat", "char_limit": 192, "model": "ayz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mai": {"name": "Maithili", "native_name": "मैथिली", "char_limit": 192, "model": "mai.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "mca": {"name": "Maka", "native_name": "Maka", "char_limit": 192, "model": "mca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mcp": {"name": "Makaa", "native_name": "Makaa", "char_limit": 192, "model": "mcp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mak": {"name": "Makasar", "native_name": "Makasar", "char_limit": 192, "model": "mak.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vmw": {"name": "Makhuwa", "native_name": "Makhuwa", "char_limit": 192, "model": "vmw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mgh": {"name": "Makhuwa-Meetto", "native_name": "Makhuwa-Meetto", "char_limit": 192, "model": "mgh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kde": {"name": "Makonde", "native_name": "Makonde", "char_limit": 192, "model": "kde.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mlg": {"name": "Malagasy", "native_name": "Malagasy", "char_limit": 192, "model": "mlg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zlm": {"name": "Malay", "native_name": "Bahasa Melayu", "char_limit": 72, "model": "zlm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pse": {"name": "Malay, Central", "native_name": "Bahasa Melayu Tengah", "char_limit": 72, "model": "pse.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mkn": {"name": "Malay, Kupang", "native_name": "Bahasa Melayu Kupang", "char_limit": 192, "model": "mkn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xmm": {"name": "Malay, Manado", "native_name": "Bahasa Melayu Manado", "char_limit": 192, "model": "xmm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mal": {"name": "Malayalam", "native_name": "മലയാളം", "char_limit": 192, "model": "mal.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "xdy": {"name": "Malayic Dayak", "native_name": "Dayak Melayu", "char_limit": 72, "model": "xdy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "div": {"name": "Maldivian", "native_name": "ދިވެހި", "char_limit": 41, "model": "div.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "mdy": {"name": "Male", "native_name": "Male", "char_limit": 192, "model": "mdy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mup": {"name": "Malvi", "native_name": "Malvi", "char_limit": 192, "model": "mup.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mam-dialect_central": {"name": "Mam - dialect Central", "native_name": "Mam", "char_limit": 192, "model": "mam-dialect_central.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mam-dialect_northern": {"name": "Mam - dialect Northern", "native_name": "Mam", "char_limit": 192, "model": "mam-dialect_northern.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mam-dialect_southern": {"name": "Mam - dialect Southern", "native_name": "Mam", "char_limit": 192, "model": "mam-dialect_southern.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mam-dialect_western": {"name": "Mam - dialect Western", "native_name": "Mam", "char_limit": 192, "model": "mam-dialect_western.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mqj": {"name": "Mamasa", "native_name": "Mamasa", "char_limit": 192, "model": "mqj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mcu": {"name": "Mambila, Cameroon", "native_name": "Mambila", "char_limit": 192, "model": "mcu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mzk": {"name": "Mambila, Nigeria", "native_name": "Mambila", "char_limit": 192, "model": "mzk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maw": {"name": "Mampruli", "native_name": "Mampruli", "char_limit": 192, "model": "maw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mjl": {"name": "Mandeali", "native_name": "Mandeali", "char_limit": 192, "model": "mjl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mnk": {"name": "Mandinka", "native_name": "Mandinka", "char_limit": 192, "model": "mnk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mge": {"name": "Mango", "native_name": "Mango", "char_limit": 192, "model": "mge.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbh": {"name": "Mangseng", "native_name": "Mangseng", "char_limit": 192, "model": "mbh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "knf": {"name": "Mankanya", "native_name": "Mankanya", "char_limit": 192, "model": "knf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mjv": {"name": "Mannan", "native_name": "Mannan", "char_limit": 192, "model": "mjv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbt": {"name": "Manobo, Matigsalug", "native_name": "Manobo", "char_limit": 192, "model": "mbt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "obo": {"name": "Manobo, Obo", "native_name": "Manobo", "char_limit": 192, "model": "obo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbb": {"name": "Manobo, Western Bukidnon", "native_name": "Manobo", "char_limit": 192, "model": "mbb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mzj": {"name": "Manya", "native_name": "Manya", "char_limit": 192, "model": "mzj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sjm": {"name": "Mapun", "native_name": "Mapun", "char_limit": 192, "model": "sjm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mrw": {"name": "Maranao", "native_name": "Maranao", "char_limit": 192, "model": "mrw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mar": {"name": "Marathi", "native_name": "मराठी", "char_limit": 192, "model": "mar.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "mpg": {"name": "Marba", "native_name": "Marba", "char_limit": 192, "model": "mpg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mhr": {"name": "Mari, Meadow", "native_name": "Марий", "char_limit": 192, "model": "mhr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "enb": {"name": "Markweeta", "native_name": "Markweeta", "char_limit": 192, "model": "enb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mah": {"name": "Marshallese", "native_name": "Kajin M̧ajeļ", "char_limit": 192, "model": "mah.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myx": {"name": "Masaaba", "native_name": "Masaaba", "char_limit": 192, "model": "myx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "klv": {"name": "Maskelynes", "native_name": "Maskelynes", "char_limit": 192, "model": "klv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfh": {"name": "Matal", "native_name": "Matal", "char_limit": 192, "model": "mfh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "met": {"name": "Mato", "native_name": "Mato", "char_limit": 192, "model": "met.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mcb": {"name": "Matsigenka", "native_name": "Matsigenka", "char_limit": 192, "model": "mcb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mop": {"name": "Maya, Mopán", "native_name": "Mopán", "char_limit": 192, "model": "mop.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yua": {"name": "Maya, Yucatec", "native_name": "Yucateco", "char_limit": 192, "model": "yua.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfy": {"name": "Mayo", "native_name": "Mayo", "char_limit": 192, "model": "mfy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maz": {"name": "Mazahua, Central", "native_name": "Mazahua", "char_limit": 72, "model": "maz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vmy": {"name": "Mazatec, Ayautla", "native_name": "Ayautla", "char_limit": 192, "model": "vmy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maq": {"name": "Mazatec, Chiquihuitlán", "native_name": "Chiquihuitlán", "char_limit": 192, "model": "maq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mzi": {"name": "Mazatec, Ixcatlán", "native_name": "Ixcatlán", "char_limit": 192, "model": "mzi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maj": {"name": "Mazatec, Jalapa de Díaz", "native_name": "Jalapa de Díaz", "char_limit": 41, "model": "maj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maa-dialect_sanantonio": {"name": "Mazatec, San Jerónimo Tecóatl - dialect San Antonio", "native_name": "San Jerónimo Tecóatl", "char_limit": 41, "model": "maa-dialect_sanantonio.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "maa-dialect_sanjerónimo": {"name": "Mazatec, San Jerónimo Tecóatl - dialect San Jerónimo", "native_name": "San Jerónimo Tecóatl", "char_limit": 41, "model": "maa-dialect_sanjerónimo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mhy": {"name": "Ma’anyan", "native_name": "Ma’anyan", "char_limit": 192, "model": "mhy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mhi": {"name": "Ma’di", "native_name": "Ma’di", "char_limit": 192, "model": "mhi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zmz": {"name": "Mbandja", "native_name": "Mbandja", "char_limit": 192, "model": "zmz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myb": {"name": "Mbay", "native_name": "Mbay", "char_limit": 192, "model": "myb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gai": {"name": "Mbore", "native_name": "Mbore", "char_limit": 192, "model": "gai.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mqb": {"name": "Mbuko", "native_name": "Mbuko", "char_limit": 192, "model": "mqb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbu": {"name": "Mbula-Bwazza", "native_name": "Mbula-Bwazza", "char_limit": 72, "model": "mbu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "med": {"name": "Melpa", "native_name": "Melpa", "char_limit": 192, "model": "med.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "men": {"name": "Mende", "native_name": "Mende", "char_limit": 192, "model": "men.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mee": {"name": "Mengen", "native_name": "Mengen", "char_limit": 192, "model": "mee.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mwv": {"name": "Mentawai", "native_name": "Mentawai", "char_limit": 192, "model": "mwv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "meq": {"name": "Merey", "native_name": "Merey", "char_limit": 192, "model": "meq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zim": {"name": "Mesme", "native_name": "Mesme", "char_limit": 192, "model": "zim.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mgo": {"name": "Meta’", "native_name": "Meta’", "char_limit": 192, "model": "mgo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mej": {"name": "Meyah", "native_name": "Meyah", "char_limit": 192, "model": "mej.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mpp": {"name": "Migabac", "native_name": "Migabac", "char_limit": 192, "model": "mpp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "min": {"name": "Minangkabau", "native_name": "Minangkabau", "char_limit": 72, "model": "min.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gum": {"name": "Misak", "native_name": "Misak", "char_limit": 192, "model": "gum.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mpx": {"name": "Misima-Panaeati", "native_name": "Misima-Panaeati", "char_limit": 192, "model": "mpx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mco": {"name": "Mixe, Coatlán", "native_name": "Coatlán", "char_limit": 192, "model": "mco.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mxq": {"name": "Mixe, Juquila", "native_name": "Juquila", "char_limit": 192, "model": "mxq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pxm": {"name": "Mixe, Quetzaltepec", "native_name": "Mixe, Quetzaltepec", "char_limit": 192, "model": "pxm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mto": {"name": "Mixe, Totontepec", "native_name": "Mixe, Totontepec", "char_limit": 192, "model": "mto.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mim": {"name": "Mixtec, Alacatlatzala", "native_name": "Mixtec, Alacatlatzala", "char_limit": 192, "model": "mim.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xta": {"name": "Mixtec, Alcozauca", "native_name": "Mixtec, Alcozauca", "char_limit": 192, "model": "xta.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbz": {"name": "Mixtec, Amoltepec", "native_name": "Mixtec, Amoltepec", "char_limit": 192, "model": "mbz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mip": {"name": "Mixtec, Apasco-Apoala", "native_name": "Mixtec, Apasco-Apoala", "char_limit": 192, "model": "mip.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mib": {"name": "Mixtec, Atatlahuca", "native_name": "Mixtec, Atatlahuca", "char_limit": 192, "model": "mib.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "miy": {"name": "Mixtec, Ayutla", "native_name": "Mixtec, Ayutla", "char_limit": 192, "model": "miy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mih": {"name": "Mixtec, Chayuco", "native_name": "Mixtec, Chayuco", "char_limit": 192, "model": "mih.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "miz": {"name": "Mixtec, Coatzospan", "native_name": "Mixtec, Coatzospan", "char_limit": 192, "model": "miz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xtd": {"name": "Mixtec, Diuxi-Tilantongo", "native_name": "Mixtec, Diuxi-Tilantongo", "char_limit": 192, "model": "xtd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mxt": {"name": "Mixtec, Jamiltepec", "native_name": "Mixtec, Jamiltepec", "char_limit": 192, "model": "mxt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xtm": {"name": "Mixtec, Magdalena Peñasco", "native_name": "Mixtec, Magdalena Peñasco", "char_limit": 192, "model": "xtm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mxv": {"name": "Mixtec, Metlatónoc", "native_name": "Mixtec, Metlatónoc", "char_limit": 192, "model": "mxv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xtn": {"name": "Mixtec, Northern Tlaxiaco", "native_name": "Mixtec, Northern Tlaxiaco", "char_limit": 192, "model": "xtn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mie": {"name": "Mixtec, Ocotepec", "native_name": "Mixtec, Ocotepec", "char_limit": 192, "model": "mie.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mil": {"name": "Mixtec, Peñoles", "native_name": "Mixtec, Peñoles", "char_limit": 192, "model": "mil.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mio": {"name": "Mixtec, Pinotepa Nacional", "native_name": "Mixtec, Pinotepa Nacional", "char_limit": 192, "model": "mio.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mdv": {"name": "Mixtec, Santa Lucía Monteverde", "native_name": "Mixtec, Santa Lucía Monteverde", "char_limit": 192, "model": "mdv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mza": {"name": "Mixtec, Santa María Zacatepec", "native_name": "Mixtec, Santa María Zacatepec", "char_limit": 192, "model": "mza.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mit": {"name": "Mixtec, Southern Puebla", "native_name": "Mixtec, Southern Puebla", "char_limit": 192, "model": "mit.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mxb": {"name": "Mixtec, Tezoatlán", "native_name": "Mixtec, Tezoatlán", "char_limit": 192, "model": "mxb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mpm": {"name": "Mixtec, Yosondúa", "native_name": "Mixtec, Yosondúa", "char_limit": 192, "model": "mpm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "soy": {"name": "Miyobe", "native_name": "Miyobe", "char_limit": 192, "model": "soy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cmo-script_latin": {"name": "Mnong, Central - Latin", "native_name": "Mnong, Central", "char_limit": 192, "model": "cmo-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cmo-script_khmer": {"name": "Mnong, Central - Khmer", "native_name": "Mnong, Central", "char_limit": 192, "model": "cmo-script_khmer.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfq": {"name": "Moba", "native_name": "Moba", "char_limit": 192, "model": "mfq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "old": {"name": "Mochi", "native_name": "Mochi", "char_limit": 192, "model": "old.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfk": {"name": "Mofu, North", "native_name": "Mofu, North", "char_limit": 192, "model": "mfk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mif": {"name": "Mofu-Gudur", "native_name": "Mofu-Gudur", "char_limit": 192, "model": "mif.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mkl": {"name": "Mokole", "native_name": "Mokole", "char_limit": 192, "model": "mkl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mox": {"name": "Molima", "native_name": "Molima", "char_limit": 192, "model": "mox.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myl": {"name": "Moma", "native_name": "Moma", "char_limit": 192, "model": "myl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mqf": {"name": "Momuna", "native_name": "Momuna", "char_limit": 192, "model": "mqf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mnw": {"name": "Mon", "native_name": "မွန်", "char_limit": 72, "model": "mnw.tar.gz", "punctuation": ["။", "၊", "!", "?", "…"]}, - "mon": {"name": "Mongolian", "native_name": "Монгол", "char_limit": 192, "model": "mon.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "mog": {"name": "Mongondow", "native_name": "Mongondow", "char_limit": 192, "model": "mog.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfe": {"name": "Morisyen", "native_name": "Morisyen", "char_limit": 192, "model": "mfe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mor": {"name": "Moro", "native_name": "Moro", "char_limit": 192, "model": "mor.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mqn": {"name": "Moronene", "native_name": "Moronene", "char_limit": 192, "model": "mqn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mgd": {"name": "Moru", "native_name": "Moru", "char_limit": 192, "model": "mgd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mtj": {"name": "Moskona", "native_name": "Moskona", "char_limit": 192, "model": "mtj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cmr": {"name": "Mro-Khimi", "native_name": "Mro-Khimi", "char_limit": 192, "model": "cmr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mtd": {"name": "Mualang", "native_name": "Mualang", "char_limit": 192, "model": "mtd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bmr": {"name": "Muinane", "native_name": "Muinane", "char_limit": 192, "model": "bmr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "moz": {"name": "Mukulu", "native_name": "Mukulu", "char_limit": 192, "model": "moz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mzm": {"name": "Mumuye", "native_name": "Mumuye", "char_limit": 192, "model": "mzm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mnb": {"name": "Muna", "native_name": "Muna", "char_limit": 192, "model": "mnb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mnf": {"name": "Mundani", "native_name": "Mundani", "char_limit": 192, "model": "mnf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "unr": {"name": "Mundari", "native_name": "Mundari", "char_limit": 192, "model": "unr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "fmu": {"name": "Muria, Far Western", "native_name": "Muria, Far Western", "char_limit": 72, "model": "fmu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mur": {"name": "Murle", "native_name": "Murle", "char_limit": 192, "model": "mur.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tih": {"name": "Murut, Timugon", "native_name": "Murut, Timugon", "char_limit": 192, "model": "tih.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "muv": {"name": "Muthuvan", "native_name": "Muthuvan", "char_limit": 192, "model": "muv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "muy": {"name": "Muyang", "native_name": "Muyang", "char_limit": 192, "model": "muy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sur": {"name": "Mwaghavul", "native_name": "Mwaghavul", "char_limit": 192, "model": "sur.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "moa": {"name": "Mwan", "native_name": "Mwan", "char_limit": 192, "model": "moa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wmw": {"name": "Mwani", "native_name": "Mwani", "char_limit": 192, "model": "wmw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tnr": {"name": "Ménik", "native_name": "Ménik", "char_limit": 192, "model": "tnr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "miq": {"name": "Mískito", "native_name": "Mískito", "char_limit": 192, "model": "miq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mos": {"name": "Mòoré", "native_name": "Mòoré", "char_limit": 192, "model": "mos.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "muh": {"name": "Mündü", "native_name": "Mündü", "char_limit": 192, "model": "muh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nas": {"name": "Naasioi", "native_name": "Naasioi", "char_limit": 192, "model": "nas.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mbj": {"name": "Nadëb", "native_name": "Nadëb", "char_limit": 72, "model": "mbj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nfr": {"name": "Nafaanra", "native_name": "Nafaanra", "char_limit": 192, "model": "nfr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kfw": {"name": "Naga, Kharam", "native_name": "Naga, Kharam", "char_limit": 192, "model": "kfw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nst": {"name": "Naga, Tangshang", "native_name": "Naga, Tangshang", "char_limit": 192, "model": "nst.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nag": {"name": "Nagamese", "native_name": "Nagamese", "char_limit": 192, "model": "nag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nch": {"name": "Nahuatl, Central Huasteca", "native_name": "Nāhuatl Central Huasteca", "char_limit": 72, "model": "nch.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhe": {"name": "Nahuatl, Eastern Huasteca", "native_name": "Nāhuatl Eastern Huastec", "char_limit": 72, "model": "nhe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ngu": {"name": "Nahuatl, Guerrero", "native_name": "Nāhuatl Guerrero", "char_limit": 192, "model": "ngu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "azz": {"name": "Nahuatl, Highland Puebla", "native_name": "Nāhuatl Puebla Alta", "char_limit": 72, "model": "azz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhx": {"name": "Nahuatl, Isthmus-Mecayapan", "native_name": "Nāhuatl Istmo Mecayapan", "char_limit": 192, "model": "nhx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ncl": {"name": "Nahuatl, Michoacán", "native_name": "Nāhuatl Michoacán", "char_limit": 192, "model": "ncl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhy": {"name": "Nahuatl, Northern Oaxaca", "native_name": "Nāhuatl Oaxaca Norte", "char_limit": 192, "model": "nhy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ncj": {"name": "Nahuatl, Northern Puebla", "native_name": "Nāhuatl Puebla Norte", "char_limit": 72, "model": "ncj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nsu": {"name": "Nahuatl, Sierra Negra", "native_name": "Nāhuatl Sierra Negra", "char_limit": 192, "model": "nsu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "npl": {"name": "Nahuatl, Southeastern Puebla", "native_name": "Nāhuatl Sureste Puebla", "char_limit": 72, "model": "npl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nuz": {"name": "Nahuatl, Tlamacazapa", "native_name": "Nāhuatl Tlamacazapa", "char_limit": 72, "model": "nuz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhw": {"name": "Nahuatl, Western Huasteca", "native_name": "Nahuatl, Western Huasteca", "char_limit": 72, "model": "nhw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhi": {"name": "Nahuatl, Zacatlán-Ahuacatlán-Tepetzintla", "native_name": "Nāhuatl Zacatlán-Ahuacatlán-Tepetzintla", "char_limit": 41, "model": "nhi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nlc": {"name": "Nalca", "native_name": "Nalca", "char_limit": 192, "model": "nlc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nab": {"name": "Nambikuára, Southern", "native_name": "Nambikuára Meridional", "char_limit": 192, "model": "nab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gld": {"name": "Nanai", "native_name": "Нанай", "char_limit": 192, "model": "gld.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "nnb": {"name": "Nande", "native_name": "Nande", "char_limit": 192, "model": "nnb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "npy": {"name": "Napu", "native_name": "Napu", "char_limit": 192, "model": "npy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pbb": {"name": "Nasa", "native_name": "Nasa Yuwe", "char_limit": 192, "model": "pbb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ntm": {"name": "Nateni", "native_name": "Nateni", "char_limit": 192, "model": "ntm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nmz": {"name": "Nawdm", "native_name": "Nawdm", "char_limit": 192, "model": "nmz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "naw": {"name": "Nawuri", "native_name": "Nawuri", "char_limit": 192, "model": "naw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nxq": {"name": "Naxi", "native_name": "纳西语", "char_limit": 192, "model": "nxq.tar.gz", "punctuation": ["。", ",", "!", "?", "…"]}, - "ndj": {"name": "Ndamba", "native_name": "Ndamba", "char_limit": 192, "model": "ndj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ndz": {"name": "Ndogo", "native_name": "Ndogo", "char_limit": 192, "model": "ndz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ndv": {"name": "Ndut", "native_name": "Ndut", "char_limit": 192, "model": "ndv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "new": {"name": "Newar", "native_name": "नेपाल भाषा", "char_limit": 192, "model": "new.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "nij": {"name": "Ngaju", "native_name": "Ngaju", "char_limit": 192, "model": "nij.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sba": {"name": "Ngambay", "native_name": "Ngambay", "char_limit": 192, "model": "sba.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gng": {"name": "Ngangam", "native_name": "Ngangam", "char_limit": 192, "model": "gng.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nga": {"name": "Ngbaka", "native_name": "Ngbaka", "char_limit": 192, "model": "nga.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nnq": {"name": "Ngindo", "native_name": "Ngindo", "char_limit": 192, "model": "nnq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ngp": {"name": "Ngulu", "native_name": "Ngulu", "char_limit": 192, "model": "ngp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gym": {"name": "Ngäbere", "native_name": "Ngäbere", "char_limit": 192, "model": "gym.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdj": {"name": "Ng’akarimojong", "native_name": "Ng’akarimojong", "char_limit": 192, "model": "kdj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nia": {"name": "Nias", "native_name": "Nias", "char_limit": 192, "model": "nia.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nim": {"name": "Nilamba", "native_name": "Nilamba", "char_limit": 192, "model": "nim.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nin": {"name": "Ninzo", "native_name": "Ninzo", "char_limit": 192, "model": "nin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nko": {"name": "Nkonya", "native_name": "Nkonya", "char_limit": 192, "model": "nko.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nog": {"name": "Nogai", "native_name": "Nogai", "char_limit": 192, "model": "nog.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lem": {"name": "Nomaande", "native_name": "Nomaande", "char_limit": 192, "model": "lem.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "not": {"name": "Nomatsigenga", "native_name": "Nomatsigenga", "char_limit": 192, "model": "not.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nhu": {"name": "Noone", "native_name": "Noone", "char_limit": 192, "model": "nhu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nob": {"name": "Norwegian Bokmål", "native_name": "norsk bokmål", "char_limit": 192, "model": "nb_core_news_md", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bud": {"name": "Ntcham", "native_name": "Ntcham", "char_limit": 192, "model": "bud.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nus": {"name": "Nuer", "native_name": "Nuer", "char_limit": 192, "model": "nus.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yas": {"name": "Nugunu", "native_name": "Nugunu", "char_limit": 192, "model": "yas.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nnw": {"name": "Nuni, Southern", "native_name": "Nuni, Southern", "char_limit": 192, "model": "nnw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nwb": {"name": "Nyabwa", "native_name": "Nyabwa", "char_limit": 192, "model": "nwb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nyy": {"name": "Nyakyusa-Ngonde", "native_name": "Nyakyusa-Ngonde", "char_limit": 192, "model": "nyy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nyn": {"name": "Nyankore", "native_name": "Nyankore", "char_limit": 192, "model": "nyn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rim": {"name": "Nyaturu", "native_name": "Nyaturu", "char_limit": 192, "model": "rim.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lid": {"name": "Nyindrou", "native_name": "Nyindrou", "char_limit": 192, "model": "lid.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nuj": {"name": "Nyole", "native_name": "Nyole", "char_limit": 192, "model": "nuj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nyo": {"name": "Nyoro", "native_name": "Nyoro", "char_limit": 192, "model": "nyo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nzi": {"name": "Nzema", "native_name": "Nzema", "char_limit": 192, "model": "nzi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ann": {"name": "Obolo", "native_name": "Obolo", "char_limit": 192, "model": "ann.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ory": {"name": "Odia", "native_name": "ଓଡ଼ିଆ", "char_limit": 192, "model": "ory.tar.gz", "punctuation": ["।", ",", "!", "?", "…"]}, - "ojb-script_latin": {"name": "Ojibwa, Northwestern - Latin", "native_name": "Ojibwa", "char_limit": 192, "model": "ojb-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ojb-script_syllabics": {"name": "Ojibwa, Northwestern - Syllabics", "native_name": "ᐊᒋᒧᐎᓐ", "char_limit": 41, "model": "ojb-script_syllabics.tar.gz", "punctuation": [",", ";\n", "?", "!", "(", ")", ":", ";", "—", "-", "“", "”", "..."]}, - "oku": {"name": "Oku", "native_name": "Oku", "char_limit": 192, "model": "oku.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bsc": {"name": "Oniyan", "native_name": "Oniyan", "char_limit": 192, "model": "bsc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bdu": {"name": "Oroko", "native_name": "Oroko", "char_limit": 192, "model": "bdu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "orm": {"name": "Oromo", "native_name": "Oromoo", "char_limit": 192, "model": "orm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ury": {"name": "Orya", "native_name": "Orya", "char_limit": 192, "model": "ury.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "oss": {"name": "Ossetic", "native_name": "Ирон", "char_limit": 192, "model": "oss.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ote": {"name": "Otomi, Mezquital", "native_name": "Hñähñu", "char_limit": 192, "model": "ote.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "otq": {"name": "Otomi, Querétaro", "native_name": "Ñañhö", "char_limit": 192, "model": "otq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "stn": {"name": "Owa", "native_name": "Owa", "char_limit": 192, "model": "stn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sig": {"name": "Paasaal", "native_name": "Paasaal", "char_limit": 192, "model": "sig.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kfx": {"name": "Pahari, Kullu", "native_name": "कुल्लू पहाड़ी", "char_limit": 72, "model": "kfx.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "bfz": {"name": "Pahari, Mahasu", "native_name": "महासू पहाड़ी", "char_limit": 192, "model": "bfz.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "sey": {"name": "Paicoca", "native_name": "Paicoca", "char_limit": 192, "model": "sey.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pao": {"name": "Paiute, Northern", "native_name": "Numu", "char_limit": 192, "model": "pao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pau": {"name": "Palauan", "native_name": "Palauan", "char_limit": 192, "model": "pau.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pce": {"name": "Palaung, Ruching", "native_name": "Ruching", "char_limit": 192, "model": "pce.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "plw": {"name": "Palawano, Brooke’s Point", "native_name": "Palawano", "char_limit": 192, "model": "plw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pmf": {"name": "Pamona", "native_name": "Pamona", "char_limit": 192, "model": "pmf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pag": {"name": "Pangasinan", "native_name": "Pangasinan", "char_limit": 192, "model": "pag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pap": {"name": "Papiamentu", "native_name": "Papiamentu", "char_limit": 192, "model": "pap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "prf": {"name": "Paranan", "native_name": "Paranan", "char_limit": 192, "model": "prf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pab": {"name": "Parecís", "native_name": "Haliti", "char_limit": 192, "model": "pab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pbi": {"name": "Parkwa", "native_name": "Parkwa", "char_limit": 192, "model": "pbi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pbc": {"name": "Patamona", "native_name": "Patamona", "char_limit": 192, "model": "pbc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pad": {"name": "Paumarí", "native_name": "Paumarí", "char_limit": 192, "model": "pad.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ata": {"name": "Pele-Ata", "native_name": "Pele-Ata", "char_limit": 192, "model": "ata.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pez": {"name": "Penan, Eastern", "native_name": "Penan", "char_limit": 192, "model": "pez.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "peg": {"name": "Pengo", "native_name": "Pengo", "char_limit": 192, "model": "peg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pcm": {"name": "Pidgin, Nigerian", "native_name": "Naijá", "char_limit": 192, "model": "pcm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pis": {"name": "Pijin", "native_name": "Pijin", "char_limit": 192, "model": "pis.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pny": {"name": "Pinyin", "native_name": "Pinyin", "char_limit": 192, "model": "pny.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pir": {"name": "Piratapuyo", "native_name": "Piratapuyo", "char_limit": 192, "model": "pir.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pjt": {"name": "Pitjantjatjara", "native_name": "Pitjantjatjara", "char_limit": 192, "model": "pjt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "poy": {"name": "Pogolo", "native_name": "Pogolo", "char_limit": 192, "model": "poy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pps": {"name": "Popoloca, San Luís Temalacayuca", "native_name": "Popoloca de San Luís Temalacayuca", "char_limit": 200, "model": "pps.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pls": {"name": "Popoloca, San Marcos Tlacoyalco", "native_name": "Popoloca de San Marcos Tlacoyalco", "char_limit": 200, "model": "pls.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "poi": {"name": "Popoluca, Highland", "native_name": "Popoluca de la Sierra", "char_limit": 200, "model": "poi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "poh-dialect_eastern": {"name": "Poqomchi’ - dialect Eastern", "native_name": "Poqomchi’", "char_limit": 200, "model": "poh-dialect_eastern.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "poh-dialect_western": {"name": "Poqomchi’ - dialect Western", "native_name": "Poqomchi’", "char_limit": 200, "model": "poh-dialect_western.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "prt": {"name": "Prai", "native_name": "Prai", "char_limit": 200, "model": "prt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pui": {"name": "Puinave", "native_name": "Puinave", "char_limit": 200, "model": "pui.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pan": {"name": "Punjabi, Eastern", "native_name": "ਪੰਜਾਬੀ", "char_limit": 192, "model": "pan.tar.gz", "punctuation": ["।", "،", "؟"]}, - "tsz": {"name": "Purepecha", "native_name": "Purépecha", "char_limit": 200, "model": "tsz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "suv": {"name": "Puroik", "native_name": "Puroik", "char_limit": 200, "model": "suv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lme": {"name": "Pévé", "native_name": "Pévé", "char_limit": 200, "model": "lme.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quy": {"name": "Quechua, Ayacucho", "native_name": "Runasimi", "char_limit": 72, "model": "quy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvc": {"name": "Quechua, Cajamarca", "native_name": "Runasimi", "char_limit": 72, "model": "qvc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quz": {"name": "Quechua, Cusco", "native_name": "Runasimi", "char_limit": 72, "model": "quz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qve": {"name": "Quechua, Eastern Apurímac", "native_name": "Runasimi", "char_limit": 72, "model": "qve.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qub": {"name": "Quechua, Huallaga", "native_name": "Runasimi", "char_limit": 72, "model": "qub.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvh": {"name": "Quechua, Huamalíes-Dos de Mayo Huánuco", "native_name": "Runasimi", "char_limit": 72, "model": "qvh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qwh": {"name": "Quechua, Huaylas Ancash", "native_name": "Runasimi", "char_limit": 72, "model": "qwh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvw": {"name": "Quechua, Huaylla Wanca", "native_name": "Runasimi", "char_limit": 72, "model": "qvw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quf": {"name": "Quechua, Lambayeque", "native_name": "Runasimi", "char_limit": 72, "model": "quf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvm": {"name": "Quechua, Margos-Yarowilca-Lauricocha", "native_name": "Runasimi", "char_limit": 72, "model": "qvm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qul": {"name": "Quechua, North Bolivian", "native_name": "Runasimi", "char_limit": 72, "model": "qul.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvn": {"name": "Quechua, North Junín", "native_name": "Runasimi", "char_limit": 72, "model": "qvn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qxn": {"name": "Quechua, Northern Conchucos Ancash", "native_name": "Runasimi", "char_limit": 72, "model": "qxn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qxh": {"name": "Quechua, Panao", "native_name": "Runasimi", "char_limit": 72, "model": "qxh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvs": {"name": "Quechua, San Martín", "native_name": "Runasimi", "char_limit": 72, "model": "qvs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quh": {"name": "Quechua, South Bolivian", "native_name": "Runasimi", "char_limit": 72, "model": "quh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qxo": {"name": "Quechua, Southern Conchucos", "native_name": "Runasimi", "char_limit": 72, "model": "qxo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qxr": {"name": "Quichua, Cañar Highland", "native_name": "Runasimi", "char_limit": 72, "model": "qxr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvo": {"name": "Quichua, Napo", "native_name": "Runasimi", "char_limit": 72, "model": "qvo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qvz": {"name": "Quichua, Northern Pastaza", "native_name": "Runasimi", "char_limit": 72, "model": "qvz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "qxl": {"name": "Quichua, Salasaca Highland", "native_name": "Runasimi", "char_limit": 72, "model": "qxl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "quw": {"name": "Quichua, Tena Lowland", "native_name": "Runasimi", "char_limit": 72, "model": "quw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kjb": {"name": "Q’anjob’al", "native_name": "Q’anjob’al", "char_limit": 200, "model": "kjb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kek": {"name": "Q’eqchi’", "native_name": "Q’eqchi’", "char_limit": 200, "model": "kek.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rah": {"name": "Rabha", "native_name": "Rabha", "char_limit": 200, "model": "rah.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rjs": {"name": "Rajbanshi", "native_name": "Rajbanshi", "char_limit": 192, "model": "rjs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rai": {"name": "Ramoaaina", "native_name": "Ramoaaina", "char_limit": 192, "model": "rai.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lje": {"name": "Rampi", "native_name": "Rampi", "char_limit": 192, "model": "lje.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rnl": {"name": "Ranglong", "native_name": "Ranglong", "char_limit": 192, "model": "rnl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rkt": {"name": "Rangpuri", "native_name": "Rangpuri", "char_limit": 192, "model": "rkt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rap": {"name": "Rapa Nui", "native_name": "Rapa Nui", "char_limit": 192, "model": "rap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yea": {"name": "Ravula", "native_name": "Ravula", "char_limit": 192, "model": "yea.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "raw": {"name": "Rawang", "native_name": "Rawang", "char_limit": 192, "model": "raw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rej": {"name": "Rejang", "native_name": "Rejang", "char_limit": 192, "model": "rej.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rel": {"name": "Rendille", "native_name": "Rendille", "char_limit": 192, "model": "rel.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ril": {"name": "Riang Lang", "native_name": "Riang Lang", "char_limit": 192, "model": "ril.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "iri": {"name": "Rigwe", "native_name": "Rigwe", "char_limit": 192, "model": "iri.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rgu": {"name": "Rikou", "native_name": "Rikou", "char_limit": 192, "model": "rgu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rhg": {"name": "Rohingya", "native_name": "Ruáingga", "char_limit": 192, "model": "rhg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rmc-script_latin": {"name": "Romani, Carpathian - Latin", "native_name": "Romani Čhib", "char_limit": 72, "model": "rmc-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rmc-script_cyrillic": {"name": "Romani, Carpathian - Cyrillic", "native_name": "Романи Чхиб", "char_limit": 41, "model": "rmc-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "rmo": {"name": "Romani, Sinte", "native_name": "Romanes", "char_limit": 192, "model": "rmo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rmy-script_latin": {"name": "Romani, Vlax - Latin", "native_name": "Romani Čhib", "char_limit": 72, "model": "rmy-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rmy-script_cyrillic": {"name": "Romani, Vlax - Cyrillic", "native_name": "Романи Чхиб", "char_limit": 41, "model": "rmy-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ron": {"name": "Romanian", "native_name": "Română", "char_limit": 192, "model": "ron.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rol": {"name": "Romblomanon", "native_name": "Romblomanon", "char_limit": 192, "model": "rol.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cla": {"name": "Ron", "native_name": "Ron", "char_limit": 192, "model": "cla.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rng": {"name": "Ronga", "native_name": "Ronga", "char_limit": 192, "model": "rng.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rug": {"name": "Roviana", "native_name": "Roviana", "char_limit": 192, "model": "rug.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "run": {"name": "Rundi", "native_name": "Ikirundi", "char_limit": 192, "model": "run.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lsm": {"name": "Saamya-Gwe", "native_name": "Saamya-Gwe", "char_limit": 192, "model": "lsm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "spy": {"name": "Sabaot", "native_name": "Sabaot", "char_limit": 192, "model": "spy.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sck": {"name": "Sadri", "native_name": "Sadri", "char_limit": 192, "model": "sck.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "saj": {"name": "Sahu", "native_name": "Sahu", "char_limit": 192, "model": "saj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sch": {"name": "Sakachep", "native_name": "Sakachep", "char_limit": 192, "model": "sch.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sml": {"name": "Sama, Central", "native_name": "Sama", "char_limit": 192, "model": "sml.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xsb": {"name": "Sambal", "native_name": "Sambal", "char_limit": 192, "model": "xsb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sbl": {"name": "Sambal, Botolan", "native_name": "Sambal Botolan", "char_limit": 192, "model": "sbl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "saq": {"name": "Samburu", "native_name": "Samburu", "char_limit": 192, "model": "saq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sbd": {"name": "Samo, Southern", "native_name": "Samo", "char_limit": 192, "model": "sbd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "smo": {"name": "Samoan", "native_name": "Gagana fa'a Samoa", "char_limit": 72, "model": "smo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rav": {"name": "Sampang", "native_name": "Sampang", "char_limit": 192, "model": "rav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sxn": {"name": "Sangir", "native_name": "Sangir", "char_limit": 192, "model": "sxn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sag": {"name": "Sango", "native_name": "Sängö", "char_limit": 192, "model": "sag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sbp": {"name": "Sangu", "native_name": "Sangu", "char_limit": 192, "model": "sbp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xsu": {"name": "Sanumá", "native_name": "Sanumá", "char_limit": 192, "model": "xsu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "srm": {"name": "Saramaccan", "native_name": "Saramaccan", "char_limit": 192, "model": "srm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sas": {"name": "Sasak", "native_name": "Sasak", "char_limit": 192, "model": "sas.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "apb": {"name": "Sa’a", "native_name": "Sa’a", "char_limit": 192, "model": "apb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sgw": {"name": "Sebat Bet Gurage", "native_name": "Sebat Bet Gurage", "char_limit": 192, "model": "sgw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tvw": {"name": "Sedoa", "native_name": "Sedoa", "char_limit": 192, "model": "tvw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lip": {"name": "Sekpele", "native_name": "Sekpele", "char_limit": 192, "model": "lip.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "slu": {"name": "Selaru", "native_name": "Selaru", "char_limit": 192, "model": "slu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "snw": {"name": "Selee", "native_name": "Selee", "char_limit": 192, "model": "snw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sea": {"name": "Semai", "native_name": "Semai", "char_limit": 192, "model": "sea.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sza": {"name": "Semelai", "native_name": "Semelai", "char_limit": 192, "model": "sza.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "seh": {"name": "Sena", "native_name": "Sena", "char_limit": 192, "model": "seh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "crs": {"name": "Seychelles French Creole", "native_name": "Kreol Seselwa", "char_limit": 192, "model": "crs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ksb": {"name": "Shambala", "native_name": "Kishambala", "char_limit": 72, "model": "ksb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "shn": {"name": "Shan", "native_name": "Shan", "char_limit": 192, "model": "shn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sho": {"name": "Shanga", "native_name": "Shanga", "char_limit": 192, "model": "sho.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mcd": {"name": "Sharanahua", "native_name": "Sharanahua", "char_limit": 192, "model": "mcd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cbt": {"name": "Shawi", "native_name": "Shawi", "char_limit": 192, "model": "cbt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xsr": {"name": "Sherpa", "native_name": "ཤར་པཱ", "char_limit": 192, "model": "xsr.tar.gz", "punctuation": ["།", "༄", "༅", "?", "…"]}, - "shk": {"name": "Shilluk", "native_name": "Shilluk", "char_limit": 192, "model": "shk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "shp": {"name": "Shipibo-Conibo", "native_name": "Shipibo-Conibo", "char_limit": 72, "model": "shp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sna": {"name": "Shona", "native_name": "ChiShona", "char_limit": 72, "model": "sna.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cjs": {"name": "Shor", "native_name": "Шор тили", "char_limit": 192, "model": "cjs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "jiv": {"name": "Shuar", "native_name": "Shuar", "char_limit": 192, "model": "jiv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "snp": {"name": "Siane", "native_name": "Siane", "char_limit": 192, "model": "snp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sya": {"name": "Siang", "native_name": "Siang", "char_limit": 192, "model": "sya.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sid": {"name": "Sidamo", "native_name": "Sidamo", "char_limit": 192, "model": "sid.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "snn": {"name": "Siona", "native_name": "Siona", "char_limit": 192, "model": "snn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sri": {"name": "Siriano", "native_name": "Siriano", "char_limit": 192, "model": "sri.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "srx": {"name": "Sirmauri", "native_name": "Sirmauri", "char_limit": 192, "model": "srx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sil": {"name": "Sisaala, Tumulung", "native_name": "Sisaala, Tumulung", "char_limit": 192, "model": "sil.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sld": {"name": "Sissala", "native_name": "Sissala", "char_limit": 192, "model": "sld.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "akp": {"name": "Siwu", "native_name": "Siwu", "char_limit": 192, "model": "akp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xog": {"name": "Soga", "native_name": "Soga", "char_limit": 192, "model": "xog.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "som": {"name": "Somali", "native_name": "Soomaali", "char_limit": 192, "model": "som.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bmu": {"name": "Somba-Siawari", "native_name": "Somba-Siawari", "char_limit": 192, "model": "bmu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "khq": {"name": "Songhay, Koyra Chiini", "native_name": "Songhay, Koyra Chiini", "char_limit": 192, "model": "khq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ses": {"name": "Songhay, Koyraboro Senni", "native_name": "Songhay, Koyraboro Senni", "char_limit": 192, "model": "ses.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mnx": {"name": "Sougb", "native_name": "Sougb", "char_limit": 192, "model": "mnx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "srn": {"name": "Sranan Tongo", "native_name": "Sranan Tongo", "char_limit": 192, "model": "srn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sxb": {"name": "Suba", "native_name": "Suba", "char_limit": 192, "model": "sxb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "suc": {"name": "Subanon, Western", "native_name": "Subanon, Western", "char_limit": 192, "model": "suc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tgo": {"name": "Sudest", "native_name": "Sudest", "char_limit": 192, "model": "tgo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "suk": {"name": "Sukuma", "native_name": "Sukuma", "char_limit": 192, "model": "suk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sun": {"name": "Sunda", "native_name": "Basa Sunda", "char_limit": 192, "model": "sun.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "suz": {"name": "Sunwar", "native_name": "Sunwar", "char_limit": 192, "model": "suz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sgj": {"name": "Surgujia", "native_name": "Surgujia", "char_limit": 192, "model": "sgj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sus": {"name": "Susu", "native_name": "Susu", "char_limit": 192, "model": "sus.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "swh": {"name": "Swahili", "native_name": "Kiswahili", "char_limit": 192, "model": "swh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "swe": {"name": "Swedish", "native_name": "Svenska", "char_limit": 192, "model": "swe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "syl": {"name": "Sylheti", "native_name": "Sylheti", "char_limit": 192, "model": "syl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dyi": {"name": "Sénoufo, Djimini", "native_name": "Sénoufo, Djimini", "char_limit": 192, "model": "dyi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "myk": {"name": "Sénoufo, Mamara", "native_name": "Sénoufo, Mamara", "char_limit": 192, "model": "myk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "spp": {"name": "Sénoufo, Supyire", "native_name": "Sénoufo, Supyire", "char_limit": 192, "model": "spp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tap": {"name": "Taabwa", "native_name": "Taabwa", "char_limit": 192, "model": "tap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tby": {"name": "Tabaru", "native_name": "Tabaru", "char_limit": 192, "model": "tby.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tna": {"name": "Tacana", "native_name": "Tacana", "char_limit": 192, "model": "tna.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "shi": {"name": "Tachelhit", "native_name": "Tashelḥiyt", "char_limit": 192, "model": "shi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "klw": {"name": "Tado", "native_name": "Tado", "char_limit": 192, "model": "klw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tgl": {"name": "Tagalog", "native_name": "Tagalog", "char_limit": 192, "model": "tgl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tbk": {"name": "Tagbanwa, Calamian", "native_name": "Tagbanwa", "char_limit": 192, "model": "tbk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tgj": {"name": "Tagin", "native_name": "Tagin", "char_limit": 192, "model": "tgj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "blt": {"name": "Tai Dam", "native_name": "Táy Dăm", "char_limit": 72, "model": "blt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tbg": {"name": "Tairora, North", "native_name": "Tairora", "char_limit": 192, "model": "tbg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "omw": {"name": "Tairora, South", "native_name": "Tairora", "char_limit": 192, "model": "omw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tgk": {"name": "Tajik", "native_name": "Тоҷикӣ", "char_limit": 192, "model": "tgk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "tdj": {"name": "Tajio", "native_name": "Tajio", "char_limit": 192, "model": "tdj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tbc": {"name": "Takia", "native_name": "Takia", "char_limit": 192, "model": "tbc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tlj": {"name": "Talinga-Bwisi", "native_name": "Talinga-Bwisi", "char_limit": 192, "model": "tlj.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tly": {"name": "Talysh", "native_name": "Толыши", "char_limit": 72, "model": "tly.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ttq-script_tifinagh": {"name": "Tamajaq, Tawallammat", "native_name": "ⵜⴰⵎⴰⵌⴰⵇ", "char_limit": 41, "model": "ttq-script_tifinagh.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "taj": {"name": "Tamang, Eastern", "native_name": "तामाङ", "char_limit": 72, "model": "taj.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "taq": {"name": "Tamasheq", "native_name": "ⵜⴰⵎⴰⵛⵍⵈⵜ", "char_limit": 192, "model": "taq.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "tpm": {"name": "Tampulma", "native_name": "Tampulma", "char_limit": 192, "model": "tpm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tgp": {"name": "Tangoa", "native_name": "Tangoa", "char_limit": 192, "model": "tgp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tnn": {"name": "Tanna, North", "native_name": "Tanna", "char_limit": 192, "model": "tnn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tac": {"name": "Tarahumara, Western", "native_name": "Tarahumara", "char_limit": 192, "model": "tac.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rif-script_latin": {"name": "Tarifit - Latin", "native_name": "Tarifit", "char_limit": 192, "model": "rif-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rif-script_arabic": {"name": "Tarifit - Arabic", "native_name": "ⵜⴰⵔⵉⴼⵉⵜ", "char_limit": 72, "model": "rif-script_arabic.tar.gz", "punctuation": [";\n", ",", "!", "?", "…"]}, - "tat": {"name": "Tatar", "native_name": "татар теле", "char_limit": 192, "model": "tat.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "tav": {"name": "Tatuyo", "native_name": "Tatuyo", "char_limit": 192, "model": "tav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "twb": {"name": "Tawbuid", "native_name": "Tawbuid", "char_limit": 192, "model": "twb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tbl": {"name": "Tboli", "native_name": "Tboli", "char_limit": 192, "model": "tbl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kps": {"name": "Tehit", "native_name": "Tehit", "char_limit": 192, "model": "kps.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "twe": {"name": "Teiwa", "native_name": "Teiwa", "char_limit": 192, "model": "twe.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ttc": {"name": "Tektiteko", "native_name": "Tektiteko", "char_limit": 192, "model": "ttc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdh": {"name": "Tem", "native_name": "Tem", "char_limit": 192, "model": "kdh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tes": {"name": "Tengger", "native_name": "Tengger", "char_limit": 192, "model": "tes.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tex": {"name": "Tennet", "native_name": "Tennet", "char_limit": 192, "model": "tex.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tee": {"name": "Tepehua, Huehuetla", "native_name": "Tepehua", "char_limit": 192, "model": "tee.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tpp": {"name": "Tepehua, Pisaflores", "native_name": "Tepehua Pisaflores", "char_limit": 192, "model": "tpp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tpt": {"name": "Tepehua, Tlachichilco", "native_name": "Tepehua Tlachichilco", "char_limit": 192, "model": "tpt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "stp": {"name": "Tepehuan, Southeastern", "native_name": "Tepehuan Southeastern", "char_limit": 192, "model": "stp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tfr": {"name": "Teribe", "native_name": "Teribe", "char_limit": 192, "model": "tfr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "twu": {"name": "Termanu", "native_name": "Termanu", "char_limit": 192, "model": "twu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ter": {"name": "Terêna", "native_name": "Terêna", "char_limit": 192, "model": "ter.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tew": {"name": "Tewa", "native_name": "Tewa", "char_limit": 192, "model": "tew.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tha": {"name": "Thai", "native_name": "ไทย", "char_limit": 72, "model": "tha.tar.gz", "punctuation": ["।", "๚", "!", "?", "…"]}, - "nod": {"name": "Thai, Northern", "native_name": "คำเมือง", "char_limit": 192, "model": "nod.tar.gz", "punctuation": ["।", "๚", "!", "?", "…"]}, - "thl": {"name": "Tharu, Dangaura", "native_name": "थारू", "char_limit": 72, "model": "thl.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "tem": {"name": "Themne", "native_name": "Themne", "char_limit": 192, "model": "tem.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "adx": {"name": "Tibetan, Amdo", "native_name": "ཨ་མདོ", "char_limit": 192, "model": "adx.tar.gz", "punctuation": ["།", "༄", "༅", "༈", "…"]}, - "bod": {"name": "Tibetan, Central", "native_name": "བོད", "char_limit": 192, "model": "bod.tar.gz", "punctuation": ["།", "༄", "༅", "༈", "…"]}, - "khg": {"name": "Tibetan, Khams", "native_name": "ཁམས", "char_limit": 192, "model": "khg.tar.gz", "punctuation": ["།", "༄", "༅", "༈", "…"]}, - "tca": {"name": "Ticuna", "native_name": "Ticuna", "char_limit": 192, "model": "tca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tir": {"name": "Tigrigna", "native_name": "ትግርኛ", "char_limit": 72, "model": "tir.tar.gz", "punctuation": ["።", "፣", "!", "?", "…"]}, - "txq": {"name": "Tii", "native_name": "Tii", "char_limit": 192, "model": "txq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tik": {"name": "Tikar", "native_name": "Tikar", "char_limit": 192, "model": "tik.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "dgr": {"name": "Tlicho", "native_name": "Tlicho", "char_limit": 192, "model": "dgr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tob": {"name": "Toba", "native_name": "Toba", "char_limit": 192, "model": "tob.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tmf": {"name": "Toba-Maskoy", "native_name": "Toba-Maskoy", "char_limit": 192, "model": "tmf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tng": {"name": "Tobanga", "native_name": "Tobanga", "char_limit": 192, "model": "tng.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tlb": {"name": "Tobelo", "native_name": "Tobelo", "char_limit": 192, "model": "tlb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ood": {"name": "Tohono O’odham", "native_name": "Tohono O’odham", "char_limit": 192, "model": "ood.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tpi": {"name": "Tok Pisin", "native_name": "Tok Pisin", "char_limit": 192, "model": "tpi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jic": {"name": "Tol", "native_name": "Tol", "char_limit": 192, "model": "jic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lbw": {"name": "Tolaki", "native_name": "Tolaki", "char_limit": 192, "model": "lbw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "txa": {"name": "Tombonuo", "native_name": "Tombonuo", "char_limit": 192, "model": "txa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tom": {"name": "Tombulu", "native_name": "Tombulu", "char_limit": 192, "model": "tom.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "toh": {"name": "Tonga", "native_name": "Tonga", "char_limit": 192, "model": "toh.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tnt": {"name": "Tontemboan", "native_name": "Tontemboan", "char_limit": 192, "model": "tnt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sda": {"name": "Toraja-Sa’dan", "native_name": "Toraja-Sa’dan", "char_limit": 192, "model": "sda.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tcs": {"name": "Torres Strait Creole", "native_name": "Torres Strait Creole", "char_limit": 192, "model": "tcs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "toc": {"name": "Totonac, Coyutla", "native_name": "Totonac, Coyutla", "char_limit": 192, "model": "toc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tos": {"name": "Totonac, Highland", "native_name": "Totonac, Highland", "char_limit": 192, "model": "tos.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "neb": {"name": "Toura", "native_name": "Toura", "char_limit": 192, "model": "neb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "trn": {"name": "Trinitario", "native_name": "Trinitario", "char_limit": 192, "model": "trn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "trs": {"name": "Triqui, Chicahuaxtla", "native_name": "Triqui, Chicahuaxtla", "char_limit": 192, "model": "trs.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "trc": {"name": "Triqui, Copala", "native_name": "Triqui, Copala", "char_limit": 192, "model": "trc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tri": {"name": "Trió", "native_name": "Trió", "char_limit": 192, "model": "tri.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cof": {"name": "Tsafiki", "native_name": "Tsafiki", "char_limit": 192, "model": "cof.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tkr": {"name": "Tsakhur", "native_name": "Tsakhur", "char_limit": 192, "model": "tkr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kdl": {"name": "Tsikimba", "native_name": "Tsikimba", "char_limit": 192, "model": "kdl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cas": {"name": "Tsimané", "native_name": "Tsimané", "char_limit": 192, "model": "cas.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tso": {"name": "Tsonga", "native_name": "Tsonga", "char_limit": 192, "model": "tso.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tuo": {"name": "Tucano", "native_name": "Tucano", "char_limit": 192, "model": "tuo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "iou": {"name": "Tuma-Irumu", "native_name": "Tuma-Irumu", "char_limit": 192, "model": "iou.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tmc": {"name": "Tumak", "native_name": "Tumak", "char_limit": 192, "model": "tmc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tuf": {"name": "Tunebo, Central", "native_name": "Tunebo, Central", "char_limit": 192, "model": "tuf.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tuk-script_latin": {"name": "Turkmen - Latin", "native_name": "Türkmençe", "char_limit": 192, "model": "tuk-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tuk-script_arabic": {"name": "Turkmen - Arabic", "native_name": "تركمن", "char_limit": 72, "model": "tuk-script_arabic.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "bov": {"name": "Tuwuli", "native_name": "Tuwuli", "char_limit": 192, "model": "bov.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tue": {"name": "Tuyuca", "native_name": "Tuyuca", "char_limit": 192, "model": "tue.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kcg": {"name": "Tyap", "native_name": "Tyap", "char_limit": 192, "model": "kcg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzh-dialect_bachajón": {"name": "Tzeltal - dialect Bachajón", "native_name": "Tzeltal", "char_limit": 192, "model": "tzh-dialect_bachajón.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzh-dialect_tenejapa": {"name": "Tzeltal - dialect Tenejapa", "native_name": "Tzeltal", "char_limit": 192, "model": "tzh-dialect_tenejapa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzo-dialect_chenalhó": {"name": "Tzotzil - dialect Chenalhó", "native_name": "Tzotzil", "char_limit": 192, "model": "tzo-dialect_chenalhó.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzo-dialect_chamula": {"name": "Tzotzil - dialect Chamula", "native_name": "Tzotzil", "char_limit": 192, "model": "tzo-dialect_chamula.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzj-dialect_western": {"name": "Tz’utujil - dialect Western", "native_name": "Tz’utujil", "char_limit": 192, "model": "tzj-dialect_western.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tzj-dialect_eastern": {"name": "Tz’utujil - dialect Eastern", "native_name": "Tz’utujil", "char_limit": 192, "model": "tzj-dialect_eastern.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "aoz": {"name": "Uab Meto", "native_name": "Uab Meto", "char_limit": 192, "model": "aoz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "udm": {"name": "Udmurt", "native_name": "Udmurt", "char_limit": 192, "model": "udm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "udu": {"name": "Uduk", "native_name": "Uduk", "char_limit": 192, "model": "udu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ukr": {"name": "Ukrainian", "native_name": "Українська", "char_limit": 72, "model": "ukr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "ppk": {"name": "Uma", "native_name": "Uma", "char_limit": 192, "model": "ppk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ubu": {"name": "Umbu-Ungu", "native_name": "Umbu-Ungu", "char_limit": 192, "model": "ubu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "urk": {"name": "Urak Lawoi’", "native_name": "Urak Lawoi’", "char_limit": 192, "model": "urk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ura": {"name": "Urarina", "native_name": "Urarina", "char_limit": 192, "model": "ura.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "urt": {"name": "Urat", "native_name": "Urat", "char_limit": 192, "model": "urt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "urd-script_devanagari": {"name": "Urdu - Devanagari", "native_name": "उर्दू", "char_limit": 72, "model": "urd-script_devanagari.tar.gz", "punctuation": ["।", ",", ";\n", ":", "?", "!", "(", ")", "—", "॥", "..."]}, - "urd-script_arabic": {"name": "Urdu - Arabic", "native_name": "اردو", "char_limit": 72, "model": "urd-script_arabic.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "urd-script_latin": {"name": "Urdu - Latin", "native_name": "Urdu", "char_limit": 192, "model": "urd-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "upv": {"name": "Uripiv-Wala-Rano-Atchin", "native_name": "Uripiv-Wala-Rano-Atchin", "char_limit": 192, "model": "upv.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "usp": {"name": "Uspanteko", "native_name": "Uspanteko", "char_limit": 192, "model": "usp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "uig-script_arabic": {"name": "Uyghur - Arabic", "native_name": "ئۇيغۇر", "char_limit": 72, "model": "uig-script_arabic.tar.gz", "punctuation": ["،", ";\n", "؟", "!", "«", "»", "(", ")", "-", "؛", "..."]}, - "uig-script_cyrillic": {"name": "Uyghur - Cyrillic", "native_name": "Уйғур", "char_limit": 72, "model": "uig-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "uzb-script_cyrillic": {"name": "Uzbek", "native_name": "Ўзбек", "char_limit": 72, "model": "uzb-script_cyrillic.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "vag": {"name": "Vagla", "native_name": "Vagla", "char_limit": 192, "model": "vag.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bav": {"name": "Vengo", "native_name": "Vengo", "char_limit": 192, "model": "bav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vid": {"name": "Vidunda", "native_name": "Vidunda", "char_limit": 192, "model": "vid.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vie": {"name": "Vietnamese", "native_name": "Tiếng Việt", "char_limit": 192, "model": "vie.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vif": {"name": "Vili", "native_name": "Vili", "char_limit": 192, "model": "vif.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vun": {"name": "Vunjo", "native_name": "Vunjo", "char_limit": 192, "model": "vun.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "vut": {"name": "Vute", "native_name": "Vute", "char_limit": 192, "model": "vut.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "prk": {"name": "Wa, Parauk", "native_name": "Wa, Parauk", "char_limit": 192, "model": "prk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wwa": {"name": "Waama", "native_name": "Waama", "char_limit": 192, "model": "wwa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "rro": {"name": "Waima", "native_name": "Waima", "char_limit": 192, "model": "rro.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "bao": {"name": "Waimaha", "native_name": "Waimaha", "char_limit": 192, "model": "bao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "waw": {"name": "Waiwai", "native_name": "Waiwai", "char_limit": 192, "model": "waw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "lgl": {"name": "Wala", "native_name": "Wala", "char_limit": 192, "model": "lgl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wlx": {"name": "Wali", "native_name": "Wali", "char_limit": 192, "model": "wlx.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cou": {"name": "Wamey", "native_name": "Wamey", "char_limit": 192, "model": "cou.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hub": {"name": "Wampís", "native_name": "Wampís", "char_limit": 192, "model": "hub.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gvc": {"name": "Wanano", "native_name": "Wanano", "char_limit": 192, "model": "gvc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "mfi": {"name": "Wandala", "native_name": "Wandala", "char_limit": 192, "model": "mfi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wap": {"name": "Wapishana", "native_name": "Wapishana", "char_limit": 192, "model": "wap.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wba": {"name": "Warao", "native_name": "Warao", "char_limit": 192, "model": "wba.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "war": {"name": "Waray-Waray", "native_name": "Waray-Waray", "char_limit": 192, "model": "war.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "way": {"name": "Wayana", "native_name": "Wayana", "char_limit": 192, "model": "way.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guc": {"name": "Wayuu", "native_name": "Wayuu", "char_limit": 192, "model": "guc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "cym": {"name": "Welsh", "native_name": "Cymraeg", "char_limit": 192, "model": "cym.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kvw": {"name": "Wersing", "native_name": "Wersing", "char_limit": 192, "model": "kvw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tnp": {"name": "Whitesands", "native_name": "Whitesands", "char_limit": 192, "model": "tnp.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "hto": {"name": "Witoto, Minika", "native_name": "Witoto, Minika", "char_limit": 192, "model": "hto.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "huu": {"name": "Witoto, Murui", "native_name": "Witoto, Murui", "char_limit": 192, "model": "huu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wal-script_latin": {"name": "Wolaytta - Latin", "native_name": "Wolaytta", "char_limit": 192, "model": "wal-script_latin.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wal-script_ethiopic": {"name": "Wolaytta - Ethiopic", "native_name": "ወላይታ", "char_limit": 192, "model": "wal-script_ethiopic.tar.gz", "punctuation": ["።", "፣", "፤", "፧", "…"]}, - "wlo": {"name": "Wolio", "native_name": "Wolio", "char_limit": 192, "model": "wlo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "noa": {"name": "Woun Meu", "native_name": "Woun Meu", "char_limit": 192, "model": "noa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "wob": {"name": "Wè Northern", "native_name": "Wè", "char_limit": 192, "model": "wob.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "kao": {"name": "Xaasongaxango", "native_name": "Xaasongaxango", "char_limit": 192, "model": "kao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "xer": {"name": "Xerénte", "native_name": "Xerénte", "char_limit": 192, "model": "xer.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yad": {"name": "Yagua", "native_name": "Yagua", "char_limit": 192, "model": "yad.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yka": {"name": "Yakan", "native_name": "Yakan", "char_limit": 192, "model": "yka.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "sah": {"name": "Yakut", "native_name": "Саха", "char_limit": 192, "model": "sah.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "«", "»", "—", "-", "(", ")", "…", "„", "“", "”", "..."]}, - "yba": {"name": "Yala", "native_name": "Yala", "char_limit": 192, "model": "yba.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yli": {"name": "Yali, Angguruk", "native_name": "Yali, Angguruk", "char_limit": 192, "model": "yli.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "nlk": {"name": "Yali, Ninia", "native_name": "Yali, Ninia", "char_limit": 192, "model": "nlk.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yal": {"name": "Yalunka", "native_name": "Yalunka", "char_limit": 192, "model": "yal.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yam": {"name": "Yamba", "native_name": "Yamba", "char_limit": 192, "model": "yam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yat": {"name": "Yambeta", "native_name": "Yambeta", "char_limit": 192, "model": "yat.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "jmd": {"name": "Yamdena", "native_name": "Yamdena", "char_limit": 192, "model": "jmd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "tao": {"name": "Yami", "native_name": "Yami", "char_limit": 192, "model": "tao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yaa": {"name": "Yaminahua", "native_name": "Yaminahua", "char_limit": 192, "model": "yaa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ame": {"name": "Yanesha’", "native_name": "Yanesha’", "char_limit": 192, "model": "ame.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "guu": {"name": "Yanomamö", "native_name": "Yanomamö", "char_limit": 192, "model": "guu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yao": {"name": "Yao", "native_name": "Yao", "char_limit": 192, "model": "yao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yre": {"name": "Yaouré", "native_name": "Yaouré", "char_limit": 192, "model": "yre.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yva": {"name": "Yawa", "native_name": "Yawa", "char_limit": 192, "model": "yva.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ybb": {"name": "Yemba", "native_name": "Yemba", "char_limit": 192, "model": "ybb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pib": {"name": "Yine", "native_name": "Yine", "char_limit": 192, "model": "pib.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "byr": {"name": "Yipma", "native_name": "Yipma", "char_limit": 192, "model": "byr.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "pil": {"name": "Yom", "native_name": "Yom", "char_limit": 192, "model": "pil.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ycn": {"name": "Yucuna", "native_name": "Yucuna", "char_limit": 192, "model": "ycn.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ess": {"name": "Yupik, Saint Lawrence Island", "native_name": "Yupigestun", "char_limit": 72, "model": "ess.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "yuz": {"name": "Yuracare", "native_name": "Yuracare", "char_limit": 192, "model": "yuz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "atb": {"name": "Zaiwa", "native_name": "Zaiwa", "char_limit": 192, "model": "atb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zne": {"name": "Zande", "native_name": "Zande", "char_limit": 192, "model": "zne.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zaq": {"name": "Zapotec, Aloápam", "native_name": "Aloápam Zapotec", "char_limit": 72, "model": "zaq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpo": {"name": "Zapotec, Amatlán", "native_name": "Amatlán Zapotec", "char_limit": 72, "model": "zpo.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zad": {"name": "Zapotec, Cajonos", "native_name": "Cajonos Zapotec", "char_limit": 72, "model": "zad.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpc": {"name": "Zapotec, Choapan", "native_name": "Choapan Zapotec", "char_limit": 72, "model": "zpc.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zca": {"name": "Zapotec, Coatecas Altas", "native_name": "Coatecas Altas Zapotec", "char_limit": 72, "model": "zca.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpg": {"name": "Zapotec, Guevea de Humboldt", "native_name": "Guevea de Humboldt Zapotec", "char_limit": 192, "model": "zpg.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zai": {"name": "Zapotec, Isthmus", "native_name": "Isthmus Zapotec", "char_limit": 72, "model": "zai.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpl": {"name": "Zapotec, Lachixío", "native_name": "Lachixío Zapotec", "char_limit": 72, "model": "zpl.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zam": {"name": "Zapotec, Miahuatlán", "native_name": "Miahuatlán Zapotec", "char_limit": 72, "model": "zam.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zaw": {"name": "Zapotec, Mitla", "native_name": "Mitla Zapotec", "char_limit": 72, "model": "zaw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpm": {"name": "Zapotec, Mixtepec", "native_name": "Mixtepec Zapotec", "char_limit": 72, "model": "zpm.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zac": {"name": "Zapotec, Ocotlán", "native_name": "Ocotlán Zapotec", "char_limit": 72, "model": "zac.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zao": {"name": "Zapotec, Ozolotepec", "native_name": "Ozolotepec Zapotec", "char_limit": 72, "model": "zao.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ztq": {"name": "Zapotec, Quioquitani-Quierí", "native_name": "Quioquitani-Quierí Zapotec", "char_limit": 192, "model": "ztq.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zar": {"name": "Zapotec, Rincón", "native_name": "Rincón Zapotec", "char_limit": 72, "model": "zar.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpt": {"name": "Zapotec, San Vicente Coatlán", "native_name": "San Vicente Coatlán Zapotec", "char_limit": 192, "model": "zpt.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpi": {"name": "Zapotec, Santa María Quiegolani", "native_name": "Santa María Quiegolani Zapotec", "char_limit": 192, "model": "zpi.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zas": {"name": "Zapotec, Santo Domingo Albarradas", "native_name": "Santo Domingo Albarradas Zapotec", "char_limit": 192, "model": "zas.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zaa": {"name": "Zapotec, Sierra de Juárez", "native_name": "Sierra de Juárez Zapotec", "char_limit": 192, "model": "zaa.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpz": {"name": "Zapotec, Texmelucan", "native_name": "Texmelucan Zapotec", "char_limit": 72, "model": "zpz.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zab": {"name": "Zapotec, Western Tlacolula Valley", "native_name": "Western Tlacolula Valley Zapotec", "char_limit": 192, "model": "zab.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zpu": {"name": "Zapotec, Yalálag", "native_name": "Yalálag Zapotec", "char_limit": 72, "model": "zpu.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zae": {"name": "Zapotec, Yareni", "native_name": "Yareni Zapotec", "char_limit": 72, "model": "zae.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zty": {"name": "Zapotec, Yatee", "native_name": "Yatee Zapotec", "char_limit": 72, "model": "zty.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zav": {"name": "Zapotec, Yatzachi", "native_name": "Yatzachi Zapotec", "char_limit": 72, "model": "zav.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zza": {"name": "Zaza", "native_name": "Zazaki", "char_limit": 72, "model": "zza.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zyb": {"name": "Zhuang, Yongbei", "native_name": "Yongbei Zhuang", "char_limit": 72, "model": "zyb.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "ziw": {"name": "Zigula", "native_name": "Zigula", "char_limit": 192, "model": "ziw.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "zos": {"name": "Zoque, Francisco León", "native_name": "Francisco León Zoque", "char_limit": 72, "model": "zos.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]}, - "gnd": {"name": "Zulgo-Gemzek", "native_name": "Zulgo-Gemzek", "char_limit": 192, "model": "gnd.tar.gz", "punctuation": [",", ";\n", ":", ";", "?", "!", "\"", "(", ")", "[", "]", "{", "}", "—", "-", "...", "“", "”", "/"]} -} diff --git a/lib/tokenizer.py b/lib/tokenizer.py deleted file mode 100644 index 9dd3e13b6a109f41b8b51d67b0794b7a90d6f7bb..0000000000000000000000000000000000000000 --- a/lib/tokenizer.py +++ /dev/null @@ -1,906 +0,0 @@ -import logging -import os -import re -import textwrap -from functools import cached_property - -import torch -from num2words import num2words -from spacy.lang.ar import Arabic -from spacy.lang.en import English -from spacy.lang.es import Spanish -from spacy.lang.hi import Hindi -from spacy.lang.ja import Japanese -from spacy.lang.zh import Chinese -from tokenizers import Tokenizer - -from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words - -logger = logging.getLogger(__name__) - - -def get_spacy_lang(lang): - """Return Spacy language used for sentence splitting.""" - if lang == "zh": - return Chinese() - elif lang == "ja": - return Japanese() - elif lang == "ar": - return Arabic() - elif lang == "es": - return Spanish() - elif lang == "hi": - return Hindi() - else: - # For most languages, English does the job - return English() - - -def split_sentence(text, lang, text_split_length=250): - """Preprocess the input text""" - text_splits = [] - if text_split_length is not None and len(text) >= text_split_length: - text_splits.append("") - nlp = get_spacy_lang(lang) - nlp.add_pipe("sentencizer") - doc = nlp(text) - for sentence in doc.sents: - if len(text_splits[-1]) + len(str(sentence)) <= text_split_length: - # if the last sentence + the current sentence is less than the text_split_length - # then add the current sentence to the last sentence - text_splits[-1] += " " + str(sentence) - text_splits[-1] = text_splits[-1].lstrip() - elif len(str(sentence)) > text_split_length: - # if the current sentence is greater than the text_split_length - for line in textwrap.wrap( - str(sentence), - width=text_split_length, - drop_whitespace=True, - break_on_hyphens=False, - tabsize=1, - ): - text_splits.append(str(line)) - else: - text_splits.append(str(sentence)) - - if len(text_splits) > 1: - if text_splits[0] == "": - del text_splits[0] - else: - text_splits = [text.lstrip()] - - return text_splits - - -_whitespace_re = re.compile(r"\s+") - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = { - "en": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("mrs", "misess"), - ("mr", "mister"), - ("dr", "doctor"), - ("st", "saint"), - ("co", "company"), - ("jr", "junior"), - ("maj", "major"), - ("gen", "general"), - ("drs", "doctors"), - ("rev", "reverend"), - ("lt", "lieutenant"), - ("hon", "honorable"), - ("sgt", "sergeant"), - ("capt", "captain"), - ("esq", "esquire"), - ("ltd", "limited"), - ("col", "colonel"), - ("ft", "fort"), - ] - ], - "es": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("sra", "señora"), - ("sr", "señor"), - ("dr", "doctor"), - ("dra", "doctora"), - ("st", "santo"), - ("co", "compañía"), - ("jr", "junior"), - ("ltd", "limitada"), - ] - ], - "fr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("mme", "madame"), - ("mr", "monsieur"), - ("dr", "docteur"), - ("st", "saint"), - ("co", "compagnie"), - ("jr", "junior"), - ("ltd", "limitée"), - ] - ], - "de": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("fr", "frau"), - ("dr", "doktor"), - ("st", "sankt"), - ("co", "firma"), - ("jr", "junior"), - ] - ], - "pt": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("sra", "senhora"), - ("sr", "senhor"), - ("dr", "doutor"), - ("dra", "doutora"), - ("st", "santo"), - ("co", "companhia"), - ("jr", "júnior"), - ("ltd", "limitada"), - ] - ], - "it": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - # ("sig.ra", "signora"), - ("sig", "signore"), - ("dr", "dottore"), - ("st", "santo"), - ("co", "compagnia"), - ("jr", "junior"), - ("ltd", "limitata"), - ] - ], - "pl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("p", "pani"), - ("m", "pan"), - ("dr", "doktor"), - ("sw", "święty"), - ("jr", "junior"), - ] - ], - "ar": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - # There are not many common abbreviations in Arabic as in English. - ] - ], - "zh": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. - ] - ], - "cs": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("dr", "doktor"), # doctor - ("ing", "inženýr"), # engineer - ("p", "pan"), # Could also map to pani for woman but no easy way to do it - # Other abbreviations would be specialized and not as common. - ] - ], - "ru": [ - (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) - for x in [ - ("г-жа", "госпожа"), # Mrs. - ("г-н", "господин"), # Mr. - ("д-р", "доктор"), # doctor - # Other abbreviations are less common or specialized. - ] - ], - "nl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("dhr", "de heer"), # Mr. - ("mevr", "mevrouw"), # Mrs. - ("dr", "dokter"), # doctor - ("jhr", "jonkheer"), # young lord or nobleman - # Dutch uses more abbreviations, but these are the most common ones. - ] - ], - "tr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("b", "bay"), # Mr. - ("byk", "büyük"), # büyük - ("dr", "doktor"), # doctor - # Add other Turkish abbreviations here if needed. - ] - ], - "hu": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("dr", "doktor"), # doctor - ("b", "bácsi"), # Mr. - ("nőv", "nővér"), # nurse - # Add other Hungarian abbreviations here if needed. - ] - ], - "ko": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. - ] - ], - "hi": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts. - ] - - ], - "vi": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("ông", "ông"), # Mr. - ("bà", "bà"), # Mrs. - ("dr", "bác sĩ"), # doctor - ("ts", "tiến sĩ"), # PhD - ("st", "số thứ tự"), # ordinal - ] - ], -} - - -def expand_abbreviations_multilingual(text, lang="en"): - for regex, replacement in _abbreviations[lang]: - text = re.sub(regex, replacement, text) - return text - - -_symbols_multilingual = { - "en": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " and "), - ("@", " at "), - ("%", " percent "), - ("#", " hash "), - ("$", " dollar "), - ("£", " pound "), - ("°", " degree "), - ] - ], - "es": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " y "), - ("@", " arroba "), - ("%", " por ciento "), - ("#", " numeral "), - ("$", " dolar "), - ("£", " libra "), - ("°", " grados "), - ] - ], - "fr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " et "), - ("@", " arobase "), - ("%", " pour cent "), - ("#", " dièse "), - ("$", " dollar "), - ("£", " livre "), - ("°", " degrés "), - ] - ], - "de": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " und "), - ("@", " at "), - ("%", " prozent "), - ("#", " raute "), - ("$", " dollar "), - ("£", " pfund "), - ("°", " grad "), - ] - ], - "pt": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " e "), - ("@", " arroba "), - ("%", " por cento "), - ("#", " cardinal "), - ("$", " dólar "), - ("£", " libra "), - ("°", " graus "), - ] - ], - "it": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " e "), - ("@", " chiocciola "), - ("%", " per cento "), - ("#", " cancelletto "), - ("$", " dollaro "), - ("£", " sterlina "), - ("°", " gradi "), - ] - ], - "pl": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " i "), - ("@", " małpa "), - ("%", " procent "), - ("#", " krzyżyk "), - ("$", " dolar "), - ("£", " funt "), - ("°", " stopnie "), - ] - ], - "ar": [ - # Arabic - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " و "), - ("@", " على "), - ("%", " في المئة "), - ("#", " رقم "), - ("$", " دولار "), - ("£", " جنيه "), - ("°", " درجة "), - ] - ], - "zh": [ - # Chinese - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " 和 "), - ("@", " 在 "), - ("%", " 百分之 "), - ("#", " 号 "), - ("$", " 美元 "), - ("£", " 英镑 "), - ("°", " 度 "), - ] - ], - "cs": [ - # Czech - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " a "), - ("@", " na "), - ("%", " procento "), - ("#", " křížek "), - ("$", " dolar "), - ("£", " libra "), - ("°", " stupně "), - ] - ], - "ru": [ - # Russian - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " и "), - ("@", " собака "), - ("%", " процентов "), - ("#", " номер "), - ("$", " доллар "), - ("£", " фунт "), - ("°", " градус "), - ] - ], - "nl": [ - # Dutch - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " en "), - ("@", " bij "), - ("%", " procent "), - ("#", " hekje "), - ("$", " dollar "), - ("£", " pond "), - ("°", " graden "), - ] - ], - "tr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " ve "), - ("@", " at "), - ("%", " yüzde "), - ("#", " diyez "), - ("$", " dolar "), - ("£", " sterlin "), - ("°", " derece "), - ] - ], - "hu": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " és "), - ("@", " kukac "), - ("%", " százalék "), - ("#", " kettőskereszt "), - ("$", " dollár "), - ("£", " font "), - ("°", " fok "), - ] - ], - "ko": [ - # Korean - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " 그리고 "), - ("@", " 에 "), - ("%", " 퍼센트 "), - ("#", " 번호 "), - ("$", " 달러 "), - ("£", " 파운드 "), - ("°", " 도 "), - ] - ], - "hi": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " और "), - ("@", " ऐट दी रेट "), - ("%", " प्रतिशत "), - ("#", " हैश "), - ("$", " डॉलर "), - ("£", " पाउंड "), - ("°", " डिग्री "), - ] - ], - "vi": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) - for x in [ - ("&", " và "), # and - ("@", " a còng "), # at - ("%", " phần trăm "), # percent - ("#", " dấu thăng "), # hash - ("$", " đô la "), # dollar - ("£", " bảng Anh "), # pound - ("°", " độ "), # degree - ] - ], -} - - -def expand_symbols_multilingual(text, lang="en"): - for regex, replacement in _symbols_multilingual[lang]: - text = re.sub(regex, replacement, text) - text = text.replace(" ", " ") # Ensure there are no double spaces - return text.strip() - - -_ordinal_re = { - "en": re.compile(r"([0-9]+)(st|nd|rd|th)"), - "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"), - "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"), - "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"), - "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"), - "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"), - "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"), - "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"), - "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals. - "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"), - "nl": re.compile(r"([0-9]+)(de|ste|e)"), - "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), - "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), - "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), - "hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check - "vi": re.compile(r"([0-9]+)(th|thứ)?"), # Matches "1", "thứ 1", "2", "thứ 2" -} -_number_re = re.compile(r"[0-9]+") -_currency_re = { - "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), - "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), - "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), -} - -_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") -_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b") -_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)") - - -def _remove_commas(m): - text = m.group(0) - if "," in text: - text = text.replace(",", "") - return text - - -def _remove_dots(m): - text = m.group(0) - if "." in text: - text = text.replace(".", "") - return text - - -def _expand_decimal_point(m, lang="en"): - amount = m.group(1).replace(",", ".") - return num2words(float(amount), lang=lang if lang != "cs" else "cz") - - -def _expand_currency(m, lang="en", currency="USD"): - amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) - full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz") - - and_equivalents = { - "en": ", ", - "es": " con ", - "fr": " et ", - "de": " und ", - "pt": " e ", - "it": " e ", - "pl": ", ", - "cs": ", ", - "ru": ", ", - "nl": ", ", - "ar": ", ", - "tr": ", ", - "hu": ", ", - "ko": ", ", - "hi": ", ", - } - - if amount.is_integer(): - last_and = full_amount.rfind(and_equivalents[lang]) - if last_and != -1: - full_amount = full_amount[:last_and] - - return full_amount - - -def _expand_ordinal(m, lang="en"): - return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") - - -def _expand_number(m, lang="en"): - return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") - - -def expand_numbers_multilingual(text, lang="en"): - if lang == "zh": - text = zh_num2words()(text) - else: - if lang in ["en", "ru"]: - text = re.sub(_comma_number_re, _remove_commas, text) - else: - text = re.sub(_dot_number_re, _remove_dots, text) - try: - text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text) - text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text) - text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text) - except: - pass - if lang != "tr": - text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text) - text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) - text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) - return text - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, " ", text) - - -def multilingual_cleaners(text, lang): - text = text.replace('"', "") - if lang == "tr": - text = text.replace("İ", "i") - text = text.replace("Ö", "ö") - text = text.replace("Ü", "ü") - text = lowercase(text) - text = expand_numbers_multilingual(text, lang) - text = expand_abbreviations_multilingual(text, lang) - text = expand_symbols_multilingual(text, lang=lang) - text = collapse_whitespace(text) - return text - - -def basic_cleaners(text): - """Basic pipeline that lowercases and collapses whitespace without transliteration.""" - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def chinese_transliterate(text): - try: - import pypinyin - except ImportError as e: - raise ImportError("Chinese requires: pypinyin") from e - return "".join( - [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)] - ) - - -def japanese_cleaners(text, katsu): - text = katsu.romaji(text) - text = lowercase(text) - return text - - -def korean_transliterate(text): - try: - from hangul_romanize import Transliter - from hangul_romanize.rule import academic - except ImportError as e: - raise ImportError("Korean requires: hangul_romanize") from e - r = Transliter(academic) - return r.translit(text) - - -DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json") - - -class VoiceBpeTokenizer: - def __init__(self, vocab_file=None): - self.tokenizer = None - if vocab_file is not None: - self.tokenizer = Tokenizer.from_file(vocab_file) - self.char_limits = { - "en": 250, - "de": 253, - "fr": 273, - "es": 239, - "it": 213, - "pt": 203, - "pl": 224, - "zh": 82, - "ar": 166, - "cs": 186, - "ru": 182, - "nl": 251, - "tr": 226, - "ja": 71, - "hu": 224, - "ko": 95, - "hi": 150, - "vi": 250, - } - - @cached_property - def katsu(self): - import cutlet - - return cutlet.Cutlet() - - def check_input_length(self, txt, lang): - lang = lang.split("-")[0] # remove the region - limit = self.char_limits.get(lang, 250) - if len(txt) > limit: - logger.warning( - "The text length exceeds the character limit of %d for language '%s', this might cause truncated audio.", - limit, - lang, - ) - - def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko", "vi"}: - txt = multilingual_cleaners(txt, lang) - if lang == "zh": - txt = chinese_transliterate(txt) - if lang == "ko": - txt = korean_transliterate(txt) - elif lang == "ja": - txt = japanese_cleaners(txt, self.katsu) - else: - raise NotImplementedError(f"Language '{lang}' is not supported.") - return txt - - def encode(self, txt, lang): - lang = lang.split("-")[0] # remove the region - self.check_input_length(txt, lang) - txt = self.preprocess_text(txt, lang) - lang = "zh-cn" if lang == "zh" else lang - txt = f"[{lang}]{txt}" - txt = txt.replace(" ", "[SPACE]") - return self.tokenizer.encode(txt).ids - - def decode(self, seq): - if isinstance(seq, torch.Tensor): - seq = seq.cpu().numpy() - txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") - txt = txt.replace("[SPACE]", " ") - txt = txt.replace("[STOP]", "") - txt = txt.replace("[UNK]", "") - return txt - - def __len__(self): - return self.tokenizer.get_vocab_size() - - def get_number_tokens(self): - return max(self.tokenizer.get_vocab().values()) + 1 - - -def test_expand_numbers_multilingual(): - test_cases = [ - # English - ("In 12.5 seconds.", "In twelve point five seconds.", "en"), - ("There were 50 soldiers.", "There were fifty soldiers.", "en"), - ("This is a 1st test", "This is a first test", "en"), - ("That will be $20 sir.", "That will be twenty dollars sir.", "en"), - ("That will be 20€ sir.", "That will be twenty euro sir.", "en"), - ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"), - ("That's 100,000.5.", "That's one hundred thousand point five.", "en"), - # French - ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"), - ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"), - ("Ceci est un 1er test", "Ceci est un premier test", "fr"), - ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"), - ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"), - ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"), - ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"), - # German - ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"), - ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"), - ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender - ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"), - ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"), - ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"), - # Spanish - ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"), - ("Había 50 soldados.", "Había cincuenta soldados.", "es"), - ("Este es un 1er test", "Este es un primero test", "es"), - ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"), - ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"), - ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"), - # Italian - ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"), - ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"), - ("Questo è un 1° test", "Questo è un primo test", "it"), - ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"), - ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"), - ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"), - # Portuguese - ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"), - ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"), - ("Este é um 1º teste", "Este é um primeiro teste", "pt"), - ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"), - ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"), - ( - "Isso custará 20,15€ senhor.", - "Isso custará vinte euros e quinze cêntimos senhor.", - "pt", - ), # "cêntimos" should be "centavos" num2words issue - # Polish - ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"), - ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"), - ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"), - ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"), - # Arabic - ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"), - ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"), - # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words - # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'), - # Czech - ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"), - ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"), - ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"), - ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"), - # Russian - ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"), - ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"), - ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"), - ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"), - # Dutch - ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"), - ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"), - ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), - ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), - # Chinese (Simplified) - ("在12.5秒内", "在十二点五秒内", "zh"), - ("有50名士兵", "有五十名士兵", "zh"), - # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work - # ("那将是20€先生", '那将是二十欧元先生', 'zh'), - # Turkish - # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR - ("50 asker vardı.", "elli asker vardı.", "tr"), - ("Bu 1. test", "Bu birinci test", "tr"), - # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'), - # Hungarian - ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"), - ("50 katona volt.", "ötven katona volt.", "hu"), - ("Ez az 1. teszt", "Ez az első teszt", "hu"), - # Korean - ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), - ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), - ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), - # Hindi - ("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"), - ("50 सैनिक थे।", "पचास सैनिक थे।", "hi"), - ] - for a, b, lang in test_cases: - out = expand_numbers_multilingual(a, lang=lang) - assert out == b, f"'{out}' vs '{b}'" - - -def test_abbreviations_multilingual(): - test_cases = [ - # English - ("Hello Mr. Smith.", "Hello mister Smith.", "en"), - ("Dr. Jones is here.", "doctor Jones is here.", "en"), - # Spanish - ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"), - ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"), - # French - ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"), - ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"), - # German - ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"), - # Portuguese - ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"), - ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"), - # Italian - ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"), - # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern - # Polish - ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"), - ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"), - # Czech - ("P. Novák", "pan Novák", "cs"), - ("Dr. Vojtěch", "doktor Vojtěch", "cs"), - # Dutch - ("Dhr. Jansen", "de heer Jansen", "nl"), - ("Mevr. de Vries", "mevrouw de Vries", "nl"), - # Russian - ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"), - ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"), - # Turkish - ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"), - ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), - # Hungarian - ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), - ] - - for a, b, lang in test_cases: - out = expand_abbreviations_multilingual(a, lang=lang) - assert out == b, f"'{out}' vs '{b}'" - - -def test_symbols_multilingual(): - test_cases = [ - ("I have 14% battery", "I have 14 percent battery", "en"), - ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"), - ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"), - ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"), - ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"), - ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"), - ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"), - ("Mám 14% baterie", "Mám 14 procento baterie", "cs"), - ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"), - ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"), - ("Я буду @ дома", "Я буду собака дома", "ru"), - ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), - ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), - ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), - ("我的电量为 14%", "我的电量为 14 百分之", "zh"), - ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), - ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), - ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), - ("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"), - ] - - for a, b, lang in test_cases: - out = expand_symbols_multilingual(a, lang=lang) - assert out == b, f"'{out}' vs '{b}'" - - -if __name__ == "__main__": - test_expand_numbers_multilingual() - test_abbreviations_multilingual() - test_symbols_multilingual() - diff --git a/packages.txt b/packages.txt deleted file mode 100644 index 8209068d470674a6230c4097425c47bda5e93aa9..0000000000000000000000000000000000000000 --- a/packages.txt +++ /dev/null @@ -1,7 +0,0 @@ -wget -git -calibre -ffmpeg -libmecab-dev -mecab -mecab-ipadic \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index aa09586fbdca1cf97d5c5fb09055d3bec9b26474..0000000000000000000000000000000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,44 +0,0 @@ -[build-system] -name = "ebook2audiobook" -version = "2.0.0" -requires = ["setuptools >= 64"] -build-backend = "setuptools.build_meta" - -[project] -name = "ebook2audiobook" -version = "2.0.0" -description = "Convert eBooks to audiobooks with chapters and metadata" -authors = [ - { name = "Drew Thomasson" } -] -dependencies = [ - "beautifulsoup4", - "coqui-tts", - "cutlet", - "deep_translator", - "docker", - "ebooklib", - "gensim", - "gradio>=4.44", - "hangul-romanize", - "indic-nlp-library", - "iso-639", - "jieba", - "pydub", - "pypinyin", - "ray", - "transformers", - "translate", - "tqdm" -] -readme = "README.md" -requires-python = ">=3.10,<3.13" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] -scripts = { "ebook2audiobook" = "app:main" } - -[project.urls] -"Homepage" = "https://github.com/DrewThomasson/ebook2audiobook" \ No newline at end of file diff --git a/readme/README_CN.md b/readme/README_CN.md deleted file mode 100644 index ede6d7b45cf95d568c00ba27d0c89d84f9f07d27..0000000000000000000000000000000000000000 --- a/readme/README_CN.md +++ /dev/null @@ -1,420 +0,0 @@ -# 📚 ebook2audiobook - -使用Calibre和Coqui XTTS将电子书转换为包含章节和元数据的有声读物。支持可选的语音克隆和多种语言! -> [!IMPORTANT] -**本工具仅适用于非DRM、合法获取的电子书。** -作者对软件的任何误用或由此产生的法律后果概不负责。 -请负责任地使用本工具,并遵守所有适用法律。 - -#### 🖥️ Web GUI界面 -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- 点击查看Web GUI的图片 -image -image -image -
- -## 🌟 特征 - -- 📖 使用Calibre将电子书转换为文本格式。 -- 📚 将电子书拆分为章节,以获得有组织的音频。 -- 🎙️ 使用Coqui XTTS实现高质量的文本到语音转换。 -- 🗣️ 可选择使用您自己的语音文件进行语音克隆。 -- 🌍 支持多种语言(默认为英语)。 -- 🖥️ 基于4GB RAM运行。 - -## 🛠️ 环境要求 - -- Python 3.10 -- `coqui-tts` Python package -- Calibre (用于电子书转换) -- FFmpeg (用于有声读物创作) -- Optional: 用于语音克隆的自定义语音文件 - -### 🔧 安装说明 - -1. **安装 Python 3.x** from [Python.org](https://www.python.org/downloads/). - -2. **安装 Calibre**: - - **Ubuntu**: `sudo apt-get install -y calibre` - - **macOS**: `brew install calibre` - - **Windows** (Admin Powershell): `choco install calibre` - -3. **安装 FFmpeg**: - - **Ubuntu**: `sudo apt-get install -y ffmpeg` - - **macOS**: `brew install ffmpeg` - - **Windows** (Admin Powershell): `choco install ffmpeg` - -4. **可选: Install Mecab** (非拉丁语言): - - **Ubuntu**: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` - - **macOS**: `brew install mecab`, `brew install mecab-ipadic` - - **Windows**: [mecab-website-to-install-manually](https://taku910.github.io/mecab/#download) (注:日语支持有限) - -5. **安装 Python packages**: - ```bash - pip install coqui-tts==0.24.2 pydub nltk beautifulsoup4 ebooklib tqdm gradio==4.44.0 - - python -m nltk.downloader punkt - python -m nltk.downloader punkt_tab - ``` - - **For non-Latin languages**: - ```bash - pip install mecab mecab-python3 unidic - - python -m unidic download - ``` - -## 🌐 支持的语言 - -- **English (en)** -- **Spanish (es)** -- **French (fr)** -- **German (de)** -- **Italian (it)** -- **Portuguese (pt)** -- **Polish (pl)** -- **Turkish (tr)** -- **Russian (ru)** -- **Dutch (nl)** -- **Czech (cs)** -- **Arabic (ar)** -- **Chinese (zh-cn)** -- **Japanese (ja)** -- **Hungarian (hu)** -- **Korean (ko)** - -在无头模式下运行脚本时指定语言代码。 -## 🚀 使用 - -### 🖥️ 启动Gradio Web界面 - -1. **运行脚本**: - ```bash - python app.py - ``` - -2. **打开web应用程序**: 点击终端中提供的URL访问web应用程序并转换电子书. -3. **公共链接**: 在末尾添加“--share True”,如下所示:`python app.py--share True` -- **[更多参数]**: 使用`-h`参数,如`python app.py-h` - -### 📝 基本的无头用法 - -```bash -python app.py --headless True --ebook --voice [path_to_voice_file] --language [language_code] -``` - -- ****: 电子书文件的路径。 -- **[path_to_voice_file]**: 指定转换的语音文件,可选。 -- **[language_code]**: 指定转换的语言,可选。 -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🧩 自定义XTTS模型的无头用法 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model --custom_config --custom_vocab -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: `model.pth`的路径。 -- ****: `config.json`的路径。 -- ****: `vocab.json`的路径。 -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🧩 自定义XTTS Fine-Tune 模型的无头用法 🌐 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model_url -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: 模型文件夹压缩包的URL路径。例如 - [xtts_David_Attenborough_fine_tune](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/tree/main) `https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true` -- **[更多参数]**: 使用 `-h` 参数,如 `python app.py -h` - -### 🔍 详细指南,包括所有要使用的参数列表 -```bash -python app.py -h -``` -- 这将输出以下内容: -```bash -usage: app.py [-h] [--share] [--headless [HEADLESS]] [--ebook EBOOK] - [--ebooks_dir [EBOOKS_DIR]] [--voice VOICE] [--language LANGUAGE] - [--device {cpu,gpu}] [--use_custom_model] [--custom_model CUSTOM_MODEL] - [--custom_config CUSTOM_CONFIG] [--custom_vocab CUSTOM_VOCAB] - [--custom_model_url CUSTOM_MODEL_URL] [--temperature TEMPERATURE] - [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share Enable a public shareable Gradio link. Defaults to False. - --headless [HEADLESS] - Run in headless mode. Defaults to True if the flag is present without a value, False otherwise. - --ebook EBOOK Path to the ebook file for conversion. Required in headless mode. - --ebooks_dir [EBOOKS_DIR] - Path to the directory containing ebooks for batch conversion. Defaults to './ebooks' if 'default' value is provided. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to English (en). - --device {cpu,gpu} Type of processor unit for the audiobook conversion. Defaults to cpu. - --use_custom_model Use a custom TTS model. Defaults to False. Must be True to use custom models. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but will be used if provided. Examples include David Attenborough's model: 'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. More XTTS fine-tunes can be found on my Hugging Face at 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher temperatures lead to more creative outputs. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. Defaults to 1.0. - --enable_text_splitting - Enable splitting text into sentences. Defaults to False. - -Example usage: -Windows: - headless: - ./ebook2audiobook.cmd --headless --ebook 'path_to_ebook' --voice 'path_to_voice' --language en --use_custom_model --custom_model 'model.zip' --custom_config config.json --custom_vocab vocab.json - Graphic Interface: - ./ebook2audiobook.cmd -Linux/Mac: - headless: - ./ebook2audiobook.sh --headless --ebook 'path_to_ebook' --voice 'path_to_voice' --language en --use_custom_model --custom_model 'model.zip' --custom_config config.json --custom_vocab vocab.json - Graphic Interface: - ./ebook2audiobook.sh -``` - -
- ⚠️ 遗留的旧版使用说明 - -## 🚀 使用 - -----> `ebook2audiobookXTTS/legacy/` - -### 🖥️ Web界面 - -1. **运行脚本**: - ```bash - python custom_model_ebook2audiobookXTTS_gradio.py - ``` - -2. **打开web应用程序**: 单击终端中提供的URL以访问web应用程序并转换电子书。 - -### 📝 基础用法 - -```bash -python ebook2audiobook.py [path_to_voice_file] [language_code] -``` - -- ****: 电子书文件的路径。 -- **[path_to_voice_file]**: 指定转换的语音文件,可选。 -- **[language_code]**: 指定转换的语言,可选。 - -### 🧩 自定义XTTS模型 - -```bash -python custom_model_ebook2audiobookXTTS.py -``` - -- ****: 电子书文件的路径。 -- ****: 指定转换的语音文件,可选。 -- ****: 指定转换的语言,可选。 -- ****: `model.pth`的路径。 -- ****: `config.json`的路径。 -- ****: `vocab.json`的路径。 -
- -### 🐳 使用Docker - -您还可以使用Docker运行电子书到有声读物的转换器。这种方法确保了不同环境之间的一致性,并简化了设置。 - -#### 🚀 运行Docker容器 - -要运行Docker容器并启动Gradio接口,请使用以下命令: - - -仅使用CPU运行 -```powershell -docker run -it --rm -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -使用GPU加速运行(仅限Nvida显卡) -```powershell -docker run -it --rm --gpus all -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -此命令将启动7860端口上的Gradio接口(localhost:7860) -- 对于更多选项,如以无头模式运行docker或公开gradio链接,请在docker启动命令中的`app.py`后添加`-h`参数 -
- 在无头模式下使用docker或使用额外参数修改任何内容的示例+完整指南 - -## 在无头模式下使用docker的示例 - -首先是docker pull的最新版本 -```bash -docker pull athomasson2/ebook2audiobookxtts:huggingface -``` - -- 在运行此命令之前,您需要在当前目录中创建一个名为“input folder”的目录,该目录将被链接,您可以在此处放置docker镜像的输入文件 -```bash -mkdir input-folder && mkdir Audiobooks -``` - -- 运行下面命令需要将 **YOUR_INPUT_FILE.TXT** 替换为您创建的输入文件的名称 - -```bash -docker run -it --rm \ - -v $(pwd)/input-folder:/home/user/app/input_folder \ - -v $(pwd)/Audiobooks:/home/user/app/Audiobooks \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py --headless True --ebook /home/user/app/input_folder/YOUR_INPUT_FILE.TXT -``` - -- 应该就是这样了! - -- 输出Audiobooks将在Audiobook文件夹中找到,该文件夹也位于您运行此docker命令的本地目录中 - - -## 要获取此程序中其他参数的帮助命令,可以运行以下命令 - -```bash -docker run -it --rm \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py -h - -``` - - -这将输出以下内容 - -```bash -user/app/ebook2audiobookXTTS/input-folder -v $(pwd)/Audiobooks:/home/user/app/ebook2audiobookXTTS/Audiobooks --memory="4g" --network none --platform linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -h -starting... -usage: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Convert eBooks to Audiobooks using a Text-to-Speech model. You can either launch the -Gradio interface or run the script in headless mode for direct conversion. - -options: - -h, --help show this help message and exit - --share SHARE Set to True to enable a public shareable Gradio link. Defaults - to False. - --headless HEADLESS Set to True to run in headless mode without the Gradio - interface. Defaults to False. - --ebook EBOOK Path to the ebook file for conversion. Required in headless - mode. - --voice VOICE Path to the target voice file for TTS. Optional, uses a default - voice if not provided. - --language LANGUAGE Language for the audiobook conversion. Options: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. Defaults to - English (en). - --use_custom_model USE_CUSTOM_MODEL - Set to True to use a custom TTS model. Defaults to False. Must - be True to use custom models, otherwise you'll get an error. - --custom_model CUSTOM_MODEL - Path to the custom model file (.pth). Required if using a custom - model. - --custom_config CUSTOM_CONFIG - Path to the custom config file (config.json). Required if using - a custom model. - --custom_vocab CUSTOM_VOCAB - Path to the custom vocab file (vocab.json). Required if using a - custom model. - --custom_model_url CUSTOM_MODEL_URL - URL to download the custom model as a zip file. Optional, but - will be used if provided. Examples include David Attenborough's - model: 'https://huggingface.co/drewThomasson/xtts_David_Attenbor - ough_fine_tune/resolve/main/Finished_model_files.zip?download=tr - ue'. More XTTS fine-tunes can be found on my Hugging Face at - 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Temperature for the model. Defaults to 0.65. Higher Tempatures - will lead to more creative outputs IE: more Hallucinations. - Lower Tempatures will be more monotone outputs IE: less - Hallucinations. - --length_penalty LENGTH_PENALTY - A length penalty applied to the autoregressive decoder. Defaults - to 1.0. Not applied to custom models. - --repetition_penalty REPETITION_PENALTY - A penalty that prevents the autoregressive decoder from - repeating itself. Defaults to 2.0. - --top_k TOP_K Top-k sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 50. - --top_p TOP_P Top-p sampling. Lower values mean more likely outputs and - increased audio generation speed. Defaults to 0.8. - --speed SPEED Speed factor for the speech generation. IE: How fast the - Narrerator will speak. Defaults to 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Enable splitting text into sentences. Defaults to True. - -Example: python script.py --headless --ebook path_to_ebook --voice path_to_voice ---language en --use_custom_model True --custom_model model.pth --custom_config -config.json --custom_vocab vocab.json -``` -
- -#### 🖥️ Docker图形用户界面 -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- 点击查看Web界面的图片 -image -image -image -
- -### 🛠️ 关于自定义XTTS模型 - -为更好地处理特定声音而构建的模型。查看我的Hugging Face页面 [here](https://huggingface.co/drewThomasson). - -要使用自定义模型,请粘贴“Finished_model_files.zip”文件的链接,如下所示: - -[David Attenborough fine tuned Finished_model_files.zip](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true) - - - - -更多详细信息请访问 [Dockerfile Hub Page]([https://github.com/DrewThomasson/ebook2audiobookXTTS](https://hub.docker.com/repository/docker/athomasson2/ebook2audiobookxtts/general)). - -## 🌐 微调XTTS模型 - -要查找已经过微调的XTTS型号,请访问[Hugging Face](https://huggingface.co/drewThomasson) 🌐. 模型搜索需要包含“xtts fine tune”的关键字。 - -## 🎥 Demos - -https://github.com/user-attachments/assets/8486603c-38b1-43ce-9639-73757dfb1031 - -## 🤗 [Huggingface space demo](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Huggingface空间正在空闲cpu层上运行,所以预计会非常慢或超时,哈哈,只是不要给它大文件 -- 最好复制空间或在本地运行。 -## 📚 支持的电子书格式 - -- `.epub`, `.pdf`, `.mobi`, `.txt`, `.html`, `.rtf`, `.chm`, `.lit`, `.pdb`, `.fb2`, `.odt`, `.cbr`, `.cbz`, `.prc`, `.lrf`, `.pml`, `.snb`, `.cbc`, `.rb`, `.tcr` -- **最佳结果**: `.epub` 或者 `.mobi`格式可以进行自动章节检测。 - -## 📂 输出 - -- 创建一个包含元数据和章节的“.m4b”文件。 -- **例子**: ![Example](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) diff --git a/readme/README_RU.md b/readme/README_RU.md deleted file mode 100644 index 95dbde82a19c8a20f7625d3358e002987a71797e..0000000000000000000000000000000000000000 --- a/readme/README_RU.md +++ /dev/null @@ -1,391 +0,0 @@ -# 📚 ebook2audiobook - -Конвертация электронных книг в аудиокниги с сохранением глав и метаданных, используются механизмы Calibre и XTTS. Поддерживаются опциональное клонирование голоса и множественные языки! -> [!IMPORTANT] -**Этот инструмент предназначен для использования только с электронными книгами, не защищёнными DRM, приобретёнными законным путём.** -Авторы не несут ответственности за неправильное использование этого программного обеспечения или любые юридические последствия, связанные с его использованием. -Используйте этот инструмент ответственно и в соответствии с действующим законодательством. - - -#### 🖥️ Web-интерфейс -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Больше картинок Web-интерфейса -image -image -image -
- -## README.md -- en [English](README.md) -- zh_CN [简体中文](readme/README_CN.md) -- ru [Русский](readme/README_RU.md) - - -## 🌟 Возможности - -- 📖 Преобразование электронных книг в текстовой формат при помощи Calibre. -- 📚 Разбитие электронных книг по главам для аудиоформата. -- 🎙️ Высококачественное преобразование текста в голос при помощи Coqui XTTS. -- 🗣️ Опциональное клонирование голоса на основе вашего голосового файла. -- 🌍 Многоязыковая поддержка (Английский по умолчанию). -- 🖥️ Для работы достаточно всего 4 Гб ОЗУ. - -## 🤗 [Демонстрация на Huggingface](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Пространство на Huggingface работает на бесплатном процессорном уровне, посему не стоит ожидать от него высокой скорости обработки или отсутствие сообщений о таймаутах. Даже и не пытайтесь обработать большие файлы. -- Лучше всего скопировать пространство или запустить приложение локально. - -## Бесплатный Google Colab [![Бесплатный Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - -## 🛠️ Требования - -- Python 3.10 -- `coqui-tts` Python package -- Calibre (для конвертации электронных книг) -- FFmpeg (для создания аудиокниг) -- Опционально: собственный файл с голосом для начитки - - -### 🔧 Установка - -1. **Установить Python 3.x** из [Python.org](https://www.python.org/downloads/). - -2. **Установить Calibre**: - - **Ubuntu**: `sudo apt-get install -y calibre` - - **macOS**: `brew install calibre` - - **Windows** (Admin Powershell): `choco install calibre` - -3. **Установить FFmpeg**: - - **Ubuntu**: `sudo apt-get install -y ffmpeg` - - **macOS**: `brew install ffmpeg` - - **Windows** (Admin Powershell): `choco install ffmpeg` - -4. **Опционально: установить Mecab** (для не латинских языков): - - **Ubuntu**: `sudo apt-get install -y mecab libmecab-dev mecab-ipadic-utf8` - - **macOS**: `brew install mecab`, `brew install mecab-ipadic` - - **Windows**: [mecab-website-to-install-manually](https://taku910.github.io/mecab/#download) (Замечание: Японский язык поддерживается ограничено) - -5. **Установить пакеты Python**: - ```bash - pip install coqui-tts==0.24.2 pydub nltk beautifulsoup4 ebooklib tqdm gradio==4.44.0 - - python -m nltk.downloader punkt - python -m nltk.downloader punkt_tab - ``` - - **Для не латинских языков**: - ```bash - pip install mecab mecab-python3 unidic - - python -m unidic download - ``` - -## 🌐 Поддерживаемые языки - -- **English (en)** -- **Spanish (es)** -- **French (fr)** -- **German (de)** -- **Italian (it)** -- **Portuguese (pt)** -- **Polish (pl)** -- **Turkish (tr)** -- **Russian (ru)** -- **Dutch (nl)** -- **Czech (cs)** -- **Arabic (ar)** -- **Chinese (zh-cn)** -- **Japanese (ja)** -- **Hungarian (hu)** -- **Korean (ko)** - -Указывайте код нужного языка при запуске в безинтерфейсном режиме (в коммандной строке). -## 🚀 Использование - -### 🖥️ Запуск Gradio Web-интерфейса - -1. **Запустите скрипт**: - ```bash - python app.py - ``` - -2. **Откройте Web-приложение**: нажмите на ссылку появившуся в окне терминала для доступа к Web-приложению и конвертированию электронных книг. -3. **Для доступа из сети**: добавьте `--share True` в конец команды, наподобие: `python app.py --share True` -- **[Для большего количества параметров]**: используйте `-h` ключ, наподобие: `python app.py -h` - -### 📝 Типовое использование в безинтерфейсном режиме - -```bash -python app.py --headless True --ebook --voice [path_to_voice_file] --language [language_code] -``` - -- ****: путь к файлу электронной книги. -- **[path_to_voice_file]**: путь к примеру голоса, для опционального клонирования голоса для начитки. -- **[language_code]**: по желанию, выбрать язык. -- **[Для большего количества парамтеров]**: используйте `-h` ключ, наподобие `python app.py -h` - -### 🧩 Безинтерфейсное использование с индивиуальной моделью XTTS - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model --custom_config --custom_vocab -``` - -- ****: путь к файлу электронной книги. -- ****: путь к примеру голоса, для опционального клонирования. -- **\**: по желанию, выбрать язык. -- ****: путь к `model.pth`. -- ****: путь к `config.json`. -- ****: путь к `vocab.json`. -- **[Для большего количества парамтеров]**: используйте `-h` ключ, наподобие `python app.py -h` - - -### 🧩 Безинтерфейсое использование с индивидуальной моделью XTTS со ссылкой на Zip-архив содержащий модель тонкой настройки XTTS 🌐 - -```bash -python app.py --headless True --use_custom_model True --ebook --voice --language --custom_model_url -``` - -- ****: путь к файлу eBook. -- ****: путь к примеру голоса, для опционального клонирования. -- **\**: по желанию, выбрать язык. -- ****: путь в виде URL к архиву формата zip с папкой модели. Например, [xtts_David_Attenborough_fine_tune](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/tree/main) `https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true` -- Для индивидуальной модели все равно потребуется референсный аудиофайл с голосом: -[референсный аудиофайл с голосом David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) -- **[Для большего количества парамтеров]**: используйте `-h` ключ, наподобие `python app.py -h` - -### 🔍 Для подробного списка всех параметров используйте -```bash -python app.py -h -``` -- Будет выведен примерно следующий сприсок ключей: -```bash -использование: app.py [-h] [--share SHARE] [--headless HEADLESS] [--ebook EBOOK] [--voice VOICE] - [--language LANGUAGE] [--use_custom_model USE_CUSTOM_MODEL] - [--custom_model CUSTOM_MODEL] [--custom_config CUSTOM_CONFIG] - [--custom_vocab CUSTOM_VOCAB] [--custom_model_url CUSTOM_MODEL_URL] - [--temperature TEMPERATURE] [--length_penalty LENGTH_PENALTY] - [--repetition_penalty REPETITION_PENALTY] [--top_k TOP_K] [--top_p TOP_P] - [--speed SPEED] [--enable_text_splitting ENABLE_TEXT_SPLITTING] - -Преобразование электронных книг в аудиокниги с использованием модели Text-to-Speech (TTS). Вы можете либо использовать -интерфейс Gradio, либо запустить скрипт в безинтерфейсном режиме (командная строка) для прямого конвертирования. - -опции: - -h, --help Отобразить этот список и выйти - --share SHARE Установить в True для включения публичного доступа к Web-интерфейсу Gradio. По умолчанию False. - --headless HEADLESS Установить в True для использования безинтерфейсного режима. По умолчанию False. - --ebook EBOOK Путь к электронной книге для конвертации. Необходимо для безинтерфейсного режима. - --voice VOICE Путь к целевому голосовому файлу для TTS (текст-в-голос). Опционально, используется голос по умолчанию, если путь не указан. - --language LANGUAGE Язык для конвертации в аудиокнигу. Варианты: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. По умолчанию English (en). - --use_custom_model USE_CUSTOM_MODEL - Установить в True для использования индивидуальной модели TTS. По умолчанию False. Необходимо переключить в - True для использования индивидулаьной модели, в противном случае возникнет ошибка. - --custom_model CUSTOM_MODEL - Путь к файлу индивидуальной модели (.pth). Требуется, если используется индивидуальная модель. - --custom_config CUSTOM_CONFIG - Путь к конфигурационнмоу файлу индивидуальной модели (config.json). Требуется, если используется индивидуальная модель. - --custom_vocab CUSTOM_VOCAB - Путь к словарю индивидуальной модели (vocab.json). Требуется, если используется индивидуальная модель. - --custom_model_url CUSTOM_MODEL_URL - URL для скачивания индивидуальной модели в виде zip-архива. Опционально, но если указано, то будет использовано. - Примеры включающие модель David Attenborough: 'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. Больше точно-настроенных моделей XTTS можно найти на Hugging Face 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Температура для модели. По умолчанию 0.65. Чем выше температура, тем более креативным будет синтез голоса, с большим наваждением. Чем меньше, тем более монотонным и спокойным. - --length_penalty LENGTH_PENALTY - Ограничение длинны авторегрессионного декодреа. По умолчанию 1.0. Не применяется к индивидуальным моделям. - --repetition_penalty REPETITION_PENALTY - Ограниечение предотвращающее повторение авторегрессивным декодером за собой. По умолчанию 2.0 - --top_k TOP_K Сэмплирование Top-k. Меньшее значние приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 50. - --top_p TOP_P Сэмплирование Top-p. Меньшее значние приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 0.8. - --speed SPEED Фактор скорости начитки. Чем больше значение, тем быстрее диктор будет читать текст. По умолчанию 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Включает разбиение текста на предложения. По умолчаниею True. - -Пример: python script.py --headless --ebook path_to_ebook --voice path_to_voice --language en --use_custom_model True --custom_model model.pth --custom_config config.json --custom_vocab vocab.json -``` - - - -### 🐳 Использование Docker - -Помимо всего прочего, можно использовать Docker для использования конвертера электронных книг в аудиокниги. Этот метод обеспечивает согласованность в различных средах и упрощает настройку. - -#### 🚀 Запуск контейнера Docker - -Для запуска контейнера Docker и интерфейса Gradio используйте следующую команду: - - -Запуск с использованием только CPU (процессора) -```powershell -docker run -it --rm -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -Запуск с использованием ускорения на GPU (графической карты), поддерживаются только видеокарты NVIDIA -```powershell -docker run -it --rm --gpus all -p 7860:7860 --platform=linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -``` - -Эта команда запускает интерфейс Gradio на порту 7860. (localhost:7860) -- Для получения большей информации о доступных командах в безинтерфейсном режиме или предоставление доступа к Gradio в сети, используйте ключ `-h` после имени команды `app.py` в терминале Docker -
- Пример использования Docker в безинтерфейсном режиме или модификаций параметров + полный гид - -## Пример использования Docker в безинтерфейсном режиме - -- Сперва необходимо получить свежий контейнер с приложением -```bash -docker pull athomasson2/ebook2audiobookxtts:huggingface -``` - -- Прежде чем запустить команду на исполнение, необходимо создать директрую с именем "input-folder" в текущей папке, которая будет подтянута к использованию. В эту папку необходимо помещать файлы, которые будут видны образу Docker -```bash -mkdir input-folder && mkdir Audiobooks -``` - -- В команде ниже замените **YOUR_INPUT_FILE.TXT** именем файла, который необходимо начитать - -```bash -docker run -it --rm \ - -v $(pwd)/input-folder:/home/user/app/input_folder \ - -v $(pwd)/Audiobooks:/home/user/app/Audiobooks \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py --headless True --ebook /home/user/app/input_folder/YOUR_INPUT_FILE.TXT -``` - -- И на этом это все! - -- Начитанная аудиокнига будет сформирована в папке Audiobooks, которая будет создана в вашей локальной директории, в которой был осуществлен запуск Docker - - -## Для получения помощи по параметрам, необходимо запустить следующую команду - -```bash -docker run -it --rm \ - --platform linux/amd64 \ - athomasson2/ebook2audiobookxtts:huggingface \ - python app.py -h - -``` - - -и вывод будет следующим - -```bash -user/app/ebook2audiobookXTTS/input-folder -v $(pwd)/Audiobooks:/home/user/app/ebook2audiobookXTTS/Audiobooks --memory="4g" --network none --platform linux/amd64 athomasson2/ebook2audiobookxtts:huggingface python app.py -h -starting... -Преобразование электронных книг в аудиокниги с использованием модели Text-to-Speech (TTS). Вы можете либо использовать -интерфейс Gradio, либо запустить скрипт в безинтерфейсном режиме (командная строка) для прямого конвертирования. - -опции: - -h, --help Отобразить этот список и выйти - --share SHARE Установить в True для включения публичного доступа к Web-интерфейсу Gradio. По умолчанию False. - --headless HEADLESS Установить в True для использования безинтерфейсного режима. По умолчанию False. - --ebook EBOOK Путь к электронной книге для конвертации. Необходимо для безинтерфейсного режима. - --voice VOICE Путь к целевому голосовому файлу для TTS (текст-в-голос). Опционально, используется голос по умолчанию, если путь не указан. - --language LANGUAGE Язык для конвертации в аудиокнигу. Варианты: en, es, fr, de, - it, pt, pl, tr, ru, nl, cs, ar, zh-cn, ja, hu, ko. По умолчанию English (en). - --use_custom_model USE_CUSTOM_MODEL - Установить в True для использования индивидуальной модели TTS. По умолчанию False. Необходимо переключить в - True для использования индивидулаьной модели, в противном случае возникнет ошибка. - --custom_model CUSTOM_MODEL - Путь к файлу индивидуальной модели (.pth). Требуется, если используется индивидуальная модель. - --custom_config CUSTOM_CONFIG - Путь к конфигурационнмоу файлу индивидуальной модели (config.json). Требуется, если используется индивидуальная модель. - --custom_vocab CUSTOM_VOCAB - Путь к словарю индивидуальной модели (vocab.json). Требуется, если используется индивидуальная модель. - --custom_model_url CUSTOM_MODEL_URL - URL для скачивания индивидуальной модели в виде zip-архива. Опционально, но если указано, то будет использовано. - Примеры включающие модель David Attenborough: 'https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true'. Больше точно-настроенных моделей XTTS можно найти на Hugging Face 'https://huggingface.co/drewThomasson'. - --temperature TEMPERATURE - Температура для модели. По умолчанию 0.65. Чем выше температура, тем более креативным будет синтез голоса, с большим наваждением. Чем меньше, тем более монотонным и спокойным. - --length_penalty LENGTH_PENALTY - Ограничение длинны авторегрессионного декодреа. По умолчанию 1.0. Не применяется к индивидуальным моделям. - --repetition_penalty REPETITION_PENALTY - Ограниечение предотвращающее повторение авторегрессивным декодером за собой. По умолчанию 2.0 - --top_k TOP_K Сэмплирование Top-k. Меньшее значние приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 50. - --top_p TOP_P Сэмплирование Top-p. Меньшее значние приводит к более вероятностному выводу и ускоряют генерацию аудио. По умолчанию 0.8. - --speed SPEED Фактор скорости начитки. Чем больше значение, тем быстрее диктор будет читать текст. По умолчанию 1.0. - --enable_text_splitting ENABLE_TEXT_SPLITTING - Включает разбиение текста на предложения. По умолчаниею True. - -Пример: python script.py --headless --ebook path_to_ebook --voice path_to_voice --language en --use_custom_model True --custom_model model.pth --custom_config config.json --custom_vocab vocab.json -``` -
- -#### 🖥️ Docker Web-интерфейс -![demo_web_gui](https://github.com/user-attachments/assets/85af88a7-05dd-4a29-91de-76a14cf5ef06) - -
- Нажмите для просмотра изображений Web-интерфейса -image -image -image -
- -### 🛠️ Для индивидуальных Xtts моделей - -Модели создаются для лучшего использования с конкретным голосом. Проверьте различные модели на страничке Hugging Face [тут](https://huggingface.co/drewThomasson). - -Для использования индивидуальных моделей, используйте ссылку на архив с моделью `Finished_model_files.zip`, например: -[David Attenborough точно настроенный голос Finished_model_files.zip](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/resolve/main/Finished_model_files.zip?download=true) - -Для индивидуальной модели также необходим файл с голосом: -[файл с голосом David Attenborough](https://huggingface.co/drewThomasson/xtts_David_Attenborough_fine_tune/blob/main/ref.wav) - - - -Больше информации можно найти на [странице Dockerfile Hub]([https://github.com/DrewThomasson/ebook2audiobookXTTS](https://hub.docker.com/repository/docker/athomasson2/ebook2audiobookxtts/general)). - -## 🌐 Точно отстроенные модели Xtts models - -Для поиска уже подготовленных точно настроенных моделей XTTS обратитесь к [этой страничке на Hugging Face](https://huggingface.co/drewThomasson) 🌐. Ищите модели которые имеют в наименовании "xtts fine tune". - -## 🎥 Демонстрация - -Голос ненастного дня - -https://github.com/user-attachments/assets/8486603c-38b1-43ce-9639-73757dfb1031 - -Голос David Attenborough - -https://github.com/user-attachments/assets/47c846a7-9e51-4eb9-844a-7460402a20a8 - - -## 🤗 [Демонстрация в пространстве Huggingface](https://huggingface.co/spaces/drewThomasson/ebook2audiobookXTTS) -- Пространства на Huggingface работают на бесплатном уровне процессоров, поэтому выполнение очень медленное и частво возникают ошибки связанные с истечением времени. Не пытайтесь преобразовывать большие файлы. -- Лучше всего клонировать пространство или запускать его локально. - -## Бесплатный Google Colab [![Бесплатный Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DrewThomasson/ebook2audiobookXTTS/blob/main/Notebooks/colab_ebook2audiobookxtts.ipynb) - - - -## 📚 Поддерживаемые форматы электронных книг - -- **Можно**: `.epub`, `.pdf`, `.mobi`, `.txt`, `.html`, `.rtf`, `.chm`, `.lit`, `.pdb`, `.fb2`, `.odt`, `.cbr`, `.cbz`, `.prc`, `.lrf`, `.pml`, `.snb`, `.cbc`, `.rb`, `.tcr` -- **Лучше**: `.epub` или `.mobi` для автоматического определения глав. - -## 📂 Вывод - -- Создается файл с расширением `.m4b`, содержащий метаданные и главы. -- **Пример вывода**: ![Пример](https://github.com/DrewThomasson/VoxNovel/blob/dc5197dff97252fa44c391dc0596902d71278a88/readme_files/example_in_app.jpeg) - -## 🛠️ Частые проблемы: -- "Очень медленно!" - При конвертации только на CPU она происходит медленно, единственный способ ускорения - использовать GPU от NVIDIA: [Обсуждение](https://github.com/DrewThomasson/ebook2audiobookXTTS/discussions/19#discussioncomment-10879846). Для быстрой многоязыковой генерации аудио, рекомендуется использовать другой проект [использующий piper-tts](https://github.com/DrewThomasson/ebook2audiobookpiper-tts). (Тем не менее, в нем нет функции клонирования голоса без лишней суеты и он воспроизводит голоса в качестве siri, но он намного быстрее работает на CPU.) -- "У меня проблема с зависимостями" - Просто используейте Docker. Образы в Docker самодостаточны, имеют, в том числе режим работы с конмандной строкой, ключ для вывода помощи. -- "У меня проблема с обрезаным аудио!" - создайте запись о проблеме, автор не говорит на каждом из поддерживаемых языков и ему требуется помощь по автоматическому разбиению текста на предложения в поддерживаемых языках.😊 -- "Процесс застопорился на 30% в Web-интерфейсе!" - Отображение прогресса в Web-интерфейсе выполнено на базовом уровне и содержит всего 3 шага, для контроллирования процесса посматривайте в терминальный вывод, где и отображается обработка текущего предложения. - -## С чем требуется помощь! 🙌 -## [Полный список тут](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/32) -- Любая помощь от людей говорящий на поддерживаемых языках для более корретного разбиения текста на предложения. -- Потенциальная помощь в создании инструкций для разных языков (автор знает только английский 😔). - -## 🙏 Отдельные спасибо - -- **Coqui TTS**: [Coqui TTS GitHub](https://github.com/coqui-ai/TTS) -- **Calibre**: [Calibre Website](https://calibre-ebook.com) - -- [@shakenbake15 за лучший способ сохранения глав](https://github.com/DrewThomasson/ebook2audiobookXTTS/issues/8) - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f090e85999cdba1cead565a4666a0b037e899c79..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -beautifulsoup4 -coqui-tts -cutlet -deep_translator -docker -ebooklib -gensim -gradio>=4.44 -hangul-romanize -indic-nlp-library -iso-639 -jieba -mecab -mecab-python3 -pydub -pypinyin -ray -transformers -translate -tqdm -unidic \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 5bb2cf88c46875454e2d19ad2a4eef78ffa15c81..0000000000000000000000000000000000000000 --- a/setup.py +++ /dev/null @@ -1,50 +0,0 @@ -import subprocess -import sys -from setuptools import setup, find_packages -from setuptools.command.develop import develop -from setuptools.command.install import install -import os - -cwd = os.path.dirname(os.path.abspath(__file__)) - -with open("README.md", "r", encoding='utf-8') as fh: - long_description = fh.read() - -with open('requirements.txt') as f: - requirements = f.read().splitlines() - -class PostInstallCommand(install): - def run(self): - install.run(self) - try: - subprocess.run([sys.executable, 'python -m', 'unidic', 'download'], check=True) - except Exception: - print("unidic download failed during installation, but it will be re-attempted a diffrent way when the app itself runs.") - - -setup( - name='ebook2audiobook', - version='2.0.0', - python_requires=">=3.10,<3.13", - author="Drew Thomasson", - description="Convert eBooks to audiobooks with chapters and metadata", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/DrewThomasson/ebook2audiobook", - packages=find_packages(), - install_requires=requirements, - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - include_package_data=True, - entry_points={ - "console_scripts": [ - "ebook2audiobook = app:main", - ], - }, - cmdclass={ - 'install': PostInstallCommand, - } -) diff --git a/tools/convert_24khz_to_16khz.bat b/tools/convert_24khz_to_16khz.bat deleted file mode 100644 index 9881019563669c7224cf932732b1014da7ea1cea..0000000000000000000000000000000000000000 --- a/tools/convert_24khz_to_16khz.bat +++ /dev/null @@ -1,27 +0,0 @@ -@echo off -setlocal enabledelayedexpansion - -:: Set the path to FFmpeg -set FFmpegPath="C:\path\to\ffmpeg.exe" - -:: Root directory to start the search -set RootDir=. - -:: Step 1: Find and delete _22khz.wav files -for /r "%RootDir%" %%F in (*_22khz.wav) do ( - echo Deleting "%%F" - del "%%F" -) - -:: Step 2: Find _24khz.wav files and convert them to _16khz.wav -for /r "%RootDir%" %%F in (*_24khz.wav) do ( - set "InputFile=%%F" - set "OutputFile=%%~dpF%%~nF" - set "OutputFile=!OutputFile:_24khz=_16khz!.wav" - - echo Converting "!InputFile!" to "!OutputFile!" - %FFmpegPath% -i "!InputFile!" -ar 16000 "!OutputFile!" -) - -echo Done! -pause diff --git a/tools/convert_to_gif.sh b/tools/convert_to_gif.sh deleted file mode 100644 index b7320bd51f0542396d0953be08356abae61d489c..0000000000000000000000000000000000000000 --- a/tools/convert_to_gif.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Help function -function show_help() { - echo "Usage: $0 " - echo - echo "This script converts a video file to a GIF with the following properties:" - echo " - Frame rate: 10 fps" - echo " - Resolution: 1728x1028 (with padding if needed)" - echo - echo "Example:" - echo " $0 input.mov" - echo - echo "The output file will be named 'output.gif'." -} - -# Check if help is requested -if [[ "$1" == "-h" || "$1" == "--h" || "$1" == "-help" || "$1" == "--help" ]]; then - show_help - exit 0 -fi - -# Check if the input file is provided -if [ "$#" -ne 1 ]; then - echo "Error: Missing input file." - echo "Use --help for usage information." - exit 1 -fi - -# Input video file -input_file="$1" -output_file="output.gif" - -# Conversion parameters -fps=10 -width=1728 -height=1028 - -# Convert the video to GIF -ffmpeg -i "$input_file" -vf "fps=$fps,scale=${width}:${height}:force_original_aspect_ratio=decrease,pad=${width}:${height}:(ow-iw)/2:(oh-ih)/2" "$output_file" - -echo "Conversion complete! Output file: $output_file" - diff --git a/tools/generate_ebooks.py b/tools/generate_ebooks.py deleted file mode 100644 index ee6547d4b0f4e3df5696d8f6ec41607a5d1a9d58..0000000000000000000000000000000000000000 --- a/tools/generate_ebooks.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import sys -import subprocess - -from iso639 import languages -from deep_translator import GoogleTranslator -from tqdm import tqdm - -parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -sys.path.append(parent_dir) - -# Your language mapping dictionary from lang.py -from lib.lang import language_mapping - -env = os.environ.copy() -env["PYTHONIOENCODING"] = "utf-8"; -env["LANG"] = "en_US.UTF-8" - -# Base text to be translated -base_text = "This is a test from the result of text file to audiobook conversion." - -# Output directory -output_dir = "../ebooks/tests" -if not os.path.exists(output_dir): - os.makedirs(output_dir) - -# Path to your base cover image (adjust the path accordingly) -base_cover_image = "../ebooks/tests/__cover.jpg" - -# List to keep track of languages that failed -failed_languages = [] - -# Loop over languages with a progress bar -for lang_code, lang_info in tqdm(language_mapping.items(), desc="Processing languages"): - try: - lang_iso = lang_code - language_array = languages.get(part3=lang_code) - if language_array and language_array.part1: - lang_iso = language_array.part1 - if lang_iso == "zh": - lang_iso = "zh-CN" - # Translate the text - translated_text = GoogleTranslator(source='en', target=lang_iso).translate(base_text) - print(f"\nTranslated text for {lang_info['name']} ({lang_iso}): {translated_text}") - - # Write the translated text to a txt file - txt_filename = f"test_{lang_code}.txt" - txt_filepath = os.path.join(output_dir, txt_filename) - with open(txt_filepath, 'w', encoding='utf-8') as f: - f.write(translated_text) - - # Prepare the ebook-convert command - azw3_filename = f"test_{lang_code}.azw3" - azw3_filepath = os.path.join(output_dir, azw3_filename) - - title = f"Ebook {lang_info['name']} Test" - authors = "Dev Team" - language = lang_iso - - command = [ - "ebook-convert", - txt_filepath, - azw3_filepath, - "--cover", base_cover_image, - "--title", title, - "--authors", authors, - "--language", language, - "--input-encoding", "utf-8" - ] - - result = subprocess.run(command, env=env, text=True, encoding="utf-8") - print(f"Ebook generated for {lang_info['name']} at {azw3_filepath}\n") - - except Exception as e: - print(f"Erro: language {lang_code} not supported!") - failed_languages.append(lang_code) - continue - -# After processing all languages, output the list of languages that failed -if failed_languages: - print("\nThe following languages could not be processed:") - for lang_code in failed_languages: - lang_name = language_mapping[lang_code]['name'] - print(f"- {lang_name} ({lang_code})") diff --git a/voices/eng/adult/female/Jennifer_16khz.wav b/voices/eng/adult/female/Jennifer_16khz.wav deleted file mode 100644 index e8c9a637bc4ab47fa77ff41c9824294728ad7936..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/female/Jennifer_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/female/Jennifer_16khz_16khz.wav b/voices/eng/adult/female/Jennifer_16khz_16khz.wav deleted file mode 100644 index e8c9a637bc4ab47fa77ff41c9824294728ad7936..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/female/Jennifer_16khz_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/female/Jennifer_24khz.wav b/voices/eng/adult/female/Jennifer_24khz.wav deleted file mode 100644 index 1dca441cb10fd8d82b8e901d7c31351d31f22d03..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/female/Jennifer_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/female/jenifer_22khz.wav b/voices/eng/adult/female/jenifer_22khz.wav deleted file mode 100644 index c2e384cfe9e24770533e3108a9848425162aa0d8..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/female/jenifer_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/female/jenifer_24khz.wav b/voices/eng/adult/female/jenifer_24khz.wav deleted file mode 100644 index 1dca441cb10fd8d82b8e901d7c31351d31f22d03..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/female/jenifer_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/AiExplained_16khz.wav b/voices/eng/adult/male/AiExplained_16khz.wav deleted file mode 100644 index 1ff45f781703cb56b4c236735a5ec59a78dbc852..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/AiExplained_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/AiExplained_24khz.wav b/voices/eng/adult/male/AiExplained_24khz.wav deleted file mode 100644 index ce87d9cc18ed45946ca67c7082834894b0060f45..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/AiExplained_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/BobOdenkirk_16khz.wav b/voices/eng/adult/male/BobOdenkirk_16khz.wav deleted file mode 100644 index cf2fb7dd178038b81d6bc60a0349c430e0704a62..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/BobOdenkirk_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/BobOdenkirk_22khz.wav b/voices/eng/adult/male/BobOdenkirk_22khz.wav deleted file mode 100644 index 380ac047aaa84e73b6f50603378be86583004d5e..0000000000000000000000000000000000000000 --- a/voices/eng/adult/male/BobOdenkirk_22khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20891bd4c3740cae7eca166719937a157d83099f5ba6169efce99b453aa7abfd -size 1031136 diff --git a/voices/eng/adult/male/BobOdenkirk_24khz.wav b/voices/eng/adult/male/BobOdenkirk_24khz.wav deleted file mode 100644 index 34b81433504727575c52c45884edc22b58abec62..0000000000000000000000000000000000000000 --- a/voices/eng/adult/male/BobOdenkirk_24khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:336b71f92f7150d79638a1c82262432b78857ed0027c04e1ac86e29c7d7dc4f9 -size 1122318 diff --git a/voices/eng/adult/male/BobRoss_16khz.wav b/voices/eng/adult/male/BobRoss_16khz.wav deleted file mode 100644 index 114bf26522664c6946abfede97e612e76f2d67fe..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/BobRoss_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/BobRoss_22khz.wav b/voices/eng/adult/male/BobRoss_22khz.wav deleted file mode 100644 index 93e0d688c151de6d793e87c9d8f244d4bdb2019f..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/BobRoss_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/BobRoss_24khz.wav b/voices/eng/adult/male/BobRoss_24khz.wav deleted file mode 100644 index 120ebc4cac315ed4020e5aa61eb777d11e186720..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/BobRoss_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/BryanCranston_16khz.wav b/voices/eng/adult/male/BryanCranston_16khz.wav deleted file mode 100644 index 4580f26ab189893202641406d0ad788d0d37570a..0000000000000000000000000000000000000000 --- a/voices/eng/adult/male/BryanCranston_16khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c3ca3649ee4b3b0cd3f2bebc8c13f5b7c77334f69f8ff5f4a129b717f1812ed -size 1562638 diff --git a/voices/eng/adult/male/BryanCranston_22khz.wav b/voices/eng/adult/male/BryanCranston_22khz.wav deleted file mode 100644 index 4eaa09925ed4d8bd1f8be8f10399558aa8e7ca4f..0000000000000000000000000000000000000000 --- a/voices/eng/adult/male/BryanCranston_22khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:881a6806a777631c5f395c57e04efc7eccd8d4d349c6fbd8d28665d1cef396d5 -size 2153480 diff --git a/voices/eng/adult/male/BryanCranston_24khz.wav b/voices/eng/adult/male/BryanCranston_24khz.wav deleted file mode 100644 index ba5970bd652e8891ee6c325c4f56d025d3754c73..0000000000000000000000000000000000000000 --- a/voices/eng/adult/male/BryanCranston_24khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6692327d48ea479e12ea6b1c43babb2325c0a9f3f1952054c2af35f4aab8b76 -size 2343918 diff --git a/voices/eng/adult/male/Curt_16khz.wav b/voices/eng/adult/male/Curt_16khz.wav deleted file mode 100644 index e80de4290d9d90c803be0c2d77d8a91c0e278151..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Curt_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/Curt_24khz.wav b/voices/eng/adult/male/Curt_24khz.wav deleted file mode 100644 index 3f64ecd450b4f91fcabb9011639590e4e5b69d1d..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Curt_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/DeathPuss&Boots_16khz.wav b/voices/eng/adult/male/DeathPuss&Boots_16khz.wav deleted file mode 100644 index ccb483a01180c6f8f61d86679c2923b6b5b3a20c..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/DeathPuss&Boots_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/DeathPuss&Boots_22khz.wav b/voices/eng/adult/male/DeathPuss&Boots_22khz.wav deleted file mode 100644 index 735c556ea616597fcdb3e28e0ab5b7267ebcd881..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/DeathPuss&Boots_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/DeathPuss&Boots_24khz.wav b/voices/eng/adult/male/DeathPuss&Boots_24khz.wav deleted file mode 100644 index 4a65f470a281c5fb01bc57d0f2cae16da5eb54d0..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/DeathPuss&Boots_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/GhostMW2_16khz.wav b/voices/eng/adult/male/GhostMW2_16khz.wav deleted file mode 100644 index fbde7bb8058a0b136f756c056ac6f707ea040149..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/GhostMW2_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/GhostMW2_24khz.wav b/voices/eng/adult/male/GhostMW2_24khz.wav deleted file mode 100644 index 47d8cdae7323a7f19098354f75d7e8017374872d..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/GhostMW2_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/Ghost_MW2_22khz.wav b/voices/eng/adult/male/Ghost_MW2_22khz.wav deleted file mode 100644 index 55476804065ee88a8751859a61cfbbb895f3d8d9..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Ghost_MW2_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/Ghost_MW2_24khz.wav b/voices/eng/adult/male/Ghost_MW2_24khz.wav deleted file mode 100644 index 47d8cdae7323a7f19098354f75d7e8017374872d..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Ghost_MW2_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/JhonMulaney_16khz.wav b/voices/eng/adult/male/JhonMulaney_16khz.wav deleted file mode 100644 index fafafcb2b5fb4514cee9f2285d22e1cf432a5533..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/JhonMulaney_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/JhonMulaney_22khz.wav b/voices/eng/adult/male/JhonMulaney_22khz.wav deleted file mode 100644 index 10881bc137ff9c7caf6a90f63dd4422913e29672..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/JhonMulaney_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/JhonMulaney_24khz.wav b/voices/eng/adult/male/JhonMulaney_24khz.wav deleted file mode 100644 index d67ff5281a85f00e7df6a3677b5cf0ad0f69193f..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/JhonMulaney_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/MorganFreeman_16khz.wav b/voices/eng/adult/male/MorganFreeman_16khz.wav deleted file mode 100644 index a56c76466394a7cdb7a31e4cbb24af1112feb9b6..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/MorganFreeman_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/MorganFreeman_24khz.wav b/voices/eng/adult/male/MorganFreeman_24khz.wav deleted file mode 100644 index a0ff4ee803d69b7083dad9951ff0a695c62823be..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/MorganFreeman_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/Morgan_Freeman_22khz.wav b/voices/eng/adult/male/Morgan_Freeman_22khz.wav deleted file mode 100644 index e12e0fc531e77a6b3831bd2d9c95b13e22d996e3..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Morgan_Freeman_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/Morgan_Freeman_24khz.wav b/voices/eng/adult/male/Morgan_Freeman_24khz.wav deleted file mode 100644 index a0ff4ee803d69b7083dad9951ff0a695c62823be..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/Morgan_Freeman_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/WhisperSalemASMR_16khz.wav b/voices/eng/adult/male/WhisperSalemASMR_16khz.wav deleted file mode 100644 index 4376c6b84d7c023ce85bc5fa0fe83e7b39136697..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/WhisperSalemASMR_16khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/WhisperSalemASMR_22khz.wav b/voices/eng/adult/male/WhisperSalemASMR_22khz.wav deleted file mode 100644 index 384443d1e6333311fc3d7609372e1c4dd5e886b6..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/WhisperSalemASMR_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/WhisperSalemASMR_24khz.wav b/voices/eng/adult/male/WhisperSalemASMR_24khz.wav deleted file mode 100644 index ec71a3af43842cfbee0f57edfd3d357880423533..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/WhisperSalemASMR_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/ai_explained_22khz.wav b/voices/eng/adult/male/ai_explained_22khz.wav deleted file mode 100644 index 808351e4bb7738beb0475b7f00f21097a307382c..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/ai_explained_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/ai_explained_24khz.wav b/voices/eng/adult/male/ai_explained_24khz.wav deleted file mode 100644 index ce87d9cc18ed45946ca67c7082834894b0060f45..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/ai_explained_24khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/curt_22khz.wav b/voices/eng/adult/male/curt_22khz.wav deleted file mode 100644 index 99837119a81fa105bcca47dc79ecbf59ed6865bd..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/curt_22khz.wav and /dev/null differ diff --git a/voices/eng/adult/male/curt_24khz.wav b/voices/eng/adult/male/curt_24khz.wav deleted file mode 100644 index 3f64ecd450b4f91fcabb9011639590e4e5b69d1d..0000000000000000000000000000000000000000 Binary files a/voices/eng/adult/male/curt_24khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/DavidAttenborough_16khz.wav b/voices/eng/elder/male/DavidAttenborough_16khz.wav deleted file mode 100644 index 069b046b5659b5d03998b704abe53cf7cfa93598..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/DavidAttenborough_16khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/DavidAttenborough_22khz.wav b/voices/eng/elder/male/DavidAttenborough_22khz.wav deleted file mode 100644 index e57a9b40ddf23da7229c66b79390864a4795c2f8..0000000000000000000000000000000000000000 --- a/voices/eng/elder/male/DavidAttenborough_22khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:197f043744f728545d47d18662694e82b40ea124476c7e4c78a2e54aecf3b204 -size 1054068 diff --git a/voices/eng/elder/male/DavidAttenborough_24khz.wav b/voices/eng/elder/male/DavidAttenborough_24khz.wav deleted file mode 100644 index 4044945db4e363dc3fc6d0ba744bcaac0c334269..0000000000000000000000000000000000000000 --- a/voices/eng/elder/male/DavidAttenborough_24khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:066ff87be82723a2aeef01da26357ab2746ce5e7b863a62479b37255404dc093 -size 1147278 diff --git a/voices/eng/elder/male/JhonButlerASMR_16khz.wav b/voices/eng/elder/male/JhonButlerASMR_16khz.wav deleted file mode 100644 index c5a03c588d3ab042324911287ab440a46ad5e9cd..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/JhonButlerASMR_16khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/JhonButlerASMR_22khz.wav b/voices/eng/elder/male/JhonButlerASMR_22khz.wav deleted file mode 100644 index 2b9538c51efd998ce5d9a71ba4681fbd84e33777..0000000000000000000000000000000000000000 --- a/voices/eng/elder/male/JhonButlerASMR_22khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:771650b040d485e0687fe40379472d27c4a5c5ef154acfc8f0ae696bba48df7a -size 1063770 diff --git a/voices/eng/elder/male/JhonButlerASMR_24khz.wav b/voices/eng/elder/male/JhonButlerASMR_24khz.wav deleted file mode 100644 index 8a048d5bcd732c75ac149df2ba7c285f8f0749c7..0000000000000000000000000000000000000000 --- a/voices/eng/elder/male/JhonButlerASMR_24khz.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fbe36977362327bc9fd04cbbd14e40b9a79c9740f8dda4eb5bbdbe444b9f98b1 -size 1157838 diff --git a/voices/eng/elder/male/RainyDayHeadSpace_16khz.wav b/voices/eng/elder/male/RainyDayHeadSpace_16khz.wav deleted file mode 100644 index ccc73fdba066e08147aa2067aac094cf6ce8f608..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/RainyDayHeadSpace_16khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/RainyDayHeadSpace_24khz.wav b/voices/eng/elder/male/RainyDayHeadSpace_24khz.wav deleted file mode 100644 index dad5979bbb8b9a91c744f8c4d1dc921bf26bc7a4..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/RainyDayHeadSpace_24khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/RainyDay_HeadSpace_22khz.wav b/voices/eng/elder/male/RainyDay_HeadSpace_22khz.wav deleted file mode 100644 index 3a746356b8c8ad00e5ceeb2635a35b5a0f3470fd..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/RainyDay_HeadSpace_22khz.wav and /dev/null differ diff --git a/voices/eng/elder/male/RainyDay_HeadSpace_24khz.wav b/voices/eng/elder/male/RainyDay_HeadSpace_24khz.wav deleted file mode 100644 index dad5979bbb8b9a91c744f8c4d1dc921bf26bc7a4..0000000000000000000000000000000000000000 Binary files a/voices/eng/elder/male/RainyDay_HeadSpace_24khz.wav and /dev/null differ diff --git a/voices/fra/adult/female/beatrice_16khz.wav b/voices/fra/adult/female/beatrice_16khz.wav deleted file mode 100644 index 0d4e241295be7741d30b7db847e32edcd7e91b29..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/female/beatrice_16khz.wav and /dev/null differ diff --git a/voices/fra/adult/female/beatrice_22khz.wav b/voices/fra/adult/female/beatrice_22khz.wav deleted file mode 100644 index 6134bfb5d1f595a3cc1a1bf41d2d561a50195484..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/female/beatrice_22khz.wav and /dev/null differ diff --git a/voices/fra/adult/female/beatrice_24khz.wav b/voices/fra/adult/female/beatrice_24khz.wav deleted file mode 100644 index 26ecab5abd697ad8d13721f8ece9a9438fa701c9..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/female/beatrice_24khz.wav and /dev/null differ diff --git a/voices/fra/adult/male/andre_16khz.wav b/voices/fra/adult/male/andre_16khz.wav deleted file mode 100644 index 6e7ba49da8d014c3f5a71e4a52ffceb2b20bdcbe..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/male/andre_16khz.wav and /dev/null differ diff --git a/voices/fra/adult/male/andre_22khz.wav b/voices/fra/adult/male/andre_22khz.wav deleted file mode 100644 index 4370c8a96dc9d42eb1971a56bc78cb5569bf29b6..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/male/andre_22khz.wav and /dev/null differ diff --git a/voices/fra/adult/male/andre_24khz.wav b/voices/fra/adult/male/andre_24khz.wav deleted file mode 100644 index 463b4a50e558f5cf275db0aecb7586c1f3fc7a52..0000000000000000000000000000000000000000 Binary files a/voices/fra/adult/male/andre_24khz.wav and /dev/null differ diff --git a/voices/fra/elder/male/default_voice.wav b/voices/fra/elder/male/default_voice.wav deleted file mode 100644 index 2b7a9d258d8c6a6cbf48db3d71352d828b58f559..0000000000000000000000000000000000000000 --- a/voices/fra/elder/male/default_voice.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d6a53b631748766bae3197b0452005724c91281bc142d1f0f0639ece5d53eb3 -size 1702860 diff --git a/voices/fra/teen/male/default_voice.wav b/voices/fra/teen/male/default_voice.wav deleted file mode 100644 index a3266ece3fc729c209e8b690d0182b50f38f6d86..0000000000000000000000000000000000000000 Binary files a/voices/fra/teen/male/default_voice.wav and /dev/null differ