{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "5292a160", "metadata": {}, "outputs": [], "source": [ "import re\n", "import numpy as np\n", "\n", "from bs4 import BeautifulSoup\n", "from nltk import tokenize, download\n", "from textwrap import TextWrapper" ] }, { "cell_type": "code", "execution_count": 3, "id": "68609a77", "metadata": {}, "outputs": [], "source": [ "# file_path = '1232-h.htm'\n", "file_path = ''" ] }, { "cell_type": "code", "execution_count": 4, "id": "5c526c9b", "metadata": {}, "outputs": [], "source": [ "download('punkt', quiet=True)\n", "wrapper = TextWrapper(140, fix_sentence_endings=True)" ] }, { "cell_type": "code", "execution_count": 5, "id": "d4732304", "metadata": {}, "outputs": [], "source": [ "def preprocess(file):\n", " input_text = BeautifulSoup(file, \"html.parser\").text\n", " text_list = []\n", " for paragraph in input_text.split('\\n'):\n", " paragraph = paragraph.replace('—', '-')\n", " paragraph = paragraph.replace(' .', '')\n", " paragraph = re.sub(r'[^\\x00-\\x7f]', \"\", paragraph)\n", " paragraph = re.sub(r'x0f', \" \", paragraph)\n", " sentences = tokenize.sent_tokenize(paragraph)\n", "\n", " sentence_list = []\n", " for sentence in sentences:\n", " if not re.search('[a-zA-Z]', sentence):\n", " sentence = ''\n", " wrapped_sentences = wrapper.wrap(sentence)\n", " sentence_list.append(wrapped_sentences)\n", " trunc_sentences = [phrase for sublist in sentence_list for phrase in sublist]\n", " text_list.append(trunc_sentences)\n", " text_list = [text for sentences in text_list for text in sentences]\n", " return text_list" ] }, { "cell_type": "code", "execution_count": null, "id": "3045665a", "metadata": {}, "outputs": [], "source": [ "def read_html(file):\n", " corpus = preprocess(file)\n", " return corpus" ] }, { "cell_type": "code", "execution_count": null, "id": "e18be118", "metadata": {}, "outputs": [], "source": [ "with open(file_path, 'r') as f:\n", " ebook_upload = f.read()\n", "corpus = read_html(ebook_upload)" ] }, { "cell_type": "code", "execution_count": 11, "id": "ece1c7d3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 2)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.shape(corpus)" ] }, { "cell_type": "code", "execution_count": 12, "id": "dc7e4010", "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcorpus\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n", "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "corpus[0][2]" ] }, { "cell_type": "code", "execution_count": 13, "id": "6cb47a2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Predict Testing Text File',\n", " 'Audiobook Gen is a tool that allows the users to generate an audio file from an ebook or other document.']]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus" ] }, { "cell_type": "code", "execution_count": null, "id": "8508b073", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d11031c7", "metadata": {}, "outputs": [], "source": [ "assert title == \"1232-h\"\n", "assert np.shape(corpus) == (1, 5476)\n", "assert corpus[0][0] == 'The Project Gutenberg eBook of The Prince, by Nicolo Machiavelli'\n", "assert corpus[0][2] == 'This eBook is for the use of anyone anywhere in the United States and'" ] }, { "cell_type": "code", "execution_count": null, "id": "0c57eec6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "af281267", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "from bs4 import BeautifulSoup\n", "from nltk import tokenize, download\n", "from textwrap import TextWrapper\n", "from stqdm import stqdm" ] }, { "cell_type": "code", "execution_count": 6, "id": "676ce437", "metadata": {}, "outputs": [], "source": [ "download('punkt', quiet=True)\n", "wrapper = TextWrapper(140, fix_sentence_endings=True)\n", "file_path = 'test.txt'" ] }, { "cell_type": "code", "execution_count": 7, "id": "4d278f8e", "metadata": {}, "outputs": [], "source": [ "def preprocess_text(file):\n", " input_text = BeautifulSoup(file, \"html.parser\").text\n", " text_list = []\n", " for paragraph in input_text.split('\\n'):\n", " paragraph = paragraph.replace('—', '-')\n", " paragraph = paragraph.replace(' .', '')\n", " paragraph = re.sub(r'[^\\x00-\\x7f]', \"\", paragraph)\n", " paragraph = re.sub(r'x0f', \" \", paragraph)\n", " sentences = tokenize.sent_tokenize(paragraph)\n", "\n", " sentence_list = []\n", " for sentence in sentences:\n", " if not re.search('[a-zA-Z]', sentence):\n", " sentence = ''\n", " wrapped_sentences = wrapper.wrap(sentence)\n", " sentence_list.append(wrapped_sentences)\n", " trunc_sentences = [phrase for sublist in sentence_list for phrase in sublist]\n", " text_list.append(trunc_sentences)\n", " text_list = [text for sentences in text_list for text in sentences]\n", " return text_list" ] }, { "cell_type": "code", "execution_count": 8, "id": "f67e0184", "metadata": {}, "outputs": [], "source": [ "with open(file_path, 'r') as uploaded_file:\n", " file = uploaded_file.read()\n", " text = preprocess_text(file)" ] }, { "cell_type": "code", "execution_count": 10, "id": "0bd67797", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Testing Text File \\n\\nWith generated random Lorem Ipsum and other unexpected characters!\\n\\nLink to generator repo!\\n\\n此行是对非英语字符的测试\\n\\nLorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Proin fermentum leo vel orci porta non pulvinar. Pretium lectus quam id leo in vitae turpis massa sed. Donec ac odio tempor orci dapibus. Feugiat in ante metus dictum at tempor. Elementum tempus egestas sed sed risus. Adipiscing commodo elit at imperdiet dui accumsan sit. Placerat orci nulla pellentesque dignissim enim. Posuere lorem ipsum dolor sit. Id ornare arcu odio ut sem. Purus faucibus ornare suspendisse sed nisi lacus sed. Ac turpis egestas sed tempus urna et pharetra pharetra massa. Morbi quis commodo odio aenean. Malesuada proin libero nunc consequat interdum. Ut placerat orci nulla pellentesque dignissim enim sit. Elit at imperdiet dui accumsan sit amet.\\n\\nBuilt to test various characters and other possible inputs to the silero model.\\n\\nHere are some Chinese characters: 此行是对非英语字符的测试.\\n\\nThere are 24 letters in the Greek alphabet. The vowels: are α, ε, η, ι, ο, ω, υ. All the rest are consonants.\\n\\nWe can also test for mathematical symbols: ∫, ∇, ∞, δ, ε, X̄, %, √ ,a, ±, ÷, +, = ,-.\\n\\nFinally, here are some emoticons: ☺️🙂😊😀😁☹️🙁😞😟😣😖😨😧😦😱😫😩.'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file" ] }, { "cell_type": "code", "execution_count": 9, "id": "064aa16b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Testing Text File',\n", " 'With generated random Lorem Ipsum and other unexpected characters!',\n", " 'Link to generator repo!',\n", " 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',\n", " 'Proin fermentum leo vel orci porta non pulvinar.',\n", " 'Pretium lectus quam id leo in vitae turpis massa sed.',\n", " 'Donec ac odio tempor orci dapibus.',\n", " 'Feugiat in ante metus dictum at tempor.',\n", " 'Elementum tempus egestas sed sed risus.',\n", " 'Adipiscing commodo elit at imperdiet dui accumsan sit.',\n", " 'Placerat orci nulla pellentesque dignissim enim.',\n", " 'Posuere lorem ipsum dolor sit.',\n", " 'Id ornare arcu odio ut sem.',\n", " 'Purus faucibus ornare suspendisse sed nisi lacus sed.',\n", " 'Ac turpis egestas sed tempus urna et pharetra pharetra massa.',\n", " 'Morbi quis commodo odio aenean.',\n", " 'Malesuada proin libero nunc consequat interdum.',\n", " 'Ut placerat orci nulla pellentesque dignissim enim sit.',\n", " 'Elit at imperdiet dui accumsan sit amet.',\n", " 'Built to test various characters and other possible inputs to the silero model.',\n", " 'Here are some Chinese characters: .',\n", " 'There are 24 letters in the Greek alphabet.',\n", " 'The vowels: are , , , , , , .',\n", " 'All the rest are consonants.',\n", " 'We can also test for mathematical symbols: , , , , , X, %, ,a, , , +, = ,-.',\n", " 'Finally, here are some emoticons: .']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text" ] }, { "cell_type": "code", "execution_count": 22, "id": "3e8e7965", "metadata": {}, "outputs": [], "source": [ "with open('test_processed.txt', 'w') as output_file:\n", " for line in text:\n", " output_file.write(line)\n", " output_file.write('\\n')" ] }, { "cell_type": "code", "execution_count": 26, "id": "2aa4c8ff", "metadata": {}, "outputs": [], "source": [ "with open('test_processed.txt', 'r') as process_file:\n", " out_file = [line.strip() for line in process_file.readlines()]" ] }, { "cell_type": "code", "execution_count": 27, "id": "c483fb65", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Testing Text File',\n", " 'With generated random Lorem Ipsum and other unexpected characters!',\n", " 'Link to generator repo!',\n", " 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',\n", " 'Proin fermentum leo vel orci porta non pulvinar.',\n", " 'Pretium lectus quam id leo in vitae turpis massa sed.',\n", " 'Donec ac odio tempor orci dapibus.',\n", " 'Feugiat in ante metus dictum at tempor.',\n", " 'Elementum tempus egestas sed sed risus.',\n", " 'Adipiscing commodo elit at imperdiet dui accumsan sit.',\n", " 'Placerat orci nulla pellentesque dignissim enim.',\n", " 'Posuere lorem ipsum dolor sit.',\n", " 'Id ornare arcu odio ut sem.',\n", " 'Purus faucibus ornare suspendisse sed nisi lacus sed.',\n", " 'Ac turpis egestas sed tempus urna et pharetra pharetra massa.',\n", " 'Morbi quis commodo odio aenean.',\n", " 'Malesuada proin libero nunc consequat interdum.',\n", " 'Ut placerat orci nulla pellentesque dignissim enim sit.',\n", " 'Elit at imperdiet dui accumsan sit amet.',\n", " 'Built to test various characters and other possible inputs to the silero model.',\n", " 'Here are some Chinese characters: .',\n", " 'There are 24 letters in the Greek alphabet.',\n", " 'The vowels: are , , , , , , .',\n", " 'All the rest are consonants.',\n", " 'We can also test for mathematical symbols: , , , , , X, %, ,a, , , +, = ,-.',\n", " 'Finally, here are some emoticons: .']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out_file" ] }, { "cell_type": "code", "execution_count": null, "id": "65646961", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }