Spaces:

mkutarna
/

audiobook_gen

Build error

App Files Files Community

mkutarna commited on Dec 9, 2022

Commit

985fb81

•

1 Parent(s): ab137d9

Created function to replace symbols with text equivalent

Browse files

Files changed (2) hide show

notebooks/audiobook_gen_silero.ipynb +12 -77
src/file_readers.py +26 -0

notebooks/audiobook_gen_silero.ipynb CHANGED Viewed

@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,17 +96,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using cache found in /home/mkutarna/.cache/torch/hub/snakers4_silero-models_master\n"
-     ]
-    }
-   ],
    "source": [
     "language = 'en'\n",
     "model_id = 'v3_en'\n",
@@ -130,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -167,8 +159,6 @@
     "\n",
     "    download('punkt')\n",
     "    wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
-    "    \n",
-    "    p = inflect.engine()\n",
     "\n",
     "    book = epub.read_epub(ebook_path)\n",
     "\n",
@@ -208,32 +198,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to /home/mkutarna/nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cb413d9ca55b4607924cc598502c00fe",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/11 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
    "source": [
     "ebook, title = read_ebook(ebook_path)"
    ]
@@ -247,22 +214,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Title of ebook (path name):the_picture_of_dorian_gray\n",
-      "\n",
-      "First line of the ebook:The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde\n",
-      "\n",
-      "First paragraph (truncated for display): \n",
-      " ['CHAPTER I.', 'The studio was filled with the rich odour of roses, and when the light summer wind stirred amidst the trees of the garden, there came', 'through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn.', 'From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry', 'Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able']\n"
-     ]
-    }
-   ],
    "source": [
     "print(f'Title of ebook (path name):{title}\\n')\n",
     "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
@@ -271,28 +225,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde',\n",
-       " 'This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no',\n",
-       " 'restrictions whatsoever.',\n",
-       " 'You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at',\n",
-       " 'www.gutenberg.org.',\n",
-       " 'If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',\n",
-       " 'Title: The Picture of Dorian Gray',\n",
-       " 'Author: Oscar Wilde',\n",
-       " 'Release Date: October, one thousand, nine hundred and ninety-four [eBook #one hundred and seventy-four]']"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "ebook[0][:9]"
    ]

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "language = 'en'\n",
     "model_id = 'v3_en'\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
     "    download('punkt')\n",
     "    wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
     "\n",
     "    book = epub.read_epub(ebook_path)\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "ebook, title = read_ebook(ebook_path)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "print(f'Title of ebook (path name):{title}\\n')\n",
     "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "ebook[0][:9]"
    ]

src/file_readers.py CHANGED Viewed

@@ -51,6 +51,7 @@ def preprocess_text(file):
         for sentence in sentences:
             if any(chr.isdigit() for chr in sentence):
                 sentence = extract_replace(sentence)
             if not re.search('[a-zA-Z]', sentence):
                 sentence = ''
             wrapped_sentences = wrapper.wrap(sentence)
@@ -117,6 +118,31 @@ def find_num_index(entry_string):
     return result1
 def read_pdf(file):
     """
     Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,

         for sentence in sentences:
             if any(chr.isdigit() for chr in sentence):
                 sentence = extract_replace(sentence)
+            sentence = replace_symbols(sentence)
             if not re.search('[a-zA-Z]', sentence):
                 sentence = ''
             wrapped_sentences = wrapper.wrap(sentence)
     return result1
+def replace_symbols(text):
+    import re
+    symbol_map = {
+        '+': ' plus ',
+        '-': ' minus ',
+        '—': ' dash ',
+        '=': ' equals ',
+        '≈': ' approximately equal to ',
+        '*': ' times ',
+        'x': ' times ',
+        '%': ' percent ',
+        '/': ' divided by ',
+        '#': ' number ',
+        '@': ' at ',
+        '&': ' ampersand ',
+        '°': ' degrees '
+    }
+    symbol_regex = re.compile('|'.join(re.escape(key) for key in symbol_map.keys()))
+    text = symbol_regex.sub(lambda x: symbol_map[x.group()], text)
+    return text
 def read_pdf(file):
     """
     Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,