Spaces:
Build error
Build error
Added function to replace numerals with text equivalent
Browse files- notebooks/audiobook_gen_silero.ipynb +79 -14
- requirements.txt +1 -0
- src/file_readers.py +58 -0
notebooks/audiobook_gen_silero.ipynb
CHANGED
@@ -45,7 +45,7 @@
|
|
45 |
},
|
46 |
{
|
47 |
"cell_type": "code",
|
48 |
-
"execution_count":
|
49 |
"metadata": {},
|
50 |
"outputs": [],
|
51 |
"source": [
|
@@ -79,7 +79,7 @@
|
|
79 |
},
|
80 |
{
|
81 |
"cell_type": "code",
|
82 |
-
"execution_count":
|
83 |
"metadata": {},
|
84 |
"outputs": [],
|
85 |
"source": [
|
@@ -96,9 +96,17 @@
|
|
96 |
},
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
-
"execution_count":
|
100 |
"metadata": {},
|
101 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
"source": [
|
103 |
"language = 'en'\n",
|
104 |
"model_id = 'v3_en'\n",
|
@@ -122,7 +130,7 @@
|
|
122 |
},
|
123 |
{
|
124 |
"cell_type": "code",
|
125 |
-
"execution_count":
|
126 |
"metadata": {},
|
127 |
"outputs": [],
|
128 |
"source": [
|
@@ -144,7 +152,7 @@
|
|
144 |
},
|
145 |
{
|
146 |
"cell_type": "code",
|
147 |
-
"execution_count":
|
148 |
"metadata": {},
|
149 |
"outputs": [],
|
150 |
"source": [
|
@@ -159,12 +167,14 @@
|
|
159 |
"\n",
|
160 |
" download('punkt')\n",
|
161 |
" wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
|
|
|
|
|
162 |
"\n",
|
163 |
" book = epub.read_epub(ebook_path)\n",
|
164 |
"\n",
|
165 |
" ebook_title = book.get_metadata('DC', 'title')[0][0]\n",
|
166 |
" ebook_title = ebook_title.lower().replace(' ', '_')\n",
|
167 |
-
"\n",
|
168 |
" corpus = []\n",
|
169 |
" for item in tqdm(list(book.get_items())):\n",
|
170 |
" if item.get_type() == ebooklib.ITEM_DOCUMENT:\n",
|
@@ -198,9 +208,32 @@
|
|
198 |
},
|
199 |
{
|
200 |
"cell_type": "code",
|
201 |
-
"execution_count":
|
202 |
"metadata": {},
|
203 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
"source": [
|
205 |
"ebook, title = read_ebook(ebook_path)"
|
206 |
]
|
@@ -214,9 +247,22 @@
|
|
214 |
},
|
215 |
{
|
216 |
"cell_type": "code",
|
217 |
-
"execution_count":
|
218 |
"metadata": {},
|
219 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
"source": [
|
221 |
"print(f'Title of ebook (path name):{title}\\n')\n",
|
222 |
"print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
|
@@ -225,11 +271,30 @@
|
|
225 |
},
|
226 |
{
|
227 |
"cell_type": "code",
|
228 |
-
"execution_count":
|
229 |
"metadata": {},
|
230 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
"source": [
|
232 |
-
"ebook[0:
|
233 |
]
|
234 |
},
|
235 |
{
|
|
|
45 |
},
|
46 |
{
|
47 |
"cell_type": "code",
|
48 |
+
"execution_count": 1,
|
49 |
"metadata": {},
|
50 |
"outputs": [],
|
51 |
"source": [
|
|
|
79 |
},
|
80 |
{
|
81 |
"cell_type": "code",
|
82 |
+
"execution_count": 2,
|
83 |
"metadata": {},
|
84 |
"outputs": [],
|
85 |
"source": [
|
|
|
96 |
},
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
+
"execution_count": 3,
|
100 |
"metadata": {},
|
101 |
+
"outputs": [
|
102 |
+
{
|
103 |
+
"name": "stderr",
|
104 |
+
"output_type": "stream",
|
105 |
+
"text": [
|
106 |
+
"Using cache found in /home/mkutarna/.cache/torch/hub/snakers4_silero-models_master\n"
|
107 |
+
]
|
108 |
+
}
|
109 |
+
],
|
110 |
"source": [
|
111 |
"language = 'en'\n",
|
112 |
"model_id = 'v3_en'\n",
|
|
|
130 |
},
|
131 |
{
|
132 |
"cell_type": "code",
|
133 |
+
"execution_count": 4,
|
134 |
"metadata": {},
|
135 |
"outputs": [],
|
136 |
"source": [
|
|
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
+
"execution_count": 8,
|
156 |
"metadata": {},
|
157 |
"outputs": [],
|
158 |
"source": [
|
|
|
167 |
"\n",
|
168 |
" download('punkt')\n",
|
169 |
" wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
|
170 |
+
" \n",
|
171 |
+
" p = inflect.engine()\n",
|
172 |
"\n",
|
173 |
" book = epub.read_epub(ebook_path)\n",
|
174 |
"\n",
|
175 |
" ebook_title = book.get_metadata('DC', 'title')[0][0]\n",
|
176 |
" ebook_title = ebook_title.lower().replace(' ', '_')\n",
|
177 |
+
" \n",
|
178 |
" corpus = []\n",
|
179 |
" for item in tqdm(list(book.get_items())):\n",
|
180 |
" if item.get_type() == ebooklib.ITEM_DOCUMENT:\n",
|
|
|
208 |
},
|
209 |
{
|
210 |
"cell_type": "code",
|
211 |
+
"execution_count": 9,
|
212 |
"metadata": {},
|
213 |
+
"outputs": [
|
214 |
+
{
|
215 |
+
"name": "stderr",
|
216 |
+
"output_type": "stream",
|
217 |
+
"text": [
|
218 |
+
"[nltk_data] Downloading package punkt to /home/mkutarna/nltk_data...\n",
|
219 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
220 |
+
]
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"data": {
|
224 |
+
"application/vnd.jupyter.widget-view+json": {
|
225 |
+
"model_id": "cb413d9ca55b4607924cc598502c00fe",
|
226 |
+
"version_major": 2,
|
227 |
+
"version_minor": 0
|
228 |
+
},
|
229 |
+
"text/plain": [
|
230 |
+
" 0%| | 0/11 [00:00<?, ?it/s]"
|
231 |
+
]
|
232 |
+
},
|
233 |
+
"metadata": {},
|
234 |
+
"output_type": "display_data"
|
235 |
+
}
|
236 |
+
],
|
237 |
"source": [
|
238 |
"ebook, title = read_ebook(ebook_path)"
|
239 |
]
|
|
|
247 |
},
|
248 |
{
|
249 |
"cell_type": "code",
|
250 |
+
"execution_count": 10,
|
251 |
"metadata": {},
|
252 |
+
"outputs": [
|
253 |
+
{
|
254 |
+
"name": "stdout",
|
255 |
+
"output_type": "stream",
|
256 |
+
"text": [
|
257 |
+
"Title of ebook (path name):the_picture_of_dorian_gray\n",
|
258 |
+
"\n",
|
259 |
+
"First line of the ebook:The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde\n",
|
260 |
+
"\n",
|
261 |
+
"First paragraph (truncated for display): \n",
|
262 |
+
" ['CHAPTER I.', 'The studio was filled with the rich odour of roses, and when the light summer wind stirred amidst the trees of the garden, there came', 'through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn.', 'From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry', 'Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able']\n"
|
263 |
+
]
|
264 |
+
}
|
265 |
+
],
|
266 |
"source": [
|
267 |
"print(f'Title of ebook (path name):{title}\\n')\n",
|
268 |
"print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
|
|
|
271 |
},
|
272 |
{
|
273 |
"cell_type": "code",
|
274 |
+
"execution_count": 11,
|
275 |
"metadata": {},
|
276 |
+
"outputs": [
|
277 |
+
{
|
278 |
+
"data": {
|
279 |
+
"text/plain": [
|
280 |
+
"['The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde',\n",
|
281 |
+
" 'This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no',\n",
|
282 |
+
" 'restrictions whatsoever.',\n",
|
283 |
+
" 'You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at',\n",
|
284 |
+
" 'www.gutenberg.org.',\n",
|
285 |
+
" 'If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',\n",
|
286 |
+
" 'Title: The Picture of Dorian Gray',\n",
|
287 |
+
" 'Author: Oscar Wilde',\n",
|
288 |
+
" 'Release Date: October, one thousand, nine hundred and ninety-four [eBook #one hundred and seventy-four]']"
|
289 |
+
]
|
290 |
+
},
|
291 |
+
"execution_count": 11,
|
292 |
+
"metadata": {},
|
293 |
+
"output_type": "execute_result"
|
294 |
+
}
|
295 |
+
],
|
296 |
"source": [
|
297 |
+
"ebook[0][:9]"
|
298 |
]
|
299 |
},
|
300 |
{
|
requirements.txt
CHANGED
@@ -3,5 +3,6 @@ streamlit
|
|
3 |
ebooklib
|
4 |
PyPDF2
|
5 |
bs4
|
|
|
6 |
nltk
|
7 |
stqdm
|
|
|
3 |
ebooklib
|
4 |
PyPDF2
|
5 |
bs4
|
6 |
+
inflect
|
7 |
nltk
|
8 |
stqdm
|
src/file_readers.py
CHANGED
@@ -49,6 +49,8 @@ def preprocess_text(file):
|
|
49 |
|
50 |
sentence_list = []
|
51 |
for sentence in sentences:
|
|
|
|
|
52 |
if not re.search('[a-zA-Z]', sentence):
|
53 |
sentence = ''
|
54 |
wrapped_sentences = wrapper.wrap(sentence)
|
@@ -59,6 +61,62 @@ def preprocess_text(file):
|
|
59 |
return text_list
|
60 |
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def read_pdf(file):
|
63 |
"""
|
64 |
Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,
|
|
|
49 |
|
50 |
sentence_list = []
|
51 |
for sentence in sentences:
|
52 |
+
if any(chr.isdigit() for chr in sentence):
|
53 |
+
sentence = extract_replace(sentence)
|
54 |
if not re.search('[a-zA-Z]', sentence):
|
55 |
sentence = ''
|
56 |
wrapped_sentences = wrapper.wrap(sentence)
|
|
|
61 |
return text_list
|
62 |
|
63 |
|
64 |
+
def extract_replace(entry_string):
|
65 |
+
import inflect
|
66 |
+
|
67 |
+
result = (entry_string + '.')[:-1]
|
68 |
+
p = inflect.engine()
|
69 |
+
i = 0
|
70 |
+
|
71 |
+
#initialize array with three random numbers to enter the loop, then find if there are numbers or not.
|
72 |
+
array = [3 , 2 , 3]
|
73 |
+
|
74 |
+
#take every number from the entry string, locate and store the number in digits in a sentence (using find_num_index), apply number_to_words
|
75 |
+
#to that number specifically then replace it back in the sentence.
|
76 |
+
while(len(array) > 2):
|
77 |
+
#update array with first and last indexes of every number in digits in a sentence
|
78 |
+
array = find_num_index(result)
|
79 |
+
number = result[array[i] : array[i+1] + 1]
|
80 |
+
k = p.number_to_words(number)
|
81 |
+
position = array[i]
|
82 |
+
number_of_characters = array[i+1] - array[i] + 1
|
83 |
+
|
84 |
+
#update sentence with the new word to numbers until there are no numbers in digits left
|
85 |
+
result = result[:position] + k + result[position + number_of_characters:]
|
86 |
+
|
87 |
+
return result
|
88 |
+
|
89 |
+
|
90 |
+
def find_num_index(entry_string):
|
91 |
+
result0 = []
|
92 |
+
|
93 |
+
#fill result0 array with all the indexes of digit characters in a sentence
|
94 |
+
for i in range(len(entry_string)):
|
95 |
+
if (entry_string[i].isdigit() == True):
|
96 |
+
result0.append(i)
|
97 |
+
|
98 |
+
result1 = []
|
99 |
+
|
100 |
+
try:
|
101 |
+
result1.append(result0[0])
|
102 |
+
except IndexError:
|
103 |
+
result0 = 'null'
|
104 |
+
if(result0 != 'null'):
|
105 |
+
|
106 |
+
# append only indexes of first and last characters of numbers to result1 array
|
107 |
+
for k in range(len(result0) - 1):
|
108 |
+
if ((result0[k+1] - result0[k]) > 2):
|
109 |
+
result1.append(result0[k])
|
110 |
+
result1.append(result0[k+1])
|
111 |
+
try:
|
112 |
+
result1.append(result0[len(result0) - 1])
|
113 |
+
except IndexError:
|
114 |
+
result1 = 'null'
|
115 |
+
|
116 |
+
# return array of even length that contains first and last index of every number in a sentence
|
117 |
+
return result1
|
118 |
+
|
119 |
+
|
120 |
def read_pdf(file):
|
121 |
"""
|
122 |
Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,
|