mkutarna commited on
Commit
ab137d9
1 Parent(s): 82b2b89

Added function to replace numerals with text equivalent

Browse files
notebooks/audiobook_gen_silero.ipynb CHANGED
@@ -45,7 +45,7 @@
45
  },
46
  {
47
  "cell_type": "code",
48
- "execution_count": null,
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
@@ -79,7 +79,7 @@
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": null,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
@@ -96,9 +96,17 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": null,
100
  "metadata": {},
101
- "outputs": [],
 
 
 
 
 
 
 
 
102
  "source": [
103
  "language = 'en'\n",
104
  "model_id = 'v3_en'\n",
@@ -122,7 +130,7 @@
122
  },
123
  {
124
  "cell_type": "code",
125
- "execution_count": null,
126
  "metadata": {},
127
  "outputs": [],
128
  "source": [
@@ -144,7 +152,7 @@
144
  },
145
  {
146
  "cell_type": "code",
147
- "execution_count": null,
148
  "metadata": {},
149
  "outputs": [],
150
  "source": [
@@ -159,12 +167,14 @@
159
  "\n",
160
  " download('punkt')\n",
161
  " wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
 
 
162
  "\n",
163
  " book = epub.read_epub(ebook_path)\n",
164
  "\n",
165
  " ebook_title = book.get_metadata('DC', 'title')[0][0]\n",
166
  " ebook_title = ebook_title.lower().replace(' ', '_')\n",
167
- "\n",
168
  " corpus = []\n",
169
  " for item in tqdm(list(book.get_items())):\n",
170
  " if item.get_type() == ebooklib.ITEM_DOCUMENT:\n",
@@ -198,9 +208,32 @@
198
  },
199
  {
200
  "cell_type": "code",
201
- "execution_count": null,
202
  "metadata": {},
203
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  "source": [
205
  "ebook, title = read_ebook(ebook_path)"
206
  ]
@@ -214,9 +247,22 @@
214
  },
215
  {
216
  "cell_type": "code",
217
- "execution_count": null,
218
  "metadata": {},
219
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "source": [
221
  "print(f'Title of ebook (path name):{title}\\n')\n",
222
  "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
@@ -225,11 +271,30 @@
225
  },
226
  {
227
  "cell_type": "code",
228
- "execution_count": null,
229
  "metadata": {},
230
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  "source": [
232
- "ebook[0:3]"
233
  ]
234
  },
235
  {
 
45
  },
46
  {
47
  "cell_type": "code",
48
+ "execution_count": 1,
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
 
79
  },
80
  {
81
  "cell_type": "code",
82
+ "execution_count": 2,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
 
96
  },
97
  {
98
  "cell_type": "code",
99
+ "execution_count": 3,
100
  "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "name": "stderr",
104
+ "output_type": "stream",
105
+ "text": [
106
+ "Using cache found in /home/mkutarna/.cache/torch/hub/snakers4_silero-models_master\n"
107
+ ]
108
+ }
109
+ ],
110
  "source": [
111
  "language = 'en'\n",
112
  "model_id = 'v3_en'\n",
 
130
  },
131
  {
132
  "cell_type": "code",
133
+ "execution_count": 4,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
 
152
  },
153
  {
154
  "cell_type": "code",
155
+ "execution_count": 8,
156
  "metadata": {},
157
  "outputs": [],
158
  "source": [
 
167
  "\n",
168
  " download('punkt')\n",
169
  " wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
170
+ " \n",
171
+ " p = inflect.engine()\n",
172
  "\n",
173
  " book = epub.read_epub(ebook_path)\n",
174
  "\n",
175
  " ebook_title = book.get_metadata('DC', 'title')[0][0]\n",
176
  " ebook_title = ebook_title.lower().replace(' ', '_')\n",
177
+ " \n",
178
  " corpus = []\n",
179
  " for item in tqdm(list(book.get_items())):\n",
180
  " if item.get_type() == ebooklib.ITEM_DOCUMENT:\n",
 
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": 9,
212
  "metadata": {},
213
+ "outputs": [
214
+ {
215
+ "name": "stderr",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "[nltk_data] Downloading package punkt to /home/mkutarna/nltk_data...\n",
219
+ "[nltk_data] Package punkt is already up-to-date!\n"
220
+ ]
221
+ },
222
+ {
223
+ "data": {
224
+ "application/vnd.jupyter.widget-view+json": {
225
+ "model_id": "cb413d9ca55b4607924cc598502c00fe",
226
+ "version_major": 2,
227
+ "version_minor": 0
228
+ },
229
+ "text/plain": [
230
+ " 0%| | 0/11 [00:00<?, ?it/s]"
231
+ ]
232
+ },
233
+ "metadata": {},
234
+ "output_type": "display_data"
235
+ }
236
+ ],
237
  "source": [
238
  "ebook, title = read_ebook(ebook_path)"
239
  ]
 
247
  },
248
  {
249
  "cell_type": "code",
250
+ "execution_count": 10,
251
  "metadata": {},
252
+ "outputs": [
253
+ {
254
+ "name": "stdout",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "Title of ebook (path name):the_picture_of_dorian_gray\n",
258
+ "\n",
259
+ "First line of the ebook:The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde\n",
260
+ "\n",
261
+ "First paragraph (truncated for display): \n",
262
+ " ['CHAPTER I.', 'The studio was filled with the rich odour of roses, and when the light summer wind stirred amidst the trees of the garden, there came', 'through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn.', 'From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry', 'Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able']\n"
263
+ ]
264
+ }
265
+ ],
266
  "source": [
267
  "print(f'Title of ebook (path name):{title}\\n')\n",
268
  "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
 
271
  },
272
  {
273
  "cell_type": "code",
274
+ "execution_count": 11,
275
  "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "data": {
279
+ "text/plain": [
280
+ "['The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde',\n",
281
+ " 'This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no',\n",
282
+ " 'restrictions whatsoever.',\n",
283
+ " 'You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at',\n",
284
+ " 'www.gutenberg.org.',\n",
285
+ " 'If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',\n",
286
+ " 'Title: The Picture of Dorian Gray',\n",
287
+ " 'Author: Oscar Wilde',\n",
288
+ " 'Release Date: October, one thousand, nine hundred and ninety-four [eBook #one hundred and seventy-four]']"
289
+ ]
290
+ },
291
+ "execution_count": 11,
292
+ "metadata": {},
293
+ "output_type": "execute_result"
294
+ }
295
+ ],
296
  "source": [
297
+ "ebook[0][:9]"
298
  ]
299
  },
300
  {
requirements.txt CHANGED
@@ -3,5 +3,6 @@ streamlit
3
  ebooklib
4
  PyPDF2
5
  bs4
 
6
  nltk
7
  stqdm
 
3
  ebooklib
4
  PyPDF2
5
  bs4
6
+ inflect
7
  nltk
8
  stqdm
src/file_readers.py CHANGED
@@ -49,6 +49,8 @@ def preprocess_text(file):
49
 
50
  sentence_list = []
51
  for sentence in sentences:
 
 
52
  if not re.search('[a-zA-Z]', sentence):
53
  sentence = ''
54
  wrapped_sentences = wrapper.wrap(sentence)
@@ -59,6 +61,62 @@ def preprocess_text(file):
59
  return text_list
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def read_pdf(file):
63
  """
64
  Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,
 
49
 
50
  sentence_list = []
51
  for sentence in sentences:
52
+ if any(chr.isdigit() for chr in sentence):
53
+ sentence = extract_replace(sentence)
54
  if not re.search('[a-zA-Z]', sentence):
55
  sentence = ''
56
  wrapped_sentences = wrapper.wrap(sentence)
 
61
  return text_list
62
 
63
 
64
+ def extract_replace(entry_string):
65
+ import inflect
66
+
67
+ result = (entry_string + '.')[:-1]
68
+ p = inflect.engine()
69
+ i = 0
70
+
71
+ #initialize array with three random numbers to enter the loop, then find if there are numbers or not.
72
+ array = [3 , 2 , 3]
73
+
74
+ #take every number from the entry string, locate and store the number in digits in a sentence (using find_num_index), apply number_to_words
75
+ #to that number specifically then replace it back in the sentence.
76
+ while(len(array) > 2):
77
+ #update array with first and last indexes of every number in digits in a sentence
78
+ array = find_num_index(result)
79
+ number = result[array[i] : array[i+1] + 1]
80
+ k = p.number_to_words(number)
81
+ position = array[i]
82
+ number_of_characters = array[i+1] - array[i] + 1
83
+
84
+ #update sentence with the new word to numbers until there are no numbers in digits left
85
+ result = result[:position] + k + result[position + number_of_characters:]
86
+
87
+ return result
88
+
89
+
90
+ def find_num_index(entry_string):
91
+ result0 = []
92
+
93
+ #fill result0 array with all the indexes of digit characters in a sentence
94
+ for i in range(len(entry_string)):
95
+ if (entry_string[i].isdigit() == True):
96
+ result0.append(i)
97
+
98
+ result1 = []
99
+
100
+ try:
101
+ result1.append(result0[0])
102
+ except IndexError:
103
+ result0 = 'null'
104
+ if(result0 != 'null'):
105
+
106
+ # append only indexes of first and last characters of numbers to result1 array
107
+ for k in range(len(result0) - 1):
108
+ if ((result0[k+1] - result0[k]) > 2):
109
+ result1.append(result0[k])
110
+ result1.append(result0[k+1])
111
+ try:
112
+ result1.append(result0[len(result0) - 1])
113
+ except IndexError:
114
+ result1 = 'null'
115
+
116
+ # return array of even length that contains first and last index of every number in a sentence
117
+ return result1
118
+
119
+
120
  def read_pdf(file):
121
  """
122
  Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,