mkutarna commited on
Commit
985fb81
1 Parent(s): ab137d9

Created function to replace symbols with text equivalent

Browse files
notebooks/audiobook_gen_silero.ipynb CHANGED
@@ -45,7 +45,7 @@
45
  },
46
  {
47
  "cell_type": "code",
48
- "execution_count": 1,
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
@@ -79,7 +79,7 @@
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": 2,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
@@ -96,17 +96,9 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": 3,
100
  "metadata": {},
101
- "outputs": [
102
- {
103
- "name": "stderr",
104
- "output_type": "stream",
105
- "text": [
106
- "Using cache found in /home/mkutarna/.cache/torch/hub/snakers4_silero-models_master\n"
107
- ]
108
- }
109
- ],
110
  "source": [
111
  "language = 'en'\n",
112
  "model_id = 'v3_en'\n",
@@ -130,7 +122,7 @@
130
  },
131
  {
132
  "cell_type": "code",
133
- "execution_count": 4,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
@@ -152,7 +144,7 @@
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": 8,
156
  "metadata": {},
157
  "outputs": [],
158
  "source": [
@@ -167,8 +159,6 @@
167
  "\n",
168
  " download('punkt')\n",
169
  " wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
170
- " \n",
171
- " p = inflect.engine()\n",
172
  "\n",
173
  " book = epub.read_epub(ebook_path)\n",
174
  "\n",
@@ -208,32 +198,9 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": 9,
212
  "metadata": {},
213
- "outputs": [
214
- {
215
- "name": "stderr",
216
- "output_type": "stream",
217
- "text": [
218
- "[nltk_data] Downloading package punkt to /home/mkutarna/nltk_data...\n",
219
- "[nltk_data] Package punkt is already up-to-date!\n"
220
- ]
221
- },
222
- {
223
- "data": {
224
- "application/vnd.jupyter.widget-view+json": {
225
- "model_id": "cb413d9ca55b4607924cc598502c00fe",
226
- "version_major": 2,
227
- "version_minor": 0
228
- },
229
- "text/plain": [
230
- " 0%| | 0/11 [00:00<?, ?it/s]"
231
- ]
232
- },
233
- "metadata": {},
234
- "output_type": "display_data"
235
- }
236
- ],
237
  "source": [
238
  "ebook, title = read_ebook(ebook_path)"
239
  ]
@@ -247,22 +214,9 @@
247
  },
248
  {
249
  "cell_type": "code",
250
- "execution_count": 10,
251
  "metadata": {},
252
- "outputs": [
253
- {
254
- "name": "stdout",
255
- "output_type": "stream",
256
- "text": [
257
- "Title of ebook (path name):the_picture_of_dorian_gray\n",
258
- "\n",
259
- "First line of the ebook:The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde\n",
260
- "\n",
261
- "First paragraph (truncated for display): \n",
262
- " ['CHAPTER I.', 'The studio was filled with the rich odour of roses, and when the light summer wind stirred amidst the trees of the garden, there came', 'through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn.', 'From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry', 'Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able']\n"
263
- ]
264
- }
265
- ],
266
  "source": [
267
  "print(f'Title of ebook (path name):{title}\\n')\n",
268
  "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
@@ -271,28 +225,9 @@
271
  },
272
  {
273
  "cell_type": "code",
274
- "execution_count": 11,
275
  "metadata": {},
276
- "outputs": [
277
- {
278
- "data": {
279
- "text/plain": [
280
- "['The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde',\n",
281
- " 'This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no',\n",
282
- " 'restrictions whatsoever.',\n",
283
- " 'You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at',\n",
284
- " 'www.gutenberg.org.',\n",
285
- " 'If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.',\n",
286
- " 'Title: The Picture of Dorian Gray',\n",
287
- " 'Author: Oscar Wilde',\n",
288
- " 'Release Date: October, one thousand, nine hundred and ninety-four [eBook #one hundred and seventy-four]']"
289
- ]
290
- },
291
- "execution_count": 11,
292
- "metadata": {},
293
- "output_type": "execute_result"
294
- }
295
- ],
296
  "source": [
297
  "ebook[0][:9]"
298
  ]
 
45
  },
46
  {
47
  "cell_type": "code",
48
+ "execution_count": null,
49
  "metadata": {},
50
  "outputs": [],
51
  "source": [
 
79
  },
80
  {
81
  "cell_type": "code",
82
+ "execution_count": null,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
 
96
  },
97
  {
98
  "cell_type": "code",
99
+ "execution_count": null,
100
  "metadata": {},
101
+ "outputs": [],
 
 
 
 
 
 
 
 
102
  "source": [
103
  "language = 'en'\n",
104
  "model_id = 'v3_en'\n",
 
122
  },
123
  {
124
  "cell_type": "code",
125
+ "execution_count": null,
126
  "metadata": {},
127
  "outputs": [],
128
  "source": [
 
144
  },
145
  {
146
  "cell_type": "code",
147
+ "execution_count": null,
148
  "metadata": {},
149
  "outputs": [],
150
  "source": [
 
159
  "\n",
160
  " download('punkt')\n",
161
  " wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)\n",
 
 
162
  "\n",
163
  " book = epub.read_epub(ebook_path)\n",
164
  "\n",
 
198
  },
199
  {
200
  "cell_type": "code",
201
+ "execution_count": null,
202
  "metadata": {},
203
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  "source": [
205
  "ebook, title = read_ebook(ebook_path)"
206
  ]
 
214
  },
215
  {
216
  "cell_type": "code",
217
+ "execution_count": null,
218
  "metadata": {},
219
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "source": [
221
  "print(f'Title of ebook (path name):{title}\\n')\n",
222
  "print(f'First line of the ebook:{ebook[0][0]}\\n')\n",
 
225
  },
226
  {
227
  "cell_type": "code",
228
+ "execution_count": null,
229
  "metadata": {},
230
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  "source": [
232
  "ebook[0][:9]"
233
  ]
src/file_readers.py CHANGED
@@ -51,6 +51,7 @@ def preprocess_text(file):
51
  for sentence in sentences:
52
  if any(chr.isdigit() for chr in sentence):
53
  sentence = extract_replace(sentence)
 
54
  if not re.search('[a-zA-Z]', sentence):
55
  sentence = ''
56
  wrapped_sentences = wrapper.wrap(sentence)
@@ -117,6 +118,31 @@ def find_num_index(entry_string):
117
  return result1
118
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def read_pdf(file):
121
  """
122
  Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,
 
51
  for sentence in sentences:
52
  if any(chr.isdigit() for chr in sentence):
53
  sentence = extract_replace(sentence)
54
+ sentence = replace_symbols(sentence)
55
  if not re.search('[a-zA-Z]', sentence):
56
  sentence = ''
57
  wrapped_sentences = wrapper.wrap(sentence)
 
118
  return result1
119
 
120
 
121
+ def replace_symbols(text):
122
+ import re
123
+
124
+ symbol_map = {
125
+ '+': ' plus ',
126
+ '-': ' minus ',
127
+ '—': ' dash ',
128
+ '=': ' equals ',
129
+ '≈': ' approximately equal to ',
130
+ '*': ' times ',
131
+ 'x': ' times ',
132
+ '%': ' percent ',
133
+ '/': ' divided by ',
134
+ '#': ' number ',
135
+ '@': ' at ',
136
+ '&': ' ampersand ',
137
+ '°': ' degrees '
138
+ }
139
+
140
+ symbol_regex = re.compile('|'.join(re.escape(key) for key in symbol_map.keys()))
141
+ text = symbol_regex.sub(lambda x: symbol_map[x.group()], text)
142
+
143
+ return text
144
+
145
+
146
  def read_pdf(file):
147
  """
148
  Invokes PyPDF2 PdfReader to extract main body text from PDF file_like input,