File size: 2,309 Bytes
ba43e42 45dfe77 ba43e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import os
import re
import unicodedata
def strip_accents(text: str) -> str:
"""Removes accents from text."""
return ''.join(c for c in unicodedata.normalize('NFD', text)
if unicodedata.category(c) != 'Mn')
def load_raw_text(corpus_directory: str, file_names=None) -> str:
"""Loads all the text files in a directory into one large string"""
corpus = ""
for file_name in os.listdir(corpus_directory):
# Read the file as a string
file_path = os.path.join(corpus_directory, file_name)
if os.path.isdir(file_path):
continue
# Make sure we only read text files
if ".txt" not in file_name:
continue
with open(file_path, 'r') as file:
file_contents = file.read()
corpus += (file_contents + "\n")
return corpus
def load_single_raw_text_file(file_name):
"""Loads a single text file into one large string"""
corpus = ""
with open(file_name, 'r') as file:
file_contents = file.read()
corpus += (file_contents + "\n")
return corpus
word_regex = r"[\w|\']+"
def tokenize(text):
return re.findall(word_regex, text)
def preprocess(text):
"""Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation."""
text = strip_accents(text)
text = text.lower()
tokens = text.split(" ")
tokens_filtered = []
for token in tokens:
# Skip any tokens with special characters
if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token):
tokens_filtered.append(token)
return tokens_filtered
def pad(text: list, num_padding: int):
"""Pads the given text, as a list of strings, with <s> characters between sentences."""
padded_text = []
# Add initial padding to the first sentence
for _ in range(num_padding):
padded_text.append("<s>")
for word in text:
padded_text.append(word)
# Every time we see an end punctuation mark, add <s> tokens before it
# REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION
if word in [".", "?", "!"]:
for _ in range(num_padding):
padded_text.append("<s>")
return padded_text
|