Spaces:
Running
Running
import random | |
import requests | |
import os, glob | |
# english literature | |
books = [ | |
'https://www.gutenberg.org/cache/epub/1513/pg1513.txt', | |
'https://www.gutenberg.org/files/2701/2701-0.txt', | |
'https://www.gutenberg.org/cache/epub/84/pg84.txt', | |
'https://www.gutenberg.org/cache/epub/2641/pg2641.txt', | |
'https://www.gutenberg.org/cache/epub/1342/pg1342.txt', | |
'https://www.gutenberg.org/cache/epub/100/pg100.txt' | |
] | |
#default english | |
# allowed_chars = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~\n\\' | |
#german | |
allowed_chars = ' aäbcdefghijklmnoöpqrsßtuüvwxyzABCDEFGHIJKLMNOÖPQRSTUÜVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~\n\\' | |
def download_book(book): | |
return requests.get(book).content.decode('utf-8') | |
def filter_data(data): | |
print('Filtering data') | |
return ''.join([char for char in data if char in allowed_chars]) | |
def load_books(fromfolder=False): | |
text_data = [] | |
if fromfolder: | |
current_working_directory = os.getcwd() | |
print(current_working_directory) | |
path = 'text' | |
for filename in glob.glob(os.path.join(path, '*.txt')): | |
with open(os.path.join(os.getcwd(), filename), 'r') as f: # open in readonly mode | |
print(f'Loading {filename}') | |
text_data.append(filter_data(str(f.read()))) | |
else: | |
print(f'Loading {len(books)} books into ram') | |
for book in books: | |
text_data.append(filter_data(str(download_book(book)))) | |
print('Loaded books') | |
return ' '.join(text_data) | |
def random_split_chunk(data, size=14): | |
data = data.split(' ') | |
index = random.randrange(0, len(data)) | |
return ' '.join(data[index:index+size]) | |