Spaces:
Runtime error
Runtime error
import pandas as pd | |
import re | |
import tensorflow as tf | |
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import numpy as np | |
import string | |
from string import digits | |
from sklearn.utils import shuffle | |
from sklearn.model_selection import train_test_split | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from tqdm import tqdm | |
class Dataset: | |
def __init__(self, data, tknizer_ass, tknizer_eng, max_len): | |
self.encoder_inps = data['ass'].values | |
self.decoder_inps = data['eng_inp'].values | |
self.decoder_outs = data['eng_out'].values | |
self.tknizer_eng = tknizer_eng | |
self.tknizer_ass = tknizer_ass | |
self.max_len = max_len | |
def __getitem__(self, i): | |
self.encoder_seq = self.tknizer_ass.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values | |
self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]]) | |
self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]]) | |
self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post') | |
self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post') | |
self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post') | |
return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq | |
def __len__(self): # your model.fit_gen requires this function | |
return len(self.encoder_inps) | |
class Dataloder(tf.keras.utils.Sequence): | |
def __init__(self, dataset, batch_size=1): | |
self.dataset = dataset | |
self.batch_size = batch_size | |
self.indexes = np.arange(len(self.dataset.encoder_inps)) | |
def __getitem__(self, i): | |
start = i * self.batch_size | |
stop = (i + 1) * self.batch_size | |
data = [] | |
for j in range(start, stop): | |
data.append(self.dataset[j]) | |
batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)] | |
# we are creating data like ([italian, english_inp], english_out) these are already converted into seq | |
return tuple([[batch[0],batch[1]],batch[2]]) | |
def __len__(self): # your model.fit_gen requires this function | |
return len(self.indexes) // self.batch_size | |
def on_epoch_end(self): | |
self.indexes = np.random.permutation(self.indexes) |