rbgo's picture
add all files
550b30c
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import string
from string import digits
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
class Dataset:
def __init__(self, data, tknizer_ass, tknizer_eng, max_len):
self.encoder_inps = data['ass'].values
self.decoder_inps = data['eng_inp'].values
self.decoder_outs = data['eng_out'].values
self.tknizer_eng = tknizer_eng
self.tknizer_ass = tknizer_ass
self.max_len = max_len
def __getitem__(self, i):
self.encoder_seq = self.tknizer_ass.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]])
self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]])
self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post')
self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post')
self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post')
return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq
def __len__(self): # your model.fit_gen requires this function
return len(self.encoder_inps)
class Dataloder(tf.keras.utils.Sequence):
def __init__(self, dataset, batch_size=1):
self.dataset = dataset
self.batch_size = batch_size
self.indexes = np.arange(len(self.dataset.encoder_inps))
def __getitem__(self, i):
start = i * self.batch_size
stop = (i + 1) * self.batch_size
data = []
for j in range(start, stop):
data.append(self.dataset[j])
batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
# we are creating data like ([italian, english_inp], english_out) these are already converted into seq
return tuple([[batch[0],batch[1]],batch[2]])
def __len__(self): # your model.fit_gen requires this function
return len(self.indexes) // self.batch_size
def on_epoch_end(self):
self.indexes = np.random.permutation(self.indexes)