from datasets import load_dataset, DatasetDict | |
from hazm import sent_tokenize | |
from normalizer import normalize | |
class Prep_dataset: | |
def __init__(self, subsample=False, *args, **kwargs): | |
raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa") | |
if subsample: | |
sample_dataset = raw_dataset.copy() | |
sample_dataset["sample"] = sample_dataset["train"].select(range(100)) | |
sample_dataset.pop("train") | |
sample_dataset["train"] = sample_dataset.pop("sample") | |
final = DatasetDict(sample_dataset) | |
self.raw_dataset = final | |
else: | |
self.raw_dataset = raw_dataset | |
def _normalize(self, example): | |
example["text"] = normalize(example["text"]) | |
return example | |
def preprare_dataset(self): | |
big_dataset = self.raw_dataset.filter(lambda x: len(x["text"]) > 500) | |
richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"])) > 2) | |
normalized_dataset = richSent_dataset.map(self._normalize) | |
return normalized_dataset | |