m3hrdadfi commited on
Commit
4350a5a
1 Parent(s): 1809a17

Remove extra file

Browse files
Files changed (1) hide show
  1. src/prep_dataset.py +0 -29
src/prep_dataset.py DELETED
@@ -1,29 +0,0 @@
1
- from datasets import load_dataset, DatasetDict
2
- from hazm import sent_tokenize
3
- from normalizer import normalize
4
-
5
-
6
- class Prep_dataset:
7
-
8
- def __init__(self, subsample=False, *args, **kwargs):
9
- raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa")
10
- if subsample:
11
- sample_dataset = raw_dataset.copy()
12
- sample_dataset["sample"] = sample_dataset["train"].select(range(100))
13
- sample_dataset.pop("train")
14
- sample_dataset["train"] = sample_dataset.pop("sample")
15
- final = DatasetDict(sample_dataset)
16
- self.raw_dataset = final
17
- else:
18
- self.raw_dataset = raw_dataset
19
-
20
- def _normalize(self, example):
21
- example["text"] = normalize(example["text"])
22
- return example
23
-
24
- def preprare_dataset(self):
25
- big_dataset = self.raw_dataset.filter(lambda x: len(x["text"]) > 500)
26
- richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"])) > 2)
27
- normalized_dataset = richSent_dataset.map(self._normalize)
28
-
29
- return normalized_dataset