Remove extra file
Browse files- src/prep_dataset.py +0 -29
src/prep_dataset.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
from datasets import load_dataset, DatasetDict
|
2 |
-
from hazm import sent_tokenize
|
3 |
-
from normalizer import normalize
|
4 |
-
|
5 |
-
|
6 |
-
class Prep_dataset:
|
7 |
-
|
8 |
-
def __init__(self, subsample=False, *args, **kwargs):
|
9 |
-
raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_fa")
|
10 |
-
if subsample:
|
11 |
-
sample_dataset = raw_dataset.copy()
|
12 |
-
sample_dataset["sample"] = sample_dataset["train"].select(range(100))
|
13 |
-
sample_dataset.pop("train")
|
14 |
-
sample_dataset["train"] = sample_dataset.pop("sample")
|
15 |
-
final = DatasetDict(sample_dataset)
|
16 |
-
self.raw_dataset = final
|
17 |
-
else:
|
18 |
-
self.raw_dataset = raw_dataset
|
19 |
-
|
20 |
-
def _normalize(self, example):
|
21 |
-
example["text"] = normalize(example["text"])
|
22 |
-
return example
|
23 |
-
|
24 |
-
def preprare_dataset(self):
|
25 |
-
big_dataset = self.raw_dataset.filter(lambda x: len(x["text"]) > 500)
|
26 |
-
richSent_dataset = big_dataset.filter(lambda x: len(sent_tokenize(x["text"])) > 2)
|
27 |
-
normalized_dataset = richSent_dataset.map(self._normalize)
|
28 |
-
|
29 |
-
return normalized_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|