Fix data preparation
Browse files- src/preparaing_recipe_nlg_dataset.py +8 -3
- src/run.sh +2 -1
src/preparaing_recipe_nlg_dataset.py
CHANGED
@@ -81,6 +81,7 @@ def main():
|
|
81 |
cache_dir=data_args.cache_dir
|
82 |
)
|
83 |
|
|
|
84 |
def cleaning(text, item_type="ner"):
|
85 |
# NOTE: DO THE CLEANING LATER
|
86 |
text = normalizer(text, do_lowercase=True)
|
@@ -92,9 +93,9 @@ def main():
|
|
92 |
ingredients = item_dict["ingredients"]
|
93 |
steps = item_dict["directions"]
|
94 |
|
95 |
-
condition_1 = filter_by_item(ner,
|
96 |
-
condition_2 = filter_by_length(title,
|
97 |
-
condition_3 = filter_by_item(ingredients,
|
98 |
condition_4 = filter_by_item(steps, 2)
|
99 |
condition_5 = filter_by_steps(" ".join(steps))
|
100 |
|
@@ -140,6 +141,10 @@ def main():
|
|
140 |
data_dict.append(item)
|
141 |
|
142 |
data_df = pd.DataFrame(data_dict)
|
|
|
|
|
|
|
|
|
143 |
train, test = train_test_split(data_df, test_size=0.05, random_state=101)
|
144 |
|
145 |
train = train.reset_index(drop=True)
|
|
|
81 |
cache_dir=data_args.cache_dir
|
82 |
)
|
83 |
|
84 |
+
|
85 |
def cleaning(text, item_type="ner"):
|
86 |
# NOTE: DO THE CLEANING LATER
|
87 |
text = normalizer(text, do_lowercase=True)
|
|
|
93 |
ingredients = item_dict["ingredients"]
|
94 |
steps = item_dict["directions"]
|
95 |
|
96 |
+
condition_1 = filter_by_item(ner, 3)
|
97 |
+
condition_2 = filter_by_length(title, 3)
|
98 |
+
condition_3 = filter_by_item(ingredients, 3)
|
99 |
condition_4 = filter_by_item(steps, 2)
|
100 |
condition_5 = filter_by_steps(" ".join(steps))
|
101 |
|
|
|
141 |
data_dict.append(item)
|
142 |
|
143 |
data_df = pd.DataFrame(data_dict)
|
144 |
+
|
145 |
+
logger.info(f"Preparation - [before] consists of {len(dataset[subset])} records!")
|
146 |
+
logger.info(f"Preparation - [after] consists of {len(data_df)} records!")
|
147 |
+
|
148 |
train, test = train_test_split(data_df, test_size=0.05, random_state=101)
|
149 |
|
150 |
train = train.reset_index(drop=True)
|
src/run.sh
CHANGED
@@ -52,4 +52,5 @@ python run_ed_recipe_nlg.py \
|
|
52 |
--do_train \
|
53 |
--do_eval \
|
54 |
--overwrite_output_dir \
|
55 |
-
--predict_with_generate
|
|
|
|
52 |
--do_train \
|
53 |
--do_eval \
|
54 |
--overwrite_output_dir \
|
55 |
+
--predict_with_generate \
|
56 |
+
--push_to_hub
|