Add cleaner, fix data preparation

Browse files

Files changed (4) hide show

src/data_utils.py +71 -0
src/preparaing_recipe_nlg_dataset.py +62 -16
src/run.sh +1 -1
src/run_ed_recipe_nlg.py +0 -1

src/data_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from nltk.tokenize import wordpunct_tokenize as word_tokenize
+from nltk.tokenize import sent_tokenize
+import re
+import six
+import textwrap
+_whitelist = r"[0-9a-z\,\.\/\<\>]+"
+_regex = "0-9a-z\,\.\/\<\>"
+def filter_by_lang_regex(text, ratio=0.7, regex="0-9a-z\,\.\/\<\>"):
+    candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text), flags=re.IGNORECASE).replace(" ", "")
+    text = text.replace(" ", "")
+    return (len(candidate_text) / len(text)) > ratio
+def filter_by_num_tokens(text, gt=64):
+    return len(word_tokenize(text)) > gt
+def filter_by_num_sents(text, gt=2):
+    return len(sent_tokenize(text)) > gt
+def filter_by_steps(text):
+    return re.search('(step|mix all)', text, re.IGNORECASE) is not None
+def filter_by_length(text, gt=40):
+    return len(text) > gt
+def filter_by_item(item_list, gt=4):
+    return len(item_list) > gt
+def chars_to_preserve(sentence, whitelist):
+    try:
+        tokenized = re.findall(whitelist, sentence, re.IGNORECASE)
+        return " ".join(tokenized)
+    except Exception as error:
+        print(
+            textwrap.dedent(
+                f"""
+                Bad characters range {whitelist},
+                {error}
+                """
+            )
+        )
+        raise
+def normalizer(text, whitelist=r"[0-9a-z\,\.\/\<\>]+", do_lowercase=False):
+    if do_lowercase:
+        text = text.lower()
+    text = chars_to_preserve(text, whitelist=whitelist)
+    text = " ".join([word.strip() for word in text.split() if word.strip()])
+    text = text.strip()
+    return text
+# _text = "Crust, Peanut Butter}Melt <sep> 1/2Butter, 2 c. Eggs, Filling, Semi- Sweet Chocolate Chips, Milk, Butter, " \
+#         "Frosting"
+# out = normalizer(_text)
+# print(out)
+#
+# _text = "step ... "
+# print(re.search('(step|mix all)', _text, re.IGNORECASE) != None)

src/preparaing_recipe_nlg_dataset.py CHANGED Viewed

@@ -5,6 +5,7 @@ import sys
 from dataclasses import dataclass, field
 import pandas as pd
 from tqdm import tqdm
 from typing import Dict, List, Optional, Tuple
@@ -13,6 +14,16 @@ from transformers import (
     HfArgumentParser,
 )
 logger = logging.getLogger(__name__)
@@ -72,40 +83,75 @@ def main():
     def cleaning(text, item_type="ner"):
         # NOTE: DO THE CLEANING LATER
         return text
     def recipe_preparation(item_dict):
-        requirements = ["ner", "ingredients", "steps"]
-        constraints = [3, 3, 10]
-        if not all([
-            True if requirements[i] in item_dict and len(item_dict[requirements[i]].split()) > constraints[i] else False
-            for i in range(len(requirements))
-        ]):
             return None
-        ner = cleaning(item_dict["ner"], "ner")
-        ingredients = cleaning(item_dict["ingredients"], "ingredients")
-        steps = cleaning(item_dict["steps"], "steps")
         return {
             "inputs": ner,
-            "targets": f"{ingredients}<sep>{steps}"
         }
-    for subset in dataset.keys():
         data_dict = []
         for item in tqdm(dataset[subset], position=0, total=len(dataset[subset])):
             item = recipe_preparation(item)
             if item:
                 data_dict.append(item)
         data_df = pd.DataFrame(data_dict)
-        logger.info(f"Preparation of [{subset}] set consists of {len(data_df)} records!")
-        output_path = os.path.join(data_args.output_dir, f"{subset}.csv")
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        data_df.to_csv(output_path, sep="\t", encoding="utf-8", index=False)
-        logger.info(f"Data saved here {output_path}")
 if __name__ == '__main__':

 from dataclasses import dataclass, field
 import pandas as pd
+from sklearn.model_selection import train_test_split
 from tqdm import tqdm
 from typing import Dict, List, Optional, Tuple
     HfArgumentParser,
 )
+from data_utils import (
+    filter_by_lang_regex,
+    filter_by_steps,
+    filter_by_length,
+    filter_by_item,
+    filter_by_num_sents,
+    filter_by_num_tokens,
+    normalizer
+)
 logger = logging.getLogger(__name__)
     def cleaning(text, item_type="ner"):
         # NOTE: DO THE CLEANING LATER
+        text = normalizer(text, do_lowercase=True)
         return text
     def recipe_preparation(item_dict):
+        ner = item_dict["ner"]
+        title = item_dict["title"]
+        ingredients = item_dict["ingredients"]
+        steps = item_dict["directions"]
+        condition_1 = filter_by_item(ner, 4)
+        condition_2 = filter_by_length(title, 10)
+        condition_3 = filter_by_item(ingredients, 4)
+        condition_4 = filter_by_item(steps, 2)
+        condition_5 = filter_by_steps(" ".join(steps))
+        if not all([condition_1, condition_2, condition_3, condition_4, condition_5]):
             return None
+        ner = ", ".join(ner)
+        ingredients = " <sep> ".join(ingredients)
+        steps = " <sep> ".join(steps)
+        # Cleaning
+        ner = cleaning(ner, "ner")
+        title = cleaning(title, "title")
+        ingredients = cleaning(ingredients, "ingredients")
+        steps = cleaning(steps, "steps")
         return {
             "inputs": ner,
+            "targets": f"title: {title} <section> ingredients: {ingredients} <section> directions: {steps}"
         }
+    if len(dataset.keys()) > 1:
+        for subset in dataset.keys():
+            data_dict = []
+            for item in tqdm(dataset[subset], position=0, total=len(dataset[subset])):
+                item = recipe_preparation(item)
+                if item:
+                    data_dict.append(item)
+            data_df = pd.DataFrame(data_dict)
+            logger.info(f"Preparation of [{subset}] set consists of {len(data_df)} records!")
+            output_path = os.path.join(data_args.output_dir, f"{subset}.csv")
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            data_df.to_csv(output_path, sep="\t", encoding="utf-8", index=False)
+            logger.info(f"Data saved here {output_path}")
+    else:
         data_dict = []
+        subset = list(dataset.keys())[0]
         for item in tqdm(dataset[subset], position=0, total=len(dataset[subset])):
             item = recipe_preparation(item)
             if item:
                 data_dict.append(item)
         data_df = pd.DataFrame(data_dict)
+        train, test = train_test_split(data_df, test_size=0.05, random_state=101)
+        train = train.reset_index(drop=True)
+        test = test.reset_index(drop=True)
+        logger.info(f"Preparation of [train] set consists of {len(train)} records!")
+        logger.info(f"Preparation of [test] set consists of {len(test)} records!")
+        os.makedirs(data_args.output_dir, exist_ok=True)
+        train.to_csv(os.path.join(data_args.output_dir, "train.csv"), sep="\t", encoding="utf-8", index=False)
+        test.to_csv(os.path.join(data_args.output_dir, "test.csv"), sep="\t", encoding="utf-8", index=False)
+        logger.info(f"Data saved here {data_args.output_dir}")
 if __name__ == '__main__':

src/run.sh CHANGED Viewed

@@ -35,7 +35,7 @@ python run_ed_recipe_nlg.py \
     --max_target_length="$MAX_TARGET_LENGTH" \
     --model_name_or_path="$MODEL_NAME_OR_PATH"  \
     --extra_tokens="" \
-    --special_tokens="<sep>,<items>" \
     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
     --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS \

     --max_target_length="$MAX_TARGET_LENGTH" \
     --model_name_or_path="$MODEL_NAME_OR_PATH"  \
     --extra_tokens="" \
+    --special_tokens="<sep>,<section>" \
     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE \
     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
     --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS \

src/run_ed_recipe_nlg.py CHANGED Viewed

@@ -409,7 +409,6 @@ def main():
             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
-    model.resize_token_embeddings(len(tokenizer))
     if model.config.decoder_start_token_id is None:
         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

             config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     if model.config.decoder_start_token_id is None:
         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")