Fraser
/

to_delete

English

program-synthesis

Model card Files Files and versions Community

Fraser-Greenlee commited on Jan 15, 2022

Commit

0d62ce8

1 Parent(s): b30aa98

dataset script

Browse files

Files changed (1) hide show

program_synthesis.py +99 -0

program_synthesis.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Program Synthesis dataset from dreamcoder. https://github.com/ellisk42/ec"""
+from random import choice, shuffle
+import datasets
+from dreamcoder.domains.text.makeTextTasks import makeTasks as textMakeTasks
+from dreamcoder.domains.list.main import main as listMakeTasks
+_DESCRIPTION = """\
+Generated program synthesis datasets used to train dreamcoder.
+"""
+_FEATURES = datasets.Features(
+    {
+        "description": datasets.Value("string"),
+        "input": datasets.Value("string"),
+        "output": datasets.Value("string"),
+        "types": datasets.Value("string")
+    }
+)
+_HOMEPAGE = "https://github.com/ellisk42/ec"
+_LICENSE = "MIT License"
+_MAX_STEPS = 10
+class infIterator:
+    def __init__(self, make_mthd):
+        self.make_mthd = make_mthd
+        self.i = None
+    def reset(self):
+        tasks = self.make_mthd()
+        rows = []
+        for task in tasks:
+            base = {
+                'types': str(task.request),
+                "description": task.name,
+            }
+            for (inp, outp) in task.examples:
+                rows.append(dict(input=str(inp), output=str(outp), **base))
+        shuffle(rows)
+        self.rows = rows
+        self.i = 0
+    def step(self):
+        if self.i is None:
+            self.reset()
+        row = self.rows[self.i]
+        self.i += 1
+        if self.i >= len(self.rows):
+            self.reset()
+        return row
+class ProgramSynthesis(datasets.GeneratorBasedBuilder):
+    """Program Synthesis dataset from dreamcoder."""
+    VERSION = datasets.Version("1.1.0")
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="text", version=VERSION, description="Text tasks."),
+        datasets.BuilderConfig(name="list", version=VERSION, description="List tasks."),
+        datasets.BuilderConfig(name="all", version=VERSION, description="All tasks at once."),
+    ]
+    DEFAULT_CONFIG_NAME = "all"
+    def _info(self):
+        # self.config.name shows config used
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=_FEATURES,
+            supervised_keys=("input", "output"),
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+        )
+    def _split_generators(self, dl_manager):
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+            ),
+        ]
+    def _generate_examples(self):
+        task_samples = {
+            'text': infIterator(textMakeTasks),
+            'list': infIterator(listMakeTasks)
+        }
+        for key in range(_MAX_STEPS):
+            if self.config.name == 'all':
+                dataset_type = choice(task_samples.keys())
+            else:
+                dataset_type = self.config.name
+            yield key, task_samples[dataset_type].step()