Fraser-Greenlee
commited on
Commit
·
0d62ce8
1
Parent(s):
b30aa98
dataset script
Browse files- program_synthesis.py +99 -0
program_synthesis.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Program Synthesis dataset from dreamcoder. https://github.com/ellisk42/ec"""
|
2 |
+
from random import choice, shuffle
|
3 |
+
import datasets
|
4 |
+
|
5 |
+
from dreamcoder.domains.text.makeTextTasks import makeTasks as textMakeTasks
|
6 |
+
from dreamcoder.domains.list.main import main as listMakeTasks
|
7 |
+
|
8 |
+
|
9 |
+
_DESCRIPTION = """\
|
10 |
+
Generated program synthesis datasets used to train dreamcoder.
|
11 |
+
"""
|
12 |
+
_FEATURES = datasets.Features(
|
13 |
+
{
|
14 |
+
"description": datasets.Value("string"),
|
15 |
+
"input": datasets.Value("string"),
|
16 |
+
"output": datasets.Value("string"),
|
17 |
+
"types": datasets.Value("string")
|
18 |
+
}
|
19 |
+
)
|
20 |
+
|
21 |
+
_HOMEPAGE = "https://github.com/ellisk42/ec"
|
22 |
+
|
23 |
+
_LICENSE = "MIT License"
|
24 |
+
|
25 |
+
_MAX_STEPS = 10
|
26 |
+
|
27 |
+
|
28 |
+
class infIterator:
|
29 |
+
def __init__(self, make_mthd):
|
30 |
+
self.make_mthd = make_mthd
|
31 |
+
self.i = None
|
32 |
+
|
33 |
+
def reset(self):
|
34 |
+
tasks = self.make_mthd()
|
35 |
+
|
36 |
+
rows = []
|
37 |
+
for task in tasks:
|
38 |
+
base = {
|
39 |
+
'types': str(task.request),
|
40 |
+
"description": task.name,
|
41 |
+
}
|
42 |
+
for (inp, outp) in task.examples:
|
43 |
+
rows.append(dict(input=str(inp), output=str(outp), **base))
|
44 |
+
|
45 |
+
shuffle(rows)
|
46 |
+
self.rows = rows
|
47 |
+
self.i = 0
|
48 |
+
|
49 |
+
def step(self):
|
50 |
+
if self.i is None:
|
51 |
+
self.reset()
|
52 |
+
row = self.rows[self.i]
|
53 |
+
self.i += 1
|
54 |
+
if self.i >= len(self.rows):
|
55 |
+
self.reset()
|
56 |
+
return row
|
57 |
+
|
58 |
+
|
59 |
+
class ProgramSynthesis(datasets.GeneratorBasedBuilder):
|
60 |
+
"""Program Synthesis dataset from dreamcoder."""
|
61 |
+
|
62 |
+
VERSION = datasets.Version("1.1.0")
|
63 |
+
BUILDER_CONFIGS = [
|
64 |
+
datasets.BuilderConfig(name="text", version=VERSION, description="Text tasks."),
|
65 |
+
datasets.BuilderConfig(name="list", version=VERSION, description="List tasks."),
|
66 |
+
datasets.BuilderConfig(name="all", version=VERSION, description="All tasks at once."),
|
67 |
+
]
|
68 |
+
DEFAULT_CONFIG_NAME = "all"
|
69 |
+
|
70 |
+
def _info(self):
|
71 |
+
# self.config.name shows config used
|
72 |
+
return datasets.DatasetInfo(
|
73 |
+
description=_DESCRIPTION,
|
74 |
+
features=_FEATURES,
|
75 |
+
supervised_keys=("input", "output"),
|
76 |
+
homepage=_HOMEPAGE,
|
77 |
+
license=_LICENSE,
|
78 |
+
)
|
79 |
+
|
80 |
+
def _split_generators(self, dl_manager):
|
81 |
+
return [
|
82 |
+
datasets.SplitGenerator(
|
83 |
+
name=datasets.Split.TRAIN,
|
84 |
+
),
|
85 |
+
]
|
86 |
+
|
87 |
+
def _generate_examples(self):
|
88 |
+
task_samples = {
|
89 |
+
'text': infIterator(textMakeTasks),
|
90 |
+
'list': infIterator(listMakeTasks)
|
91 |
+
}
|
92 |
+
for key in range(_MAX_STEPS):
|
93 |
+
|
94 |
+
if self.config.name == 'all':
|
95 |
+
dataset_type = choice(task_samples.keys())
|
96 |
+
else:
|
97 |
+
dataset_type = self.config.name
|
98 |
+
|
99 |
+
yield key, task_samples[dataset_type].step()
|