Fraser-Greenlee commited on
Commit
0d62ce8
·
1 Parent(s): b30aa98

dataset script

Browse files
Files changed (1) hide show
  1. program_synthesis.py +99 -0
program_synthesis.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Program Synthesis dataset from dreamcoder. https://github.com/ellisk42/ec"""
2
+ from random import choice, shuffle
3
+ import datasets
4
+
5
+ from dreamcoder.domains.text.makeTextTasks import makeTasks as textMakeTasks
6
+ from dreamcoder.domains.list.main import main as listMakeTasks
7
+
8
+
9
+ _DESCRIPTION = """\
10
+ Generated program synthesis datasets used to train dreamcoder.
11
+ """
12
+ _FEATURES = datasets.Features(
13
+ {
14
+ "description": datasets.Value("string"),
15
+ "input": datasets.Value("string"),
16
+ "output": datasets.Value("string"),
17
+ "types": datasets.Value("string")
18
+ }
19
+ )
20
+
21
+ _HOMEPAGE = "https://github.com/ellisk42/ec"
22
+
23
+ _LICENSE = "MIT License"
24
+
25
+ _MAX_STEPS = 10
26
+
27
+
28
+ class infIterator:
29
+ def __init__(self, make_mthd):
30
+ self.make_mthd = make_mthd
31
+ self.i = None
32
+
33
+ def reset(self):
34
+ tasks = self.make_mthd()
35
+
36
+ rows = []
37
+ for task in tasks:
38
+ base = {
39
+ 'types': str(task.request),
40
+ "description": task.name,
41
+ }
42
+ for (inp, outp) in task.examples:
43
+ rows.append(dict(input=str(inp), output=str(outp), **base))
44
+
45
+ shuffle(rows)
46
+ self.rows = rows
47
+ self.i = 0
48
+
49
+ def step(self):
50
+ if self.i is None:
51
+ self.reset()
52
+ row = self.rows[self.i]
53
+ self.i += 1
54
+ if self.i >= len(self.rows):
55
+ self.reset()
56
+ return row
57
+
58
+
59
+ class ProgramSynthesis(datasets.GeneratorBasedBuilder):
60
+ """Program Synthesis dataset from dreamcoder."""
61
+
62
+ VERSION = datasets.Version("1.1.0")
63
+ BUILDER_CONFIGS = [
64
+ datasets.BuilderConfig(name="text", version=VERSION, description="Text tasks."),
65
+ datasets.BuilderConfig(name="list", version=VERSION, description="List tasks."),
66
+ datasets.BuilderConfig(name="all", version=VERSION, description="All tasks at once."),
67
+ ]
68
+ DEFAULT_CONFIG_NAME = "all"
69
+
70
+ def _info(self):
71
+ # self.config.name shows config used
72
+ return datasets.DatasetInfo(
73
+ description=_DESCRIPTION,
74
+ features=_FEATURES,
75
+ supervised_keys=("input", "output"),
76
+ homepage=_HOMEPAGE,
77
+ license=_LICENSE,
78
+ )
79
+
80
+ def _split_generators(self, dl_manager):
81
+ return [
82
+ datasets.SplitGenerator(
83
+ name=datasets.Split.TRAIN,
84
+ ),
85
+ ]
86
+
87
+ def _generate_examples(self):
88
+ task_samples = {
89
+ 'text': infIterator(textMakeTasks),
90
+ 'list': infIterator(listMakeTasks)
91
+ }
92
+ for key in range(_MAX_STEPS):
93
+
94
+ if self.config.name == 'all':
95
+ dataset_type = choice(task_samples.keys())
96
+ else:
97
+ dataset_type = self.config.name
98
+
99
+ yield key, task_samples[dataset_type].step()