Jami Pekkanen commited on
Commit
a18679a
1 Parent(s): e4732c7

Add all .py files

Browse files
Files changed (3) hide show
  1. neural_networks.py +31 -0
  2. tokenization.py +273 -0
  3. utils.py +203 -0
neural_networks.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo
2
+
3
+ __generated_with = "0.6.8"
4
+ app = marimo.App()
5
+
6
+
7
+ @app.cell
8
+ def __():
9
+ import marimo as mo
10
+ return mo,
11
+
12
+
13
+ @app.cell
14
+ def __(mo):
15
+ mo.md(
16
+ rf"""
17
+ # Neural network models
18
+
19
+ **TODO**, please check back later.
20
+ """
21
+ )
22
+ return
23
+
24
+
25
+ @app.cell
26
+ def __():
27
+ return
28
+
29
+
30
+ if __name__ == "__main__":
31
+ app.run()
tokenization.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo
2
+
3
+ __generated_with = "0.6.8"
4
+ app = marimo.App()
5
+
6
+
7
+ @app.cell
8
+ def __():
9
+ import marimo as mo
10
+
11
+ import marimo as mo
12
+ from pprint import pformat
13
+ from collections import defaultdict
14
+ import utils as U
15
+
16
+ U.init_output
17
+ return U, defaultdict, mo, pformat
18
+
19
+
20
+ @app.cell
21
+ def __(mo):
22
+ mo.md(
23
+ rf"""
24
+ # Tokenization
25
+ ---
26
+ In the [previous notebook](?file=basics.py) we split the text roughly to words. But the models don't care about what the pieces of text are, and we can split them any way we want. In this notebook we can try out different tokenizations and see how they affect the model's behavior.
27
+
28
+ Now the text will separate tokens with alternate colored backgrounds, so they can be distinguished easier. We also print out the newline characters and show spaces as underlined.
29
+
30
+ Last time we also did a trick where spaces `' '` were not tokens. In the following we'll treat them as separate tokens too.
31
+
32
+ Now you should see some familiar lyrics tokenized like this:
33
+ """
34
+ )
35
+ return
36
+
37
+
38
+ @app.cell
39
+ def __(U):
40
+ class Tokenizer:
41
+ def tokens_to_strings(self, tokens):
42
+ return map(self.token_to_string, tokens)
43
+
44
+ def detokenize(self, tokens):
45
+ strings = self.tokens_to_strings(tokens)
46
+ return ''.join(strings)
47
+
48
+ def token_to_string(self, s):
49
+ return s
50
+
51
+ class HackyWordTokenizer(Tokenizer):
52
+ def __call__(self, s):
53
+ return s.split(' ')
54
+
55
+ def tokens_to_strings(self, tokens):
56
+ for token in tokens:
57
+ yield token
58
+ # TODO: Shouldn't yield last space
59
+ yield ' '
60
+
61
+ import re
62
+ class WordTokenizer(Tokenizer):
63
+ def __call__(self, s):
64
+ out = re.split('( +|\n+)', s)
65
+ return [t for t in out if t]
66
+
67
+ class CharacterTokenizer(Tokenizer):
68
+ def __call__(self, s):
69
+ return list(s)
70
+
71
+ import transformers
72
+
73
+ #_BASE_MODEL="EleutherAI/pythia-14m"
74
+ _BASE_MODEL="facebook/opt-125m"
75
+ class SubwordTokenizer(Tokenizer):
76
+ def __init__(self):
77
+ self._tok = transformers.AutoTokenizer.from_pretrained(_BASE_MODEL)
78
+
79
+ def __call__(self, s):
80
+ # Using strings instead of ids to avoid confusion
81
+ token_ids = self._tok(s)['input_ids']
82
+ return [self._tok.decode([id]) for id in token_ids]
83
+
84
+
85
+ tokenizers = {
86
+ "Word": WordTokenizer(),
87
+ "Character": CharacterTokenizer(),
88
+ "Subword": SubwordTokenizer(),
89
+ }
90
+
91
+ languages = {
92
+ "English": U.blowin_text,
93
+ "Finnish": U.blowin_text_finnish,
94
+ "German": U.blowin_text_german,
95
+ }
96
+
97
+
98
+ return (
99
+ CharacterTokenizer,
100
+ HackyWordTokenizer,
101
+ SubwordTokenizer,
102
+ Tokenizer,
103
+ WordTokenizer,
104
+ languages,
105
+ re,
106
+ tokenizers,
107
+ transformers,
108
+ )
109
+
110
+
111
+ @app.cell
112
+ def __(languages, mo):
113
+ language_selector = mo.ui.dropdown(options=languages, value="English", allow_select_none=False)
114
+
115
+ random_seed_slider = mo.ui.slider(start=1, value=1, stop=30, full_width=False, show_value=True, label="Variation (random seed)")
116
+ return language_selector, random_seed_slider
117
+
118
+
119
+ @app.cell
120
+ def __():
121
+ return
122
+
123
+
124
+ @app.cell
125
+ def __(mo):
126
+ #corpus_text_first_line = corpus_text.strip().split('\n')[0]
127
+
128
+ tokenizer_texts = {
129
+ "Word tokenizer": mo.md("""
130
+ The word tokenizer splits the text into individual words. This tends to generate somewhat legible text even with short context lengths. However, it can't create new words!
131
+
132
+ This is not so bad in English that has quite few inflections. However, in synthetic and agglutinative languages like Finnish this is a big problem, as you can form new words that have never been uttered in the history of the world!
133
+ """),
134
+ "Character tokenizer": mo.md("""
135
+ The character tokenizer splits the text into individual characters. With this we can create new words, but especially with shorter context length, it produces total gibberish!
136
+
137
+ A tradeoff between word tokenization and character tokenization is **subword tokenization**. Here common strings, like English words and Finnish inflections, are typically represented as a single token, but the tokenization also includes individual characters.
138
+ """),
139
+ "Subword tokenizer": mo.md(f"""
140
+ Subword tokenizer tries to split the text to commonly occuring strings, such as words, but it can "fall back" to smaller strings, including single characters. Typically most common English words are individual tokens. Also subwords like Finnish inflections or syllables may get their own tokens.
141
+
142
+ A common method for subword tokenization is [Byte Pair Encoding](https://en.wikipedia.org/wiki/Byte_pair_encoding). The tokenizer in this examples uses that method, and is in fact the same tokenizer that was used for GPT-2.
143
+
144
+ You may notice that for English the resulting tokenization is not that different from the word tokenization. A major difference is that spaces are included in the tokens. However, see what happens if you do a Finnish or German translation of the lyrics.
145
+ """),
146
+ }
147
+
148
+ context_length_slider = mo.ui.slider(start=1, value = 2, stop=10, full_width=False, label="Context length", show_value=True)
149
+
150
+ tokenizer_tabs = mo.ui.tabs(
151
+ tokenizer_texts,
152
+ value="Word"
153
+ )
154
+
155
+ return context_length_slider, tokenizer_tabs, tokenizer_texts
156
+
157
+
158
+ @app.cell
159
+ def __(tokenizer_tabs, tokenizers):
160
+ tokenizer_type = tokenizer_tabs.value.split()[0]
161
+ tokenizer = tokenizers[tokenizer_type]
162
+ return tokenizer, tokenizer_type
163
+
164
+
165
+ @app.cell
166
+ def __(
167
+ U,
168
+ context_length_slider,
169
+ language_selector,
170
+ tokenizer,
171
+ tokenizer_type,
172
+ ):
173
+ corpus_text = language_selector.value
174
+ context_length = context_length_slider.value
175
+ corpus_tokens = tokenizer(corpus_text)
176
+ print(tokenizer, tokenizer_type, corpus_tokens)
177
+ vocabulary = U.corpus_to_vocabulary(corpus_tokens)
178
+
179
+ return context_length, corpus_text, corpus_tokens, vocabulary
180
+
181
+
182
+ @app.cell
183
+ def __():
184
+ return
185
+
186
+
187
+ @app.cell
188
+ def __(U, context_length, corpus_tokens, tokenizer):
189
+ next_tokens = U.get_next_token_table(corpus_tokens, context_length)
190
+ U.tokens_out(corpus_tokens, tokenizer)
191
+ return next_tokens,
192
+
193
+
194
+ @app.cell
195
+ def __(mo):
196
+ mo.md(rf"With the tabs below, you can select different tokenizers. As you change the tokenizer, the results below change automatically. Go through the different tokenizers and observe how they change the results!")
197
+ return
198
+
199
+
200
+ @app.cell
201
+ def __(mo, tokenizer_tabs):
202
+ mo.md(
203
+ f"""
204
+ ## Tokenizer selection
205
+ ---
206
+ <div style="height: 20em; overflow: auto;">
207
+ {tokenizer_tabs}
208
+ </div>
209
+ """
210
+ )
211
+ return
212
+
213
+
214
+ @app.cell
215
+ def __(language_selector, mo):
216
+ mo.md(rf"Lyrics language {language_selector} (Translation by Google Translate)")
217
+ return
218
+
219
+
220
+ @app.cell
221
+ def __(mo):
222
+ mo.md(
223
+ f"""
224
+ ## Playground (watch this change!)
225
+ ---
226
+ """
227
+ )
228
+ return
229
+
230
+
231
+ @app.cell
232
+ def __(
233
+ U,
234
+ context_length_slider,
235
+ corpus_tokens,
236
+ mo,
237
+ next_tokens,
238
+ random_seed_slider,
239
+ tokenizer,
240
+ ):
241
+ gen_seed = random_seed_slider.value
242
+ gen_tokens = U.generate_tokens(next_tokens, seed=gen_seed)
243
+
244
+ gen_ui = mo.vstack([
245
+ U.tokens_out(gen_tokens, tokenizer),
246
+ mo.hstack([context_length_slider, random_seed_slider])
247
+ ])
248
+
249
+ mo.ui.tabs({
250
+ "Random generated": gen_ui,
251
+ "Tokenized original": U.tokens_out(corpus_tokens, tokenizer),
252
+ #"Follower graph": U.plot_follower_context_graph(next_tokens),
253
+ #"Vocabulary": U.python_out(vocabulary),
254
+ #"Next token table": U.python_out(dict(next_tokens)),
255
+ })
256
+ return gen_seed, gen_tokens, gen_ui
257
+
258
+
259
+ @app.cell
260
+ def __(mo):
261
+ mo.md(
262
+ rf"""
263
+ ---
264
+ In the next notebook, we'll learn basics of neural networks and how they can be used to create more flexible and scalable language models.
265
+
266
+ [Continue to Neural Networks >](?file=tokenization.py)
267
+ """
268
+ )
269
+ return
270
+
271
+
272
+ if __name__ == "__main__":
273
+ app.run()
utils.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo as mo
2
+
3
+ from pprint import pformat
4
+ from collections import defaultdict, Counter
5
+ import random
6
+
7
+ _pre_box_height = "10em";
8
+ _font_size = "12px";
9
+ def pre_box(text):
10
+ return mo.Html(f"""
11
+ <pre class="pre_out_box" style="overflow: auto; height: {_pre_box_height}; font-size: {_font_size};">
12
+ {text}
13
+
14
+ </pre>""")
15
+
16
+ def python_out(code):
17
+ return mo.Html(f"""
18
+ <pre class="python_out_box" style="overflow: auto; height: {_pre_box_height}; font-size: {_font_size};">
19
+ {pformat(code, sort_dicts=False, compact=True)}
20
+
21
+ </pre>""")
22
+
23
+ def tokens_out(tokens, tokenizer):
24
+ out = ""
25
+ for i, string in enumerate(tokenizer.tokens_to_strings(tokens)):
26
+ #colors = ["rgb(20, 184, 166)", "rgb(245, 158, 11)"]
27
+ colors = [
28
+ "#2b9a66",
29
+ #"#26997b",
30
+ "#00749e",
31
+ "#dc3e42",
32
+ ]
33
+ #colors = "#d1f0fa", "#ffcdce"
34
+ colors = "var(--sky-3)", "var(--red-3)", "var(--amber-3)"
35
+ color = colors[i%len(colors)]
36
+ # TODO: Be more general!
37
+ if string == ' ':
38
+ decoration = "underline"
39
+ else:
40
+ decoration = "none"
41
+
42
+ n_newlines = string.count('\n')
43
+ string = string.replace("\n", "\\n")
44
+ string += "\n"*n_newlines
45
+
46
+ out += f'<span style="background-color: {color}; text-decoration: {decoration}">{string}</span>'
47
+ out = f'<div style="overflow: auto; height: {_pre_box_height};">{out}</div>'
48
+ return pre_box(out)
49
+
50
+ def corpus_to_vocabulary(tokens):
51
+ # Using dict instead of set to keep the order
52
+ return list({w: None for w in tokens}.keys())
53
+
54
+ init_output = mo.Html(f"""
55
+ <style>
56
+ .python_out_box {{
57
+ overflow: auto !important;
58
+ max_height: {_pre_box_height};
59
+ font-size: 12px;
60
+ }}
61
+
62
+ .pre_out_box {{
63
+ overflow: auto !important;
64
+ height: {_pre_box_height};
65
+ font-size: 12px;
66
+ }}
67
+ </style>
68
+ """)
69
+ init_output = None
70
+
71
+ def graph_out(svg):
72
+ return mo.Html(f"""
73
+ <div style="overflow: auto; max-height: 32em;">
74
+ {svg}
75
+ </div>
76
+ """)
77
+
78
+ def plot_follower_graph(next_words):
79
+ import pydot
80
+
81
+ graph = pydot.Dot("follower_graph", ordering="in")
82
+ def mangle(s):
83
+ #if isinstance(s, tuple) and len(s) == 1:
84
+ # s = s[0]
85
+ return repr(s).replace(r'\n', r'\\n')
86
+ for context, followers in next_words.items():
87
+ graph.add_node(pydot.Node(mangle(context)))
88
+ for follower in followers:
89
+ edge = graph.add_edge(pydot.Edge(mangle(context), mangle(follower)))
90
+ # A bit of a hack
91
+ #if hasattr(followers, 'get'):
92
+ # edge.set_label(followers.get(follower))
93
+ #else:
94
+ # count = None
95
+
96
+ svg = graph.create_svg().decode('utf-8')
97
+ return graph_out(svg)
98
+
99
+ def plot_follower_context_graph(next_words):
100
+ # TODO: This is fugly. Use dot
101
+ import pydot
102
+
103
+ graph = pydot.Dot("follower_graph", ordering="in", strict=True)
104
+ def mangle(s):
105
+ #if isinstance(s, tuple) and len(s) == 1:
106
+ # s = s[0]
107
+ return repr(s).replace(r'\n', r'\\n')
108
+ for context, followers in next_words.items():
109
+ #graph.add_node(pydot.Node(mangle(context)))
110
+ for follower in followers:
111
+ # A bit of a hack
112
+ #edge = graph.add_edge(pydot.Edge(mangle(context), mangle(follower)))
113
+ new_context = (*context[1:], follower)
114
+ for follower in next_words.get(context, []):
115
+ follower_context = (*context[1:], follower)
116
+ graph.add_edge(pydot.Edge(
117
+ mangle(context),
118
+ mangle(follower_context),
119
+ label=mangle(follower)
120
+ ))
121
+
122
+ svg = graph.create_svg().decode('utf-8')
123
+ return graph_out(svg)
124
+
125
+ def generate_tokens(next_words, context=None, max_tokens=200, seed=3):
126
+ rng = random.Random(seed)
127
+
128
+ if context is None:
129
+ context = next(iter(next_words.keys()))
130
+ yield from context
131
+
132
+ for i in range(max_tokens):
133
+ candidates = next_words.get(context, None)
134
+ if not candidates: return
135
+
136
+ choices, counts = zip(*candidates.items())
137
+ if not choices: return
138
+ next_word = rng.choice(choices)
139
+ if next_word == '\n\n': return
140
+ yield next_word
141
+ context = (*context[1:], next_word)
142
+
143
+ # Doing this more succintly now
144
+ def get_ngrams(tokens, n):
145
+ for i in range(len(tokens) - n + 1):
146
+ yield tokens[i:i+n]
147
+
148
+ def get_next_token_table(tokens, context_length, table=None):
149
+ if table is None:
150
+ table = defaultdict(Counter)
151
+ for *context, next_token in get_ngrams(tokens, context_length + 1):
152
+ table[tuple(context)][next_token] += 1
153
+
154
+ return table
155
+
156
+ happy_birthday_text = """
157
+ Happy birthday to you
158
+ Happy birthday to you
159
+ Happy birthday dear Dave
160
+ Happy birthday to you
161
+ """
162
+
163
+ blowin_text = """
164
+ Yes, and how many roads must a man walk down, before you call him a man?
165
+ And how many seas must a white dove sail, before she sleeps in the sand?
166
+ Yes, and how many times must the cannonballs fly, before they're forever banned?
167
+
168
+ Yes, and how many years must a mountain exist, before it is washed to the sea?
169
+ And how many years can some people exist, before they're allowed to be free?
170
+ Yes, and how many times can a man turn his head, and pretend that he just doesn't see?
171
+
172
+ Yes, and how many times must a man look up, before he can see the sky?
173
+ And how many ears must one man have, before he can hear people cry?
174
+ Yes, and how many deaths will it take 'til he knows, that too many people have died?
175
+ """
176
+
177
+ blowin_text_finnish = """
178
+ Niin, ja kuinka monta tietä miehen täytyy kävellä, ennen kuin kutsut häntä mieheksi?
179
+ Ja kuinka monta merta valkoisen kyyhkysen täytyy purjehtia, ennen kuin se nukkuu hiekkaan?
180
+ Kyllä, ja kuinka monta kertaa kanuunankuulat täytyy lentää, ennen kuin ne on ikuisesti kielletty?
181
+
182
+ Kyllä, ja kuinka monta vuotta vuoren on oltava olemassa, ennen kuin se huuhtoutuu mereen?
183
+ Ja kuinka monta vuotta jotkut ihmiset voivat olla olemassa ennen kuin he saavat olla vapaita?
184
+ Kyllä, ja kuinka monta kertaa ihminen voi kääntää päätään ja teeskennellä, ettei hän vain näe?
185
+
186
+ Kyllä, ja kuinka monta kertaa miehen täytyy katsoa ylös, ennen kuin hän voi nähdä taivaan?
187
+ Ja kuinka monta korvaa yhdellä ihmisellä pitää olla, ennen kuin hän voi kuulla ihmisten itkevän?
188
+ Kyllä, ja kuinka monta kuolemaa kestää, ennen kuin hän tietää, että liian monta ihmistä on kuollut?
189
+ """
190
+
191
+ blowin_text_german = """
192
+ Ja, und wie viele Wege muss ein Mann gehen, bevor man ihn einen Mann nennt?
193
+ Und wie viele Meere muss eine weiße Taube durchsegeln, bevor sie im Sand schläft?
194
+ Ja, und wie oft müssen die Kanonenkugeln fliegen, bevor sie für immer verboten werden?
195
+
196
+ Ja, und wie viele Jahre muss ein Berg existieren, bevor er ins Meer gespült wird?
197
+ Und wie viele Jahre können manche Menschen existieren, bevor sie frei sein dürfen?
198
+ Ja, und wie oft kann ein Mann den Kopf drehen und so tun, als würde er einfach nichts sehen?
199
+
200
+ Ja, und wie oft muss ein Mensch nach oben schauen, bevor er den Himmel sehen kann?
201
+ Und wie viele Ohren muss ein Mann haben, bevor er Menschen weinen hören kann?
202
+ Ja, und wie viele Todesfälle wird es dauern, bis er weiß, dass zu viele Menschen gestorben sind?
203
+ """