AlienKevin commited on
Commit
a54c5b6
1 Parent(s): 2bed565

Upload 113 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. CyberCan.dict +0 -0
  3. CyberCan.xlsx +3 -0
  4. abc_rare_char_mapping.txt +360 -0
  5. checkpoint-11000/config.json +76 -0
  6. checkpoint-11000/generation_config.json +13 -0
  7. checkpoint-11000/optimizer.pt +3 -0
  8. checkpoint-11000/pytorch_model.bin +3 -0
  9. checkpoint-11000/rng_state.pth +3 -0
  10. checkpoint-11000/scheduler.pt +3 -0
  11. checkpoint-11000/trainer_state.json +456 -0
  12. checkpoint-11000/training_args.bin +3 -0
  13. commercial_baselines/bing.can +0 -0
  14. commercial_baselines/bing.key +1 -0
  15. commercial_baselines/bing.man +0 -0
  16. commercial_baselines/bing_translator.ipynb +234 -0
  17. commercial_baselines/lihkg.filtered.man +0 -0
  18. commercial_baselines/load_can.ipynb +58 -0
  19. finetune.ipynb +0 -0
  20. load_abc.ipynb +964 -0
  21. load_lihkg.ipynb +242 -0
  22. load_mined_bitext.ipynb +175 -0
  23. para/.DS_Store +0 -0
  24. para/dev/.DS_Store +0 -0
  25. para/dev/dev.can +0 -0
  26. para/dev/dev.man +0 -0
  27. para/dev/dev.norm.can +0 -0
  28. para/test/.DS_Store +0 -0
  29. para/test/test.can +0 -0
  30. para/test/test.man +0 -0
  31. para/test/test.norm.can +0 -0
  32. para/test/test.typos.can +0 -0
  33. para/test/test.typos.man +0 -0
  34. process_novels.ipynb +104 -0
  35. runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/1681654257.025384/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.1 +3 -0
  36. runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.0 +3 -0
  37. test.ipynb +0 -0
  38. test.pred.130K.new.12000.man +0 -0
  39. test.pred.130K.new.6000.man +0 -0
  40. test.pred.130K.old.man +0 -0
  41. test.pred.16K.man +0 -0
  42. test.pred.175K.12000.bidir.man +0 -0
  43. test.pred.80K.man +0 -0
  44. test.pred.bing.11000.man +0 -0
  45. test.pred.bing.man +0 -0
  46. test.typos.pred.130K.new.12000.man +0 -0
  47. test.typos.pred.130K.old.12000.man +0 -0
  48. test.typos.pred.170K.mined.6000.man +0 -0
  49. test.typos.pred.175K.12000.bidir.man +0 -0
  50. test.typos.pred.80K.7000.man +0 -0
.gitattributes CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ CyberCan.xlsx filter=lfs diff=lfs merge=lfs -text
36
+ train/lihkg.can filter=lfs diff=lfs merge=lfs -text
37
+ train/train.can filter=lfs diff=lfs merge=lfs -text
38
+ train/train.man filter=lfs diff=lfs merge=lfs -text
CyberCan.dict ADDED
The diff for this file is too large to render. See raw diff
 
CyberCan.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459bafd61fd02b74b1e94388298ed81623e7641711abb06841fe122c3cfdab1e
3
+ size 2680859
abc_rare_char_mapping.txt ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 𠹺 埋 388
2
+ 噖 琴 162
3
+ 𡁵 緊 157
4
+ 𠶧 掂 88
5
+ 嚫 親 88
6
+ 屘 尾 57
7
+ 衭 褲 47
8
+ 贃 賺 43
9
+ 說 説 35
10
+ 𧵳 蝕 30
11
+ 歳 歲 27
12
+ 𢫏 冚 27
13
+ 𨶙 撚 25
14
+ 癐 攰 25
15
+ 𦡆 窟 25
16
+ 𨃩 跣 24
17
+ 况 況 21
18
+ 内 內 19
19
+ 𢵌 隊 19
20
+ 𦧺 賴 18
21
+ 𠹌 掕 18
22
+ 爲 為 16
23
+ 𢱑 搲 16
24
+ 𡁯 嘟 15
25
+ 𠱓 詭 14
26
+ 𠵿 披 14
27
+ 踹 踩 13
28
+ 㗇 蝦 13
29
+ 𠾴 唪 13
30
+ 嗍 索 13
31
+ 𧘹 太 13
32
+ 𠹳 傑 12
33
+ 𠹭 哥 12
34
+ 脫 脱 12
35
+ 䁪 眨 11
36
+ 𧨾 氹 11
37
+ 掬 谷 11
38
+ 𠸐 襟 11
39
+ 啥 沙 11
40
+ 𠱃 凹 10
41
+ 噔 等 10
42
+ 捹 揈 10
43
+ 𠹻 陣 10
44
+ 𠼻 基 10
45
+ 噠 達 10
46
+ 𨳊 鳩 10
47
+ 𢲲 晾 9
48
+ 𨉖 匿 9
49
+ 躭 耽 9
50
+ 䠋 卑 9
51
+ 嘮 勞 9
52
+ 啽 罨 9
53
+ 滮 標 8
54
+ 㧻 篤 8
55
+ 𧶄 爭 8
56
+ 𦛚 淰 8
57
+ 撠 棘 8
58
+ 呡 抆 8
59
+ 睸 瞇 8
60
+ 𠰲 嗝 8
61
+ 𥔿 冚 8
62
+ 唎 脷 8
63
+ 𠸊 佮 8
64
+ 𬜐 舔 8
65
+ 蔥 葱 8
66
+ B B 7
67
+ 𢯊 扚 7
68
+ 𫫃 𡃀 7
69
+ 銹 鏽 7
70
+ 㓤 拮 7
71
+ 䁯 瞌 7
72
+ 啉 林 7
73
+ 臥 卧 7
74
+ 𠓼 攝 7
75
+ 稅 税 7
76
+  趟 7
77
+ 喴 依 7
78
+ 噱 ??? 7
79
+ 𡄯 ??? 6
80
+ 揤 擳 6
81
+ 𢤹 啹 6
82
+  噏 6
83
+ 鷄 雞 6
84
+  ??? 6
85
+ 𦣇 籮 6
86
+ 齧 咬 6
87
+ 𠮨 乃 6
88
+  啤 6
89
+ 𡀝 稔 6
90
+ 婄 蓓 6
91
+ 𠼱 累 6
92
+ 𠱂 腯 5
93
+ 磧 石責 5
94
+ 𠰋 㕭 5
95
+ 𡂖 戾 5
96
+ 擏 擎 5
97
+ 𥋇 掌 5
98
+ 揢 咳 5
99
+ 㨆 冧 5
100
+ 𠾍 棄 5
101
+ 兌 兑 5
102
+ 𢺳 掹 5
103
+ 坺 擗 5
104
+ 鍚 錫 5
105
+ 𣘚 ??? 5
106
+ 𪘁 ??? 5
107
+ 𨳍 七 5
108
+ 嗙 o旁 5
109
+ 𠼰 ??? 5
110
+ 𨳒 屌 4
111
+ 唿 篋 4
112
+ 𣳼 ??? 4
113
+ 𦂥 ??? 4
114
+ 溚 塔 4
115
+ 囋 ??? 4
116
+ 瀄 吱 4
117
+ 𠌥 ??? 4
118
+ 𢫦 ??? 4
119
+ 𢶍 ??? 4
120
+ 𠲵 ??? 4
121
+ 䉺 ??? 4
122
+ 炕 ??? 4
123
+ 𢴈 撻 4
124
+ 𡲢 ??? 4
125
+ 𥅈 立 4
126
+ 𬧊 甩 4
127
+ 簕 勒 4
128
+ 査 查 4
129
+ 𩜠 岩 4
130
+ 𫬿 ??? 4
131
+ 𠜱 卑刂 4
132
+ 嚬 顰 4
133
+ 𠹹 ??? 4
134
+ 𦉘 ??? 4
135
+ 唦 沙 4
136
+ 㨘 扌省 4
137
+ 𡄽 瀉 4
138
+ 熗 槍 4
139
+ 𡁷 ??? 4
140
+ 𠿬 ??? 4
141
+ 咜 叱 4
142
+ 𠸏 茄 4
143
+ 𡁸 ??? 4
144
+ 𡃵 ??? 4
145
+ 𪚩 ??? 4
146
+ D D 4
147
+ Q Q 4
148
+ 𨆯 ??? 3
149
+ 啗 啖 3
150
+ 蔸 艹兜 3
151
+ 舗 鋪 3
152
+ 囪 窗 3
153
+ 艔 ??? 3
154
+ 洩 ??? 3
155
+ 𢵧 ??? 3
156
+ 菓 果 3
157
+ 䪴 ??? 3
158
+ 䆲 ??? 3
159
+ 痱 ??? 3
160
+ 趿 拖 3
161
+ 𠮩 ??? 3
162
+ 搉 確 3
163
+ 矋 矖 3
164
+ 𠻗 ??? 3
165
+ 𢲈 ??? 3
166
+ 潞 氵路 3
167
+ 沬 ??? 3
168
+ 揇 扌南 3
169
+ 齃 曷 3
170
+ 𡃤 賴 3
171
+ 𡃶 ??? 3
172
+ 瀟 ??? 3
173
+ 軨 ??? 3
174
+ 鉻 ??? 3
175
+  ??? 3
176
+ 㿭 斥 3
177
+ 𢵄 ??? 3
178
+ 㗲 ??? 3
179
+ 𢫕 ??? 3
180
+ 𢰸 ??? 3
181
+ 葫 ??? 3
182
+ 咔 ??? 3
183
+ 嚎 ??? 3
184
+ 嗿 ??? 3
185
+ 咈 o弗 3
186
+ 咾 嚕 3
187
+  ??? 3
188
+ 𠵈 妹 3
189
+ 吥 o不 3
190
+ 𠾭 ??? 3
191
+ 𠾵 ??? 3
192
+ 朘 俊 3
193
+ 觥 黃 3
194
+ 㩧 扌暴 2
195
+ 焙 ??? 2
196
+ 兀 ??? 2
197
+ 䭤 ??? 2
198
+ 饊 ??? 2
199
+ [ ??? 2
200
+ ] ??? 2
201
+ 炖 ??? 2
202
+ 争 爭 2
203
+ 䁓 ??? 2
204
+ 𡂝 ??? 2
205
+ 𩬎 壬 2
206
+ 鈒 閘 2
207
+ 亁 乾 2
208
+ 炠 灬甲 2
209
+ 摼 ??? 2
210
+ 𠺬 ??? 2
211
+ 𠵉 ??? 2
212
+ 蝄 ??? 2
213
+  ??? 2
214
+ 蔫 艹焉 2
215
+ 㘉 ??? 2
216
+ 荏 ??? 2
217
+ 墘 土乾 2
218
+ 嗏 搽 2
219
+ 呣 o母 2
220
+ 曚 矇 2
221
+ 壬 ??? 2
222
+ 揅 研 2
223
+ 溼 濕 2
224
+ 囓 咬 2
225
+ 嚙 咬 2
226
+ 枴 拐 2
227
+ 𡃀 ??? 2
228
+ 饑 ??? 2
229
+ 䏭 ??? 2
230
+ 挼 挪 2
231
+ 掱 ??? 2
232
+ 咑 打 2
233
+ 芙 ??? 2
234
+ 𦂗 ??? 2
235
+ 舦 軚 2
236
+ 𢶤 扌靴 2
237
+ 翡 ??? 2
238
+ 翠 ??? 2
239
+ 酡 酉它 2
240
+ 𫭊 ??? 2
241
+ 煀 火屈 2
242
+ 耙 ??? 2
243
+ 𠿭 滑 2
244
+ 鉤 鈎 2
245
+ 𠻘 ??? 2
246
+ 脽 離 2
247
+ 焊 ??? 2
248
+ 唊 o夾 2
249
+ 胅 ⺼失 2
250
+ 翕 ??? 2
251
+ 摜 摔 2
252
+ 僚 ??? 1
253
+ 𩗴 ??? 1
254
+ 毡 ??? 1
255
+ 跤 ??? 1
256
+ 梧 ??? 1
257
+ 痄 疒乍 1
258
+ 卟 卜 1
259
+ 劄 札 1
260
+ 𠶜 制 1
261
+ 睜 ??? 1
262
+ 迹 跡 1
263
+ 揃 扌前 1
264
+ 唨 o阻 1
265
+ 謢 護 1
266
+ 菻 麻 1
267
+ 𣚺 ??? 1
268
+ 鷓 庶鳥 1
269
+ 鴣 古鳥 1
270
+ 强 ??? 1
271
+ 𠾶 ??? 1
272
+ 𡆀 轟 1
273
+ 拫 扌艮 1
274
+ 𠼮 偽 1
275
+ 汞 ??? 1
276
+ 㤿 ??? 1
277
+ 厴 ??? 1
278
+ 𥀬 ??? 1
279
+ 牯 ??? 1
280
+ 𡇙 ??? 1
281
+ 讕 賴 1
282
+ 𠿫 ??? 1
283
+ 瘺 婁 1
284
+ 骲 骨包 1
285
+ 𫲭 ??? 1
286
+ 瓏 玉龍 1
287
+ 繚 ??? 1
288
+ 撿 ??? 1
289
+ 跀 ⻊月 1
290
+ 𢛴 掹 1
291
+ 蝻 虫南 1
292
+ 赧 羞赤 1
293
+ 𪙛 甩 1
294
+  ??? 1
295
+ 檳 ??? 1
296
+ 潲 餿 1
297
+ 𢶠 ??? 1
298
+ 秧 ??? 1
299
+ 蒔 ??? 1
300
+ 炩 灬令 1
301
+ 㩋 ??? 1
302
+ 饅 ??? 1
303
+ 鍍 ??? 1
304
+ 𢚖 ??? 1
305
+ 𧊅 虫另 1
306
+  ??? 1
307
+ 篸 ??? 1
308
+ 𩟔 ??? 1
309
+ 撍 賺 1
310
+ 栗 ??? 1
311
+  ??? 1
312
+ 𡆇 ??? 1
313
+ 杧 芒 1
314
+ 榛 ??? 1
315
+ 蠄 虫禽 1
316
+ 蟧 ??? 1
317
+ 嘶 ??? 1
318
+ 梆 ??? 1
319
+ 竪 豎 1
320
+ 騾 ??? 1
321
+ 矺 ??? 1
322
+ 堀 ??? 1
323
+ 麝 ??? 1
324
+ 慪 嘔 1
325
+ 撴 扌敦 1
326
+ 哾 啜 1
327
+ 𠳖 ??? 1
328
+ 洌 冽 1
329
+ 霹 ??? 1
330
+ 𠾼 ??? 1
331
+ 𬦠 ??? 1
332
+ 𤌍 ??? 1
333
+ 𬧯 ??? 1
334
+ 厠 廁 1
335
+ 㖡 ??? 1
336
+ 跁 ⻊巴 1
337
+ 鉎 ??? 1
338
+ 𧣈 ??? 1
339
+ 𠳏 ??? 1
340
+ 㹃 非 1
341
+ 𧝞 ??? 1
342
+ 𡀞 ??? 1
343
+ 㦒 ??? 1
344
+ 𩩍 娉 1
345
+ 𢱢 ??? 1
346
+ 鍟 ??? 1
347
+ 煱 ??? 1
348
+ 撘 搭 1
349
+ 閱 ??? 1
350
+ 橇 喬 1
351
+ 籽 ??? 1
352
+ 庵 ??? 1
353
+ 厨 ??? 1
354
+ 疴 屙 1
355
+ 豹 ??? 1
356
+ 杠 槓 1
357
+ 咘 o布 1
358
+ 裡 ??? 1
359
+ 熏 燻 1
360
+  ??? 1
checkpoint-11000/config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Ayaka/bart-base-cantonese",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 101,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 101,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 102,
27
+ "forced_eos_token_id": 102,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_length": 64,
42
+ "max_position_embeddings": 512,
43
+ "min_length": 3,
44
+ "model_type": "bart",
45
+ "no_repeat_ngram_size": 3,
46
+ "normalize_before": false,
47
+ "normalize_embedding": true,
48
+ "num_beams": 4,
49
+ "num_hidden_layers": 6,
50
+ "pad_token_id": 0,
51
+ "scale_embedding": false,
52
+ "task_specific_params": {
53
+ "summarization": {
54
+ "length_penalty": 1.0,
55
+ "max_length": 128,
56
+ "min_length": 12,
57
+ "num_beams": 4
58
+ },
59
+ "summarization_cnn": {
60
+ "length_penalty": 2.0,
61
+ "max_length": 142,
62
+ "min_length": 56,
63
+ "num_beams": 4
64
+ },
65
+ "summarization_xsum": {
66
+ "length_penalty": 1.0,
67
+ "max_length": 62,
68
+ "min_length": 11,
69
+ "num_beams": 6
70
+ }
71
+ },
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.27.4",
74
+ "use_cache": true,
75
+ "vocab_size": 12660
76
+ }
checkpoint-11000/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 101,
3
+ "decoder_start_token_id": 101,
4
+ "early_stopping": true,
5
+ "eos_token_id": 102,
6
+ "forced_eos_token_id": 102,
7
+ "max_length": 64,
8
+ "min_length": 3,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 4,
11
+ "pad_token_id": 0,
12
+ "transformers_version": "4.27.4"
13
+ }
checkpoint-11000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c35fcf880925d6524a4d69c6153d76494bf292e183492aef621926fb1a3339f3
3
+ size 878171525
checkpoint-11000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41d33ab2cbebaa3b1b34e65ceeb91b9ac3e55d369daa79e96c105e90f84610c8
3
+ size 439148829
checkpoint-11000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c7e530afd0f80bdec827d3df914776b44f41c60dd27c32dc14b519e239a533e
3
+ size 13553
checkpoint-11000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20f836947b7cda13930ff3f199cca6c84444150a1c16e3cdc68ff9b8ba9995f2
3
+ size 627
checkpoint-11000/trainer_state.json ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.38198354265916,
5
+ "global_step": 11000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 5e-06,
13
+ "loss": 1.7639,
14
+ "step": 200
15
+ },
16
+ {
17
+ "epoch": 0.09,
18
+ "learning_rate": 1e-05,
19
+ "loss": 0.4886,
20
+ "step": 400
21
+ },
22
+ {
23
+ "epoch": 0.13,
24
+ "learning_rate": 1.5e-05,
25
+ "loss": 0.2481,
26
+ "step": 600
27
+ },
28
+ {
29
+ "epoch": 0.17,
30
+ "learning_rate": 2e-05,
31
+ "loss": 0.1724,
32
+ "step": 800
33
+ },
34
+ {
35
+ "epoch": 0.22,
36
+ "learning_rate": 2.5e-05,
37
+ "loss": 0.1417,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.22,
42
+ "eval_bleu": 15.49859933982253,
43
+ "eval_chrf": 14.312711083470898,
44
+ "eval_loss": 0.37249916791915894,
45
+ "eval_runtime": 140.7211,
46
+ "eval_samples_per_second": 4.548,
47
+ "eval_steps_per_second": 0.071,
48
+ "step": 1000
49
+ },
50
+ {
51
+ "epoch": 0.26,
52
+ "learning_rate": 3e-05,
53
+ "loss": 0.1238,
54
+ "step": 1200
55
+ },
56
+ {
57
+ "epoch": 0.3,
58
+ "learning_rate": 3.5e-05,
59
+ "loss": 0.1094,
60
+ "step": 1400
61
+ },
62
+ {
63
+ "epoch": 0.35,
64
+ "learning_rate": 4e-05,
65
+ "loss": 0.1004,
66
+ "step": 1600
67
+ },
68
+ {
69
+ "epoch": 0.39,
70
+ "learning_rate": 4.5e-05,
71
+ "loss": 0.0889,
72
+ "step": 1800
73
+ },
74
+ {
75
+ "epoch": 0.43,
76
+ "learning_rate": 5e-05,
77
+ "loss": 0.0843,
78
+ "step": 2000
79
+ },
80
+ {
81
+ "epoch": 0.43,
82
+ "eval_bleu": 13.930224375278131,
83
+ "eval_chrf": 15.927172187426772,
84
+ "eval_loss": 0.3602243661880493,
85
+ "eval_runtime": 171.4695,
86
+ "eval_samples_per_second": 3.732,
87
+ "eval_steps_per_second": 0.058,
88
+ "step": 2000
89
+ },
90
+ {
91
+ "epoch": 0.48,
92
+ "learning_rate": 4.952584163110479e-05,
93
+ "loss": 0.0795,
94
+ "step": 2200
95
+ },
96
+ {
97
+ "epoch": 0.52,
98
+ "learning_rate": 4.905168326220958e-05,
99
+ "loss": 0.0748,
100
+ "step": 2400
101
+ },
102
+ {
103
+ "epoch": 0.56,
104
+ "learning_rate": 4.8577524893314366e-05,
105
+ "loss": 0.0705,
106
+ "step": 2600
107
+ },
108
+ {
109
+ "epoch": 0.61,
110
+ "learning_rate": 4.8103366524419156e-05,
111
+ "loss": 0.0665,
112
+ "step": 2800
113
+ },
114
+ {
115
+ "epoch": 0.65,
116
+ "learning_rate": 4.7629208155523946e-05,
117
+ "loss": 0.0625,
118
+ "step": 3000
119
+ },
120
+ {
121
+ "epoch": 0.65,
122
+ "eval_bleu": 20.903227705509728,
123
+ "eval_chrf": 19.94358238391718,
124
+ "eval_loss": 0.363214910030365,
125
+ "eval_runtime": 179.3923,
126
+ "eval_samples_per_second": 3.568,
127
+ "eval_steps_per_second": 0.056,
128
+ "step": 3000
129
+ },
130
+ {
131
+ "epoch": 0.69,
132
+ "learning_rate": 4.7155049786628736e-05,
133
+ "loss": 0.0607,
134
+ "step": 3200
135
+ },
136
+ {
137
+ "epoch": 0.74,
138
+ "learning_rate": 4.6680891417733527e-05,
139
+ "loss": 0.0584,
140
+ "step": 3400
141
+ },
142
+ {
143
+ "epoch": 0.78,
144
+ "learning_rate": 4.620673304883831e-05,
145
+ "loss": 0.0578,
146
+ "step": 3600
147
+ },
148
+ {
149
+ "epoch": 0.82,
150
+ "learning_rate": 4.57325746799431e-05,
151
+ "loss": 0.0549,
152
+ "step": 3800
153
+ },
154
+ {
155
+ "epoch": 0.87,
156
+ "learning_rate": 4.525841631104789e-05,
157
+ "loss": 0.0531,
158
+ "step": 4000
159
+ },
160
+ {
161
+ "epoch": 0.87,
162
+ "eval_bleu": 17.338938784234383,
163
+ "eval_chrf": 15.930568457831859,
164
+ "eval_loss": 0.36546987295150757,
165
+ "eval_runtime": 170.3063,
166
+ "eval_samples_per_second": 3.758,
167
+ "eval_steps_per_second": 0.059,
168
+ "step": 4000
169
+ },
170
+ {
171
+ "epoch": 0.91,
172
+ "learning_rate": 4.478425794215268e-05,
173
+ "loss": 0.0521,
174
+ "step": 4200
175
+ },
176
+ {
177
+ "epoch": 0.95,
178
+ "learning_rate": 4.431009957325747e-05,
179
+ "loss": 0.0513,
180
+ "step": 4400
181
+ },
182
+ {
183
+ "epoch": 1.0,
184
+ "learning_rate": 4.383594120436226e-05,
185
+ "loss": 0.05,
186
+ "step": 4600
187
+ },
188
+ {
189
+ "epoch": 1.04,
190
+ "learning_rate": 4.3361782835467044e-05,
191
+ "loss": 0.0462,
192
+ "step": 4800
193
+ },
194
+ {
195
+ "epoch": 1.08,
196
+ "learning_rate": 4.2887624466571834e-05,
197
+ "loss": 0.0442,
198
+ "step": 5000
199
+ },
200
+ {
201
+ "epoch": 1.08,
202
+ "eval_bleu": 35.5303321748609,
203
+ "eval_chrf": 30.398609275779588,
204
+ "eval_loss": 0.3768843710422516,
205
+ "eval_runtime": 147.673,
206
+ "eval_samples_per_second": 4.334,
207
+ "eval_steps_per_second": 0.068,
208
+ "step": 5000
209
+ },
210
+ {
211
+ "epoch": 1.13,
212
+ "learning_rate": 4.2413466097676624e-05,
213
+ "loss": 0.0437,
214
+ "step": 5200
215
+ },
216
+ {
217
+ "epoch": 1.17,
218
+ "learning_rate": 4.1939307728781414e-05,
219
+ "loss": 0.0436,
220
+ "step": 5400
221
+ },
222
+ {
223
+ "epoch": 1.21,
224
+ "learning_rate": 4.1465149359886204e-05,
225
+ "loss": 0.0428,
226
+ "step": 5600
227
+ },
228
+ {
229
+ "epoch": 1.26,
230
+ "learning_rate": 4.099099099099099e-05,
231
+ "loss": 0.0418,
232
+ "step": 5800
233
+ },
234
+ {
235
+ "epoch": 1.3,
236
+ "learning_rate": 4.051683262209578e-05,
237
+ "loss": 0.0408,
238
+ "step": 6000
239
+ },
240
+ {
241
+ "epoch": 1.3,
242
+ "eval_bleu": 40.96986293672358,
243
+ "eval_chrf": 35.0063576863817,
244
+ "eval_loss": 0.38005733489990234,
245
+ "eval_runtime": 82.4647,
246
+ "eval_samples_per_second": 7.761,
247
+ "eval_steps_per_second": 0.121,
248
+ "step": 6000
249
+ },
250
+ {
251
+ "epoch": 1.34,
252
+ "learning_rate": 4.004267425320057e-05,
253
+ "loss": 0.041,
254
+ "step": 6200
255
+ },
256
+ {
257
+ "epoch": 1.39,
258
+ "learning_rate": 3.956851588430536e-05,
259
+ "loss": 0.0385,
260
+ "step": 6400
261
+ },
262
+ {
263
+ "epoch": 1.43,
264
+ "learning_rate": 3.909435751541015e-05,
265
+ "loss": 0.0397,
266
+ "step": 6600
267
+ },
268
+ {
269
+ "epoch": 1.47,
270
+ "learning_rate": 3.862019914651494e-05,
271
+ "loss": 0.0384,
272
+ "step": 6800
273
+ },
274
+ {
275
+ "epoch": 1.52,
276
+ "learning_rate": 3.814604077761973e-05,
277
+ "loss": 0.0389,
278
+ "step": 7000
279
+ },
280
+ {
281
+ "epoch": 1.52,
282
+ "eval_bleu": 41.51574989819788,
283
+ "eval_chrf": 35.55197531009423,
284
+ "eval_loss": 0.38628411293029785,
285
+ "eval_runtime": 89.9685,
286
+ "eval_samples_per_second": 7.114,
287
+ "eval_steps_per_second": 0.111,
288
+ "step": 7000
289
+ },
290
+ {
291
+ "epoch": 1.56,
292
+ "learning_rate": 3.767188240872452e-05,
293
+ "loss": 0.038,
294
+ "step": 7200
295
+ },
296
+ {
297
+ "epoch": 1.6,
298
+ "learning_rate": 3.719772403982931e-05,
299
+ "loss": 0.0374,
300
+ "step": 7400
301
+ },
302
+ {
303
+ "epoch": 1.65,
304
+ "learning_rate": 3.67235656709341e-05,
305
+ "loss": 0.0359,
306
+ "step": 7600
307
+ },
308
+ {
309
+ "epoch": 1.69,
310
+ "learning_rate": 3.624940730203888e-05,
311
+ "loss": 0.0358,
312
+ "step": 7800
313
+ },
314
+ {
315
+ "epoch": 1.73,
316
+ "learning_rate": 3.577524893314367e-05,
317
+ "loss": 0.0359,
318
+ "step": 8000
319
+ },
320
+ {
321
+ "epoch": 1.73,
322
+ "eval_bleu": 23.208736406312035,
323
+ "eval_chrf": 23.97795821953749,
324
+ "eval_loss": 0.3921656310558319,
325
+ "eval_runtime": 182.5523,
326
+ "eval_samples_per_second": 3.506,
327
+ "eval_steps_per_second": 0.055,
328
+ "step": 8000
329
+ },
330
+ {
331
+ "epoch": 1.78,
332
+ "learning_rate": 3.530109056424846e-05,
333
+ "loss": 0.0348,
334
+ "step": 8200
335
+ },
336
+ {
337
+ "epoch": 1.82,
338
+ "learning_rate": 3.482693219535325e-05,
339
+ "loss": 0.0352,
340
+ "step": 8400
341
+ },
342
+ {
343
+ "epoch": 1.86,
344
+ "learning_rate": 3.435277382645804e-05,
345
+ "loss": 0.0351,
346
+ "step": 8600
347
+ },
348
+ {
349
+ "epoch": 1.91,
350
+ "learning_rate": 3.3878615457562826e-05,
351
+ "loss": 0.0345,
352
+ "step": 8800
353
+ },
354
+ {
355
+ "epoch": 1.95,
356
+ "learning_rate": 3.3404457088667616e-05,
357
+ "loss": 0.0337,
358
+ "step": 9000
359
+ },
360
+ {
361
+ "epoch": 1.95,
362
+ "eval_bleu": 41.547921684162176,
363
+ "eval_chrf": 35.46471050376956,
364
+ "eval_loss": 0.40451329946517944,
365
+ "eval_runtime": 89.4039,
366
+ "eval_samples_per_second": 7.159,
367
+ "eval_steps_per_second": 0.112,
368
+ "step": 9000
369
+ },
370
+ {
371
+ "epoch": 1.99,
372
+ "learning_rate": 3.2930298719772407e-05,
373
+ "loss": 0.0343,
374
+ "step": 9200
375
+ },
376
+ {
377
+ "epoch": 2.04,
378
+ "learning_rate": 3.24561403508772e-05,
379
+ "loss": 0.0309,
380
+ "step": 9400
381
+ },
382
+ {
383
+ "epoch": 2.08,
384
+ "learning_rate": 3.198198198198199e-05,
385
+ "loss": 0.0296,
386
+ "step": 9600
387
+ },
388
+ {
389
+ "epoch": 2.12,
390
+ "learning_rate": 3.150782361308677e-05,
391
+ "loss": 0.0291,
392
+ "step": 9800
393
+ },
394
+ {
395
+ "epoch": 2.17,
396
+ "learning_rate": 3.103366524419156e-05,
397
+ "loss": 0.0295,
398
+ "step": 10000
399
+ },
400
+ {
401
+ "epoch": 2.17,
402
+ "eval_bleu": 41.51485442459467,
403
+ "eval_chrf": 35.46553158852993,
404
+ "eval_loss": 0.4056099057197571,
405
+ "eval_runtime": 89.2092,
406
+ "eval_samples_per_second": 7.174,
407
+ "eval_steps_per_second": 0.112,
408
+ "step": 10000
409
+ },
410
+ {
411
+ "epoch": 2.21,
412
+ "learning_rate": 3.055950687529635e-05,
413
+ "loss": 0.0285,
414
+ "step": 10200
415
+ },
416
+ {
417
+ "epoch": 2.25,
418
+ "learning_rate": 3.008534850640114e-05,
419
+ "loss": 0.0293,
420
+ "step": 10400
421
+ },
422
+ {
423
+ "epoch": 2.3,
424
+ "learning_rate": 2.9611190137505927e-05,
425
+ "loss": 0.0285,
426
+ "step": 10600
427
+ },
428
+ {
429
+ "epoch": 2.34,
430
+ "learning_rate": 2.9137031768610717e-05,
431
+ "loss": 0.0288,
432
+ "step": 10800
433
+ },
434
+ {
435
+ "epoch": 2.38,
436
+ "learning_rate": 2.8662873399715508e-05,
437
+ "loss": 0.0279,
438
+ "step": 11000
439
+ },
440
+ {
441
+ "epoch": 2.38,
442
+ "eval_bleu": 41.75820932324433,
443
+ "eval_chrf": 35.69247581900476,
444
+ "eval_loss": 0.41462868452072144,
445
+ "eval_runtime": 152.578,
446
+ "eval_samples_per_second": 4.195,
447
+ "eval_steps_per_second": 0.066,
448
+ "step": 11000
449
+ }
450
+ ],
451
+ "max_steps": 23090,
452
+ "num_train_epochs": 5,
453
+ "total_flos": 2.682581278261248e+16,
454
+ "trial_name": null,
455
+ "trial_params": null
456
+ }
checkpoint-11000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87fcb457b09c1b3e7d50dad4c575e4da4493a663f81ee73d6fe8065b4e460e1a
3
+ size 3643
commercial_baselines/bing.can ADDED
The diff for this file is too large to render. See raw diff
 
commercial_baselines/bing.key ADDED
@@ -0,0 +1 @@
 
 
1
+ 99b56b71eab141dfad4e5a4789e958e8
commercial_baselines/bing.man ADDED
The diff for this file is too large to render. See raw diff
 
commercial_baselines/bing_translator.ipynb ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import requests, uuid\n",
10
+ "\n",
11
+ "# Add your key and endpoint\n",
12
+ "with open(\"bing.key\", \"r\") as key_file:\n",
13
+ " key = key_file.read()\n",
14
+ "endpoint = \"https://api.cognitive.microsofttranslator.com\"\n",
15
+ "\n",
16
+ "# location, also known as region.\n",
17
+ "# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.\n",
18
+ "location = \"eastus\"\n",
19
+ "\n",
20
+ "path = '/translate'\n",
21
+ "constructed_url = endpoint + path\n",
22
+ "\n",
23
+ "params = {\n",
24
+ " 'api-version': '3.0',\n",
25
+ " 'from': 'yue',\n",
26
+ " 'to': ['zh-Hant']\n",
27
+ "}\n",
28
+ "\n",
29
+ "headers = {\n",
30
+ " 'Ocp-Apim-Subscription-Key': key,\n",
31
+ " # location required if you're using a multi-service or regional (not global) resource.\n",
32
+ " 'Ocp-Apim-Subscription-Region': location,\n",
33
+ " 'Content-type': 'application/json',\n",
34
+ " 'X-ClientTraceId': str(uuid.uuid4())\n",
35
+ "}\n",
36
+ "\n",
37
+ "# https://stackoverflow.com/a/312464/6798201\n",
38
+ "def chunks(lst, n):\n",
39
+ " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
40
+ " for i in range(0, len(lst), n):\n",
41
+ " yield lst[i:i + n]"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 5,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "ename": "KeyboardInterrupt",
51
+ "evalue": "",
52
+ "output_type": "error",
53
+ "traceback": [
54
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
55
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
56
+ "\u001b[1;32m/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb Cell 2\u001b[0m in \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m../test.typos.pred.bing.man\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39ma+\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m output_file:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunks(body, \u001b[39m500\u001b[39m):\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m request \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39;49mpost(constructed_url, params\u001b[39m=\u001b[39;49mparams, headers\u001b[39m=\u001b[39;49mheaders, json\u001b[39m=\u001b[39;49mchunk)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m response \u001b[39m=\u001b[39m request\u001b[39m.\u001b[39mjson()\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=16'>17</a>\u001b[0m \u001b[39mfor\u001b[39;00m line \u001b[39min\u001b[39;00m response:\n",
57
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpost\u001b[39m(url, data\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, json\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[39m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[39m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[39mreturn\u001b[39;00m request(\u001b[39m\"\u001b[39;49m\u001b[39mpost\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, data\u001b[39m=\u001b[39;49mdata, json\u001b[39m=\u001b[39;49mjson, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
58
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[39m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[39m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[39mwith\u001b[39;00m sessions\u001b[39m.\u001b[39mSession() \u001b[39mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[39mreturn\u001b[39;00m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
59
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/sessions.py:587\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 582\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 583\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m 584\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 585\u001b[0m }\n\u001b[1;32m 586\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 587\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 589\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n",
60
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/sessions.py:701\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 698\u001b[0m start \u001b[39m=\u001b[39m preferred_clock()\n\u001b[1;32m 700\u001b[0m \u001b[39m# Send the request\u001b[39;00m\n\u001b[0;32m--> 701\u001b[0m r \u001b[39m=\u001b[39m adapter\u001b[39m.\u001b[39;49msend(request, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 703\u001b[0m \u001b[39m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 704\u001b[0m elapsed \u001b[39m=\u001b[39m preferred_clock() \u001b[39m-\u001b[39m start\n",
61
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/adapters.py:489\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 488\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m chunked:\n\u001b[0;32m--> 489\u001b[0m resp \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49murlopen(\n\u001b[1;32m 490\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod,\n\u001b[1;32m 491\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 492\u001b[0m body\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mbody,\n\u001b[1;32m 493\u001b[0m headers\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mheaders,\n\u001b[1;32m 494\u001b[0m redirect\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 495\u001b[0m assert_same_host\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 496\u001b[0m preload_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 497\u001b[0m decode_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 498\u001b[0m retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[1;32m 499\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 500\u001b[0m )\n\u001b[1;32m 502\u001b[0m \u001b[39m# Send the request.\u001b[39;00m\n\u001b[1;32m 503\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 504\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(conn, \u001b[39m\"\u001b[39m\u001b[39mproxy_pool\u001b[39m\u001b[39m\"\u001b[39m):\n",
62
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[1;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[1;32m 704\u001b[0m conn,\n\u001b[1;32m 705\u001b[0m method,\n\u001b[1;32m 706\u001b[0m url,\n\u001b[1;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[1;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[1;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m 711\u001b[0m )\n\u001b[1;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[1;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[1;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[1;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[1;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
63
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[1;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[1;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
64
+ "File \u001b[0;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[0;34m(value, from_value)\u001b[0m\n",
65
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[1;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[1;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
66
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[1;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[1;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
67
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[1;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[1;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
68
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[1;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
69
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
70
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/ssl.py:1274\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1270\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1271\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1272\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m 1273\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1274\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m 1275\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1276\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
71
+ "File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/ssl.py:1130\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1128\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1129\u001b[0m \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1130\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m 1131\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1132\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n",
72
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "\"\"\"\n",
78
+ "Translate test sentences\n",
79
+ "\"\"\"\n",
80
+ "\n",
81
+ "# You can pass more than one object in body.\n",
82
+ "with open(\"../para/test/test.typos.can\", \"r\") as input_file:\n",
83
+ " body = [{'text': line} for line in input_file.read().splitlines()]\n",
84
+ "\n",
85
+ "# Clear previous outputs\n",
86
+ "open(\"../test.typos.pred.bing.man\", 'w').close()\n",
87
+ "\n",
88
+ "# Split translation request into chunks of 500 lines (10,000 character limit per request)\n",
89
+ "with open(\"../test.typos.pred.bing.man\", \"a+\") as output_file:\n",
90
+ " for chunk in chunks(body, 500):\n",
91
+ " request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
92
+ " response = request.json()\n",
93
+ " for line in response:\n",
94
+ " output_file.write(line['translations'][0]['text'] + \"\\n\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 17,
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "name": "stderr",
104
+ "output_type": "stream",
105
+ "text": [
106
+ " 0%| | 0/110 [00:01<?, ?it/s]\n"
107
+ ]
108
+ },
109
+ {
110
+ "ename": "KeyboardInterrupt",
111
+ "evalue": "",
112
+ "output_type": "error",
113
+ "traceback": [
114
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
115
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
116
+ "\u001b[1;32m/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb Cell 3\u001b[0m in \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m output_file\u001b[39m.\u001b[39mwrite(line[\u001b[39m'\u001b[39m\u001b[39mtranslations\u001b[39m\u001b[39m'\u001b[39m][\u001b[39m0\u001b[39m][\u001b[39m'\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39m# Slow down because of hourly request limit for free tier\u001b[39;00m\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=22'>23</a>\u001b[0m time\u001b[39m.\u001b[39;49msleep(\u001b[39m5\u001b[39;49m)\n",
117
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
118
+ ]
119
+ }
120
+ ],
121
+ "source": [
122
+ "\"\"\"\n",
123
+ "Translate training sentences\n",
124
+ "\"\"\"\n",
125
+ "\n",
126
+ "# You can pass more than one object in body.\n",
127
+ "with open(\"bing.can\", \"r\") as input_file:\n",
128
+ " body = [{'text': line} for line in input_file.read().splitlines()]\n",
129
+ "\n",
130
+ "# Clear previous outputs\n",
131
+ "open(\"bing.man\", 'w').close()\n",
132
+ "\n",
133
+ "from tqdm import tqdm\n",
134
+ "import time\n",
135
+ "\n",
136
+ "# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
137
+ "with open(\"bing.man\", \"a+\") as output_file:\n",
138
+ " for chunk in tqdm(list(chunks(body, 400))):\n",
139
+ " request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
140
+ " response = request.json()\n",
141
+ " for line in response:\n",
142
+ " output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
143
+ " # Slow down because of hourly request limit for free tier\n",
144
+ " time.sleep(5)\n"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 18,
150
+ "metadata": {},
151
+ "outputs": [
152
+ {
153
+ "name": "stderr",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "100%|██████████| 38/38 [03:33<00:00, 5.61s/it]\n"
157
+ ]
158
+ }
159
+ ],
160
+ "source": [
161
+ "# You can pass more than one object in body.\n",
162
+ "with open(\"../train/abc.can\", \"r\") as input_file:\n",
163
+ " body = [{'text': line} for line in input_file.read().splitlines() if len(line) >= 5]\n",
164
+ "\n",
165
+ "from tqdm import tqdm\n",
166
+ "import time\n",
167
+ "\n",
168
+ "# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
169
+ "with open(\"bing.man\", \"a+\") as output_file:\n",
170
+ " for chunk in tqdm(list(chunks(body, 400))):\n",
171
+ " request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
172
+ " response = request.json()\n",
173
+ " for line in response:\n",
174
+ " output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
175
+ " # Slow down because of hourly request limit for free tier\n",
176
+ " time.sleep(5)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 3,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "name": "stderr",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "100%|██████████| 352/352 [21:28<00:00, 3.66s/it]\n"
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "# You can pass more than one object in body.\n",
194
+ "with open(\"../train/lihkg.filtered.can\", \"r\") as input_file:\n",
195
+ " body = [{'text': line} for line in input_file.read().splitlines()]\n",
196
+ "\n",
197
+ "from tqdm import tqdm\n",
198
+ "import time\n",
199
+ "\n",
200
+ "# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
201
+ "with open(\"lihkg.filtered.man\", \"w+\") as output_file:\n",
202
+ " for chunk in tqdm(list(chunks(body, 400))):\n",
203
+ " request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
204
+ " response = request.json()\n",
205
+ " for line in response:\n",
206
+ " output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
207
+ " # Slow down because of hourly request limit for free tier\n",
208
+ " time.sleep(3)"
209
+ ]
210
+ }
211
+ ],
212
+ "metadata": {
213
+ "kernelspec": {
214
+ "display_name": "Python 3",
215
+ "language": "python",
216
+ "name": "python3"
217
+ },
218
+ "language_info": {
219
+ "codemirror_mode": {
220
+ "name": "ipython",
221
+ "version": 3
222
+ },
223
+ "file_extension": ".py",
224
+ "mimetype": "text/x-python",
225
+ "name": "python",
226
+ "nbconvert_exporter": "python",
227
+ "pygments_lexer": "ipython3",
228
+ "version": "3.10.6"
229
+ },
230
+ "orig_nbformat": 4
231
+ },
232
+ "nbformat": 4,
233
+ "nbformat_minor": 2
234
+ }
commercial_baselines/lihkg.filtered.man ADDED
The diff for this file is too large to render. See raw diff
 
commercial_baselines/load_can.ipynb ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Number of Cantonese sentences: 58492\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "can_sents = []\n",
18
+ "\n",
19
+ "with open(\"../train/common_voice.can\", \"r\") as common_voice_file, open(\"../train/wordshk.can\", \"r\") as wordshk_file,\\\n",
20
+ " open(\"../train/novels.can\", \"r\") as novels_file,\\\n",
21
+ " open(\"../train/abc.can\", \"r\") as abc_file:\n",
22
+ " lines = common_voice_file.read().splitlines() + wordshk_file.read().splitlines() + novels_file.read().splitlines() + abc_file.read().splitlines()\n",
23
+ " for line in lines:\n",
24
+ " if len(line) >= 5:\n",
25
+ " can_sents.append(line)\n",
26
+ "\n",
27
+ "print(\"Number of Cantonese sentences: \", len(can_sents))\n",
28
+ "\n",
29
+ "with open(\"bing.can\", \"w\") as f:\n",
30
+ " for sent in can_sents:\n",
31
+ " f.write(sent + \"\\n\")\n",
32
+ " f.flush()\n"
33
+ ]
34
+ }
35
+ ],
36
+ "metadata": {
37
+ "kernelspec": {
38
+ "display_name": "Python 3",
39
+ "language": "python",
40
+ "name": "python3"
41
+ },
42
+ "language_info": {
43
+ "codemirror_mode": {
44
+ "name": "ipython",
45
+ "version": 3
46
+ },
47
+ "file_extension": ".py",
48
+ "mimetype": "text/x-python",
49
+ "name": "python",
50
+ "nbconvert_exporter": "python",
51
+ "pygments_lexer": "ipython3",
52
+ "version": "3.10.6"
53
+ },
54
+ "orig_nbformat": 4
55
+ },
56
+ "nbformat": 4,
57
+ "nbformat_minor": 2
58
+ }
finetune.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
load_abc.ipynb ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 33,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "def normalize_punctuations(line: str) -> str:\n",
10
+ " # Replace all English punctuations with Chinese ones\n",
11
+ " line = line.replace(\",\", \",\").replace(\"!\", \"!\").replace(\"?\", \"?\")\\\n",
12
+ " .replace(\":\", \":\").replace(\";\", \";\").replace(\"(\", \"(\").replace(\")\", \")\")\n",
13
+ " return line"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 34,
19
+ "metadata": {},
20
+ "outputs": [
21
+ {
22
+ "name": "stdout",
23
+ "output_type": "stream",
24
+ "text": [
25
+ "Got 14838 Cantonese sentences with length >= 5\n"
26
+ ]
27
+ }
28
+ ],
29
+ "source": [
30
+ "import re\n",
31
+ "from functools import reduce\n",
32
+ "\n",
33
+ "can_sentence_start = re.compile(r\"[0-9]*hz \")\n",
34
+ "can_lines = []\n",
35
+ "\n",
36
+ "with open(\"train/abc/abc_cantonese_index_00001_to_04587_line_1_to_4575.xml\", \"r\") as abc_file1,\\\n",
37
+ "open(\"train/abc/abc_cantonese_index_04588_to_09175_line_4576_to_9150.xml\", \"r\") as abc_file2,\\\n",
38
+ " open(\"train/abc/abc_cantonese_index_09176_to_13775_line_9151_to_13725.xml\", \"r\") as abc_file3,\\\n",
39
+ " open(\"train/abc/abc_cantonese_index_13776_to_FE99FD5B4E37BE32_line_13726_to_18302.xml\", \"r\") as abc_file4:\n",
40
+ " lines = reduce(lambda lines, file: lines + file.read().splitlines(), [abc_file1, abc_file2, abc_file3, abc_file4], [])\n",
41
+ " for line in lines:\n",
42
+ " match = can_sentence_start.match(line)\n",
43
+ " if match and not \"(empty band???)\" in line:\n",
44
+ " line = line[match.end():].strip()\n",
45
+ " if len(line) >= 5:\n",
46
+ " can_lines.append(normalize_punctuations(line))\n",
47
+ "\n",
48
+ "print(\"Got {} Cantonese sentences with length >= 5\".format(len(can_lines)))"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 35,
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "name": "stdout",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "Found 4527 common Cantonese characters\n"
61
+ ]
62
+ }
63
+ ],
64
+ "source": [
65
+ "common_can_charset = set()\n",
66
+ "\n",
67
+ "with open(\"train/wordshk.can\", \"r\") as wordshk_file:\n",
68
+ " for c in wordshk_file.read():\n",
69
+ " common_can_charset.add(c)\n",
70
+ "\n",
71
+ "print(f\"Found {len(common_can_charset)} common Cantonese characters\")"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 36,
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "Found 365 rare Cantonese characters\n",
84
+ "𠹺 388\n",
85
+ "噖 162\n",
86
+ "𡁵 157\n",
87
+ "𠶧 88\n",
88
+ "嚫 88\n",
89
+ "屘 57\n",
90
+ "衭 47\n",
91
+ "贃 43\n",
92
+ "說 35\n",
93
+ "𧵳 30\n",
94
+ "歳 27\n",
95
+ "𢫏 27\n",
96
+ "𨶙 25\n",
97
+ "癐 25\n",
98
+ "𦡆 25\n",
99
+ "𨃩 24\n",
100
+ "况 21\n",
101
+ "内 19\n",
102
+ "𢵌 19\n",
103
+ "𦧺 18\n",
104
+ "𠹌 18\n",
105
+ "爲 16\n",
106
+ "𢱑 16\n",
107
+ "𡁯 15\n",
108
+ "𠱓 14\n",
109
+ "𠵿 14\n",
110
+ "踹 13\n",
111
+ "㗇 13\n",
112
+ "𠾴 13\n",
113
+ "嗍 13\n",
114
+ "𧘹 13\n",
115
+ "𠹳 12\n",
116
+ "𠹭 12\n",
117
+ "脫 12\n",
118
+ "䁪 11\n",
119
+ "𧨾 11\n",
120
+ "掬 11\n",
121
+ "𠸐 11\n",
122
+ "啥 11\n",
123
+ "𠱃 10\n",
124
+ "噔 10\n",
125
+ "捹 10\n",
126
+ "𠹻 10\n",
127
+ "𠼻 10\n",
128
+ "噠 10\n",
129
+ "𨳊 10\n",
130
+ "𢲲 9\n",
131
+ "𨉖 9\n",
132
+ "躭 9\n",
133
+ "䠋 9\n",
134
+ "嘮 9\n",
135
+ "啽 9\n",
136
+ "滮 8\n",
137
+ "㧻 8\n",
138
+ "𧶄 8\n",
139
+ "𦛚 8\n",
140
+ "撠 8\n",
141
+ "呡 8\n",
142
+ "睸 8\n",
143
+ "𠰲 8\n",
144
+ "𥔿 8\n",
145
+ "唎 8\n",
146
+ "𠸊 8\n",
147
+ "𬜐 8\n",
148
+ "蔥 8\n",
149
+ "呱 8\n",
150
+ "B 7\n",
151
+ "𢯊 7\n",
152
+ "𫫃 7\n",
153
+ "𢝵 7\n",
154
+ "銹 7\n",
155
+ "㓤 7\n",
156
+ "䁯 7\n",
157
+ "啉 7\n",
158
+ "臥 7\n",
159
+ "𠓼 7\n",
160
+ "稅 7\n",
161
+ " 7\n",
162
+ "喴 7\n",
163
+ "噱 7\n",
164
+ "衛 6\n",
165
+ "𡄯 6\n",
166
+ "揤 6\n",
167
+ "𢤹 6\n",
168
+ " 6\n",
169
+ "鷄 6\n",
170
+ "湴 6\n",
171
+ " 6\n",
172
+ "𦣇 6\n",
173
+ "齧 6\n",
174
+ "𠮨 6\n",
175
+ " 6\n",
176
+ "𡀝 6\n",
177
+ "婄 6\n",
178
+ "𠼱 6\n",
179
+ "𠱂 5\n",
180
+ "磧 5\n",
181
+ "𠰋 5\n",
182
+ "𡂖 5\n",
183
+ "浭 5\n",
184
+ "擏 5\n",
185
+ "𥋇 5\n",
186
+ "揢 5\n",
187
+ "㨆 5\n",
188
+ "𠾍 5\n",
189
+ "兌 5\n",
190
+ "𢺳 5\n",
191
+ "坺 5\n",
192
+ "鍚 5\n",
193
+ "𣘚 5\n",
194
+ "𪘁 5\n",
195
+ "𨳍 5\n",
196
+ "嗙 5\n",
197
+ "𠼰 5\n",
198
+ "𨳒 4\n",
199
+ "唿 4\n",
200
+ "𣳼 4\n",
201
+ "𦂥 4\n",
202
+ "溚 4\n",
203
+ "囋 4\n",
204
+ "瀄 4\n",
205
+ "𠌥 4\n",
206
+ "𢫦 4\n",
207
+ "𢶍 4\n",
208
+ "𠲵 4\n",
209
+ "䉺 4\n",
210
+ "炕 4\n",
211
+ "𢴈 4\n",
212
+ "𡲢 4\n",
213
+ "𥅈 4\n",
214
+ "𬧊 4\n",
215
+ "簕 4\n",
216
+ "査 4\n",
217
+ "𩜠 4\n",
218
+ "𫬿 4\n",
219
+ "𠜱 4\n",
220
+ "嚬 4\n",
221
+ "𠹹 4\n",
222
+ "𦉘 4\n",
223
+ "唦 4\n",
224
+ "㨘 4\n",
225
+ "𡄽 4\n",
226
+ "熗 4\n",
227
+ "𡁷 4\n",
228
+ "𠿬 4\n",
229
+ "咜 4\n",
230
+ "𠸏 4\n",
231
+ "𡁸 4\n",
232
+ "𡃵 4\n",
233
+ "𪚩 4\n",
234
+ "D 4\n",
235
+ "Q 4\n",
236
+ "𨆯 3\n",
237
+ "啗 3\n",
238
+ "蔸 3\n",
239
+ "舗 3\n",
240
+ "囪 3\n",
241
+ "艔 3\n",
242
+ "洩 3\n",
243
+ "𢵧 3\n",
244
+ "菓 3\n",
245
+ "䪴 3\n",
246
+ "䆲 3\n",
247
+ "痱 3\n",
248
+ "趿 3\n",
249
+ "𠮩 3\n",
250
+ "搉 3\n",
251
+ "矋 3\n",
252
+ "𠻗 3\n",
253
+ "𢲈 3\n",
254
+ "潞 3\n",
255
+ "沬 3\n",
256
+ "揇 3\n",
257
+ "齃 3\n",
258
+ "𡃤 3\n",
259
+ "𡃶 3\n",
260
+ "瀟 3\n",
261
+ "軨 3\n",
262
+ "鉻 3\n",
263
+ " 3\n",
264
+ "㿭 3\n",
265
+ "𢵄 3\n",
266
+ "㗲 3\n",
267
+ "𢫕 3\n",
268
+ "𢰸 3\n",
269
+ "葫 3\n",
270
+ "咔 3\n",
271
+ "嚎 3\n",
272
+ "嗿 3\n",
273
+ "咈 3\n",
274
+ "咾 3\n",
275
+ " 3\n",
276
+ "𠵈 3\n",
277
+ "吥 3\n",
278
+ "𠾭 3\n",
279
+ "𠾵 3\n",
280
+ "朘 3\n",
281
+ "觥 3\n",
282
+ "㩧 2\n",
283
+ "焙 2\n",
284
+ "兀 2\n",
285
+ "䭤 2\n",
286
+ "饊 2\n",
287
+ "[ 2\n",
288
+ "] 2\n",
289
+ "炖 2\n",
290
+ "争 2\n",
291
+ "䁓 2\n",
292
+ "𡂝 2\n",
293
+ "𩬎 2\n",
294
+ "鈒 2\n",
295
+ "亁 2\n",
296
+ "炠 2\n",
297
+ "摼 2\n",
298
+ "𠺬 2\n",
299
+ "𠵉 2\n",
300
+ "蝄 2\n",
301
+ " 2\n",
302
+ "蔫 2\n",
303
+ "㘉 2\n",
304
+ "荏 2\n",
305
+ "墘 2\n",
306
+ "嗏 2\n",
307
+ "呣 2\n",
308
+ "曚 2\n",
309
+ "壬 2\n",
310
+ "揅 2\n",
311
+ "溼 2\n",
312
+ "囓 2\n",
313
+ "嚙 2\n",
314
+ "枴 2\n",
315
+ "𡃀 2\n",
316
+ "饑 2\n",
317
+ "䏭 2\n",
318
+ "挼 2\n",
319
+ "掱 2\n",
320
+ "咑 2\n",
321
+ "芙 2\n",
322
+ "𦂗 2\n",
323
+ "舦 2\n",
324
+ "𢶤 2\n",
325
+ "翡 2\n",
326
+ "翠 2\n",
327
+ "酡 2\n",
328
+ "𫭊 2\n",
329
+ "煀 2\n",
330
+ "耙 2\n",
331
+ "𠿭 2\n",
332
+ "鉤 2\n",
333
+ "𠻘 2\n",
334
+ "脽 2\n",
335
+ "焊 2\n",
336
+ "唊 2\n",
337
+ "胅 2\n",
338
+ "翕 2\n",
339
+ "摜 2\n",
340
+ "僚 1\n",
341
+ "𩗴 1\n",
342
+ "毡 1\n",
343
+ "跤 1\n",
344
+ "梧 1\n",
345
+ "痄 1\n",
346
+ "卟 1\n",
347
+ "劄 1\n",
348
+ "𠶜 1\n",
349
+ "睜 1\n",
350
+ "迹 1\n",
351
+ "揃 1\n",
352
+ "唨 1\n",
353
+ "謢 1\n",
354
+ "菻 1\n",
355
+ "𣚺 1\n",
356
+ "鷓 1\n",
357
+ "鴣 1\n",
358
+ "强 1\n",
359
+ "𠾶 1\n",
360
+ "𡆀 1\n",
361
+ "拫 1\n",
362
+ "𠼮 1\n",
363
+ "汞 1\n",
364
+ "㤿 1\n",
365
+ "厴 1\n",
366
+ "𥀬 1\n",
367
+ "牯 1\n",
368
+ "𡇙 1\n",
369
+ "讕 1\n",
370
+ "𠿫 1\n",
371
+ "瘺 1\n",
372
+ "骲 1\n",
373
+ "𫲭 1\n",
374
+ "瓏 1\n",
375
+ "繚 1\n",
376
+ "撿 1\n",
377
+ "跀 1\n",
378
+ "𢛴 1\n",
379
+ "蝻 1\n",
380
+ "赧 1\n",
381
+ "𪙛 1\n",
382
+ " 1\n",
383
+ "檳 1\n",
384
+ "潲 1\n",
385
+ "𢶠 1\n",
386
+ "秧 1\n",
387
+ "蒔 1\n",
388
+ "炩 1\n",
389
+ "㩋 1\n",
390
+ "饅 1\n",
391
+ "鍍 1\n",
392
+ "𢚖 1\n",
393
+ "𧊅 1\n",
394
+ " 1\n",
395
+ "篸 1\n",
396
+ "𩟔 1\n",
397
+ "撍 1\n",
398
+ "栗 1\n",
399
+ " 1\n",
400
+ "𡆇 1\n",
401
+ "杧 1\n",
402
+ "榛 1\n",
403
+ "蠄 1\n",
404
+ "蟧 1\n",
405
+ "嘶 1\n",
406
+ "梆 1\n",
407
+ "竪 1\n",
408
+ "騾 1\n",
409
+ "矺 1\n",
410
+ "堀 1\n",
411
+ "麝 1\n",
412
+ "慪 1\n",
413
+ "撴 1\n",
414
+ "哾 1\n",
415
+ "𠳖 1\n",
416
+ "洌 1\n",
417
+ "霹 1\n",
418
+ "𠾼 1\n",
419
+ "𬦠 1\n",
420
+ "𤌍 1\n",
421
+ "𬧯 1\n",
422
+ "厠 1\n",
423
+ "㖡 1\n",
424
+ "跁 1\n",
425
+ "鉎 1\n",
426
+ "𧣈 1\n",
427
+ "𠳏 1\n",
428
+ "㹃 1\n",
429
+ "𧝞 1\n",
430
+ "𡀞 1\n",
431
+ "㦒 1\n",
432
+ "𩩍 1\n",
433
+ "𢱢 1\n",
434
+ "鍟 1\n",
435
+ "煱 1\n",
436
+ "撘 1\n",
437
+ "閱 1\n",
438
+ "橇 1\n",
439
+ "籽 1\n",
440
+ "庵 1\n",
441
+ "厨 1\n",
442
+ "疴 1\n",
443
+ "豹 1\n",
444
+ "杠 1\n",
445
+ "咘 1\n",
446
+ "裡 1\n",
447
+ "熏 1\n",
448
+ " 1\n"
449
+ ]
450
+ }
451
+ ],
452
+ "source": [
453
+ "from collections import defaultdict\n",
454
+ "\n",
455
+ "rare_can_charset = defaultdict(int)\n",
456
+ "for line in can_lines:\n",
457
+ " for c in line:\n",
458
+ " if not c in common_can_charset:\n",
459
+ " rare_can_charset[c] += 1\n",
460
+ "\n",
461
+ "print(f\"Found {len(rare_can_charset)} rare Cantonese characters\")\n",
462
+ "\n",
463
+ "charset_sort_by_freq = dict(sorted(rare_can_charset.items(), key=lambda item: -item[1]))\n",
464
+ "for c, freq in charset_sort_by_freq.items():\n",
465
+ " print(c, freq)"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 46,
471
+ "metadata": {},
472
+ "outputs": [
473
+ {
474
+ "name": "stdout",
475
+ "output_type": "stream",
476
+ "text": [
477
+ "Found 12360 normalized mappings\n"
478
+ ]
479
+ }
480
+ ],
481
+ "source": [
482
+ "char_to_normalized_char = {}\n",
483
+ "\n",
484
+ "with open(\"zh_char2str_mapping.txt\", \"r\") as input_file:\n",
485
+ " for line in input_file.read().splitlines():\n",
486
+ " [c, n] = line.split(\"\\t\")\n",
487
+ " char_to_normalized_char[c] = n\n",
488
+ "\n",
489
+ "print(\"Found {} normalized mappings\".format(len(char_to_normalized_char)))"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 49,
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "𠹺\t埋\t388\n",
502
+ "噖\t琴\t162\n",
503
+ "𡁵\t緊\t157\n",
504
+ "𠶧\t掂\t88\n",
505
+ "嚫\t親\t88\n",
506
+ "屘\t尾\t57\n",
507
+ "衭\t衤夫\t47\n",
508
+ "贃\t賺\t43\n",
509
+ "說\t???\t35\n",
510
+ "𧵳\t???\t30\n",
511
+ "歳\t歲\t27\n",
512
+ "𢫏\t全\t27\n",
513
+ "𨶙\t能\t25\n",
514
+ "癐\t???\t25\n",
515
+ "𦡆\t???\t25\n",
516
+ "𨃩\t⻊扇\t24\n",
517
+ "况\t???\t21\n",
518
+ "内\t內\t19\n",
519
+ "𢵌\t扌隊\t19\n",
520
+ "𦧺\t賴\t18\n",
521
+ "𠹌\t o能\t18\n",
522
+ "爲\t為\t16\n",
523
+ "𢱑\t抓\t16\n",
524
+ "𡁯\t???\t15\n",
525
+ "𠱓\t詭\t14\n",
526
+ "𠵿\t披\t14\n",
527
+ "踹\t???\t13\n",
528
+ "㗇\t???\t13\n",
529
+ "𠾴\t棒\t13\n",
530
+ "嗍\t索\t13\n",
531
+ "𧘹\t太\t13\n",
532
+ "𠹳\t傑\t12\n",
533
+ "𠹭\t???\t12\n",
534
+ "脫\t???\t12\n",
535
+ "䁪\t???\t11\n",
536
+ "𧨾\t氹\t11\n",
537
+ "掬\t???\t11\n",
538
+ "𠸐\t???\t11\n",
539
+ "啥\t???\t11\n",
540
+ "𠱃\t o凹\t10\n",
541
+ "噔\t o登\t10\n",
542
+ "捹\t扌奔\t10\n",
543
+ "𠹻\t???\t10\n",
544
+ "𠼻\t基\t10\n",
545
+ "噠\t???\t10\n",
546
+ "𨳊\t九\t10\n",
547
+ "𢲲\t???\t9\n",
548
+ "𨉖\t???\t9\n",
549
+ "躭\t耽\t9\n",
550
+ "䠋\t卑\t9\n",
551
+ "嘮\t???\t9\n",
552
+ "啽\t o弇\t9\n",
553
+ "滮\t氵彪\t8\n",
554
+ "㧻\t扌涿\t8\n",
555
+ "𧶄\t???\t8\n",
556
+ "𦛚\t???\t8\n",
557
+ "撠\t扌戟\t8\n",
558
+ "呡\t o吻\t8\n",
559
+ "睸\t目眉\t8\n",
560
+ "𠰲\t???\t8\n",
561
+ "𥔿\t???\t8\n",
562
+ "唎\t脷\t8\n",
563
+ "𠸊\t???\t8\n",
564
+ "𬜐\t???\t8\n",
565
+ "蔥\t葱\t8\n",
566
+ "呱\t???\t8\n",
567
+ "B\t???\t7\n",
568
+ "𢯊\t扌的\t7\n",
569
+ "𫫃\t???\t7\n",
570
+ "𢝵\t???\t7\n",
571
+ "銹\t鏽\t7\n",
572
+ "㓤\t吉刂\t7\n",
573
+ "䁯\t???\t7\n",
574
+ "啉\t o林\t7\n",
575
+ "臥\t???\t7\n",
576
+ "𠓼\t???\t7\n",
577
+ "稅\t???\t7\n",
578
+ "\t???\t7\n",
579
+ "喴\t o威\t7\n",
580
+ "噱\t???\t7\n",
581
+ "衛\t???\t6\n",
582
+ "𡄯\t???\t6\n",
583
+ "揤\t扌即\t6\n",
584
+ "𢤹\t???\t6\n",
585
+ "\t???\t6\n",
586
+ "鷄\t雞\t6\n",
587
+ "湴\t氵並\t6\n",
588
+ "\t???\t6\n",
589
+ "𦣇\t???\t6\n",
590
+ "齧\t咬\t6\n",
591
+ "𠮨\t乃\t6\n",
592
+ "\t???\t6\n",
593
+ "𡀝\t???\t6\n",
594
+ "婄\t蓓\t6\n",
595
+ "𠼱\t累\t6\n",
596
+ "𠱂\t???\t5\n",
597
+ "磧\t石責\t5\n",
598
+ "𠰋\t???\t5\n",
599
+ "𡂖\t???\t5\n",
600
+ "浭\t氵更\t5\n",
601
+ "擏\t擎\t5\n",
602
+ "𥋇\t掌\t5\n",
603
+ "揢\t扌客\t5\n",
604
+ "㨆\t扌林\t5\n",
605
+ "𠾍\t棄\t5\n",
606
+ "兌\t???\t5\n",
607
+ "𢺳\t???\t5\n",
608
+ "坺\t土拔\t5\n",
609
+ "鍚\t???\t5\n",
610
+ "𣘚\t???\t5\n",
611
+ "𪘁\t???\t5\n",
612
+ "𨳍\t七\t5\n",
613
+ "嗙\t o旁\t5\n",
614
+ "𠼰\t???\t5\n",
615
+ "𨳒\t小\t4\n",
616
+ "唿\t篋\t4\n",
617
+ "𣳼\t???\t4\n",
618
+ "𦂥\t???\t4\n",
619
+ "溚\t塔\t4\n",
620
+ "囋\t???\t4\n",
621
+ "瀄\t吱\t4\n",
622
+ "𠌥\t???\t4\n",
623
+ "𢫦\t???\t4\n",
624
+ "𢶍\t???\t4\n",
625
+ "𠲵\t???\t4\n",
626
+ "䉺\t米\t4\n",
627
+ "炕\t???\t4\n",
628
+ "𢴈\t撻\t4\n",
629
+ "𡲢\t???\t4\n",
630
+ "𥅈\t立\t4\n",
631
+ "𬧊\t???\t4\n",
632
+ "簕\t勒\t4\n",
633
+ "査\t查\t4\n",
634
+ "𩜠\t岩\t4\n",
635
+ "𫬿\t???\t4\n",
636
+ "𠜱\t卑刂\t4\n",
637
+ "嚬\t顰\t4\n",
638
+ "𠹹\t???\t4\n",
639
+ "𦉘\t???\t4\n",
640
+ "唦\t o沙\t4\n",
641
+ "㨘\t扌省\t4\n",
642
+ "𡄽\t瀉\t4\n",
643
+ "熗\t槍\t4\n",
644
+ "𡁷\t???\t4\n",
645
+ "𠿬\t???\t4\n",
646
+ "咜\t叱\t4\n",
647
+ "𠸏\t茄\t4\n",
648
+ "𡁸\t???\t4\n",
649
+ "𡃵\t???\t4\n",
650
+ "𪚩\t???\t4\n",
651
+ "D\t???\t4\n",
652
+ "Q\t???\t4\n",
653
+ "𨆯\t???\t3\n",
654
+ "啗\t啖\t3\n",
655
+ "蔸\t艹兜\t3\n",
656
+ "舗\t鋪\t3\n",
657
+ "囪\t窗\t3\n",
658
+ "艔\t???\t3\n",
659
+ "洩\t???\t3\n",
660
+ "𢵧\t???\t3\n",
661
+ "菓\t果\t3\n",
662
+ "䪴\t???\t3\n",
663
+ "䆲\t???\t3\n",
664
+ "痱\t???\t3\n",
665
+ "趿\t拖\t3\n",
666
+ "𠮩\t???\t3\n",
667
+ "搉\t確\t3\n",
668
+ "矋\t矖\t3\n",
669
+ "𠻗\t???\t3\n",
670
+ "𢲈\t???\t3\n",
671
+ "潞\t氵路\t3\n",
672
+ "沬\t???\t3\n",
673
+ "揇\t扌南\t3\n",
674
+ "齃\t曷\t3\n",
675
+ "𡃤\t賴\t3\n",
676
+ "𡃶\t???\t3\n",
677
+ "瀟\t???\t3\n",
678
+ "軨\t???\t3\n",
679
+ "鉻\t???\t3\n",
680
+ "\t???\t3\n",
681
+ "㿭\t斥\t3\n",
682
+ "𢵄\t???\t3\n",
683
+ "㗲\t???\t3\n",
684
+ "𢫕\t???\t3\n",
685
+ "𢰸\t???\t3\n",
686
+ "葫\t???\t3\n",
687
+ "咔\t???\t3\n",
688
+ "嚎\t???\t3\n",
689
+ "嗿\t???\t3\n",
690
+ "咈\t o弗\t3\n",
691
+ "咾\t嚕\t3\n",
692
+ "\t???\t3\n",
693
+ "𠵈\t妹\t3\n",
694
+ "吥\t o不\t3\n",
695
+ "𠾭\t???\t3\n",
696
+ "𠾵\t???\t3\n",
697
+ "朘\t俊\t3\n",
698
+ "觥\t黃\t3\n",
699
+ "㩧\t扌暴\t2\n",
700
+ "焙\t???\t2\n",
701
+ "兀\t???\t2\n",
702
+ "䭤\t???\t2\n",
703
+ "饊\t???\t2\n",
704
+ "[\t???\t2\n",
705
+ "]\t???\t2\n",
706
+ "炖\t???\t2\n",
707
+ "争\t爭\t2\n",
708
+ "䁓\t???\t2\n",
709
+ "𡂝\t???\t2\n",
710
+ "𩬎\t壬\t2\n",
711
+ "鈒\t閘\t2\n",
712
+ "亁\t乾\t2\n",
713
+ "炠\t灬甲\t2\n",
714
+ "摼\t???\t2\n",
715
+ "𠺬\t???\t2\n",
716
+ "𠵉\t???\t2\n",
717
+ "蝄\t???\t2\n",
718
+ "\t???\t2\n",
719
+ "蔫\t艹焉\t2\n",
720
+ "㘉\t???\t2\n",
721
+ "荏\t???\t2\n",
722
+ "墘\t土乾\t2\n",
723
+ "嗏\t搽\t2\n",
724
+ "呣\t o母\t2\n",
725
+ "曚\t矇\t2\n",
726
+ "壬\t???\t2\n",
727
+ "揅\t研\t2\n",
728
+ "溼\t濕\t2\n",
729
+ "囓\t咬\t2\n",
730
+ "嚙\t咬\t2\n",
731
+ "枴\t拐\t2\n",
732
+ "𡃀\t???\t2\n",
733
+ "饑\t???\t2\n",
734
+ "䏭\t???\t2\n",
735
+ "挼\t挪\t2\n",
736
+ "掱\t???\t2\n",
737
+ "咑\t打\t2\n",
738
+ "芙\t???\t2\n",
739
+ "𦂗\t???\t2\n",
740
+ "舦\t軚\t2\n",
741
+ "𢶤\t扌靴\t2\n",
742
+ "翡\t???\t2\n",
743
+ "翠\t???\t2\n",
744
+ "酡\t酉它\t2\n",
745
+ "𫭊\t???\t2\n",
746
+ "煀\t火屈\t2\n",
747
+ "耙\t???\t2\n",
748
+ "𠿭\t滑\t2\n",
749
+ "鉤\t鈎\t2\n",
750
+ "𠻘\t???\t2\n",
751
+ "脽\t離\t2\n",
752
+ "焊\t???\t2\n",
753
+ "唊\t o夾\t2\n",
754
+ "胅\t⺼失\t2\n",
755
+ "翕\t???\t2\n",
756
+ "摜\t摔\t2\n",
757
+ "僚\t???\t1\n",
758
+ "𩗴\t???\t1\n",
759
+ "毡\t???\t1\n",
760
+ "跤\t???\t1\n",
761
+ "梧\t???\t1\n",
762
+ "痄\t疒乍\t1\n",
763
+ "卟\t卜\t1\n",
764
+ "劄\t札\t1\n",
765
+ "𠶜\t制\t1\n",
766
+ "睜\t???\t1\n",
767
+ "迹\t跡\t1\n",
768
+ "揃\t扌前\t1\n",
769
+ "唨\t o阻\t1\n",
770
+ "謢\t護\t1\n",
771
+ "菻\t麻\t1\n",
772
+ "𣚺\t???\t1\n",
773
+ "鷓\t庶鳥\t1\n",
774
+ "鴣\t古鳥\t1\n",
775
+ "强\t???\t1\n",
776
+ "𠾶\t???\t1\n",
777
+ "𡆀\t轟\t1\n",
778
+ "拫\t扌艮\t1\n",
779
+ "𠼮\t偽\t1\n",
780
+ "汞\t???\t1\n",
781
+ "㤿\t???\t1\n",
782
+ "厴\t???\t1\n",
783
+ "𥀬\t???\t1\n",
784
+ "牯\t???\t1\n",
785
+ "𡇙\t???\t1\n",
786
+ "讕\t賴\t1\n",
787
+ "𠿫\t???\t1\n",
788
+ "瘺\t婁\t1\n",
789
+ "骲\t骨包\t1\n",
790
+ "𫲭\t???\t1\n",
791
+ "瓏\t玉龍\t1\n",
792
+ "繚\t???\t1\n",
793
+ "撿\t???\t1\n",
794
+ "跀\t⻊月\t1\n",
795
+ "𢛴\t掹\t1\n",
796
+ "蝻\t虫南\t1\n",
797
+ "赧\t羞赤\t1\n",
798
+ "𪙛\t甩\t1\n",
799
+ "\t???\t1\n",
800
+ "檳\t???\t1\n",
801
+ "潲\t餿\t1\n",
802
+ "𢶠\t???\t1\n",
803
+ "秧\t???\t1\n",
804
+ "蒔\t???\t1\n",
805
+ "炩\t灬令\t1\n",
806
+ "㩋\t???\t1\n",
807
+ "饅\t???\t1\n",
808
+ "鍍\t???\t1\n",
809
+ "𢚖\t???\t1\n",
810
+ "𧊅\t虫另\t1\n",
811
+ "\t???\t1\n",
812
+ "篸\t???\t1\n",
813
+ "𩟔\t???\t1\n",
814
+ "撍\t賺\t1\n",
815
+ "栗\t???\t1\n",
816
+ "\t???\t1\n",
817
+ "𡆇\t???\t1\n",
818
+ "杧\t芒\t1\n",
819
+ "榛\t???\t1\n",
820
+ "蠄\t虫禽\t1\n",
821
+ "蟧\t???\t1\n",
822
+ "嘶\t???\t1\n",
823
+ "梆\t???\t1\n",
824
+ "竪\t豎\t1\n",
825
+ "騾\t???\t1\n",
826
+ "矺\t???\t1\n",
827
+ "堀\t???\t1\n",
828
+ "麝\t???\t1\n",
829
+ "慪\t嘔\t1\n",
830
+ "撴\t扌敦\t1\n",
831
+ "哾\t啜\t1\n",
832
+ "𠳖\t???\t1\n",
833
+ "洌\t冽\t1\n",
834
+ "霹\t???\t1\n",
835
+ "𠾼\t???\t1\n",
836
+ "𬦠\t???\t1\n",
837
+ "𤌍\t???\t1\n",
838
+ "𬧯\t???\t1\n",
839
+ "厠\t廁\t1\n",
840
+ "㖡\t???\t1\n",
841
+ "跁\t⻊巴\t1\n",
842
+ "鉎\t???\t1\n",
843
+ "𧣈\t???\t1\n",
844
+ "𠳏\t???\t1\n",
845
+ "㹃\t非\t1\n",
846
+ "𧝞\t???\t1\n",
847
+ "𡀞\t???\t1\n",
848
+ "㦒\t???\t1\n",
849
+ "𩩍\t娉\t1\n",
850
+ "𢱢\t???\t1\n",
851
+ "鍟\t???\t1\n",
852
+ "煱\t???\t1\n",
853
+ "撘\t搭\t1\n",
854
+ "閱\t???\t1\n",
855
+ "橇\t喬\t1\n",
856
+ "籽\t???\t1\n",
857
+ "庵\t???\t1\n",
858
+ "厨\t???\t1\n",
859
+ "疴\t屙\t1\n",
860
+ "豹\t???\t1\n",
861
+ "杠\t槓\t1\n",
862
+ "咘\t o布\t1\n",
863
+ "裡\t???\t1\n",
864
+ "熏\t燻\t1\n",
865
+ "\t???\t1\n"
866
+ ]
867
+ }
868
+ ],
869
+ "source": [
870
+ "for c, freq in charset_sort_by_freq.items():\n",
871
+ " if c in char_to_normalized_char:\n",
872
+ " print(c + \"\\t\" + char_to_normalized_char[c] + \"\\t\" + str(freq))\n",
873
+ " else:\n",
874
+ " print(c + \"\\t\" + \"???\" + \"\\t\" + str(freq))"
875
+ ]
876
+ },
877
+ {
878
+ "cell_type": "code",
879
+ "execution_count": 57,
880
+ "metadata": {},
881
+ "outputs": [
882
+ {
883
+ "name": "stdout",
884
+ "output_type": "stream",
885
+ "text": [
886
+ "Loaded 177 normalization mappings\n",
887
+ "Sample of first 10 highest frequency mappings:\n",
888
+ "[('𠹺', '埋'), ('噖', '琴'), ('𡁵', '緊'), ('𠶧', '掂'), ('嚫', '親'), ('屘', '尾'), ('衭', '褲'), ('贃', '賺'), ('說', '説'), ('𧵳', '蝕')]\n"
889
+ ]
890
+ }
891
+ ],
892
+ "source": [
893
+ "abc_mapping = {}\n",
894
+ "\n",
895
+ "with open(\"abc_rare_char_mapping.txt\", \"r\") as input_file:\n",
896
+ " for line in input_file.read().splitlines():\n",
897
+ " [c, n, freq] = line.split(\"\\t\")\n",
898
+ " if len(n) == 1:\n",
899
+ " abc_mapping[c] = n\n",
900
+ "\n",
901
+ "print(\"Loaded {} normalization mappings\".format(len(abc_mapping)))\n",
902
+ "print(\"Sample of first 10 highest frequency mappings:\")\n",
903
+ "print(list(abc_mapping.items())[:10])"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "code",
908
+ "execution_count": 58,
909
+ "metadata": {},
910
+ "outputs": [],
911
+ "source": [
912
+ "# replace all occurence of rare characters with normalized ones\n",
913
+ "def normalize_abc(line: str) -> str:\n",
914
+ " for c, n in abc_mapping.items():\n",
915
+ " line = line.replace(c, n)\n",
916
+ " line = line.replace(\"而𠺢\", \"而家\").replace(\"依𠺢\", \"依家\")\n",
917
+ " return line"
918
+ ]
919
+ },
920
+ {
921
+ "cell_type": "code",
922
+ "execution_count": 59,
923
+ "metadata": {},
924
+ "outputs": [],
925
+ "source": [
926
+ "with open(\"train/abc.can\", \"w+\") as output_file:\n",
927
+ " for line in can_lines:\n",
928
+ " output_file.write(normalize_abc(line) + \"\\n\")\n"
929
+ ]
930
+ },
931
+ {
932
+ "cell_type": "code",
933
+ "execution_count": null,
934
+ "metadata": {},
935
+ "outputs": [],
936
+ "source": [
937
+ "\n",
938
+ " \n"
939
+ ]
940
+ }
941
+ ],
942
+ "metadata": {
943
+ "kernelspec": {
944
+ "display_name": "Python 3",
945
+ "language": "python",
946
+ "name": "python3"
947
+ },
948
+ "language_info": {
949
+ "codemirror_mode": {
950
+ "name": "ipython",
951
+ "version": 3
952
+ },
953
+ "file_extension": ".py",
954
+ "mimetype": "text/x-python",
955
+ "name": "python",
956
+ "nbconvert_exporter": "python",
957
+ "pygments_lexer": "ipython3",
958
+ "version": "3.10.6"
959
+ },
960
+ "orig_nbformat": 4
961
+ },
962
+ "nbformat": 4,
963
+ "nbformat_minor": 2
964
+ }
load_lihkg.ipynb ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/html": [
11
+ "<div>\n",
12
+ "<style scoped>\n",
13
+ " .dataframe tbody tr th:only-of-type {\n",
14
+ " vertical-align: middle;\n",
15
+ " }\n",
16
+ "\n",
17
+ " .dataframe tbody tr th {\n",
18
+ " vertical-align: top;\n",
19
+ " }\n",
20
+ "\n",
21
+ " .dataframe thead th {\n",
22
+ " text-align: right;\n",
23
+ " }\n",
24
+ "</style>\n",
25
+ "<table border=\"1\" class=\"dataframe\">\n",
26
+ " <thead>\n",
27
+ " <tr style=\"text-align: right;\">\n",
28
+ " <th></th>\n",
29
+ " <th>Words</th>\n",
30
+ " <th>Frequency</th>\n",
31
+ " </tr>\n",
32
+ " </thead>\n",
33
+ " <tbody>\n",
34
+ " <tr>\n",
35
+ " <th>0</th>\n",
36
+ " <td>有</td>\n",
37
+ " <td>51227728</td>\n",
38
+ " </tr>\n",
39
+ " <tr>\n",
40
+ " <th>1</th>\n",
41
+ " <td>我</td>\n",
42
+ " <td>43798085</td>\n",
43
+ " </tr>\n",
44
+ " <tr>\n",
45
+ " <th>2</th>\n",
46
+ " <td>一</td>\n",
47
+ " <td>43159170</td>\n",
48
+ " </tr>\n",
49
+ " <tr>\n",
50
+ " <th>3</th>\n",
51
+ " <td>的</td>\n",
52
+ " <td>40916482</td>\n",
53
+ " </tr>\n",
54
+ " <tr>\n",
55
+ " <th>4</th>\n",
56
+ " <td>你</td>\n",
57
+ " <td>30897176</td>\n",
58
+ " </tr>\n",
59
+ " <tr>\n",
60
+ " <th>...</th>\n",
61
+ " <td>...</td>\n",
62
+ " <td>...</td>\n",
63
+ " </tr>\n",
64
+ " <tr>\n",
65
+ " <th>133207</th>\n",
66
+ " <td>黎明網</td>\n",
67
+ " <td>12</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>133208</th>\n",
71
+ " <td>黎錦華</td>\n",
72
+ " <td>12</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>133209</th>\n",
76
+ " <td>墨包</td>\n",
77
+ " <td>12</td>\n",
78
+ " </tr>\n",
79
+ " <tr>\n",
80
+ " <th>133210</th>\n",
81
+ " <td>點晒穴</td>\n",
82
+ " <td>12</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>133211</th>\n",
86
+ " <td>齋頂</td>\n",
87
+ " <td>12</td>\n",
88
+ " </tr>\n",
89
+ " </tbody>\n",
90
+ "</table>\n",
91
+ "<p>133212 rows × 2 columns</p>\n",
92
+ "</div>"
93
+ ],
94
+ "text/plain": [
95
+ " Words Frequency\n",
96
+ "0 有 51227728\n",
97
+ "1 我 43798085\n",
98
+ "2 一 43159170\n",
99
+ "3 的 40916482\n",
100
+ "4 你 30897176\n",
101
+ "... ... ...\n",
102
+ "133207 黎明網 12\n",
103
+ "133208 黎錦華 12\n",
104
+ "133209 墨包 12\n",
105
+ "133210 點晒穴 12\n",
106
+ "133211 齋頂 12\n",
107
+ "\n",
108
+ "[133212 rows x 2 columns]"
109
+ ]
110
+ },
111
+ "execution_count": 3,
112
+ "metadata": {},
113
+ "output_type": "execute_result"
114
+ }
115
+ ],
116
+ "source": [
117
+ "import pandas as pd\n",
118
+ "\n",
119
+ "# Load Excel file and convert to dictionary\n",
120
+ "df = pd.read_excel('CyberCan.xlsx')\n",
121
+ "\n",
122
+ "df"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 33,
128
+ "metadata": {},
129
+ "outputs": [],
130
+ "source": [
131
+ "with open(\"CyberCan.dict\", \"w+\") as output_file:\n",
132
+ " for index, row in df.iterrows():\n",
133
+ " word = str(row['Words']).strip()\n",
134
+ " if not \" \" in word:\n",
135
+ " output_file.write(word + \" \" + str(row['Frequency']) + \"\\n\")\n",
136
+ " output_file.flush()\n"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 36,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "name": "stdout",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "Total words: 132895\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "puncts = [\",\", \"。\", \"!\", \"?\", \"「\", \"」\", \":\"]\n",
154
+ "cybercan_words = set()\n",
155
+ "\n",
156
+ "for word in list(df['Words'].values) + puncts:\n",
157
+ " cybercan_words.add(word)\n",
158
+ "\n",
159
+ "print(\"Total words: {}\".format(len(cybercan_words)))"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 37,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "import jieba\n",
169
+ "jieba.set_dictionary(\"CyberCan.dict\")"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 42,
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "name": "stdout",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "Total filtered lines: 140590\n"
182
+ ]
183
+ }
184
+ ],
185
+ "source": [
186
+ "import re\n",
187
+ "\n",
188
+ "alnum = re.compile(\"[a-zA-Z0-9]\")\n",
189
+ "filtered_lines = []\n",
190
+ "\n",
191
+ "with open(\"train/lihkg.can\", \"r\") as input_file:\n",
192
+ " for line in input_file.read().splitlines():\n",
193
+ " line = line.replace(\" \", \"\")\n",
194
+ " if len(line) < 10:\n",
195
+ " continue\n",
196
+ " if len(line) >= 64:\n",
197
+ " continue\n",
198
+ " if alnum.search(line):\n",
199
+ " continue\n",
200
+ " tokens = list(jieba.cut(line))\n",
201
+ " found_rare_word = False\n",
202
+ " for token in tokens:\n",
203
+ " if not token in cybercan_words:\n",
204
+ " found_rare_word = True\n",
205
+ " # print(\"Found rare word: {}\".format(token))\n",
206
+ " break\n",
207
+ " if found_rare_word:\n",
208
+ " continue\n",
209
+ " filtered_lines.append(line)\n",
210
+ "\n",
211
+ "print(\"Total filtered lines: {}\".format(len(filtered_lines)))\n",
212
+ "\n",
213
+ "with open(\"train/lihkg.filtered.can\", \"w+\") as output_file:\n",
214
+ " for line in filtered_lines:\n",
215
+ " output_file.write(line + \"\\n\")\n",
216
+ " output_file.flush()"
217
+ ]
218
+ }
219
+ ],
220
+ "metadata": {
221
+ "kernelspec": {
222
+ "display_name": "Python 3",
223
+ "language": "python",
224
+ "name": "python3"
225
+ },
226
+ "language_info": {
227
+ "codemirror_mode": {
228
+ "name": "ipython",
229
+ "version": 3
230
+ },
231
+ "file_extension": ".py",
232
+ "mimetype": "text/x-python",
233
+ "name": "python",
234
+ "nbconvert_exporter": "python",
235
+ "pygments_lexer": "ipython3",
236
+ "version": "3.10.6"
237
+ },
238
+ "orig_nbformat": 4
239
+ },
240
+ "nbformat": 4,
241
+ "nbformat_minor": 2
242
+ }
load_mined_bitext.ipynb ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/html": [
11
+ "<div>\n",
12
+ "<style scoped>\n",
13
+ " .dataframe tbody tr th:only-of-type {\n",
14
+ " vertical-align: middle;\n",
15
+ " }\n",
16
+ "\n",
17
+ " .dataframe tbody tr th {\n",
18
+ " vertical-align: top;\n",
19
+ " }\n",
20
+ "\n",
21
+ " .dataframe thead th {\n",
22
+ " text-align: right;\n",
23
+ " }\n",
24
+ "</style>\n",
25
+ "<table border=\"1\" class=\"dataframe\">\n",
26
+ " <thead>\n",
27
+ " <tr style=\"text-align: right;\">\n",
28
+ " <th></th>\n",
29
+ " <th>input_text</th>\n",
30
+ " <th>target_text</th>\n",
31
+ " </tr>\n",
32
+ " </thead>\n",
33
+ " <tbody>\n",
34
+ " <tr>\n",
35
+ " <th>0</th>\n",
36
+ " <td>我要求的是法律上的澄清</td>\n",
37
+ " <td>我係要求……呢啲係好清楚嘅法律上嘅澄清呀</td>\n",
38
+ " </tr>\n",
39
+ " <tr>\n",
40
+ " <th>1</th>\n",
41
+ " <td>每晚由七點半,到十一點半</td>\n",
42
+ " <td>誒,由七點半就做到十一點半</td>\n",
43
+ " </tr>\n",
44
+ " <tr>\n",
45
+ " <th>2</th>\n",
46
+ " <td>梁頌恒議員,你是否要繼續發言</td>\n",
47
+ " <td>梁頌恆議員呢,係咪繼續係發言</td>\n",
48
+ " </tr>\n",
49
+ " <tr>\n",
50
+ " <th>3</th>\n",
51
+ " <td>可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大</td>\n",
52
+ " <td>可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛</td>\n",
53
+ " </tr>\n",
54
+ " <tr>\n",
55
+ " <th>4</th>\n",
56
+ " <td>至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見</td>\n",
57
+ " <td>噉呢,就對於佢能唔能夠達到你頭先提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見</td>\n",
58
+ " </tr>\n",
59
+ " <tr>\n",
60
+ " <th>...</th>\n",
61
+ " <td>...</td>\n",
62
+ " <td>...</td>\n",
63
+ " </tr>\n",
64
+ " <tr>\n",
65
+ " <th>35872</th>\n",
66
+ " <td>他曾在2006及2007年擔任暑期實習生</td>\n",
67
+ " <td>2006~2007學年寒暑假間亦試過將學校整大兼修容過</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>35873</th>\n",
71
+ " <td>克里莫尼迪茲戰爭</td>\n",
72
+ " <td>克里米亞戰爭</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>35874</th>\n",
76
+ " <td>產卵後親魚迴歸大海</td>\n",
77
+ " <td>海潮遇返失敗多年嘅生母</td>\n",
78
+ " </tr>\n",
79
+ " <tr>\n",
80
+ " <th>35875</th>\n",
81
+ " <td>學校規模冠絕全馬。</td>\n",
82
+ " <td>學校嘅運動水平可謂全區之冠。</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>35876</th>\n",
86
+ " <td>黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。</td>\n",
87
+ " <td>而九龍寨城到海邊碼頭就慢慢變成市集。</td>\n",
88
+ " </tr>\n",
89
+ " </tbody>\n",
90
+ "</table>\n",
91
+ "<p>35877 rows × 2 columns</p>\n",
92
+ "</div>"
93
+ ],
94
+ "text/plain": [
95
+ " input_text \\\n",
96
+ "0 我要求的是法律上的澄清 \n",
97
+ "1 每晚由七點半,到十一點半 \n",
98
+ "2 梁頌恒議員,你是否要繼續發言 \n",
99
+ "3 可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大 \n",
100
+ "4 至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見 \n",
101
+ "... ... \n",
102
+ "35872 他曾在2006及2007年擔任暑期實習生 \n",
103
+ "35873 克里莫尼迪茲戰爭 \n",
104
+ "35874 產卵後親魚迴歸大海 \n",
105
+ "35875 學校規模冠絕全馬。 \n",
106
+ "35876 黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。 \n",
107
+ "\n",
108
+ " target_text \n",
109
+ "0 我係要求……呢啲係好清楚嘅法律上嘅澄清呀 \n",
110
+ "1 誒,由七點半就做到十一點半 \n",
111
+ "2 梁頌恆議員呢,係咪繼續係發言 \n",
112
+ "3 可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛 \n",
113
+ "4 噉呢,就對於佢能唔能夠達到你頭���提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見 \n",
114
+ "... ... \n",
115
+ "35872 2006~2007學年寒暑假間亦試過將學校整大兼修容過 \n",
116
+ "35873 克里米亞戰爭 \n",
117
+ "35874 海潮遇返失敗多年嘅生母 \n",
118
+ "35875 學校嘅運動水平可謂全區之冠。 \n",
119
+ "35876 而九龍寨城到海邊碼頭就慢慢變成市集。 \n",
120
+ "\n",
121
+ "[35877 rows x 2 columns]"
122
+ ]
123
+ },
124
+ "execution_count": 1,
125
+ "metadata": {},
126
+ "output_type": "execute_result"
127
+ }
128
+ ],
129
+ "source": [
130
+ "import pandas as pd\n",
131
+ "\n",
132
+ "df = pd.read_pickle(\"yue_zh_combined36k.pkl\")\n",
133
+ "df"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 2,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "df = df.reset_index() # make sure indexes pair with number of rows\n",
143
+ "\n",
144
+ "with open(\"train/mined_bitext.can\", \"w+\") as can_file, open(\"train/mined_bitext.man\", \"w+\") as man_file:\n",
145
+ " for index, row in df.iterrows():\n",
146
+ " man_file.write(row['input_text'] + \"\\n\")\n",
147
+ " can_file.write(row['target_text'] + \"\\n\")\n",
148
+ " man_file.flush()\n",
149
+ " can_file.flush()"
150
+ ]
151
+ }
152
+ ],
153
+ "metadata": {
154
+ "kernelspec": {
155
+ "display_name": "Python 3",
156
+ "language": "python",
157
+ "name": "python3"
158
+ },
159
+ "language_info": {
160
+ "codemirror_mode": {
161
+ "name": "ipython",
162
+ "version": 3
163
+ },
164
+ "file_extension": ".py",
165
+ "mimetype": "text/x-python",
166
+ "name": "python",
167
+ "nbconvert_exporter": "python",
168
+ "pygments_lexer": "ipython3",
169
+ "version": "3.10.6"
170
+ },
171
+ "orig_nbformat": 4
172
+ },
173
+ "nbformat": 4,
174
+ "nbformat_minor": 2
175
+ }
para/.DS_Store ADDED
Binary file (10.2 kB). View file
 
para/dev/.DS_Store ADDED
Binary file (6.15 kB). View file
 
para/dev/dev.can ADDED
The diff for this file is too large to render. See raw diff
 
para/dev/dev.man ADDED
The diff for this file is too large to render. See raw diff
 
para/dev/dev.norm.can ADDED
The diff for this file is too large to render. See raw diff
 
para/test/.DS_Store ADDED
Binary file (6.15 kB). View file
 
para/test/test.can ADDED
The diff for this file is too large to render. See raw diff
 
para/test/test.man ADDED
The diff for this file is too large to render. See raw diff
 
para/test/test.norm.can ADDED
The diff for this file is too large to render. See raw diff
 
para/test/test.typos.can ADDED
The diff for this file is too large to render. See raw diff
 
para/test/test.typos.man ADDED
The diff for this file is too large to render. See raw diff
 
process_novels.ipynb ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 45,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# https://cloud.tencent.com/developer/article/2197062\n",
10
+ "import re\n",
11
+ "def cut_sent(input):\n",
12
+ " lines = []\n",
13
+ " i = 0\n",
14
+ " line = \"\"\n",
15
+ " while i < len(input):\n",
16
+ " if input[i] == \"「\":\n",
17
+ " if len(line) > 0:\n",
18
+ " lines.append(line)\n",
19
+ " line = \"\"\n",
20
+ " line += input[i]\n",
21
+ " i += 1\n",
22
+ " while i < len(input) and input[i] != \"」\":\n",
23
+ " line += input[i]\n",
24
+ " i += 1\n",
25
+ " if i < len(input):\n",
26
+ " line += input[i]\n",
27
+ " lines.append(line)\n",
28
+ " line = \"\"\n",
29
+ " else:\n",
30
+ " line += input[i]\n",
31
+ " i += 1\n",
32
+ " if len(line) > 0:\n",
33
+ " lines.append(line)\n",
34
+ " sents = []\n",
35
+ " for line in lines:\n",
36
+ " if line.startswith(\"「\"):\n",
37
+ " if len(sents) > 0 and not re.match(\"[。!?\\?]\", sents[-1][-1]):\n",
38
+ " sents[-1] += line\n",
39
+ " else:\n",
40
+ " sents.append(line)\n",
41
+ " else:\n",
42
+ " line = re.sub('([。!?\\?])([^”’」])', r\"\\1\\n\\2\", line) # 单字符断句符\n",
43
+ " line = re.sub('(\\.{6})([^”’」])', r\"\\1\\n\\2\", line) # 英文省略号\n",
44
+ " line = re.sub('(\\…{2})([^”’」])', r\"\\1\\n\\2\", line) # 中文省略号\n",
45
+ " line = re.sub('([。!?\\?][”’」])([^,。!?\\?])', r'\\1\\n\\2', line)\n",
46
+ " # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\\n放到双引号后,注意前面的几句都小心保留了双引号\n",
47
+ " line = line.rstrip() # 段尾如果有多余的\\n就去掉它\n",
48
+ " # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。\n",
49
+ " lines = line.split(\"\\n\")\n",
50
+ " if len(sents) > 0 and re.search(\"[^。!?\\?][”’」]$\", sents[-1]):\n",
51
+ " sents[-1] += lines[0]\n",
52
+ " sents.extend(lines[1:])\n",
53
+ " else:\n",
54
+ " sents.extend(lines)\n",
55
+ " return sents"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 47,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "novel_lines = []\n",
65
+ "\n",
66
+ "with open(\"train/little_prince.txt\", \"r\") as input_file:\n",
67
+ " for line in input_file.read().splitlines():\n",
68
+ " if len(line) > 0:\n",
69
+ " novel_lines.extend(cut_sent(line))\n",
70
+ "\n",
71
+ "with open(\"train/animal_farm.txt\", \"r\") as input_file:\n",
72
+ " for line in input_file.read().splitlines():\n",
73
+ " if len(line) > 0:\n",
74
+ " novel_lines.extend(cut_sent(line))\n",
75
+ "\n",
76
+ "with open(\"train/novels.can\", \"w+\") as output_file:\n",
77
+ " for line in novel_lines:\n",
78
+ " output_file.write(line + \"\\n\")"
79
+ ]
80
+ }
81
+ ],
82
+ "metadata": {
83
+ "kernelspec": {
84
+ "display_name": "Python 3",
85
+ "language": "python",
86
+ "name": "python3"
87
+ },
88
+ "language_info": {
89
+ "codemirror_mode": {
90
+ "name": "ipython",
91
+ "version": 3
92
+ },
93
+ "file_extension": ".py",
94
+ "mimetype": "text/x-python",
95
+ "name": "python",
96
+ "nbconvert_exporter": "python",
97
+ "pygments_lexer": "ipython3",
98
+ "version": "3.10.6"
99
+ },
100
+ "orig_nbformat": 4
101
+ },
102
+ "nbformat": 4,
103
+ "nbformat_minor": 2
104
+ }
runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/1681654257.025384/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2efe09742567e4a2f40b79868bd8ebebc12e7a6734df43fc6edc6193593a3bc
3
+ size 6023
runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abdd99ecb35970605a6d76b6185ecd57edb0fec7cdfe79396226b0391096a415
3
+ size 20134
test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.130K.new.12000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.130K.new.6000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.130K.old.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.16K.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.175K.12000.bidir.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.80K.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.bing.11000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.pred.bing.man ADDED
The diff for this file is too large to render. See raw diff
 
test.typos.pred.130K.new.12000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.typos.pred.130K.old.12000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.typos.pred.170K.mined.6000.man ADDED
The diff for this file is too large to render. See raw diff
 
test.typos.pred.175K.12000.bidir.man ADDED
The diff for this file is too large to render. See raw diff
 
test.typos.pred.80K.7000.man ADDED
The diff for this file is too large to render. See raw diff