AlienKevin
commited on
Commit
•
a54c5b6
1
Parent(s):
2bed565
Upload 113 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- CyberCan.dict +0 -0
- CyberCan.xlsx +3 -0
- abc_rare_char_mapping.txt +360 -0
- checkpoint-11000/config.json +76 -0
- checkpoint-11000/generation_config.json +13 -0
- checkpoint-11000/optimizer.pt +3 -0
- checkpoint-11000/pytorch_model.bin +3 -0
- checkpoint-11000/rng_state.pth +3 -0
- checkpoint-11000/scheduler.pt +3 -0
- checkpoint-11000/trainer_state.json +456 -0
- checkpoint-11000/training_args.bin +3 -0
- commercial_baselines/bing.can +0 -0
- commercial_baselines/bing.key +1 -0
- commercial_baselines/bing.man +0 -0
- commercial_baselines/bing_translator.ipynb +234 -0
- commercial_baselines/lihkg.filtered.man +0 -0
- commercial_baselines/load_can.ipynb +58 -0
- finetune.ipynb +0 -0
- load_abc.ipynb +964 -0
- load_lihkg.ipynb +242 -0
- load_mined_bitext.ipynb +175 -0
- para/.DS_Store +0 -0
- para/dev/.DS_Store +0 -0
- para/dev/dev.can +0 -0
- para/dev/dev.man +0 -0
- para/dev/dev.norm.can +0 -0
- para/test/.DS_Store +0 -0
- para/test/test.can +0 -0
- para/test/test.man +0 -0
- para/test/test.norm.can +0 -0
- para/test/test.typos.can +0 -0
- para/test/test.typos.man +0 -0
- process_novels.ipynb +104 -0
- runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/1681654257.025384/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.1 +3 -0
- runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.0 +3 -0
- test.ipynb +0 -0
- test.pred.130K.new.12000.man +0 -0
- test.pred.130K.new.6000.man +0 -0
- test.pred.130K.old.man +0 -0
- test.pred.16K.man +0 -0
- test.pred.175K.12000.bidir.man +0 -0
- test.pred.80K.man +0 -0
- test.pred.bing.11000.man +0 -0
- test.pred.bing.man +0 -0
- test.typos.pred.130K.new.12000.man +0 -0
- test.typos.pred.130K.old.12000.man +0 -0
- test.typos.pred.170K.mined.6000.man +0 -0
- test.typos.pred.175K.12000.bidir.man +0 -0
- test.typos.pred.80K.7000.man +0 -0
.gitattributes
CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
CyberCan.xlsx filter=lfs diff=lfs merge=lfs -text
|
36 |
+
train/lihkg.can filter=lfs diff=lfs merge=lfs -text
|
37 |
+
train/train.can filter=lfs diff=lfs merge=lfs -text
|
38 |
+
train/train.man filter=lfs diff=lfs merge=lfs -text
|
CyberCan.dict
ADDED
The diff for this file is too large to render.
See raw diff
|
|
CyberCan.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:459bafd61fd02b74b1e94388298ed81623e7641711abb06841fe122c3cfdab1e
|
3 |
+
size 2680859
|
abc_rare_char_mapping.txt
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
𠹺 埋 388
|
2 |
+
噖 琴 162
|
3 |
+
𡁵 緊 157
|
4 |
+
𠶧 掂 88
|
5 |
+
嚫 親 88
|
6 |
+
屘 尾 57
|
7 |
+
衭 褲 47
|
8 |
+
贃 賺 43
|
9 |
+
說 説 35
|
10 |
+
𧵳 蝕 30
|
11 |
+
歳 歲 27
|
12 |
+
𢫏 冚 27
|
13 |
+
𨶙 撚 25
|
14 |
+
癐 攰 25
|
15 |
+
𦡆 窟 25
|
16 |
+
𨃩 跣 24
|
17 |
+
况 況 21
|
18 |
+
内 內 19
|
19 |
+
𢵌 隊 19
|
20 |
+
𦧺 賴 18
|
21 |
+
𠹌 掕 18
|
22 |
+
爲 為 16
|
23 |
+
𢱑 搲 16
|
24 |
+
𡁯 嘟 15
|
25 |
+
𠱓 詭 14
|
26 |
+
𠵿 披 14
|
27 |
+
踹 踩 13
|
28 |
+
㗇 蝦 13
|
29 |
+
𠾴 唪 13
|
30 |
+
嗍 索 13
|
31 |
+
𧘹 太 13
|
32 |
+
𠹳 傑 12
|
33 |
+
𠹭 哥 12
|
34 |
+
脫 脱 12
|
35 |
+
䁪 眨 11
|
36 |
+
𧨾 氹 11
|
37 |
+
掬 谷 11
|
38 |
+
𠸐 襟 11
|
39 |
+
啥 沙 11
|
40 |
+
𠱃 凹 10
|
41 |
+
噔 等 10
|
42 |
+
捹 揈 10
|
43 |
+
𠹻 陣 10
|
44 |
+
𠼻 基 10
|
45 |
+
噠 達 10
|
46 |
+
𨳊 鳩 10
|
47 |
+
𢲲 晾 9
|
48 |
+
𨉖 匿 9
|
49 |
+
躭 耽 9
|
50 |
+
䠋 卑 9
|
51 |
+
嘮 勞 9
|
52 |
+
啽 罨 9
|
53 |
+
滮 標 8
|
54 |
+
㧻 篤 8
|
55 |
+
𧶄 爭 8
|
56 |
+
𦛚 淰 8
|
57 |
+
撠 棘 8
|
58 |
+
呡 抆 8
|
59 |
+
睸 瞇 8
|
60 |
+
𠰲 嗝 8
|
61 |
+
𥔿 冚 8
|
62 |
+
唎 脷 8
|
63 |
+
𠸊 佮 8
|
64 |
+
𬜐 舔 8
|
65 |
+
蔥 葱 8
|
66 |
+
B B 7
|
67 |
+
𢯊 扚 7
|
68 |
+
𫫃 𡃀 7
|
69 |
+
銹 鏽 7
|
70 |
+
㓤 拮 7
|
71 |
+
䁯 瞌 7
|
72 |
+
啉 林 7
|
73 |
+
臥 卧 7
|
74 |
+
𠓼 攝 7
|
75 |
+
稅 税 7
|
76 |
+
趟 7
|
77 |
+
喴 依 7
|
78 |
+
噱 ??? 7
|
79 |
+
𡄯 ??? 6
|
80 |
+
揤 擳 6
|
81 |
+
𢤹 啹 6
|
82 |
+
噏 6
|
83 |
+
鷄 雞 6
|
84 |
+
??? 6
|
85 |
+
𦣇 籮 6
|
86 |
+
齧 咬 6
|
87 |
+
𠮨 乃 6
|
88 |
+
啤 6
|
89 |
+
𡀝 稔 6
|
90 |
+
婄 蓓 6
|
91 |
+
𠼱 累 6
|
92 |
+
𠱂 腯 5
|
93 |
+
磧 石責 5
|
94 |
+
𠰋 㕭 5
|
95 |
+
𡂖 戾 5
|
96 |
+
擏 擎 5
|
97 |
+
𥋇 掌 5
|
98 |
+
揢 咳 5
|
99 |
+
㨆 冧 5
|
100 |
+
𠾍 棄 5
|
101 |
+
兌 兑 5
|
102 |
+
𢺳 掹 5
|
103 |
+
坺 擗 5
|
104 |
+
鍚 錫 5
|
105 |
+
𣘚 ??? 5
|
106 |
+
𪘁 ??? 5
|
107 |
+
𨳍 七 5
|
108 |
+
嗙 o旁 5
|
109 |
+
𠼰 ??? 5
|
110 |
+
𨳒 屌 4
|
111 |
+
唿 篋 4
|
112 |
+
𣳼 ??? 4
|
113 |
+
𦂥 ??? 4
|
114 |
+
溚 塔 4
|
115 |
+
囋 ??? 4
|
116 |
+
瀄 吱 4
|
117 |
+
𠌥 ??? 4
|
118 |
+
𢫦 ??? 4
|
119 |
+
𢶍 ??? 4
|
120 |
+
𠲵 ??? 4
|
121 |
+
䉺 ??? 4
|
122 |
+
炕 ??? 4
|
123 |
+
𢴈 撻 4
|
124 |
+
𡲢 ??? 4
|
125 |
+
𥅈 立 4
|
126 |
+
𬧊 甩 4
|
127 |
+
簕 勒 4
|
128 |
+
査 查 4
|
129 |
+
𩜠 岩 4
|
130 |
+
𫬿 ??? 4
|
131 |
+
𠜱 卑刂 4
|
132 |
+
嚬 顰 4
|
133 |
+
𠹹 ??? 4
|
134 |
+
𦉘 ??? 4
|
135 |
+
唦 沙 4
|
136 |
+
㨘 扌省 4
|
137 |
+
𡄽 瀉 4
|
138 |
+
熗 槍 4
|
139 |
+
𡁷 ??? 4
|
140 |
+
𠿬 ??? 4
|
141 |
+
咜 叱 4
|
142 |
+
𠸏 茄 4
|
143 |
+
𡁸 ??? 4
|
144 |
+
𡃵 ??? 4
|
145 |
+
𪚩 ??? 4
|
146 |
+
D D 4
|
147 |
+
Q Q 4
|
148 |
+
𨆯 ??? 3
|
149 |
+
啗 啖 3
|
150 |
+
蔸 艹兜 3
|
151 |
+
舗 鋪 3
|
152 |
+
囪 窗 3
|
153 |
+
艔 ??? 3
|
154 |
+
洩 ??? 3
|
155 |
+
𢵧 ??? 3
|
156 |
+
菓 果 3
|
157 |
+
䪴 ??? 3
|
158 |
+
䆲 ??? 3
|
159 |
+
痱 ??? 3
|
160 |
+
趿 拖 3
|
161 |
+
𠮩 ??? 3
|
162 |
+
搉 確 3
|
163 |
+
矋 矖 3
|
164 |
+
𠻗 ??? 3
|
165 |
+
𢲈 ??? 3
|
166 |
+
潞 氵路 3
|
167 |
+
沬 ??? 3
|
168 |
+
揇 扌南 3
|
169 |
+
齃 曷 3
|
170 |
+
𡃤 賴 3
|
171 |
+
𡃶 ??? 3
|
172 |
+
瀟 ??? 3
|
173 |
+
軨 ??? 3
|
174 |
+
鉻 ??? 3
|
175 |
+
??? 3
|
176 |
+
㿭 斥 3
|
177 |
+
𢵄 ??? 3
|
178 |
+
㗲 ??? 3
|
179 |
+
𢫕 ??? 3
|
180 |
+
𢰸 ??? 3
|
181 |
+
葫 ??? 3
|
182 |
+
咔 ??? 3
|
183 |
+
嚎 ??? 3
|
184 |
+
嗿 ??? 3
|
185 |
+
咈 o弗 3
|
186 |
+
咾 嚕 3
|
187 |
+
??? 3
|
188 |
+
𠵈 妹 3
|
189 |
+
吥 o不 3
|
190 |
+
𠾭 ??? 3
|
191 |
+
𠾵 ??? 3
|
192 |
+
朘 俊 3
|
193 |
+
觥 黃 3
|
194 |
+
㩧 扌暴 2
|
195 |
+
焙 ??? 2
|
196 |
+
兀 ??? 2
|
197 |
+
䭤 ??? 2
|
198 |
+
饊 ??? 2
|
199 |
+
[ ??? 2
|
200 |
+
] ??? 2
|
201 |
+
炖 ??? 2
|
202 |
+
争 爭 2
|
203 |
+
䁓 ??? 2
|
204 |
+
𡂝 ??? 2
|
205 |
+
𩬎 壬 2
|
206 |
+
鈒 閘 2
|
207 |
+
亁 乾 2
|
208 |
+
炠 灬甲 2
|
209 |
+
摼 ??? 2
|
210 |
+
𠺬 ??? 2
|
211 |
+
𠵉 ??? 2
|
212 |
+
蝄 ??? 2
|
213 |
+
??? 2
|
214 |
+
蔫 艹焉 2
|
215 |
+
㘉 ??? 2
|
216 |
+
荏 ??? 2
|
217 |
+
墘 土乾 2
|
218 |
+
嗏 搽 2
|
219 |
+
呣 o母 2
|
220 |
+
曚 矇 2
|
221 |
+
壬 ??? 2
|
222 |
+
揅 研 2
|
223 |
+
溼 濕 2
|
224 |
+
囓 咬 2
|
225 |
+
嚙 咬 2
|
226 |
+
枴 拐 2
|
227 |
+
𡃀 ??? 2
|
228 |
+
饑 ??? 2
|
229 |
+
䏭 ??? 2
|
230 |
+
挼 挪 2
|
231 |
+
掱 ??? 2
|
232 |
+
咑 打 2
|
233 |
+
芙 ??? 2
|
234 |
+
𦂗 ??? 2
|
235 |
+
舦 軚 2
|
236 |
+
𢶤 扌靴 2
|
237 |
+
翡 ??? 2
|
238 |
+
翠 ??? 2
|
239 |
+
酡 酉它 2
|
240 |
+
𫭊 ??? 2
|
241 |
+
煀 火屈 2
|
242 |
+
耙 ??? 2
|
243 |
+
𠿭 滑 2
|
244 |
+
鉤 鈎 2
|
245 |
+
𠻘 ??? 2
|
246 |
+
脽 離 2
|
247 |
+
焊 ??? 2
|
248 |
+
唊 o夾 2
|
249 |
+
胅 ⺼失 2
|
250 |
+
翕 ??? 2
|
251 |
+
摜 摔 2
|
252 |
+
僚 ??? 1
|
253 |
+
𩗴 ??? 1
|
254 |
+
毡 ??? 1
|
255 |
+
跤 ??? 1
|
256 |
+
梧 ??? 1
|
257 |
+
痄 疒乍 1
|
258 |
+
卟 卜 1
|
259 |
+
劄 札 1
|
260 |
+
𠶜 制 1
|
261 |
+
睜 ??? 1
|
262 |
+
迹 跡 1
|
263 |
+
揃 扌前 1
|
264 |
+
唨 o阻 1
|
265 |
+
謢 護 1
|
266 |
+
菻 麻 1
|
267 |
+
𣚺 ??? 1
|
268 |
+
鷓 庶鳥 1
|
269 |
+
鴣 古鳥 1
|
270 |
+
强 ??? 1
|
271 |
+
𠾶 ??? 1
|
272 |
+
𡆀 轟 1
|
273 |
+
拫 扌艮 1
|
274 |
+
𠼮 偽 1
|
275 |
+
汞 ??? 1
|
276 |
+
㤿 ??? 1
|
277 |
+
厴 ??? 1
|
278 |
+
𥀬 ??? 1
|
279 |
+
牯 ??? 1
|
280 |
+
𡇙 ??? 1
|
281 |
+
讕 賴 1
|
282 |
+
𠿫 ??? 1
|
283 |
+
瘺 婁 1
|
284 |
+
骲 骨包 1
|
285 |
+
𫲭 ??? 1
|
286 |
+
瓏 玉龍 1
|
287 |
+
繚 ??? 1
|
288 |
+
撿 ??? 1
|
289 |
+
跀 ⻊月 1
|
290 |
+
𢛴 掹 1
|
291 |
+
蝻 虫南 1
|
292 |
+
赧 羞赤 1
|
293 |
+
𪙛 甩 1
|
294 |
+
??? 1
|
295 |
+
檳 ??? 1
|
296 |
+
潲 餿 1
|
297 |
+
𢶠 ??? 1
|
298 |
+
秧 ??? 1
|
299 |
+
蒔 ??? 1
|
300 |
+
炩 灬令 1
|
301 |
+
㩋 ??? 1
|
302 |
+
饅 ??? 1
|
303 |
+
鍍 ??? 1
|
304 |
+
𢚖 ??? 1
|
305 |
+
𧊅 虫另 1
|
306 |
+
??? 1
|
307 |
+
篸 ??? 1
|
308 |
+
𩟔 ??? 1
|
309 |
+
撍 賺 1
|
310 |
+
栗 ??? 1
|
311 |
+
??? 1
|
312 |
+
𡆇 ??? 1
|
313 |
+
杧 芒 1
|
314 |
+
榛 ??? 1
|
315 |
+
蠄 虫禽 1
|
316 |
+
蟧 ??? 1
|
317 |
+
嘶 ??? 1
|
318 |
+
梆 ??? 1
|
319 |
+
竪 豎 1
|
320 |
+
騾 ??? 1
|
321 |
+
矺 ??? 1
|
322 |
+
堀 ??? 1
|
323 |
+
麝 ??? 1
|
324 |
+
慪 嘔 1
|
325 |
+
撴 扌敦 1
|
326 |
+
哾 啜 1
|
327 |
+
𠳖 ??? 1
|
328 |
+
洌 冽 1
|
329 |
+
霹 ??? 1
|
330 |
+
𠾼 ??? 1
|
331 |
+
𬦠 ??? 1
|
332 |
+
𤌍 ??? 1
|
333 |
+
𬧯 ??? 1
|
334 |
+
厠 廁 1
|
335 |
+
㖡 ??? 1
|
336 |
+
跁 ⻊巴 1
|
337 |
+
鉎 ??? 1
|
338 |
+
𧣈 ??? 1
|
339 |
+
𠳏 ??? 1
|
340 |
+
㹃 非 1
|
341 |
+
𧝞 ??? 1
|
342 |
+
𡀞 ??? 1
|
343 |
+
㦒 ??? 1
|
344 |
+
𩩍 娉 1
|
345 |
+
𢱢 ??? 1
|
346 |
+
鍟 ??? 1
|
347 |
+
煱 ??? 1
|
348 |
+
撘 搭 1
|
349 |
+
閱 ??? 1
|
350 |
+
橇 喬 1
|
351 |
+
籽 ??? 1
|
352 |
+
庵 ??? 1
|
353 |
+
厨 ??? 1
|
354 |
+
疴 屙 1
|
355 |
+
豹 ??? 1
|
356 |
+
杠 槓 1
|
357 |
+
咘 o布 1
|
358 |
+
裡 ??? 1
|
359 |
+
熏 燻 1
|
360 |
+
??? 1
|
checkpoint-11000/config.json
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Ayaka/bart-base-cantonese",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"activation_function": "gelu",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"BartForConditionalGeneration"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.1,
|
11 |
+
"bos_token_id": 101,
|
12 |
+
"classif_dropout": 0.1,
|
13 |
+
"classifier_dropout": 0.0,
|
14 |
+
"d_model": 768,
|
15 |
+
"decoder_attention_heads": 12,
|
16 |
+
"decoder_ffn_dim": 3072,
|
17 |
+
"decoder_layerdrop": 0.0,
|
18 |
+
"decoder_layers": 6,
|
19 |
+
"decoder_start_token_id": 101,
|
20 |
+
"dropout": 0.1,
|
21 |
+
"early_stopping": true,
|
22 |
+
"encoder_attention_heads": 12,
|
23 |
+
"encoder_ffn_dim": 3072,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 102,
|
27 |
+
"forced_eos_token_id": 102,
|
28 |
+
"gradient_checkpointing": false,
|
29 |
+
"id2label": {
|
30 |
+
"0": "LABEL_0",
|
31 |
+
"1": "LABEL_1",
|
32 |
+
"2": "LABEL_2"
|
33 |
+
},
|
34 |
+
"init_std": 0.02,
|
35 |
+
"is_encoder_decoder": true,
|
36 |
+
"label2id": {
|
37 |
+
"LABEL_0": 0,
|
38 |
+
"LABEL_1": 1,
|
39 |
+
"LABEL_2": 2
|
40 |
+
},
|
41 |
+
"max_length": 64,
|
42 |
+
"max_position_embeddings": 512,
|
43 |
+
"min_length": 3,
|
44 |
+
"model_type": "bart",
|
45 |
+
"no_repeat_ngram_size": 3,
|
46 |
+
"normalize_before": false,
|
47 |
+
"normalize_embedding": true,
|
48 |
+
"num_beams": 4,
|
49 |
+
"num_hidden_layers": 6,
|
50 |
+
"pad_token_id": 0,
|
51 |
+
"scale_embedding": false,
|
52 |
+
"task_specific_params": {
|
53 |
+
"summarization": {
|
54 |
+
"length_penalty": 1.0,
|
55 |
+
"max_length": 128,
|
56 |
+
"min_length": 12,
|
57 |
+
"num_beams": 4
|
58 |
+
},
|
59 |
+
"summarization_cnn": {
|
60 |
+
"length_penalty": 2.0,
|
61 |
+
"max_length": 142,
|
62 |
+
"min_length": 56,
|
63 |
+
"num_beams": 4
|
64 |
+
},
|
65 |
+
"summarization_xsum": {
|
66 |
+
"length_penalty": 1.0,
|
67 |
+
"max_length": 62,
|
68 |
+
"min_length": 11,
|
69 |
+
"num_beams": 6
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"torch_dtype": "float32",
|
73 |
+
"transformers_version": "4.27.4",
|
74 |
+
"use_cache": true,
|
75 |
+
"vocab_size": 12660
|
76 |
+
}
|
checkpoint-11000/generation_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token_id": 101,
|
3 |
+
"decoder_start_token_id": 101,
|
4 |
+
"early_stopping": true,
|
5 |
+
"eos_token_id": 102,
|
6 |
+
"forced_eos_token_id": 102,
|
7 |
+
"max_length": 64,
|
8 |
+
"min_length": 3,
|
9 |
+
"no_repeat_ngram_size": 3,
|
10 |
+
"num_beams": 4,
|
11 |
+
"pad_token_id": 0,
|
12 |
+
"transformers_version": "4.27.4"
|
13 |
+
}
|
checkpoint-11000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c35fcf880925d6524a4d69c6153d76494bf292e183492aef621926fb1a3339f3
|
3 |
+
size 878171525
|
checkpoint-11000/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41d33ab2cbebaa3b1b34e65ceeb91b9ac3e55d369daa79e96c105e90f84610c8
|
3 |
+
size 439148829
|
checkpoint-11000/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c7e530afd0f80bdec827d3df914776b44f41c60dd27c32dc14b519e239a533e
|
3 |
+
size 13553
|
checkpoint-11000/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:20f836947b7cda13930ff3f199cca6c84444150a1c16e3cdc68ff9b8ba9995f2
|
3 |
+
size 627
|
checkpoint-11000/trainer_state.json
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 2.38198354265916,
|
5 |
+
"global_step": 11000,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 0.04,
|
12 |
+
"learning_rate": 5e-06,
|
13 |
+
"loss": 1.7639,
|
14 |
+
"step": 200
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 0.09,
|
18 |
+
"learning_rate": 1e-05,
|
19 |
+
"loss": 0.4886,
|
20 |
+
"step": 400
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"epoch": 0.13,
|
24 |
+
"learning_rate": 1.5e-05,
|
25 |
+
"loss": 0.2481,
|
26 |
+
"step": 600
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"epoch": 0.17,
|
30 |
+
"learning_rate": 2e-05,
|
31 |
+
"loss": 0.1724,
|
32 |
+
"step": 800
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 0.22,
|
36 |
+
"learning_rate": 2.5e-05,
|
37 |
+
"loss": 0.1417,
|
38 |
+
"step": 1000
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"epoch": 0.22,
|
42 |
+
"eval_bleu": 15.49859933982253,
|
43 |
+
"eval_chrf": 14.312711083470898,
|
44 |
+
"eval_loss": 0.37249916791915894,
|
45 |
+
"eval_runtime": 140.7211,
|
46 |
+
"eval_samples_per_second": 4.548,
|
47 |
+
"eval_steps_per_second": 0.071,
|
48 |
+
"step": 1000
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 0.26,
|
52 |
+
"learning_rate": 3e-05,
|
53 |
+
"loss": 0.1238,
|
54 |
+
"step": 1200
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"epoch": 0.3,
|
58 |
+
"learning_rate": 3.5e-05,
|
59 |
+
"loss": 0.1094,
|
60 |
+
"step": 1400
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"epoch": 0.35,
|
64 |
+
"learning_rate": 4e-05,
|
65 |
+
"loss": 0.1004,
|
66 |
+
"step": 1600
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"epoch": 0.39,
|
70 |
+
"learning_rate": 4.5e-05,
|
71 |
+
"loss": 0.0889,
|
72 |
+
"step": 1800
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.43,
|
76 |
+
"learning_rate": 5e-05,
|
77 |
+
"loss": 0.0843,
|
78 |
+
"step": 2000
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 0.43,
|
82 |
+
"eval_bleu": 13.930224375278131,
|
83 |
+
"eval_chrf": 15.927172187426772,
|
84 |
+
"eval_loss": 0.3602243661880493,
|
85 |
+
"eval_runtime": 171.4695,
|
86 |
+
"eval_samples_per_second": 3.732,
|
87 |
+
"eval_steps_per_second": 0.058,
|
88 |
+
"step": 2000
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"epoch": 0.48,
|
92 |
+
"learning_rate": 4.952584163110479e-05,
|
93 |
+
"loss": 0.0795,
|
94 |
+
"step": 2200
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"epoch": 0.52,
|
98 |
+
"learning_rate": 4.905168326220958e-05,
|
99 |
+
"loss": 0.0748,
|
100 |
+
"step": 2400
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.56,
|
104 |
+
"learning_rate": 4.8577524893314366e-05,
|
105 |
+
"loss": 0.0705,
|
106 |
+
"step": 2600
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"epoch": 0.61,
|
110 |
+
"learning_rate": 4.8103366524419156e-05,
|
111 |
+
"loss": 0.0665,
|
112 |
+
"step": 2800
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"epoch": 0.65,
|
116 |
+
"learning_rate": 4.7629208155523946e-05,
|
117 |
+
"loss": 0.0625,
|
118 |
+
"step": 3000
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"epoch": 0.65,
|
122 |
+
"eval_bleu": 20.903227705509728,
|
123 |
+
"eval_chrf": 19.94358238391718,
|
124 |
+
"eval_loss": 0.363214910030365,
|
125 |
+
"eval_runtime": 179.3923,
|
126 |
+
"eval_samples_per_second": 3.568,
|
127 |
+
"eval_steps_per_second": 0.056,
|
128 |
+
"step": 3000
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.69,
|
132 |
+
"learning_rate": 4.7155049786628736e-05,
|
133 |
+
"loss": 0.0607,
|
134 |
+
"step": 3200
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"epoch": 0.74,
|
138 |
+
"learning_rate": 4.6680891417733527e-05,
|
139 |
+
"loss": 0.0584,
|
140 |
+
"step": 3400
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"epoch": 0.78,
|
144 |
+
"learning_rate": 4.620673304883831e-05,
|
145 |
+
"loss": 0.0578,
|
146 |
+
"step": 3600
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"epoch": 0.82,
|
150 |
+
"learning_rate": 4.57325746799431e-05,
|
151 |
+
"loss": 0.0549,
|
152 |
+
"step": 3800
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"epoch": 0.87,
|
156 |
+
"learning_rate": 4.525841631104789e-05,
|
157 |
+
"loss": 0.0531,
|
158 |
+
"step": 4000
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"epoch": 0.87,
|
162 |
+
"eval_bleu": 17.338938784234383,
|
163 |
+
"eval_chrf": 15.930568457831859,
|
164 |
+
"eval_loss": 0.36546987295150757,
|
165 |
+
"eval_runtime": 170.3063,
|
166 |
+
"eval_samples_per_second": 3.758,
|
167 |
+
"eval_steps_per_second": 0.059,
|
168 |
+
"step": 4000
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"epoch": 0.91,
|
172 |
+
"learning_rate": 4.478425794215268e-05,
|
173 |
+
"loss": 0.0521,
|
174 |
+
"step": 4200
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"epoch": 0.95,
|
178 |
+
"learning_rate": 4.431009957325747e-05,
|
179 |
+
"loss": 0.0513,
|
180 |
+
"step": 4400
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"epoch": 1.0,
|
184 |
+
"learning_rate": 4.383594120436226e-05,
|
185 |
+
"loss": 0.05,
|
186 |
+
"step": 4600
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"epoch": 1.04,
|
190 |
+
"learning_rate": 4.3361782835467044e-05,
|
191 |
+
"loss": 0.0462,
|
192 |
+
"step": 4800
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"epoch": 1.08,
|
196 |
+
"learning_rate": 4.2887624466571834e-05,
|
197 |
+
"loss": 0.0442,
|
198 |
+
"step": 5000
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 1.08,
|
202 |
+
"eval_bleu": 35.5303321748609,
|
203 |
+
"eval_chrf": 30.398609275779588,
|
204 |
+
"eval_loss": 0.3768843710422516,
|
205 |
+
"eval_runtime": 147.673,
|
206 |
+
"eval_samples_per_second": 4.334,
|
207 |
+
"eval_steps_per_second": 0.068,
|
208 |
+
"step": 5000
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"epoch": 1.13,
|
212 |
+
"learning_rate": 4.2413466097676624e-05,
|
213 |
+
"loss": 0.0437,
|
214 |
+
"step": 5200
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"epoch": 1.17,
|
218 |
+
"learning_rate": 4.1939307728781414e-05,
|
219 |
+
"loss": 0.0436,
|
220 |
+
"step": 5400
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"epoch": 1.21,
|
224 |
+
"learning_rate": 4.1465149359886204e-05,
|
225 |
+
"loss": 0.0428,
|
226 |
+
"step": 5600
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 1.26,
|
230 |
+
"learning_rate": 4.099099099099099e-05,
|
231 |
+
"loss": 0.0418,
|
232 |
+
"step": 5800
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"epoch": 1.3,
|
236 |
+
"learning_rate": 4.051683262209578e-05,
|
237 |
+
"loss": 0.0408,
|
238 |
+
"step": 6000
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"epoch": 1.3,
|
242 |
+
"eval_bleu": 40.96986293672358,
|
243 |
+
"eval_chrf": 35.0063576863817,
|
244 |
+
"eval_loss": 0.38005733489990234,
|
245 |
+
"eval_runtime": 82.4647,
|
246 |
+
"eval_samples_per_second": 7.761,
|
247 |
+
"eval_steps_per_second": 0.121,
|
248 |
+
"step": 6000
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"epoch": 1.34,
|
252 |
+
"learning_rate": 4.004267425320057e-05,
|
253 |
+
"loss": 0.041,
|
254 |
+
"step": 6200
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 1.39,
|
258 |
+
"learning_rate": 3.956851588430536e-05,
|
259 |
+
"loss": 0.0385,
|
260 |
+
"step": 6400
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"epoch": 1.43,
|
264 |
+
"learning_rate": 3.909435751541015e-05,
|
265 |
+
"loss": 0.0397,
|
266 |
+
"step": 6600
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"epoch": 1.47,
|
270 |
+
"learning_rate": 3.862019914651494e-05,
|
271 |
+
"loss": 0.0384,
|
272 |
+
"step": 6800
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"epoch": 1.52,
|
276 |
+
"learning_rate": 3.814604077761973e-05,
|
277 |
+
"loss": 0.0389,
|
278 |
+
"step": 7000
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"epoch": 1.52,
|
282 |
+
"eval_bleu": 41.51574989819788,
|
283 |
+
"eval_chrf": 35.55197531009423,
|
284 |
+
"eval_loss": 0.38628411293029785,
|
285 |
+
"eval_runtime": 89.9685,
|
286 |
+
"eval_samples_per_second": 7.114,
|
287 |
+
"eval_steps_per_second": 0.111,
|
288 |
+
"step": 7000
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"epoch": 1.56,
|
292 |
+
"learning_rate": 3.767188240872452e-05,
|
293 |
+
"loss": 0.038,
|
294 |
+
"step": 7200
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"epoch": 1.6,
|
298 |
+
"learning_rate": 3.719772403982931e-05,
|
299 |
+
"loss": 0.0374,
|
300 |
+
"step": 7400
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"epoch": 1.65,
|
304 |
+
"learning_rate": 3.67235656709341e-05,
|
305 |
+
"loss": 0.0359,
|
306 |
+
"step": 7600
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"epoch": 1.69,
|
310 |
+
"learning_rate": 3.624940730203888e-05,
|
311 |
+
"loss": 0.0358,
|
312 |
+
"step": 7800
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"epoch": 1.73,
|
316 |
+
"learning_rate": 3.577524893314367e-05,
|
317 |
+
"loss": 0.0359,
|
318 |
+
"step": 8000
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"epoch": 1.73,
|
322 |
+
"eval_bleu": 23.208736406312035,
|
323 |
+
"eval_chrf": 23.97795821953749,
|
324 |
+
"eval_loss": 0.3921656310558319,
|
325 |
+
"eval_runtime": 182.5523,
|
326 |
+
"eval_samples_per_second": 3.506,
|
327 |
+
"eval_steps_per_second": 0.055,
|
328 |
+
"step": 8000
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"epoch": 1.78,
|
332 |
+
"learning_rate": 3.530109056424846e-05,
|
333 |
+
"loss": 0.0348,
|
334 |
+
"step": 8200
|
335 |
+
},
|
336 |
+
{
|
337 |
+
"epoch": 1.82,
|
338 |
+
"learning_rate": 3.482693219535325e-05,
|
339 |
+
"loss": 0.0352,
|
340 |
+
"step": 8400
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"epoch": 1.86,
|
344 |
+
"learning_rate": 3.435277382645804e-05,
|
345 |
+
"loss": 0.0351,
|
346 |
+
"step": 8600
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"epoch": 1.91,
|
350 |
+
"learning_rate": 3.3878615457562826e-05,
|
351 |
+
"loss": 0.0345,
|
352 |
+
"step": 8800
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 1.95,
|
356 |
+
"learning_rate": 3.3404457088667616e-05,
|
357 |
+
"loss": 0.0337,
|
358 |
+
"step": 9000
|
359 |
+
},
|
360 |
+
{
|
361 |
+
"epoch": 1.95,
|
362 |
+
"eval_bleu": 41.547921684162176,
|
363 |
+
"eval_chrf": 35.46471050376956,
|
364 |
+
"eval_loss": 0.40451329946517944,
|
365 |
+
"eval_runtime": 89.4039,
|
366 |
+
"eval_samples_per_second": 7.159,
|
367 |
+
"eval_steps_per_second": 0.112,
|
368 |
+
"step": 9000
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"epoch": 1.99,
|
372 |
+
"learning_rate": 3.2930298719772407e-05,
|
373 |
+
"loss": 0.0343,
|
374 |
+
"step": 9200
|
375 |
+
},
|
376 |
+
{
|
377 |
+
"epoch": 2.04,
|
378 |
+
"learning_rate": 3.24561403508772e-05,
|
379 |
+
"loss": 0.0309,
|
380 |
+
"step": 9400
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 2.08,
|
384 |
+
"learning_rate": 3.198198198198199e-05,
|
385 |
+
"loss": 0.0296,
|
386 |
+
"step": 9600
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"epoch": 2.12,
|
390 |
+
"learning_rate": 3.150782361308677e-05,
|
391 |
+
"loss": 0.0291,
|
392 |
+
"step": 9800
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"epoch": 2.17,
|
396 |
+
"learning_rate": 3.103366524419156e-05,
|
397 |
+
"loss": 0.0295,
|
398 |
+
"step": 10000
|
399 |
+
},
|
400 |
+
{
|
401 |
+
"epoch": 2.17,
|
402 |
+
"eval_bleu": 41.51485442459467,
|
403 |
+
"eval_chrf": 35.46553158852993,
|
404 |
+
"eval_loss": 0.4056099057197571,
|
405 |
+
"eval_runtime": 89.2092,
|
406 |
+
"eval_samples_per_second": 7.174,
|
407 |
+
"eval_steps_per_second": 0.112,
|
408 |
+
"step": 10000
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 2.21,
|
412 |
+
"learning_rate": 3.055950687529635e-05,
|
413 |
+
"loss": 0.0285,
|
414 |
+
"step": 10200
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"epoch": 2.25,
|
418 |
+
"learning_rate": 3.008534850640114e-05,
|
419 |
+
"loss": 0.0293,
|
420 |
+
"step": 10400
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"epoch": 2.3,
|
424 |
+
"learning_rate": 2.9611190137505927e-05,
|
425 |
+
"loss": 0.0285,
|
426 |
+
"step": 10600
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"epoch": 2.34,
|
430 |
+
"learning_rate": 2.9137031768610717e-05,
|
431 |
+
"loss": 0.0288,
|
432 |
+
"step": 10800
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"epoch": 2.38,
|
436 |
+
"learning_rate": 2.8662873399715508e-05,
|
437 |
+
"loss": 0.0279,
|
438 |
+
"step": 11000
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"epoch": 2.38,
|
442 |
+
"eval_bleu": 41.75820932324433,
|
443 |
+
"eval_chrf": 35.69247581900476,
|
444 |
+
"eval_loss": 0.41462868452072144,
|
445 |
+
"eval_runtime": 152.578,
|
446 |
+
"eval_samples_per_second": 4.195,
|
447 |
+
"eval_steps_per_second": 0.066,
|
448 |
+
"step": 11000
|
449 |
+
}
|
450 |
+
],
|
451 |
+
"max_steps": 23090,
|
452 |
+
"num_train_epochs": 5,
|
453 |
+
"total_flos": 2.682581278261248e+16,
|
454 |
+
"trial_name": null,
|
455 |
+
"trial_params": null
|
456 |
+
}
|
checkpoint-11000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87fcb457b09c1b3e7d50dad4c575e4da4493a663f81ee73d6fe8065b4e460e1a
|
3 |
+
size 3643
|
commercial_baselines/bing.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
commercial_baselines/bing.key
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
99b56b71eab141dfad4e5a4789e958e8
|
commercial_baselines/bing.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
commercial_baselines/bing_translator.ipynb
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import requests, uuid\n",
|
10 |
+
"\n",
|
11 |
+
"# Add your key and endpoint\n",
|
12 |
+
"with open(\"bing.key\", \"r\") as key_file:\n",
|
13 |
+
" key = key_file.read()\n",
|
14 |
+
"endpoint = \"https://api.cognitive.microsofttranslator.com\"\n",
|
15 |
+
"\n",
|
16 |
+
"# location, also known as region.\n",
|
17 |
+
"# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.\n",
|
18 |
+
"location = \"eastus\"\n",
|
19 |
+
"\n",
|
20 |
+
"path = '/translate'\n",
|
21 |
+
"constructed_url = endpoint + path\n",
|
22 |
+
"\n",
|
23 |
+
"params = {\n",
|
24 |
+
" 'api-version': '3.0',\n",
|
25 |
+
" 'from': 'yue',\n",
|
26 |
+
" 'to': ['zh-Hant']\n",
|
27 |
+
"}\n",
|
28 |
+
"\n",
|
29 |
+
"headers = {\n",
|
30 |
+
" 'Ocp-Apim-Subscription-Key': key,\n",
|
31 |
+
" # location required if you're using a multi-service or regional (not global) resource.\n",
|
32 |
+
" 'Ocp-Apim-Subscription-Region': location,\n",
|
33 |
+
" 'Content-type': 'application/json',\n",
|
34 |
+
" 'X-ClientTraceId': str(uuid.uuid4())\n",
|
35 |
+
"}\n",
|
36 |
+
"\n",
|
37 |
+
"# https://stackoverflow.com/a/312464/6798201\n",
|
38 |
+
"def chunks(lst, n):\n",
|
39 |
+
" \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
|
40 |
+
" for i in range(0, len(lst), n):\n",
|
41 |
+
" yield lst[i:i + n]"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 5,
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [
|
49 |
+
{
|
50 |
+
"ename": "KeyboardInterrupt",
|
51 |
+
"evalue": "",
|
52 |
+
"output_type": "error",
|
53 |
+
"traceback": [
|
54 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
55 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
56 |
+
"\u001b[1;32m/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb Cell 2\u001b[0m in \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m../test.typos.pred.bing.man\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39ma+\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m output_file:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunks(body, \u001b[39m500\u001b[39m):\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m request \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39;49mpost(constructed_url, params\u001b[39m=\u001b[39;49mparams, headers\u001b[39m=\u001b[39;49mheaders, json\u001b[39m=\u001b[39;49mchunk)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=15'>16</a>\u001b[0m response \u001b[39m=\u001b[39m request\u001b[39m.\u001b[39mjson()\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W1sZmlsZQ%3D%3D?line=16'>17</a>\u001b[0m \u001b[39mfor\u001b[39;00m line \u001b[39min\u001b[39;00m response:\n",
|
57 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpost\u001b[39m(url, data\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, json\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[39m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[39m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[39mreturn\u001b[39;00m request(\u001b[39m\"\u001b[39;49m\u001b[39mpost\u001b[39;49m\u001b[39m\"\u001b[39;49m, url, data\u001b[39m=\u001b[39;49mdata, json\u001b[39m=\u001b[39;49mjson, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
58 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[39m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[39m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[39mwith\u001b[39;00m sessions\u001b[39m.\u001b[39mSession() \u001b[39mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[39mreturn\u001b[39;00m session\u001b[39m.\u001b[39;49mrequest(method\u001b[39m=\u001b[39;49mmethod, url\u001b[39m=\u001b[39;49murl, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
59 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/sessions.py:587\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 582\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 583\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m 584\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 585\u001b[0m }\n\u001b[1;32m 586\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 587\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m 589\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n",
|
60 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/sessions.py:701\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 698\u001b[0m start \u001b[39m=\u001b[39m preferred_clock()\n\u001b[1;32m 700\u001b[0m \u001b[39m# Send the request\u001b[39;00m\n\u001b[0;32m--> 701\u001b[0m r \u001b[39m=\u001b[39m adapter\u001b[39m.\u001b[39;49msend(request, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 703\u001b[0m \u001b[39m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 704\u001b[0m elapsed \u001b[39m=\u001b[39m preferred_clock() \u001b[39m-\u001b[39m start\n",
|
61 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/requests/adapters.py:489\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 488\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m chunked:\n\u001b[0;32m--> 489\u001b[0m resp \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49murlopen(\n\u001b[1;32m 490\u001b[0m method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod,\n\u001b[1;32m 491\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 492\u001b[0m body\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mbody,\n\u001b[1;32m 493\u001b[0m headers\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mheaders,\n\u001b[1;32m 494\u001b[0m redirect\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 495\u001b[0m assert_same_host\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 496\u001b[0m preload_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 497\u001b[0m decode_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 498\u001b[0m retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[1;32m 499\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 500\u001b[0m )\n\u001b[1;32m 502\u001b[0m \u001b[39m# Send the request.\u001b[39;00m\n\u001b[1;32m 503\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 504\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(conn, \u001b[39m\"\u001b[39m\u001b[39mproxy_pool\u001b[39m\u001b[39m\"\u001b[39m):\n",
|
62 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[1;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[1;32m 704\u001b[0m conn,\n\u001b[1;32m 705\u001b[0m method,\n\u001b[1;32m 706\u001b[0m url,\n\u001b[1;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[1;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[1;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m 711\u001b[0m )\n\u001b[1;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[1;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[1;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[1;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[1;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
|
63 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[1;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[1;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
|
64 |
+
"File \u001b[0;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[0;34m(value, from_value)\u001b[0m\n",
|
65 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/urllib3/connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[1;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[1;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[1;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[1;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
|
66 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[1;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[1;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
|
67 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[1;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[1;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
|
68 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/http/client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[1;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
69 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
|
70 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/ssl.py:1274\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1270\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 1271\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1272\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m 1273\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1274\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m 1275\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1276\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
|
71 |
+
"File \u001b[0;32m~/.pyenv/versions/3.10.6/lib/python3.10/ssl.py:1130\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1128\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1129\u001b[0m \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1130\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m 1131\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1132\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n",
|
72 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"\"\"\"\n",
|
78 |
+
"Translate test sentences\n",
|
79 |
+
"\"\"\"\n",
|
80 |
+
"\n",
|
81 |
+
"# You can pass more than one object in body.\n",
|
82 |
+
"with open(\"../para/test/test.typos.can\", \"r\") as input_file:\n",
|
83 |
+
" body = [{'text': line} for line in input_file.read().splitlines()]\n",
|
84 |
+
"\n",
|
85 |
+
"# Clear previous outputs\n",
|
86 |
+
"open(\"../test.typos.pred.bing.man\", 'w').close()\n",
|
87 |
+
"\n",
|
88 |
+
"# Split translation request into chunks of 500 lines (10,000 character limit per request)\n",
|
89 |
+
"with open(\"../test.typos.pred.bing.man\", \"a+\") as output_file:\n",
|
90 |
+
" for chunk in chunks(body, 500):\n",
|
91 |
+
" request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
|
92 |
+
" response = request.json()\n",
|
93 |
+
" for line in response:\n",
|
94 |
+
" output_file.write(line['translations'][0]['text'] + \"\\n\")"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 17,
|
100 |
+
"metadata": {},
|
101 |
+
"outputs": [
|
102 |
+
{
|
103 |
+
"name": "stderr",
|
104 |
+
"output_type": "stream",
|
105 |
+
"text": [
|
106 |
+
" 0%| | 0/110 [00:01<?, ?it/s]\n"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"ename": "KeyboardInterrupt",
|
111 |
+
"evalue": "",
|
112 |
+
"output_type": "error",
|
113 |
+
"traceback": [
|
114 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
115 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
116 |
+
"\u001b[1;32m/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb Cell 3\u001b[0m in \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m output_file\u001b[39m.\u001b[39mwrite(line[\u001b[39m'\u001b[39m\u001b[39mtranslations\u001b[39m\u001b[39m'\u001b[39m][\u001b[39m0\u001b[39m][\u001b[39m'\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39m# Slow down because of hourly request limit for free tier\u001b[39;00m\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/kevin/Dev/classes/winter2023/eecs487/canto_mando_bart_bitext_typo_augment_full_bing/commercial_baselines/bing_translator.ipynb#W2sZmlsZQ%3D%3D?line=22'>23</a>\u001b[0m time\u001b[39m.\u001b[39;49msleep(\u001b[39m5\u001b[39;49m)\n",
|
117 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
118 |
+
]
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"source": [
|
122 |
+
"\"\"\"\n",
|
123 |
+
"Translate training sentences\n",
|
124 |
+
"\"\"\"\n",
|
125 |
+
"\n",
|
126 |
+
"# You can pass more than one object in body.\n",
|
127 |
+
"with open(\"bing.can\", \"r\") as input_file:\n",
|
128 |
+
" body = [{'text': line} for line in input_file.read().splitlines()]\n",
|
129 |
+
"\n",
|
130 |
+
"# Clear previous outputs\n",
|
131 |
+
"open(\"bing.man\", 'w').close()\n",
|
132 |
+
"\n",
|
133 |
+
"from tqdm import tqdm\n",
|
134 |
+
"import time\n",
|
135 |
+
"\n",
|
136 |
+
"# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
|
137 |
+
"with open(\"bing.man\", \"a+\") as output_file:\n",
|
138 |
+
" for chunk in tqdm(list(chunks(body, 400))):\n",
|
139 |
+
" request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
|
140 |
+
" response = request.json()\n",
|
141 |
+
" for line in response:\n",
|
142 |
+
" output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
|
143 |
+
" # Slow down because of hourly request limit for free tier\n",
|
144 |
+
" time.sleep(5)\n"
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "code",
|
149 |
+
"execution_count": 18,
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [
|
152 |
+
{
|
153 |
+
"name": "stderr",
|
154 |
+
"output_type": "stream",
|
155 |
+
"text": [
|
156 |
+
"100%|██████████| 38/38 [03:33<00:00, 5.61s/it]\n"
|
157 |
+
]
|
158 |
+
}
|
159 |
+
],
|
160 |
+
"source": [
|
161 |
+
"# You can pass more than one object in body.\n",
|
162 |
+
"with open(\"../train/abc.can\", \"r\") as input_file:\n",
|
163 |
+
" body = [{'text': line} for line in input_file.read().splitlines() if len(line) >= 5]\n",
|
164 |
+
"\n",
|
165 |
+
"from tqdm import tqdm\n",
|
166 |
+
"import time\n",
|
167 |
+
"\n",
|
168 |
+
"# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
|
169 |
+
"with open(\"bing.man\", \"a+\") as output_file:\n",
|
170 |
+
" for chunk in tqdm(list(chunks(body, 400))):\n",
|
171 |
+
" request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
|
172 |
+
" response = request.json()\n",
|
173 |
+
" for line in response:\n",
|
174 |
+
" output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
|
175 |
+
" # Slow down because of hourly request limit for free tier\n",
|
176 |
+
" time.sleep(5)"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 3,
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [
|
184 |
+
{
|
185 |
+
"name": "stderr",
|
186 |
+
"output_type": "stream",
|
187 |
+
"text": [
|
188 |
+
"100%|██████████| 352/352 [21:28<00:00, 3.66s/it]\n"
|
189 |
+
]
|
190 |
+
}
|
191 |
+
],
|
192 |
+
"source": [
|
193 |
+
"# You can pass more than one object in body.\n",
|
194 |
+
"with open(\"../train/lihkg.filtered.can\", \"r\") as input_file:\n",
|
195 |
+
" body = [{'text': line} for line in input_file.read().splitlines()]\n",
|
196 |
+
"\n",
|
197 |
+
"from tqdm import tqdm\n",
|
198 |
+
"import time\n",
|
199 |
+
"\n",
|
200 |
+
"# Split translation request into chunks of 400 lines (10,000 character limit per request)\n",
|
201 |
+
"with open(\"lihkg.filtered.man\", \"w+\") as output_file:\n",
|
202 |
+
" for chunk in tqdm(list(chunks(body, 400))):\n",
|
203 |
+
" request = requests.post(constructed_url, params=params, headers=headers, json=chunk)\n",
|
204 |
+
" response = request.json()\n",
|
205 |
+
" for line in response:\n",
|
206 |
+
" output_file.write(line['translations'][0]['text'] + \"\\n\")\n",
|
207 |
+
" # Slow down because of hourly request limit for free tier\n",
|
208 |
+
" time.sleep(3)"
|
209 |
+
]
|
210 |
+
}
|
211 |
+
],
|
212 |
+
"metadata": {
|
213 |
+
"kernelspec": {
|
214 |
+
"display_name": "Python 3",
|
215 |
+
"language": "python",
|
216 |
+
"name": "python3"
|
217 |
+
},
|
218 |
+
"language_info": {
|
219 |
+
"codemirror_mode": {
|
220 |
+
"name": "ipython",
|
221 |
+
"version": 3
|
222 |
+
},
|
223 |
+
"file_extension": ".py",
|
224 |
+
"mimetype": "text/x-python",
|
225 |
+
"name": "python",
|
226 |
+
"nbconvert_exporter": "python",
|
227 |
+
"pygments_lexer": "ipython3",
|
228 |
+
"version": "3.10.6"
|
229 |
+
},
|
230 |
+
"orig_nbformat": 4
|
231 |
+
},
|
232 |
+
"nbformat": 4,
|
233 |
+
"nbformat_minor": 2
|
234 |
+
}
|
commercial_baselines/lihkg.filtered.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
commercial_baselines/load_can.ipynb
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"Number of Cantonese sentences: 58492\n"
|
13 |
+
]
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"source": [
|
17 |
+
"can_sents = []\n",
|
18 |
+
"\n",
|
19 |
+
"with open(\"../train/common_voice.can\", \"r\") as common_voice_file, open(\"../train/wordshk.can\", \"r\") as wordshk_file,\\\n",
|
20 |
+
" open(\"../train/novels.can\", \"r\") as novels_file,\\\n",
|
21 |
+
" open(\"../train/abc.can\", \"r\") as abc_file:\n",
|
22 |
+
" lines = common_voice_file.read().splitlines() + wordshk_file.read().splitlines() + novels_file.read().splitlines() + abc_file.read().splitlines()\n",
|
23 |
+
" for line in lines:\n",
|
24 |
+
" if len(line) >= 5:\n",
|
25 |
+
" can_sents.append(line)\n",
|
26 |
+
"\n",
|
27 |
+
"print(\"Number of Cantonese sentences: \", len(can_sents))\n",
|
28 |
+
"\n",
|
29 |
+
"with open(\"bing.can\", \"w\") as f:\n",
|
30 |
+
" for sent in can_sents:\n",
|
31 |
+
" f.write(sent + \"\\n\")\n",
|
32 |
+
" f.flush()\n"
|
33 |
+
]
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"metadata": {
|
37 |
+
"kernelspec": {
|
38 |
+
"display_name": "Python 3",
|
39 |
+
"language": "python",
|
40 |
+
"name": "python3"
|
41 |
+
},
|
42 |
+
"language_info": {
|
43 |
+
"codemirror_mode": {
|
44 |
+
"name": "ipython",
|
45 |
+
"version": 3
|
46 |
+
},
|
47 |
+
"file_extension": ".py",
|
48 |
+
"mimetype": "text/x-python",
|
49 |
+
"name": "python",
|
50 |
+
"nbconvert_exporter": "python",
|
51 |
+
"pygments_lexer": "ipython3",
|
52 |
+
"version": "3.10.6"
|
53 |
+
},
|
54 |
+
"orig_nbformat": 4
|
55 |
+
},
|
56 |
+
"nbformat": 4,
|
57 |
+
"nbformat_minor": 2
|
58 |
+
}
|
finetune.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
load_abc.ipynb
ADDED
@@ -0,0 +1,964 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 33,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"def normalize_punctuations(line: str) -> str:\n",
|
10 |
+
" # Replace all English punctuations with Chinese ones\n",
|
11 |
+
" line = line.replace(\",\", \",\").replace(\"!\", \"!\").replace(\"?\", \"?\")\\\n",
|
12 |
+
" .replace(\":\", \":\").replace(\";\", \";\").replace(\"(\", \"(\").replace(\")\", \")\")\n",
|
13 |
+
" return line"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": 34,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [
|
21 |
+
{
|
22 |
+
"name": "stdout",
|
23 |
+
"output_type": "stream",
|
24 |
+
"text": [
|
25 |
+
"Got 14838 Cantonese sentences with length >= 5\n"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
],
|
29 |
+
"source": [
|
30 |
+
"import re\n",
|
31 |
+
"from functools import reduce\n",
|
32 |
+
"\n",
|
33 |
+
"can_sentence_start = re.compile(r\"[0-9]*hz \")\n",
|
34 |
+
"can_lines = []\n",
|
35 |
+
"\n",
|
36 |
+
"with open(\"train/abc/abc_cantonese_index_00001_to_04587_line_1_to_4575.xml\", \"r\") as abc_file1,\\\n",
|
37 |
+
"open(\"train/abc/abc_cantonese_index_04588_to_09175_line_4576_to_9150.xml\", \"r\") as abc_file2,\\\n",
|
38 |
+
" open(\"train/abc/abc_cantonese_index_09176_to_13775_line_9151_to_13725.xml\", \"r\") as abc_file3,\\\n",
|
39 |
+
" open(\"train/abc/abc_cantonese_index_13776_to_FE99FD5B4E37BE32_line_13726_to_18302.xml\", \"r\") as abc_file4:\n",
|
40 |
+
" lines = reduce(lambda lines, file: lines + file.read().splitlines(), [abc_file1, abc_file2, abc_file3, abc_file4], [])\n",
|
41 |
+
" for line in lines:\n",
|
42 |
+
" match = can_sentence_start.match(line)\n",
|
43 |
+
" if match and not \"(empty band???)\" in line:\n",
|
44 |
+
" line = line[match.end():].strip()\n",
|
45 |
+
" if len(line) >= 5:\n",
|
46 |
+
" can_lines.append(normalize_punctuations(line))\n",
|
47 |
+
"\n",
|
48 |
+
"print(\"Got {} Cantonese sentences with length >= 5\".format(len(can_lines)))"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": 35,
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [
|
56 |
+
{
|
57 |
+
"name": "stdout",
|
58 |
+
"output_type": "stream",
|
59 |
+
"text": [
|
60 |
+
"Found 4527 common Cantonese characters\n"
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"source": [
|
65 |
+
"common_can_charset = set()\n",
|
66 |
+
"\n",
|
67 |
+
"with open(\"train/wordshk.can\", \"r\") as wordshk_file:\n",
|
68 |
+
" for c in wordshk_file.read():\n",
|
69 |
+
" common_can_charset.add(c)\n",
|
70 |
+
"\n",
|
71 |
+
"print(f\"Found {len(common_can_charset)} common Cantonese characters\")"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 36,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [
|
79 |
+
{
|
80 |
+
"name": "stdout",
|
81 |
+
"output_type": "stream",
|
82 |
+
"text": [
|
83 |
+
"Found 365 rare Cantonese characters\n",
|
84 |
+
"𠹺 388\n",
|
85 |
+
"噖 162\n",
|
86 |
+
"𡁵 157\n",
|
87 |
+
"𠶧 88\n",
|
88 |
+
"嚫 88\n",
|
89 |
+
"屘 57\n",
|
90 |
+
"衭 47\n",
|
91 |
+
"贃 43\n",
|
92 |
+
"說 35\n",
|
93 |
+
"𧵳 30\n",
|
94 |
+
"歳 27\n",
|
95 |
+
"𢫏 27\n",
|
96 |
+
"𨶙 25\n",
|
97 |
+
"癐 25\n",
|
98 |
+
"𦡆 25\n",
|
99 |
+
"𨃩 24\n",
|
100 |
+
"况 21\n",
|
101 |
+
"内 19\n",
|
102 |
+
"𢵌 19\n",
|
103 |
+
"𦧺 18\n",
|
104 |
+
"𠹌 18\n",
|
105 |
+
"爲 16\n",
|
106 |
+
"𢱑 16\n",
|
107 |
+
"𡁯 15\n",
|
108 |
+
"𠱓 14\n",
|
109 |
+
"𠵿 14\n",
|
110 |
+
"踹 13\n",
|
111 |
+
"㗇 13\n",
|
112 |
+
"𠾴 13\n",
|
113 |
+
"嗍 13\n",
|
114 |
+
"𧘹 13\n",
|
115 |
+
"𠹳 12\n",
|
116 |
+
"𠹭 12\n",
|
117 |
+
"脫 12\n",
|
118 |
+
"䁪 11\n",
|
119 |
+
"𧨾 11\n",
|
120 |
+
"掬 11\n",
|
121 |
+
"𠸐 11\n",
|
122 |
+
"啥 11\n",
|
123 |
+
"𠱃 10\n",
|
124 |
+
"噔 10\n",
|
125 |
+
"捹 10\n",
|
126 |
+
"𠹻 10\n",
|
127 |
+
"𠼻 10\n",
|
128 |
+
"噠 10\n",
|
129 |
+
"𨳊 10\n",
|
130 |
+
"𢲲 9\n",
|
131 |
+
"𨉖 9\n",
|
132 |
+
"躭 9\n",
|
133 |
+
"䠋 9\n",
|
134 |
+
"嘮 9\n",
|
135 |
+
"啽 9\n",
|
136 |
+
"滮 8\n",
|
137 |
+
"㧻 8\n",
|
138 |
+
"𧶄 8\n",
|
139 |
+
"𦛚 8\n",
|
140 |
+
"撠 8\n",
|
141 |
+
"呡 8\n",
|
142 |
+
"睸 8\n",
|
143 |
+
"𠰲 8\n",
|
144 |
+
"𥔿 8\n",
|
145 |
+
"唎 8\n",
|
146 |
+
"𠸊 8\n",
|
147 |
+
"𬜐 8\n",
|
148 |
+
"蔥 8\n",
|
149 |
+
"呱 8\n",
|
150 |
+
"B 7\n",
|
151 |
+
"𢯊 7\n",
|
152 |
+
"𫫃 7\n",
|
153 |
+
"𢝵 7\n",
|
154 |
+
"銹 7\n",
|
155 |
+
"㓤 7\n",
|
156 |
+
"䁯 7\n",
|
157 |
+
"啉 7\n",
|
158 |
+
"臥 7\n",
|
159 |
+
"𠓼 7\n",
|
160 |
+
"稅 7\n",
|
161 |
+
" 7\n",
|
162 |
+
"喴 7\n",
|
163 |
+
"噱 7\n",
|
164 |
+
"衛 6\n",
|
165 |
+
"𡄯 6\n",
|
166 |
+
"揤 6\n",
|
167 |
+
"𢤹 6\n",
|
168 |
+
" 6\n",
|
169 |
+
"鷄 6\n",
|
170 |
+
"湴 6\n",
|
171 |
+
" 6\n",
|
172 |
+
"𦣇 6\n",
|
173 |
+
"齧 6\n",
|
174 |
+
"𠮨 6\n",
|
175 |
+
" 6\n",
|
176 |
+
"𡀝 6\n",
|
177 |
+
"婄 6\n",
|
178 |
+
"𠼱 6\n",
|
179 |
+
"𠱂 5\n",
|
180 |
+
"磧 5\n",
|
181 |
+
"𠰋 5\n",
|
182 |
+
"𡂖 5\n",
|
183 |
+
"浭 5\n",
|
184 |
+
"擏 5\n",
|
185 |
+
"𥋇 5\n",
|
186 |
+
"揢 5\n",
|
187 |
+
"㨆 5\n",
|
188 |
+
"𠾍 5\n",
|
189 |
+
"兌 5\n",
|
190 |
+
"𢺳 5\n",
|
191 |
+
"坺 5\n",
|
192 |
+
"鍚 5\n",
|
193 |
+
"𣘚 5\n",
|
194 |
+
"𪘁 5\n",
|
195 |
+
"𨳍 5\n",
|
196 |
+
"嗙 5\n",
|
197 |
+
"𠼰 5\n",
|
198 |
+
"𨳒 4\n",
|
199 |
+
"唿 4\n",
|
200 |
+
"𣳼 4\n",
|
201 |
+
"𦂥 4\n",
|
202 |
+
"溚 4\n",
|
203 |
+
"囋 4\n",
|
204 |
+
"瀄 4\n",
|
205 |
+
"𠌥 4\n",
|
206 |
+
"𢫦 4\n",
|
207 |
+
"𢶍 4\n",
|
208 |
+
"𠲵 4\n",
|
209 |
+
"䉺 4\n",
|
210 |
+
"炕 4\n",
|
211 |
+
"𢴈 4\n",
|
212 |
+
"𡲢 4\n",
|
213 |
+
"𥅈 4\n",
|
214 |
+
"𬧊 4\n",
|
215 |
+
"簕 4\n",
|
216 |
+
"査 4\n",
|
217 |
+
"𩜠 4\n",
|
218 |
+
"𫬿 4\n",
|
219 |
+
"𠜱 4\n",
|
220 |
+
"嚬 4\n",
|
221 |
+
"𠹹 4\n",
|
222 |
+
"𦉘 4\n",
|
223 |
+
"唦 4\n",
|
224 |
+
"㨘 4\n",
|
225 |
+
"𡄽 4\n",
|
226 |
+
"熗 4\n",
|
227 |
+
"𡁷 4\n",
|
228 |
+
"𠿬 4\n",
|
229 |
+
"咜 4\n",
|
230 |
+
"𠸏 4\n",
|
231 |
+
"𡁸 4\n",
|
232 |
+
"𡃵 4\n",
|
233 |
+
"𪚩 4\n",
|
234 |
+
"D 4\n",
|
235 |
+
"Q 4\n",
|
236 |
+
"𨆯 3\n",
|
237 |
+
"啗 3\n",
|
238 |
+
"蔸 3\n",
|
239 |
+
"舗 3\n",
|
240 |
+
"囪 3\n",
|
241 |
+
"艔 3\n",
|
242 |
+
"洩 3\n",
|
243 |
+
"𢵧 3\n",
|
244 |
+
"菓 3\n",
|
245 |
+
"䪴 3\n",
|
246 |
+
"䆲 3\n",
|
247 |
+
"痱 3\n",
|
248 |
+
"趿 3\n",
|
249 |
+
"𠮩 3\n",
|
250 |
+
"搉 3\n",
|
251 |
+
"矋 3\n",
|
252 |
+
"𠻗 3\n",
|
253 |
+
"𢲈 3\n",
|
254 |
+
"潞 3\n",
|
255 |
+
"沬 3\n",
|
256 |
+
"揇 3\n",
|
257 |
+
"齃 3\n",
|
258 |
+
"𡃤 3\n",
|
259 |
+
"𡃶 3\n",
|
260 |
+
"瀟 3\n",
|
261 |
+
"軨 3\n",
|
262 |
+
"鉻 3\n",
|
263 |
+
" 3\n",
|
264 |
+
"㿭 3\n",
|
265 |
+
"𢵄 3\n",
|
266 |
+
"㗲 3\n",
|
267 |
+
"𢫕 3\n",
|
268 |
+
"𢰸 3\n",
|
269 |
+
"葫 3\n",
|
270 |
+
"咔 3\n",
|
271 |
+
"嚎 3\n",
|
272 |
+
"嗿 3\n",
|
273 |
+
"咈 3\n",
|
274 |
+
"咾 3\n",
|
275 |
+
" 3\n",
|
276 |
+
"𠵈 3\n",
|
277 |
+
"吥 3\n",
|
278 |
+
"𠾭 3\n",
|
279 |
+
"𠾵 3\n",
|
280 |
+
"朘 3\n",
|
281 |
+
"觥 3\n",
|
282 |
+
"㩧 2\n",
|
283 |
+
"焙 2\n",
|
284 |
+
"兀 2\n",
|
285 |
+
"䭤 2\n",
|
286 |
+
"饊 2\n",
|
287 |
+
"[ 2\n",
|
288 |
+
"] 2\n",
|
289 |
+
"炖 2\n",
|
290 |
+
"争 2\n",
|
291 |
+
"䁓 2\n",
|
292 |
+
"𡂝 2\n",
|
293 |
+
"𩬎 2\n",
|
294 |
+
"鈒 2\n",
|
295 |
+
"亁 2\n",
|
296 |
+
"炠 2\n",
|
297 |
+
"摼 2\n",
|
298 |
+
"𠺬 2\n",
|
299 |
+
"𠵉 2\n",
|
300 |
+
"蝄 2\n",
|
301 |
+
" 2\n",
|
302 |
+
"蔫 2\n",
|
303 |
+
"㘉 2\n",
|
304 |
+
"荏 2\n",
|
305 |
+
"墘 2\n",
|
306 |
+
"嗏 2\n",
|
307 |
+
"呣 2\n",
|
308 |
+
"曚 2\n",
|
309 |
+
"壬 2\n",
|
310 |
+
"揅 2\n",
|
311 |
+
"溼 2\n",
|
312 |
+
"囓 2\n",
|
313 |
+
"嚙 2\n",
|
314 |
+
"枴 2\n",
|
315 |
+
"𡃀 2\n",
|
316 |
+
"饑 2\n",
|
317 |
+
"䏭 2\n",
|
318 |
+
"挼 2\n",
|
319 |
+
"掱 2\n",
|
320 |
+
"咑 2\n",
|
321 |
+
"芙 2\n",
|
322 |
+
"𦂗 2\n",
|
323 |
+
"舦 2\n",
|
324 |
+
"𢶤 2\n",
|
325 |
+
"翡 2\n",
|
326 |
+
"翠 2\n",
|
327 |
+
"酡 2\n",
|
328 |
+
"𫭊 2\n",
|
329 |
+
"煀 2\n",
|
330 |
+
"耙 2\n",
|
331 |
+
"𠿭 2\n",
|
332 |
+
"鉤 2\n",
|
333 |
+
"𠻘 2\n",
|
334 |
+
"脽 2\n",
|
335 |
+
"焊 2\n",
|
336 |
+
"唊 2\n",
|
337 |
+
"胅 2\n",
|
338 |
+
"翕 2\n",
|
339 |
+
"摜 2\n",
|
340 |
+
"僚 1\n",
|
341 |
+
"𩗴 1\n",
|
342 |
+
"毡 1\n",
|
343 |
+
"跤 1\n",
|
344 |
+
"梧 1\n",
|
345 |
+
"痄 1\n",
|
346 |
+
"卟 1\n",
|
347 |
+
"劄 1\n",
|
348 |
+
"𠶜 1\n",
|
349 |
+
"睜 1\n",
|
350 |
+
"迹 1\n",
|
351 |
+
"揃 1\n",
|
352 |
+
"唨 1\n",
|
353 |
+
"謢 1\n",
|
354 |
+
"菻 1\n",
|
355 |
+
"𣚺 1\n",
|
356 |
+
"鷓 1\n",
|
357 |
+
"鴣 1\n",
|
358 |
+
"强 1\n",
|
359 |
+
"𠾶 1\n",
|
360 |
+
"𡆀 1\n",
|
361 |
+
"拫 1\n",
|
362 |
+
"𠼮 1\n",
|
363 |
+
"汞 1\n",
|
364 |
+
"㤿 1\n",
|
365 |
+
"厴 1\n",
|
366 |
+
"𥀬 1\n",
|
367 |
+
"牯 1\n",
|
368 |
+
"𡇙 1\n",
|
369 |
+
"讕 1\n",
|
370 |
+
"𠿫 1\n",
|
371 |
+
"瘺 1\n",
|
372 |
+
"骲 1\n",
|
373 |
+
"𫲭 1\n",
|
374 |
+
"瓏 1\n",
|
375 |
+
"繚 1\n",
|
376 |
+
"撿 1\n",
|
377 |
+
"跀 1\n",
|
378 |
+
"𢛴 1\n",
|
379 |
+
"蝻 1\n",
|
380 |
+
"赧 1\n",
|
381 |
+
"𪙛 1\n",
|
382 |
+
" 1\n",
|
383 |
+
"檳 1\n",
|
384 |
+
"潲 1\n",
|
385 |
+
"𢶠 1\n",
|
386 |
+
"秧 1\n",
|
387 |
+
"蒔 1\n",
|
388 |
+
"炩 1\n",
|
389 |
+
"㩋 1\n",
|
390 |
+
"饅 1\n",
|
391 |
+
"鍍 1\n",
|
392 |
+
"𢚖 1\n",
|
393 |
+
"𧊅 1\n",
|
394 |
+
" 1\n",
|
395 |
+
"篸 1\n",
|
396 |
+
"𩟔 1\n",
|
397 |
+
"撍 1\n",
|
398 |
+
"栗 1\n",
|
399 |
+
" 1\n",
|
400 |
+
"𡆇 1\n",
|
401 |
+
"杧 1\n",
|
402 |
+
"榛 1\n",
|
403 |
+
"蠄 1\n",
|
404 |
+
"蟧 1\n",
|
405 |
+
"嘶 1\n",
|
406 |
+
"梆 1\n",
|
407 |
+
"竪 1\n",
|
408 |
+
"騾 1\n",
|
409 |
+
"矺 1\n",
|
410 |
+
"堀 1\n",
|
411 |
+
"麝 1\n",
|
412 |
+
"慪 1\n",
|
413 |
+
"撴 1\n",
|
414 |
+
"哾 1\n",
|
415 |
+
"𠳖 1\n",
|
416 |
+
"洌 1\n",
|
417 |
+
"霹 1\n",
|
418 |
+
"𠾼 1\n",
|
419 |
+
"𬦠 1\n",
|
420 |
+
"𤌍 1\n",
|
421 |
+
"𬧯 1\n",
|
422 |
+
"厠 1\n",
|
423 |
+
"㖡 1\n",
|
424 |
+
"跁 1\n",
|
425 |
+
"鉎 1\n",
|
426 |
+
"𧣈 1\n",
|
427 |
+
"𠳏 1\n",
|
428 |
+
"㹃 1\n",
|
429 |
+
"𧝞 1\n",
|
430 |
+
"𡀞 1\n",
|
431 |
+
"㦒 1\n",
|
432 |
+
"𩩍 1\n",
|
433 |
+
"𢱢 1\n",
|
434 |
+
"鍟 1\n",
|
435 |
+
"煱 1\n",
|
436 |
+
"撘 1\n",
|
437 |
+
"閱 1\n",
|
438 |
+
"橇 1\n",
|
439 |
+
"籽 1\n",
|
440 |
+
"庵 1\n",
|
441 |
+
"厨 1\n",
|
442 |
+
"疴 1\n",
|
443 |
+
"豹 1\n",
|
444 |
+
"杠 1\n",
|
445 |
+
"咘 1\n",
|
446 |
+
"裡 1\n",
|
447 |
+
"熏 1\n",
|
448 |
+
" 1\n"
|
449 |
+
]
|
450 |
+
}
|
451 |
+
],
|
452 |
+
"source": [
|
453 |
+
"from collections import defaultdict\n",
|
454 |
+
"\n",
|
455 |
+
"rare_can_charset = defaultdict(int)\n",
|
456 |
+
"for line in can_lines:\n",
|
457 |
+
" for c in line:\n",
|
458 |
+
" if not c in common_can_charset:\n",
|
459 |
+
" rare_can_charset[c] += 1\n",
|
460 |
+
"\n",
|
461 |
+
"print(f\"Found {len(rare_can_charset)} rare Cantonese characters\")\n",
|
462 |
+
"\n",
|
463 |
+
"charset_sort_by_freq = dict(sorted(rare_can_charset.items(), key=lambda item: -item[1]))\n",
|
464 |
+
"for c, freq in charset_sort_by_freq.items():\n",
|
465 |
+
" print(c, freq)"
|
466 |
+
]
|
467 |
+
},
|
468 |
+
{
|
469 |
+
"cell_type": "code",
|
470 |
+
"execution_count": 46,
|
471 |
+
"metadata": {},
|
472 |
+
"outputs": [
|
473 |
+
{
|
474 |
+
"name": "stdout",
|
475 |
+
"output_type": "stream",
|
476 |
+
"text": [
|
477 |
+
"Found 12360 normalized mappings\n"
|
478 |
+
]
|
479 |
+
}
|
480 |
+
],
|
481 |
+
"source": [
|
482 |
+
"char_to_normalized_char = {}\n",
|
483 |
+
"\n",
|
484 |
+
"with open(\"zh_char2str_mapping.txt\", \"r\") as input_file:\n",
|
485 |
+
" for line in input_file.read().splitlines():\n",
|
486 |
+
" [c, n] = line.split(\"\\t\")\n",
|
487 |
+
" char_to_normalized_char[c] = n\n",
|
488 |
+
"\n",
|
489 |
+
"print(\"Found {} normalized mappings\".format(len(char_to_normalized_char)))"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"cell_type": "code",
|
494 |
+
"execution_count": 49,
|
495 |
+
"metadata": {},
|
496 |
+
"outputs": [
|
497 |
+
{
|
498 |
+
"name": "stdout",
|
499 |
+
"output_type": "stream",
|
500 |
+
"text": [
|
501 |
+
"𠹺\t埋\t388\n",
|
502 |
+
"噖\t琴\t162\n",
|
503 |
+
"𡁵\t緊\t157\n",
|
504 |
+
"𠶧\t掂\t88\n",
|
505 |
+
"嚫\t親\t88\n",
|
506 |
+
"屘\t尾\t57\n",
|
507 |
+
"衭\t衤夫\t47\n",
|
508 |
+
"贃\t賺\t43\n",
|
509 |
+
"說\t???\t35\n",
|
510 |
+
"𧵳\t???\t30\n",
|
511 |
+
"歳\t歲\t27\n",
|
512 |
+
"𢫏\t全\t27\n",
|
513 |
+
"𨶙\t能\t25\n",
|
514 |
+
"癐\t???\t25\n",
|
515 |
+
"𦡆\t???\t25\n",
|
516 |
+
"𨃩\t⻊扇\t24\n",
|
517 |
+
"况\t???\t21\n",
|
518 |
+
"内\t內\t19\n",
|
519 |
+
"𢵌\t扌隊\t19\n",
|
520 |
+
"𦧺\t賴\t18\n",
|
521 |
+
"𠹌\t o能\t18\n",
|
522 |
+
"爲\t為\t16\n",
|
523 |
+
"𢱑\t抓\t16\n",
|
524 |
+
"𡁯\t???\t15\n",
|
525 |
+
"𠱓\t詭\t14\n",
|
526 |
+
"𠵿\t披\t14\n",
|
527 |
+
"踹\t???\t13\n",
|
528 |
+
"㗇\t???\t13\n",
|
529 |
+
"𠾴\t棒\t13\n",
|
530 |
+
"嗍\t索\t13\n",
|
531 |
+
"𧘹\t太\t13\n",
|
532 |
+
"𠹳\t傑\t12\n",
|
533 |
+
"𠹭\t???\t12\n",
|
534 |
+
"脫\t???\t12\n",
|
535 |
+
"䁪\t???\t11\n",
|
536 |
+
"𧨾\t氹\t11\n",
|
537 |
+
"掬\t???\t11\n",
|
538 |
+
"𠸐\t???\t11\n",
|
539 |
+
"啥\t???\t11\n",
|
540 |
+
"𠱃\t o凹\t10\n",
|
541 |
+
"噔\t o登\t10\n",
|
542 |
+
"捹\t扌奔\t10\n",
|
543 |
+
"𠹻\t???\t10\n",
|
544 |
+
"𠼻\t基\t10\n",
|
545 |
+
"噠\t???\t10\n",
|
546 |
+
"𨳊\t九\t10\n",
|
547 |
+
"𢲲\t???\t9\n",
|
548 |
+
"𨉖\t???\t9\n",
|
549 |
+
"躭\t耽\t9\n",
|
550 |
+
"䠋\t卑\t9\n",
|
551 |
+
"嘮\t???\t9\n",
|
552 |
+
"啽\t o弇\t9\n",
|
553 |
+
"滮\t氵彪\t8\n",
|
554 |
+
"㧻\t扌涿\t8\n",
|
555 |
+
"𧶄\t???\t8\n",
|
556 |
+
"𦛚\t???\t8\n",
|
557 |
+
"撠\t扌戟\t8\n",
|
558 |
+
"呡\t o吻\t8\n",
|
559 |
+
"睸\t目眉\t8\n",
|
560 |
+
"𠰲\t???\t8\n",
|
561 |
+
"𥔿\t???\t8\n",
|
562 |
+
"唎\t脷\t8\n",
|
563 |
+
"𠸊\t???\t8\n",
|
564 |
+
"𬜐\t???\t8\n",
|
565 |
+
"蔥\t葱\t8\n",
|
566 |
+
"呱\t???\t8\n",
|
567 |
+
"B\t???\t7\n",
|
568 |
+
"𢯊\t扌的\t7\n",
|
569 |
+
"𫫃\t???\t7\n",
|
570 |
+
"𢝵\t???\t7\n",
|
571 |
+
"銹\t鏽\t7\n",
|
572 |
+
"㓤\t吉刂\t7\n",
|
573 |
+
"䁯\t???\t7\n",
|
574 |
+
"啉\t o林\t7\n",
|
575 |
+
"臥\t???\t7\n",
|
576 |
+
"𠓼\t???\t7\n",
|
577 |
+
"稅\t???\t7\n",
|
578 |
+
"\t???\t7\n",
|
579 |
+
"喴\t o威\t7\n",
|
580 |
+
"噱\t???\t7\n",
|
581 |
+
"衛\t???\t6\n",
|
582 |
+
"𡄯\t???\t6\n",
|
583 |
+
"揤\t扌即\t6\n",
|
584 |
+
"𢤹\t???\t6\n",
|
585 |
+
"\t???\t6\n",
|
586 |
+
"鷄\t雞\t6\n",
|
587 |
+
"湴\t氵並\t6\n",
|
588 |
+
"\t???\t6\n",
|
589 |
+
"𦣇\t???\t6\n",
|
590 |
+
"齧\t咬\t6\n",
|
591 |
+
"𠮨\t乃\t6\n",
|
592 |
+
"\t???\t6\n",
|
593 |
+
"𡀝\t???\t6\n",
|
594 |
+
"婄\t蓓\t6\n",
|
595 |
+
"𠼱\t累\t6\n",
|
596 |
+
"𠱂\t???\t5\n",
|
597 |
+
"磧\t石責\t5\n",
|
598 |
+
"𠰋\t???\t5\n",
|
599 |
+
"𡂖\t???\t5\n",
|
600 |
+
"浭\t氵更\t5\n",
|
601 |
+
"擏\t擎\t5\n",
|
602 |
+
"𥋇\t掌\t5\n",
|
603 |
+
"揢\t扌客\t5\n",
|
604 |
+
"㨆\t扌林\t5\n",
|
605 |
+
"𠾍\t棄\t5\n",
|
606 |
+
"兌\t???\t5\n",
|
607 |
+
"𢺳\t???\t5\n",
|
608 |
+
"坺\t土拔\t5\n",
|
609 |
+
"鍚\t???\t5\n",
|
610 |
+
"𣘚\t???\t5\n",
|
611 |
+
"𪘁\t???\t5\n",
|
612 |
+
"𨳍\t七\t5\n",
|
613 |
+
"嗙\t o旁\t5\n",
|
614 |
+
"𠼰\t???\t5\n",
|
615 |
+
"𨳒\t小\t4\n",
|
616 |
+
"唿\t篋\t4\n",
|
617 |
+
"𣳼\t???\t4\n",
|
618 |
+
"𦂥\t???\t4\n",
|
619 |
+
"溚\t塔\t4\n",
|
620 |
+
"囋\t???\t4\n",
|
621 |
+
"瀄\t吱\t4\n",
|
622 |
+
"𠌥\t???\t4\n",
|
623 |
+
"𢫦\t???\t4\n",
|
624 |
+
"𢶍\t???\t4\n",
|
625 |
+
"𠲵\t???\t4\n",
|
626 |
+
"䉺\t米\t4\n",
|
627 |
+
"炕\t???\t4\n",
|
628 |
+
"𢴈\t撻\t4\n",
|
629 |
+
"𡲢\t???\t4\n",
|
630 |
+
"𥅈\t立\t4\n",
|
631 |
+
"𬧊\t???\t4\n",
|
632 |
+
"簕\t勒\t4\n",
|
633 |
+
"査\t查\t4\n",
|
634 |
+
"𩜠\t岩\t4\n",
|
635 |
+
"𫬿\t???\t4\n",
|
636 |
+
"𠜱\t卑刂\t4\n",
|
637 |
+
"嚬\t顰\t4\n",
|
638 |
+
"𠹹\t???\t4\n",
|
639 |
+
"𦉘\t???\t4\n",
|
640 |
+
"唦\t o沙\t4\n",
|
641 |
+
"㨘\t扌省\t4\n",
|
642 |
+
"𡄽\t瀉\t4\n",
|
643 |
+
"熗\t槍\t4\n",
|
644 |
+
"𡁷\t???\t4\n",
|
645 |
+
"𠿬\t???\t4\n",
|
646 |
+
"咜\t叱\t4\n",
|
647 |
+
"𠸏\t茄\t4\n",
|
648 |
+
"𡁸\t???\t4\n",
|
649 |
+
"𡃵\t???\t4\n",
|
650 |
+
"𪚩\t???\t4\n",
|
651 |
+
"D\t???\t4\n",
|
652 |
+
"Q\t???\t4\n",
|
653 |
+
"𨆯\t???\t3\n",
|
654 |
+
"啗\t啖\t3\n",
|
655 |
+
"蔸\t艹兜\t3\n",
|
656 |
+
"舗\t鋪\t3\n",
|
657 |
+
"囪\t窗\t3\n",
|
658 |
+
"艔\t???\t3\n",
|
659 |
+
"洩\t???\t3\n",
|
660 |
+
"𢵧\t???\t3\n",
|
661 |
+
"菓\t果\t3\n",
|
662 |
+
"䪴\t???\t3\n",
|
663 |
+
"䆲\t???\t3\n",
|
664 |
+
"痱\t???\t3\n",
|
665 |
+
"趿\t拖\t3\n",
|
666 |
+
"𠮩\t???\t3\n",
|
667 |
+
"搉\t確\t3\n",
|
668 |
+
"矋\t矖\t3\n",
|
669 |
+
"𠻗\t???\t3\n",
|
670 |
+
"𢲈\t???\t3\n",
|
671 |
+
"潞\t氵路\t3\n",
|
672 |
+
"沬\t???\t3\n",
|
673 |
+
"揇\t扌南\t3\n",
|
674 |
+
"齃\t曷\t3\n",
|
675 |
+
"𡃤\t賴\t3\n",
|
676 |
+
"𡃶\t???\t3\n",
|
677 |
+
"瀟\t???\t3\n",
|
678 |
+
"軨\t???\t3\n",
|
679 |
+
"鉻\t???\t3\n",
|
680 |
+
"\t???\t3\n",
|
681 |
+
"㿭\t斥\t3\n",
|
682 |
+
"𢵄\t???\t3\n",
|
683 |
+
"㗲\t???\t3\n",
|
684 |
+
"𢫕\t???\t3\n",
|
685 |
+
"𢰸\t???\t3\n",
|
686 |
+
"葫\t???\t3\n",
|
687 |
+
"咔\t???\t3\n",
|
688 |
+
"嚎\t???\t3\n",
|
689 |
+
"嗿\t???\t3\n",
|
690 |
+
"咈\t o弗\t3\n",
|
691 |
+
"咾\t嚕\t3\n",
|
692 |
+
"\t???\t3\n",
|
693 |
+
"𠵈\t妹\t3\n",
|
694 |
+
"吥\t o不\t3\n",
|
695 |
+
"𠾭\t???\t3\n",
|
696 |
+
"𠾵\t???\t3\n",
|
697 |
+
"朘\t俊\t3\n",
|
698 |
+
"觥\t黃\t3\n",
|
699 |
+
"㩧\t扌暴\t2\n",
|
700 |
+
"焙\t???\t2\n",
|
701 |
+
"兀\t???\t2\n",
|
702 |
+
"䭤\t???\t2\n",
|
703 |
+
"饊\t???\t2\n",
|
704 |
+
"[\t???\t2\n",
|
705 |
+
"]\t???\t2\n",
|
706 |
+
"炖\t???\t2\n",
|
707 |
+
"争\t爭\t2\n",
|
708 |
+
"䁓\t???\t2\n",
|
709 |
+
"𡂝\t???\t2\n",
|
710 |
+
"𩬎\t壬\t2\n",
|
711 |
+
"鈒\t閘\t2\n",
|
712 |
+
"亁\t乾\t2\n",
|
713 |
+
"炠\t灬甲\t2\n",
|
714 |
+
"摼\t???\t2\n",
|
715 |
+
"𠺬\t???\t2\n",
|
716 |
+
"𠵉\t???\t2\n",
|
717 |
+
"蝄\t???\t2\n",
|
718 |
+
"\t???\t2\n",
|
719 |
+
"蔫\t艹焉\t2\n",
|
720 |
+
"㘉\t???\t2\n",
|
721 |
+
"荏\t???\t2\n",
|
722 |
+
"墘\t土乾\t2\n",
|
723 |
+
"嗏\t搽\t2\n",
|
724 |
+
"呣\t o母\t2\n",
|
725 |
+
"曚\t矇\t2\n",
|
726 |
+
"壬\t???\t2\n",
|
727 |
+
"揅\t研\t2\n",
|
728 |
+
"溼\t濕\t2\n",
|
729 |
+
"囓\t咬\t2\n",
|
730 |
+
"嚙\t咬\t2\n",
|
731 |
+
"枴\t拐\t2\n",
|
732 |
+
"𡃀\t???\t2\n",
|
733 |
+
"饑\t???\t2\n",
|
734 |
+
"䏭\t???\t2\n",
|
735 |
+
"挼\t挪\t2\n",
|
736 |
+
"掱\t???\t2\n",
|
737 |
+
"咑\t打\t2\n",
|
738 |
+
"芙\t???\t2\n",
|
739 |
+
"𦂗\t???\t2\n",
|
740 |
+
"舦\t軚\t2\n",
|
741 |
+
"𢶤\t扌靴\t2\n",
|
742 |
+
"翡\t???\t2\n",
|
743 |
+
"翠\t???\t2\n",
|
744 |
+
"酡\t酉它\t2\n",
|
745 |
+
"𫭊\t???\t2\n",
|
746 |
+
"煀\t火屈\t2\n",
|
747 |
+
"耙\t???\t2\n",
|
748 |
+
"𠿭\t滑\t2\n",
|
749 |
+
"鉤\t鈎\t2\n",
|
750 |
+
"𠻘\t???\t2\n",
|
751 |
+
"脽\t離\t2\n",
|
752 |
+
"焊\t???\t2\n",
|
753 |
+
"唊\t o夾\t2\n",
|
754 |
+
"胅\t⺼失\t2\n",
|
755 |
+
"翕\t???\t2\n",
|
756 |
+
"摜\t摔\t2\n",
|
757 |
+
"僚\t???\t1\n",
|
758 |
+
"𩗴\t???\t1\n",
|
759 |
+
"毡\t???\t1\n",
|
760 |
+
"跤\t???\t1\n",
|
761 |
+
"梧\t???\t1\n",
|
762 |
+
"痄\t疒乍\t1\n",
|
763 |
+
"卟\t卜\t1\n",
|
764 |
+
"劄\t札\t1\n",
|
765 |
+
"𠶜\t制\t1\n",
|
766 |
+
"睜\t???\t1\n",
|
767 |
+
"迹\t跡\t1\n",
|
768 |
+
"揃\t扌前\t1\n",
|
769 |
+
"唨\t o阻\t1\n",
|
770 |
+
"謢\t護\t1\n",
|
771 |
+
"菻\t麻\t1\n",
|
772 |
+
"𣚺\t???\t1\n",
|
773 |
+
"鷓\t庶鳥\t1\n",
|
774 |
+
"鴣\t古鳥\t1\n",
|
775 |
+
"强\t???\t1\n",
|
776 |
+
"𠾶\t???\t1\n",
|
777 |
+
"𡆀\t轟\t1\n",
|
778 |
+
"拫\t扌艮\t1\n",
|
779 |
+
"𠼮\t偽\t1\n",
|
780 |
+
"汞\t???\t1\n",
|
781 |
+
"㤿\t???\t1\n",
|
782 |
+
"厴\t???\t1\n",
|
783 |
+
"𥀬\t???\t1\n",
|
784 |
+
"牯\t???\t1\n",
|
785 |
+
"𡇙\t???\t1\n",
|
786 |
+
"讕\t賴\t1\n",
|
787 |
+
"𠿫\t???\t1\n",
|
788 |
+
"瘺\t婁\t1\n",
|
789 |
+
"骲\t骨包\t1\n",
|
790 |
+
"𫲭\t???\t1\n",
|
791 |
+
"瓏\t玉龍\t1\n",
|
792 |
+
"繚\t???\t1\n",
|
793 |
+
"撿\t???\t1\n",
|
794 |
+
"跀\t⻊月\t1\n",
|
795 |
+
"𢛴\t掹\t1\n",
|
796 |
+
"蝻\t虫南\t1\n",
|
797 |
+
"赧\t羞赤\t1\n",
|
798 |
+
"𪙛\t甩\t1\n",
|
799 |
+
"\t???\t1\n",
|
800 |
+
"檳\t???\t1\n",
|
801 |
+
"潲\t餿\t1\n",
|
802 |
+
"𢶠\t???\t1\n",
|
803 |
+
"秧\t???\t1\n",
|
804 |
+
"蒔\t???\t1\n",
|
805 |
+
"炩\t灬令\t1\n",
|
806 |
+
"㩋\t???\t1\n",
|
807 |
+
"饅\t???\t1\n",
|
808 |
+
"鍍\t???\t1\n",
|
809 |
+
"𢚖\t???\t1\n",
|
810 |
+
"𧊅\t虫另\t1\n",
|
811 |
+
"\t???\t1\n",
|
812 |
+
"篸\t???\t1\n",
|
813 |
+
"𩟔\t???\t1\n",
|
814 |
+
"撍\t賺\t1\n",
|
815 |
+
"栗\t???\t1\n",
|
816 |
+
"\t???\t1\n",
|
817 |
+
"𡆇\t???\t1\n",
|
818 |
+
"杧\t芒\t1\n",
|
819 |
+
"榛\t???\t1\n",
|
820 |
+
"蠄\t虫禽\t1\n",
|
821 |
+
"蟧\t???\t1\n",
|
822 |
+
"嘶\t???\t1\n",
|
823 |
+
"梆\t???\t1\n",
|
824 |
+
"竪\t豎\t1\n",
|
825 |
+
"騾\t???\t1\n",
|
826 |
+
"矺\t???\t1\n",
|
827 |
+
"堀\t???\t1\n",
|
828 |
+
"麝\t???\t1\n",
|
829 |
+
"慪\t嘔\t1\n",
|
830 |
+
"撴\t扌敦\t1\n",
|
831 |
+
"哾\t啜\t1\n",
|
832 |
+
"𠳖\t???\t1\n",
|
833 |
+
"洌\t冽\t1\n",
|
834 |
+
"霹\t???\t1\n",
|
835 |
+
"𠾼\t???\t1\n",
|
836 |
+
"𬦠\t???\t1\n",
|
837 |
+
"𤌍\t???\t1\n",
|
838 |
+
"𬧯\t???\t1\n",
|
839 |
+
"厠\t廁\t1\n",
|
840 |
+
"㖡\t???\t1\n",
|
841 |
+
"跁\t⻊巴\t1\n",
|
842 |
+
"鉎\t???\t1\n",
|
843 |
+
"𧣈\t???\t1\n",
|
844 |
+
"𠳏\t???\t1\n",
|
845 |
+
"㹃\t非\t1\n",
|
846 |
+
"𧝞\t???\t1\n",
|
847 |
+
"𡀞\t???\t1\n",
|
848 |
+
"㦒\t???\t1\n",
|
849 |
+
"𩩍\t娉\t1\n",
|
850 |
+
"𢱢\t???\t1\n",
|
851 |
+
"鍟\t???\t1\n",
|
852 |
+
"煱\t???\t1\n",
|
853 |
+
"撘\t搭\t1\n",
|
854 |
+
"閱\t???\t1\n",
|
855 |
+
"橇\t喬\t1\n",
|
856 |
+
"籽\t???\t1\n",
|
857 |
+
"庵\t???\t1\n",
|
858 |
+
"厨\t???\t1\n",
|
859 |
+
"疴\t屙\t1\n",
|
860 |
+
"豹\t???\t1\n",
|
861 |
+
"杠\t槓\t1\n",
|
862 |
+
"咘\t o布\t1\n",
|
863 |
+
"裡\t???\t1\n",
|
864 |
+
"熏\t燻\t1\n",
|
865 |
+
"\t???\t1\n"
|
866 |
+
]
|
867 |
+
}
|
868 |
+
],
|
869 |
+
"source": [
|
870 |
+
"for c, freq in charset_sort_by_freq.items():\n",
|
871 |
+
" if c in char_to_normalized_char:\n",
|
872 |
+
" print(c + \"\\t\" + char_to_normalized_char[c] + \"\\t\" + str(freq))\n",
|
873 |
+
" else:\n",
|
874 |
+
" print(c + \"\\t\" + \"???\" + \"\\t\" + str(freq))"
|
875 |
+
]
|
876 |
+
},
|
877 |
+
{
|
878 |
+
"cell_type": "code",
|
879 |
+
"execution_count": 57,
|
880 |
+
"metadata": {},
|
881 |
+
"outputs": [
|
882 |
+
{
|
883 |
+
"name": "stdout",
|
884 |
+
"output_type": "stream",
|
885 |
+
"text": [
|
886 |
+
"Loaded 177 normalization mappings\n",
|
887 |
+
"Sample of first 10 highest frequency mappings:\n",
|
888 |
+
"[('𠹺', '埋'), ('噖', '琴'), ('𡁵', '緊'), ('𠶧', '掂'), ('嚫', '親'), ('屘', '尾'), ('衭', '褲'), ('贃', '賺'), ('說', '説'), ('𧵳', '蝕')]\n"
|
889 |
+
]
|
890 |
+
}
|
891 |
+
],
|
892 |
+
"source": [
|
893 |
+
"abc_mapping = {}\n",
|
894 |
+
"\n",
|
895 |
+
"with open(\"abc_rare_char_mapping.txt\", \"r\") as input_file:\n",
|
896 |
+
" for line in input_file.read().splitlines():\n",
|
897 |
+
" [c, n, freq] = line.split(\"\\t\")\n",
|
898 |
+
" if len(n) == 1:\n",
|
899 |
+
" abc_mapping[c] = n\n",
|
900 |
+
"\n",
|
901 |
+
"print(\"Loaded {} normalization mappings\".format(len(abc_mapping)))\n",
|
902 |
+
"print(\"Sample of first 10 highest frequency mappings:\")\n",
|
903 |
+
"print(list(abc_mapping.items())[:10])"
|
904 |
+
]
|
905 |
+
},
|
906 |
+
{
|
907 |
+
"cell_type": "code",
|
908 |
+
"execution_count": 58,
|
909 |
+
"metadata": {},
|
910 |
+
"outputs": [],
|
911 |
+
"source": [
|
912 |
+
"# replace all occurence of rare characters with normalized ones\n",
|
913 |
+
"def normalize_abc(line: str) -> str:\n",
|
914 |
+
" for c, n in abc_mapping.items():\n",
|
915 |
+
" line = line.replace(c, n)\n",
|
916 |
+
" line = line.replace(\"而𠺢\", \"而家\").replace(\"依𠺢\", \"依家\")\n",
|
917 |
+
" return line"
|
918 |
+
]
|
919 |
+
},
|
920 |
+
{
|
921 |
+
"cell_type": "code",
|
922 |
+
"execution_count": 59,
|
923 |
+
"metadata": {},
|
924 |
+
"outputs": [],
|
925 |
+
"source": [
|
926 |
+
"with open(\"train/abc.can\", \"w+\") as output_file:\n",
|
927 |
+
" for line in can_lines:\n",
|
928 |
+
" output_file.write(normalize_abc(line) + \"\\n\")\n"
|
929 |
+
]
|
930 |
+
},
|
931 |
+
{
|
932 |
+
"cell_type": "code",
|
933 |
+
"execution_count": null,
|
934 |
+
"metadata": {},
|
935 |
+
"outputs": [],
|
936 |
+
"source": [
|
937 |
+
"\n",
|
938 |
+
" \n"
|
939 |
+
]
|
940 |
+
}
|
941 |
+
],
|
942 |
+
"metadata": {
|
943 |
+
"kernelspec": {
|
944 |
+
"display_name": "Python 3",
|
945 |
+
"language": "python",
|
946 |
+
"name": "python3"
|
947 |
+
},
|
948 |
+
"language_info": {
|
949 |
+
"codemirror_mode": {
|
950 |
+
"name": "ipython",
|
951 |
+
"version": 3
|
952 |
+
},
|
953 |
+
"file_extension": ".py",
|
954 |
+
"mimetype": "text/x-python",
|
955 |
+
"name": "python",
|
956 |
+
"nbconvert_exporter": "python",
|
957 |
+
"pygments_lexer": "ipython3",
|
958 |
+
"version": "3.10.6"
|
959 |
+
},
|
960 |
+
"orig_nbformat": 4
|
961 |
+
},
|
962 |
+
"nbformat": 4,
|
963 |
+
"nbformat_minor": 2
|
964 |
+
}
|
load_lihkg.ipynb
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 3,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"text/html": [
|
11 |
+
"<div>\n",
|
12 |
+
"<style scoped>\n",
|
13 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
14 |
+
" vertical-align: middle;\n",
|
15 |
+
" }\n",
|
16 |
+
"\n",
|
17 |
+
" .dataframe tbody tr th {\n",
|
18 |
+
" vertical-align: top;\n",
|
19 |
+
" }\n",
|
20 |
+
"\n",
|
21 |
+
" .dataframe thead th {\n",
|
22 |
+
" text-align: right;\n",
|
23 |
+
" }\n",
|
24 |
+
"</style>\n",
|
25 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
26 |
+
" <thead>\n",
|
27 |
+
" <tr style=\"text-align: right;\">\n",
|
28 |
+
" <th></th>\n",
|
29 |
+
" <th>Words</th>\n",
|
30 |
+
" <th>Frequency</th>\n",
|
31 |
+
" </tr>\n",
|
32 |
+
" </thead>\n",
|
33 |
+
" <tbody>\n",
|
34 |
+
" <tr>\n",
|
35 |
+
" <th>0</th>\n",
|
36 |
+
" <td>有</td>\n",
|
37 |
+
" <td>51227728</td>\n",
|
38 |
+
" </tr>\n",
|
39 |
+
" <tr>\n",
|
40 |
+
" <th>1</th>\n",
|
41 |
+
" <td>我</td>\n",
|
42 |
+
" <td>43798085</td>\n",
|
43 |
+
" </tr>\n",
|
44 |
+
" <tr>\n",
|
45 |
+
" <th>2</th>\n",
|
46 |
+
" <td>一</td>\n",
|
47 |
+
" <td>43159170</td>\n",
|
48 |
+
" </tr>\n",
|
49 |
+
" <tr>\n",
|
50 |
+
" <th>3</th>\n",
|
51 |
+
" <td>的</td>\n",
|
52 |
+
" <td>40916482</td>\n",
|
53 |
+
" </tr>\n",
|
54 |
+
" <tr>\n",
|
55 |
+
" <th>4</th>\n",
|
56 |
+
" <td>你</td>\n",
|
57 |
+
" <td>30897176</td>\n",
|
58 |
+
" </tr>\n",
|
59 |
+
" <tr>\n",
|
60 |
+
" <th>...</th>\n",
|
61 |
+
" <td>...</td>\n",
|
62 |
+
" <td>...</td>\n",
|
63 |
+
" </tr>\n",
|
64 |
+
" <tr>\n",
|
65 |
+
" <th>133207</th>\n",
|
66 |
+
" <td>黎明網</td>\n",
|
67 |
+
" <td>12</td>\n",
|
68 |
+
" </tr>\n",
|
69 |
+
" <tr>\n",
|
70 |
+
" <th>133208</th>\n",
|
71 |
+
" <td>黎錦華</td>\n",
|
72 |
+
" <td>12</td>\n",
|
73 |
+
" </tr>\n",
|
74 |
+
" <tr>\n",
|
75 |
+
" <th>133209</th>\n",
|
76 |
+
" <td>墨包</td>\n",
|
77 |
+
" <td>12</td>\n",
|
78 |
+
" </tr>\n",
|
79 |
+
" <tr>\n",
|
80 |
+
" <th>133210</th>\n",
|
81 |
+
" <td>點晒穴</td>\n",
|
82 |
+
" <td>12</td>\n",
|
83 |
+
" </tr>\n",
|
84 |
+
" <tr>\n",
|
85 |
+
" <th>133211</th>\n",
|
86 |
+
" <td>齋頂</td>\n",
|
87 |
+
" <td>12</td>\n",
|
88 |
+
" </tr>\n",
|
89 |
+
" </tbody>\n",
|
90 |
+
"</table>\n",
|
91 |
+
"<p>133212 rows × 2 columns</p>\n",
|
92 |
+
"</div>"
|
93 |
+
],
|
94 |
+
"text/plain": [
|
95 |
+
" Words Frequency\n",
|
96 |
+
"0 有 51227728\n",
|
97 |
+
"1 我 43798085\n",
|
98 |
+
"2 一 43159170\n",
|
99 |
+
"3 的 40916482\n",
|
100 |
+
"4 你 30897176\n",
|
101 |
+
"... ... ...\n",
|
102 |
+
"133207 黎明網 12\n",
|
103 |
+
"133208 黎錦華 12\n",
|
104 |
+
"133209 墨包 12\n",
|
105 |
+
"133210 點晒穴 12\n",
|
106 |
+
"133211 齋頂 12\n",
|
107 |
+
"\n",
|
108 |
+
"[133212 rows x 2 columns]"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
"execution_count": 3,
|
112 |
+
"metadata": {},
|
113 |
+
"output_type": "execute_result"
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"import pandas as pd\n",
|
118 |
+
"\n",
|
119 |
+
"# Load Excel file and convert to dictionary\n",
|
120 |
+
"df = pd.read_excel('CyberCan.xlsx')\n",
|
121 |
+
"\n",
|
122 |
+
"df"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": 33,
|
128 |
+
"metadata": {},
|
129 |
+
"outputs": [],
|
130 |
+
"source": [
|
131 |
+
"with open(\"CyberCan.dict\", \"w+\") as output_file:\n",
|
132 |
+
" for index, row in df.iterrows():\n",
|
133 |
+
" word = str(row['Words']).strip()\n",
|
134 |
+
" if not \" \" in word:\n",
|
135 |
+
" output_file.write(word + \" \" + str(row['Frequency']) + \"\\n\")\n",
|
136 |
+
" output_file.flush()\n"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"cell_type": "code",
|
141 |
+
"execution_count": 36,
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [
|
144 |
+
{
|
145 |
+
"name": "stdout",
|
146 |
+
"output_type": "stream",
|
147 |
+
"text": [
|
148 |
+
"Total words: 132895\n"
|
149 |
+
]
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"source": [
|
153 |
+
"puncts = [\",\", \"。\", \"!\", \"?\", \"「\", \"」\", \":\"]\n",
|
154 |
+
"cybercan_words = set()\n",
|
155 |
+
"\n",
|
156 |
+
"for word in list(df['Words'].values) + puncts:\n",
|
157 |
+
" cybercan_words.add(word)\n",
|
158 |
+
"\n",
|
159 |
+
"print(\"Total words: {}\".format(len(cybercan_words)))"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"cell_type": "code",
|
164 |
+
"execution_count": 37,
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [],
|
167 |
+
"source": [
|
168 |
+
"import jieba\n",
|
169 |
+
"jieba.set_dictionary(\"CyberCan.dict\")"
|
170 |
+
]
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"cell_type": "code",
|
174 |
+
"execution_count": 42,
|
175 |
+
"metadata": {},
|
176 |
+
"outputs": [
|
177 |
+
{
|
178 |
+
"name": "stdout",
|
179 |
+
"output_type": "stream",
|
180 |
+
"text": [
|
181 |
+
"Total filtered lines: 140590\n"
|
182 |
+
]
|
183 |
+
}
|
184 |
+
],
|
185 |
+
"source": [
|
186 |
+
"import re\n",
|
187 |
+
"\n",
|
188 |
+
"alnum = re.compile(\"[a-zA-Z0-9]\")\n",
|
189 |
+
"filtered_lines = []\n",
|
190 |
+
"\n",
|
191 |
+
"with open(\"train/lihkg.can\", \"r\") as input_file:\n",
|
192 |
+
" for line in input_file.read().splitlines():\n",
|
193 |
+
" line = line.replace(\" \", \"\")\n",
|
194 |
+
" if len(line) < 10:\n",
|
195 |
+
" continue\n",
|
196 |
+
" if len(line) >= 64:\n",
|
197 |
+
" continue\n",
|
198 |
+
" if alnum.search(line):\n",
|
199 |
+
" continue\n",
|
200 |
+
" tokens = list(jieba.cut(line))\n",
|
201 |
+
" found_rare_word = False\n",
|
202 |
+
" for token in tokens:\n",
|
203 |
+
" if not token in cybercan_words:\n",
|
204 |
+
" found_rare_word = True\n",
|
205 |
+
" # print(\"Found rare word: {}\".format(token))\n",
|
206 |
+
" break\n",
|
207 |
+
" if found_rare_word:\n",
|
208 |
+
" continue\n",
|
209 |
+
" filtered_lines.append(line)\n",
|
210 |
+
"\n",
|
211 |
+
"print(\"Total filtered lines: {}\".format(len(filtered_lines)))\n",
|
212 |
+
"\n",
|
213 |
+
"with open(\"train/lihkg.filtered.can\", \"w+\") as output_file:\n",
|
214 |
+
" for line in filtered_lines:\n",
|
215 |
+
" output_file.write(line + \"\\n\")\n",
|
216 |
+
" output_file.flush()"
|
217 |
+
]
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"metadata": {
|
221 |
+
"kernelspec": {
|
222 |
+
"display_name": "Python 3",
|
223 |
+
"language": "python",
|
224 |
+
"name": "python3"
|
225 |
+
},
|
226 |
+
"language_info": {
|
227 |
+
"codemirror_mode": {
|
228 |
+
"name": "ipython",
|
229 |
+
"version": 3
|
230 |
+
},
|
231 |
+
"file_extension": ".py",
|
232 |
+
"mimetype": "text/x-python",
|
233 |
+
"name": "python",
|
234 |
+
"nbconvert_exporter": "python",
|
235 |
+
"pygments_lexer": "ipython3",
|
236 |
+
"version": "3.10.6"
|
237 |
+
},
|
238 |
+
"orig_nbformat": 4
|
239 |
+
},
|
240 |
+
"nbformat": 4,
|
241 |
+
"nbformat_minor": 2
|
242 |
+
}
|
load_mined_bitext.ipynb
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"data": {
|
10 |
+
"text/html": [
|
11 |
+
"<div>\n",
|
12 |
+
"<style scoped>\n",
|
13 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
14 |
+
" vertical-align: middle;\n",
|
15 |
+
" }\n",
|
16 |
+
"\n",
|
17 |
+
" .dataframe tbody tr th {\n",
|
18 |
+
" vertical-align: top;\n",
|
19 |
+
" }\n",
|
20 |
+
"\n",
|
21 |
+
" .dataframe thead th {\n",
|
22 |
+
" text-align: right;\n",
|
23 |
+
" }\n",
|
24 |
+
"</style>\n",
|
25 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
26 |
+
" <thead>\n",
|
27 |
+
" <tr style=\"text-align: right;\">\n",
|
28 |
+
" <th></th>\n",
|
29 |
+
" <th>input_text</th>\n",
|
30 |
+
" <th>target_text</th>\n",
|
31 |
+
" </tr>\n",
|
32 |
+
" </thead>\n",
|
33 |
+
" <tbody>\n",
|
34 |
+
" <tr>\n",
|
35 |
+
" <th>0</th>\n",
|
36 |
+
" <td>我要求的是法律上的澄清</td>\n",
|
37 |
+
" <td>我係要求……呢啲係好清楚嘅法律上嘅澄清呀</td>\n",
|
38 |
+
" </tr>\n",
|
39 |
+
" <tr>\n",
|
40 |
+
" <th>1</th>\n",
|
41 |
+
" <td>每晚由七點半,到十一點半</td>\n",
|
42 |
+
" <td>誒,由七點半就做到十一點半</td>\n",
|
43 |
+
" </tr>\n",
|
44 |
+
" <tr>\n",
|
45 |
+
" <th>2</th>\n",
|
46 |
+
" <td>梁頌恒議員,你是否要繼續發言</td>\n",
|
47 |
+
" <td>梁頌恆議員呢,係咪繼續係發言</td>\n",
|
48 |
+
" </tr>\n",
|
49 |
+
" <tr>\n",
|
50 |
+
" <th>3</th>\n",
|
51 |
+
" <td>可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大</td>\n",
|
52 |
+
" <td>可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛</td>\n",
|
53 |
+
" </tr>\n",
|
54 |
+
" <tr>\n",
|
55 |
+
" <th>4</th>\n",
|
56 |
+
" <td>至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見</td>\n",
|
57 |
+
" <td>噉呢,就對於佢能唔能夠達到你頭先提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見</td>\n",
|
58 |
+
" </tr>\n",
|
59 |
+
" <tr>\n",
|
60 |
+
" <th>...</th>\n",
|
61 |
+
" <td>...</td>\n",
|
62 |
+
" <td>...</td>\n",
|
63 |
+
" </tr>\n",
|
64 |
+
" <tr>\n",
|
65 |
+
" <th>35872</th>\n",
|
66 |
+
" <td>他曾在2006及2007年擔任暑期實習生</td>\n",
|
67 |
+
" <td>2006~2007學年寒暑假間亦試過將學校整大兼修容過</td>\n",
|
68 |
+
" </tr>\n",
|
69 |
+
" <tr>\n",
|
70 |
+
" <th>35873</th>\n",
|
71 |
+
" <td>克里莫尼迪茲戰爭</td>\n",
|
72 |
+
" <td>克里米亞戰爭</td>\n",
|
73 |
+
" </tr>\n",
|
74 |
+
" <tr>\n",
|
75 |
+
" <th>35874</th>\n",
|
76 |
+
" <td>產卵後親魚迴歸大海</td>\n",
|
77 |
+
" <td>海潮遇返失敗多年嘅生母</td>\n",
|
78 |
+
" </tr>\n",
|
79 |
+
" <tr>\n",
|
80 |
+
" <th>35875</th>\n",
|
81 |
+
" <td>學校規模冠絕全馬。</td>\n",
|
82 |
+
" <td>學校嘅運動水平可謂全區之冠。</td>\n",
|
83 |
+
" </tr>\n",
|
84 |
+
" <tr>\n",
|
85 |
+
" <th>35876</th>\n",
|
86 |
+
" <td>黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。</td>\n",
|
87 |
+
" <td>而九龍寨城到海邊碼頭就慢慢變成市集。</td>\n",
|
88 |
+
" </tr>\n",
|
89 |
+
" </tbody>\n",
|
90 |
+
"</table>\n",
|
91 |
+
"<p>35877 rows × 2 columns</p>\n",
|
92 |
+
"</div>"
|
93 |
+
],
|
94 |
+
"text/plain": [
|
95 |
+
" input_text \\\n",
|
96 |
+
"0 我要求的是法律上的澄清 \n",
|
97 |
+
"1 每晚由七點半,到十一點半 \n",
|
98 |
+
"2 梁頌恒議員,你是否要繼續發言 \n",
|
99 |
+
"3 可以怎樣稱呼我?我只知道整條街都稱我「大家姐」,因為我最大,年紀最大 \n",
|
100 |
+
"4 至於他的答覆能否回應你剛才的提問,我並不能夠提出任何意見 \n",
|
101 |
+
"... ... \n",
|
102 |
+
"35872 他曾在2006及2007年擔任暑期實習生 \n",
|
103 |
+
"35873 克里莫尼迪茲戰爭 \n",
|
104 |
+
"35874 產卵後親魚迴歸大海 \n",
|
105 |
+
"35875 學校規模冠絕全馬。 \n",
|
106 |
+
"35876 黃龍溪鎮也逐漸由繁忙的碼頭轉變為安靜的江邊場鎮。 \n",
|
107 |
+
"\n",
|
108 |
+
" target_text \n",
|
109 |
+
"0 我係要求……呢啲係好清楚嘅法律上嘅澄清呀 \n",
|
110 |
+
"1 誒,由七點半就做到十一點半 \n",
|
111 |
+
"2 梁頌恆議員呢,係咪繼續係發言 \n",
|
112 |
+
"3 可以點叫我呀?呢度成條街叫我大家姐,因為我最大,年紀最大吖嘛 \n",
|
113 |
+
"4 噉呢,就對於佢能唔能夠達到你頭���提問嗰個嘅要求呢,我就唔能夠作出任何嘅意見 \n",
|
114 |
+
"... ... \n",
|
115 |
+
"35872 2006~2007學年寒暑假間亦試過將學校整大兼修容過 \n",
|
116 |
+
"35873 克里米亞戰爭 \n",
|
117 |
+
"35874 海潮遇返失敗多年嘅生母 \n",
|
118 |
+
"35875 學校嘅運動水平可謂全區之冠。 \n",
|
119 |
+
"35876 而九龍寨城到海邊碼頭就慢慢變成市集。 \n",
|
120 |
+
"\n",
|
121 |
+
"[35877 rows x 2 columns]"
|
122 |
+
]
|
123 |
+
},
|
124 |
+
"execution_count": 1,
|
125 |
+
"metadata": {},
|
126 |
+
"output_type": "execute_result"
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"source": [
|
130 |
+
"import pandas as pd\n",
|
131 |
+
"\n",
|
132 |
+
"df = pd.read_pickle(\"yue_zh_combined36k.pkl\")\n",
|
133 |
+
"df"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"cell_type": "code",
|
138 |
+
"execution_count": 2,
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [],
|
141 |
+
"source": [
|
142 |
+
"df = df.reset_index() # make sure indexes pair with number of rows\n",
|
143 |
+
"\n",
|
144 |
+
"with open(\"train/mined_bitext.can\", \"w+\") as can_file, open(\"train/mined_bitext.man\", \"w+\") as man_file:\n",
|
145 |
+
" for index, row in df.iterrows():\n",
|
146 |
+
" man_file.write(row['input_text'] + \"\\n\")\n",
|
147 |
+
" can_file.write(row['target_text'] + \"\\n\")\n",
|
148 |
+
" man_file.flush()\n",
|
149 |
+
" can_file.flush()"
|
150 |
+
]
|
151 |
+
}
|
152 |
+
],
|
153 |
+
"metadata": {
|
154 |
+
"kernelspec": {
|
155 |
+
"display_name": "Python 3",
|
156 |
+
"language": "python",
|
157 |
+
"name": "python3"
|
158 |
+
},
|
159 |
+
"language_info": {
|
160 |
+
"codemirror_mode": {
|
161 |
+
"name": "ipython",
|
162 |
+
"version": 3
|
163 |
+
},
|
164 |
+
"file_extension": ".py",
|
165 |
+
"mimetype": "text/x-python",
|
166 |
+
"name": "python",
|
167 |
+
"nbconvert_exporter": "python",
|
168 |
+
"pygments_lexer": "ipython3",
|
169 |
+
"version": "3.10.6"
|
170 |
+
},
|
171 |
+
"orig_nbformat": 4
|
172 |
+
},
|
173 |
+
"nbformat": 4,
|
174 |
+
"nbformat_minor": 2
|
175 |
+
}
|
para/.DS_Store
ADDED
Binary file (10.2 kB). View file
|
|
para/dev/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
para/dev/dev.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/dev/dev.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/dev/dev.norm.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/test/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
para/test/test.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/test/test.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/test/test.norm.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/test/test.typos.can
ADDED
The diff for this file is too large to render.
See raw diff
|
|
para/test/test.typos.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
process_novels.ipynb
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 45,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"# https://cloud.tencent.com/developer/article/2197062\n",
|
10 |
+
"import re\n",
|
11 |
+
"def cut_sent(input):\n",
|
12 |
+
" lines = []\n",
|
13 |
+
" i = 0\n",
|
14 |
+
" line = \"\"\n",
|
15 |
+
" while i < len(input):\n",
|
16 |
+
" if input[i] == \"「\":\n",
|
17 |
+
" if len(line) > 0:\n",
|
18 |
+
" lines.append(line)\n",
|
19 |
+
" line = \"\"\n",
|
20 |
+
" line += input[i]\n",
|
21 |
+
" i += 1\n",
|
22 |
+
" while i < len(input) and input[i] != \"」\":\n",
|
23 |
+
" line += input[i]\n",
|
24 |
+
" i += 1\n",
|
25 |
+
" if i < len(input):\n",
|
26 |
+
" line += input[i]\n",
|
27 |
+
" lines.append(line)\n",
|
28 |
+
" line = \"\"\n",
|
29 |
+
" else:\n",
|
30 |
+
" line += input[i]\n",
|
31 |
+
" i += 1\n",
|
32 |
+
" if len(line) > 0:\n",
|
33 |
+
" lines.append(line)\n",
|
34 |
+
" sents = []\n",
|
35 |
+
" for line in lines:\n",
|
36 |
+
" if line.startswith(\"「\"):\n",
|
37 |
+
" if len(sents) > 0 and not re.match(\"[。!?\\?]\", sents[-1][-1]):\n",
|
38 |
+
" sents[-1] += line\n",
|
39 |
+
" else:\n",
|
40 |
+
" sents.append(line)\n",
|
41 |
+
" else:\n",
|
42 |
+
" line = re.sub('([。!?\\?])([^”’」])', r\"\\1\\n\\2\", line) # 单字符断句符\n",
|
43 |
+
" line = re.sub('(\\.{6})([^”’」])', r\"\\1\\n\\2\", line) # 英文省略号\n",
|
44 |
+
" line = re.sub('(\\…{2})([^”’」])', r\"\\1\\n\\2\", line) # 中文省略号\n",
|
45 |
+
" line = re.sub('([。!?\\?][”’」])([^,。!?\\?])', r'\\1\\n\\2', line)\n",
|
46 |
+
" # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\\n放到双引号后,注意前面的几句都小心保留了双引号\n",
|
47 |
+
" line = line.rstrip() # 段尾如果有多余的\\n就去掉它\n",
|
48 |
+
" # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。\n",
|
49 |
+
" lines = line.split(\"\\n\")\n",
|
50 |
+
" if len(sents) > 0 and re.search(\"[^。!?\\?][”’」]$\", sents[-1]):\n",
|
51 |
+
" sents[-1] += lines[0]\n",
|
52 |
+
" sents.extend(lines[1:])\n",
|
53 |
+
" else:\n",
|
54 |
+
" sents.extend(lines)\n",
|
55 |
+
" return sents"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 47,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"novel_lines = []\n",
|
65 |
+
"\n",
|
66 |
+
"with open(\"train/little_prince.txt\", \"r\") as input_file:\n",
|
67 |
+
" for line in input_file.read().splitlines():\n",
|
68 |
+
" if len(line) > 0:\n",
|
69 |
+
" novel_lines.extend(cut_sent(line))\n",
|
70 |
+
"\n",
|
71 |
+
"with open(\"train/animal_farm.txt\", \"r\") as input_file:\n",
|
72 |
+
" for line in input_file.read().splitlines():\n",
|
73 |
+
" if len(line) > 0:\n",
|
74 |
+
" novel_lines.extend(cut_sent(line))\n",
|
75 |
+
"\n",
|
76 |
+
"with open(\"train/novels.can\", \"w+\") as output_file:\n",
|
77 |
+
" for line in novel_lines:\n",
|
78 |
+
" output_file.write(line + \"\\n\")"
|
79 |
+
]
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"metadata": {
|
83 |
+
"kernelspec": {
|
84 |
+
"display_name": "Python 3",
|
85 |
+
"language": "python",
|
86 |
+
"name": "python3"
|
87 |
+
},
|
88 |
+
"language_info": {
|
89 |
+
"codemirror_mode": {
|
90 |
+
"name": "ipython",
|
91 |
+
"version": 3
|
92 |
+
},
|
93 |
+
"file_extension": ".py",
|
94 |
+
"mimetype": "text/x-python",
|
95 |
+
"name": "python",
|
96 |
+
"nbconvert_exporter": "python",
|
97 |
+
"pygments_lexer": "ipython3",
|
98 |
+
"version": "3.10.6"
|
99 |
+
},
|
100 |
+
"orig_nbformat": 4
|
101 |
+
},
|
102 |
+
"nbformat": 4,
|
103 |
+
"nbformat_minor": 2
|
104 |
+
}
|
runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/1681654257.025384/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2efe09742567e4a2f40b79868bd8ebebc12e7a6734df43fc6edc6193593a3bc
|
3 |
+
size 6023
|
runs/Apr16_10-10-56_Kevins-MacBook-Pro-4.local/events.out.tfevents.1681654257.Kevins-MacBook-Pro-4.local.13638.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:abdd99ecb35970605a6d76b6185ecd57edb0fec7cdfe79396226b0391096a415
|
3 |
+
size 20134
|
test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.130K.new.12000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.130K.new.6000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.130K.old.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.16K.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.175K.12000.bidir.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.80K.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.bing.11000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.pred.bing.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.typos.pred.130K.new.12000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.typos.pred.130K.old.12000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.typos.pred.170K.mined.6000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.typos.pred.175K.12000.bidir.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.typos.pred.80K.7000.man
ADDED
The diff for this file is too large to render.
See raw diff
|
|