### **Data Preprocessing**


In [1]:
%%capture
!pip install datasets
!pip install transformers
!pip install starcc
!pip install sacrebleu

In [1]:
import random

random.seed(42)

In [2]:
from collections import defaultdict
from random import randint, choice
import math

typo_table = defaultdict(list)

with open("typos.txt", "r") as typos_file:
 for line in typos_file.read().splitlines():
 [correct, typo] = line.split("\t")
 typo_table[correct].append(typo)

def insert_typos(s: str) -> str:
 max_num_typos = randint(1, max(math.ceil(len(s) * 0.05), 1))
 num_typos = 0
 corrects = list(typo_table.keys())
 while num_typos < max_num_typos and len(corrects) > 0:
 correct = choice(corrects)
 corrects.remove(correct)
 typo = choice(typo_table[correct])
 if s.count(correct) > 0:
 num_typos += 1
 s = s.replace(correct, typo)
 return s

insert_typos("唔該你細聲啲,我喺度做緊嘢,使唔使講呢啲嘢。")

'唔該你細聲啲,我喺度做緊野,使唔使講呢啲野。'

In [3]:
abc_mapping = {}

with open("abc_rare_char_mapping.txt", "r") as input_file:
 for line in input_file.read().splitlines():
 [c, n, freq] = line.split("\t")
 if len(n) == 1:
 abc_mapping[c] = n

print("Loaded {} normalization mappings".format(len(abc_mapping)))

Loaded 177 normalization mappings


In [4]:
# replace all occurence of rare characters with normalized ones
def normalize_abc(line: str) -> str:
 for c, n in abc_mapping.items():
 line = line.replace(c, n)
 line = line.replace("而𠺢", "而家").replace("依𠺢", "依家")
 return line

In [5]:
# Normalize test and validation sets

with open("para/test/test.can", "r") as input_file:
 with open("para/test/test.norm.can", "w+") as output_file:
 for line in input_file.read().splitlines():
 output_file.write(normalize_abc(line) + "\n")

with open("para/dev/dev.can", "r") as input_file:
 with open("para/dev/dev.norm.can", "w+") as output_file:
 for line in input_file.read().splitlines():
 output_file.write(normalize_abc(line) + "\n")

In [6]:
# Insert typos to test set
num_typo_lines = 0
with open("para/test/test.norm.can", "r") as can_file, open("para/test/test.man", "r") as man_file,\
 open("para/test/test.typos.can", "w+") as can_output,\
 open("para/test/test.typos.man", "w+") as man_output:
 for can_line, man_line in zip(can_file, man_file):
 can_line_with_typos = insert_typos(can_line)
 if can_line_with_typos != can_line:
 num_typo_lines += 1
 can_output.write(can_line_with_typos)
 man_output.write(man_line)

print(f"Inserted typos into {num_typo_lines} lines")

Inserted typos into 4309 lines


In [7]:
import json
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False)

with open("train/botisan.json", "r") as botisan_file, open("train/tatoeba.tsv", "r") as tatoeba_file,\
 open("train/train.can", "w+") as can_file, open("train/train.man", "w+") as man_file:
 for line in botisan_file.readlines():
 translation = json.loads(line)["translation"]
 if "係度" in translation["yue"] or "係到" in translation["yue"]: # This is a common typo that wasn't filtered out
 continue
 can = translation["yue"].replace("⠀", "").strip()
 man = convert(translation["zh"].replace("⠀", "").strip())
 can_file.write(can + "\n")
 man_file.write(man + "\n")
 can_typo = insert_typos(can)
 if can_typo != can:
 can_file.write(can_typo + "\n")
 man_file.write(man + "\n")
 
 for line in tatoeba_file.read().splitlines():
 [_, can, _, man] = line.split("\t")
 if "係度" in can or "係到" in can: # This is a common typo that wasn't filtered out
 continue
 can_file.write(can + "\n")
 man_file.write(convert(man) + "\n")
 can_typo = insert_typos(can)
 if can_typo != can:
 can_file.write(can_typo + "\n")
 man_file.write(convert(man) + "\n")


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kk/n4ff6h1n3t170b1m4zv09yf40000gn/T/jieba.cache
Loading model cost 0.364 seconds.
Prefix dict has been built successfully.


In [8]:
with open("train/train.can", "a+") as can_train_file, open("train/train.man", "a+") as man_train_file,\
 open("commercial_baselines/bing.can", "r") as can_bing_file, open("commercial_baselines/bing.man", "r") as man_bing_file:
 for can_line, man_line in zip(can_bing_file.read().splitlines(), man_bing_file.read().splitlines()):
 can_train_file.write(can_line + "\n")
 man_train_file.write(man_line + "\n")
 can_typo = insert_typos(can_line)
 if can_typo != can_line:
 can_train_file.write(can_typo + "\n")
 man_train_file.write(man_line + "\n")


In [9]:
with open("train/train.can", "a+") as can_train_file, open("train/train.man", "a+") as man_train_file,\
 open("train/lihkg.filtered.can", "r") as can_lihkg_file, open("commercial_baselines/lihkg.filtered.man", "r") as man_lihkg_file:
 for can_line, man_line in zip(can_lihkg_file.read().splitlines(), man_lihkg_file.read().splitlines()):
 if "係度" in can_line or "係到" in can_line: # This is a common typo that wasn't filtered out
 continue
 can_train_file.write(can_line + "\n")
 man_train_file.write(man_line + "\n")
 can_typo = insert_typos(can_line)
 if can_typo != can_line and random.random() < 0.2:
 can_train_file.write(can_typo + "\n")
 man_train_file.write(man_line + "\n")


In [10]:
from datasets import Dataset

train_data = None
val_data = None
test_data = None

with open("train/train.can", "r") as can_file, open("train/train.man", "r") as man_file:
 train_data = Dataset.from_dict({"can": can_file.read().splitlines(), "man": man_file.read().splitlines()}).shuffle(seed=42).flatten_indices()
 print(f"Loaded training data.")
 print(f"First line: {train_data[0]}")

with open("para/dev/dev.norm.can", "r") as can_file, open("para/dev/dev.man", "r") as man_file:
 val_data = Dataset.from_dict({"can": can_file.read().splitlines(), "man": man_file.read().splitlines()})
 print(f"Loaded validation data.")
 print(f"First line: {val_data[0]}")

with open("para/test/test.norm.can", "r") as can_file, open("para/test/test.man", "r") as man_file:
 test_data = Dataset.from_dict({"can": can_file.read().splitlines(), "man": man_file.read().splitlines()})
 print(f"Loaded test data.")
 print(f"First line: {test_data[0]}")

with open("para/test/test.typos.can", "r") as can_file, open("para/test/test.typos.man", "r") as man_file:
 test_typos_data = Dataset.from_dict({"can": can_file.read().splitlines(), "man": man_file.read().splitlines()})
 print(f"Loaded test data with typos.")
 print(f"First line: {test_typos_data[0]}")


Flattening the indices: 0%| | 0/295518 [00:00, ? examples/s]

Loaded training data.
First line: {'can': '畫叮噹既第一步就係畫圓圈加一點。', 'man': '畫叮噹的第一步就是畫圓圈加一點。'}
Loaded validation data.
First line: {'can': '啲咁耐就攪掂嘞,真係掯', 'man': '他一會兒工夫就弄好了,真神'}
Loaded test data.
First line: {'can': '筷子放喺你嘅右便', 'man': '筷子放在你的右邊'}
Loaded test data with typos.
First line: {'can': '筷子放喺你既右便', 'man': '筷子放在你的右邊'}


In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('Ayaka/bart-base-cantonese')
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [13]:
batch_size=64 # change to 16 for full training
encoder_max_length=64
decoder_max_length=64

def process_data_to_model_inputs(batch):
 # tokenize the inputs and labels
 inputs = tokenizer(batch["can"], padding="max_length", truncation=True, max_length=encoder_max_length)
 outputs = tokenizer(batch["man"], padding="max_length", truncation=True, max_length=decoder_max_length)

 batch["input_ids"] = inputs.input_ids
# batch["attention_mask"] = inputs.attention_mask
# batch["decoder_input_ids"] = outputs.input_ids
# batch["decoder_attention_mask"] = outputs.attention_mask
 batch["labels"] = outputs.input_ids.copy()

 # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
 # We have to make sure that the PAD token is ignored
 batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

 return batch

# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
# train_data = train_data.select(range(32))

train_data = train_data.map(
 process_data_to_model_inputs, 
 batched=True, 
 batch_size=batch_size,
)
train_data.set_format(
 # type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
 type="torch", columns=["input_ids", "labels"],
)


# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
# val_data = val_data.select(range(16))

val_data = val_data.map(
 process_data_to_model_inputs, 
 batched=True, 
 batch_size=batch_size,
)
val_data.set_format(
 # type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
 type="torch", columns=["input_ids", "labels"],
)

Map: 0%| | 0/295518 [00:00, ? examples/s]

Map: 0%| | 0/6502 [00:00, ? examples/s]

### **Warm-starting the Encoder-Decoder Model**

In [14]:
from transformers import BartForConditionalGeneration

bart = BartForConditionalGeneration.from_pretrained('Ayaka/bart-base-cantonese')

In [15]:
# set special tokens
bart.config.decoder_start_token_id = tokenizer.bos_token_id
bart.config.eos_token_id = tokenizer.eos_token_id
bart.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bart.config.max_length = 64
bart.config.min_length = 3
# bart.config.no_repeat_ngram_size = 3
bart.config.early_stopping = True
# bart.config.length_penalty = 2.0
bart.config.num_beams = 4

### **Fine-Tuning Warm-Started Encoder-Decoder Models**

In [16]:
import sacrebleu

def compute_metrics(pred):
 # print("Start compute_metrics")
 labels_ids = pred.label_ids
 pred_ids = pred.predictions

 # all unnecessary tokens are removed
 pred_str = [s.replace(" ", "") for s in tokenizer.batch_decode(pred_ids, skip_special_tokens=True)]
 print("Predicted:", pred_str)
 labels_ids[labels_ids == -100] = tokenizer.pad_token_id
 label_str = [s.replace(" ", "") for s in tokenizer.batch_decode(labels_ids, skip_special_tokens=True)]
 print("Target:", label_str)

 bleu = sacrebleu.BLEU(trg_lang="zh")
 bleu_score = bleu.corpus_score(pred_str, [label_str]).score

 chrf = sacrebleu.CHRF()
 chrf_score = chrf.corpus_score(pred_str, [label_str]).score

 return {
 "bleu": bleu_score,
 "chrf": chrf_score,
 }

In [17]:
# Test CharBLEU
predictions = ['我啲咁耐就攪掂嘞,真係掯', '你幾時得嚟吖']
references = ['我啲咁耐就攪掂嘞,真係掯', '你幾時得嚟吖']

bleu = sacrebleu.BLEU(trg_lang="zh")
bleu_score = bleu.corpus_score(predictions, [references]).score
assert bleu_score - 100 <= 1e-9

Cool! Finally, we start training.

In [18]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

import warnings
warnings.filterwarnings('ignore')

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
 output_dir="./",
 per_device_train_batch_size=batch_size,
 per_device_eval_batch_size=batch_size,
 predict_with_generate=True,
 use_mps_device=True,
 do_train=True,
 evaluation_strategy="steps",
 do_eval=True,
 logging_steps=200, # set to 1000 for full training
 save_steps=1000, # set to 500 for full training
 eval_steps=1000, # set to 8000 for full training
 warmup_steps=2000, # set to 2000 for full training
 overwrite_output_dir=True,
 save_total_limit=8,
 fp16=False,
 num_train_epochs=5,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
 model=bart,
 args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=train_data,
 eval_dataset=val_data.select(range(batch_size * 10)),
)
trainer.train()

 0%| | 0/23090 [00:00, ?it/s]

{'loss': 1.7639, 'learning_rate': 5e-06, 'epoch': 0.04}
{'loss': 0.4886, 'learning_rate': 1e-05, 'epoch': 0.09}
{'loss': 0.2481, 'learning_rate': 1.5e-05, 'epoch': 0.13}
{'loss': 0.1724, 'learning_rate': 2e-05, 'epoch': 0.17}
{'loss': 0.1417, 'learning_rate': 2.5e-05, 'epoch': 0.22}


 0%| | 0/10 [00:00, ?it/s]

Predicted: ['啲咁耐就攪掂了,真的掯', '幾幾嚟吖', '爸爸爸一眼', '鬼氣氣,說話好像雞啄不斷那', '我呃咗我好多錢', '錢錢錢將啲錢畀佢', '給給我給了三次錢你', '戴戴帽紅帽帽', '的有多重', '攰攰攰攰到死!', '何何有幾何吖,冇乜幾何啫', '呢度得一個人咋', '佢好', '你你同你講過嘅喇,係咪先', '廣廣廣東話讀歪啲就係普通話嘞', '有有乜順嘢吖?', '看過三次戲', '我我', '我我書我', '我逼我叫你選他做班長', '筆筆', '你你你好叻呀!', '去去黃去', '我我搶我三粒糖', '做做做功課', '我我錢', '枱枱將張枱抹一抹', '我借五蚊你,你連句多謝都冇', '啲將將啲衫洗咗', '人寫字你不好在度喐喐', '寫寫信給', '同同敵同同敵人搏命', '我我', '我間我賺咗我五蚊', '我我', '我借你五蚊,遲還還給你,', '呢這呢個老闆仲欠伙記三個月人工', '你你你就勸佢唔好喊喇!', '你別逼我做這樣的事!', '你這樣做法,正一攞苦來辛!', '嗰嗰嗰啲嘢出嚟', '我當佢好人添!', '我我請吃', '我我', '我教你十年英文', '生得真醜怪', '生得真醜樣', '真冇解八點鐘仲未有人嚟', '條條條命凍過水', '事', '生生得像你', '我麻你一件事', '會會會?會?', '乜嘢都唔想要', '教教幾十年書', '咗咗洗咗啲衫', '被打', '送一筆', '我教咗呢一科十年', '書書', '我你我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一一唞氣一邊傾偈', '啤啤仔好得', '去去唔去?', '套套得吖', '要要要??', '你你這你你話幾咁靚呢', '你他你睇佢個樣幾得敕', '度這呢度冇乜好玩', '你叫什麼名啊?', '你你你們喺度攪攪震,人家都冇辦法休息喇', '去去你唔去呀?', '在在在佢企喺出便', '了了了你你躝咗去邊度?', '他這個人鬼五馬六的事零多', '他他佢好打得', '他他他沒錯吖,為什麼說他呢?', '他他佢嘅作風這麼鬼馬,好人都有限嘞', '你話怎呢?', '你你想搵你傾', '你都大仔', '佢買嘢好醃嘅', '你以為咁著著', '生生得好靚', '算不算算算了算算算得算算尖算算的算算不算算菜算算好算算了算得

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這這這一這一早多點就這點這就快這早這多這快那些很幾不難', '幾幾時得嚟吖', '他他爸爸一眼', '他這氣他他說他他說別,他他他氣氣氣說說說他他說他氣說氣,他別說,說說說氣氣氣說說他說說', '他我我我騙佢呃錢它值這她了給價是此', '錢將將錢給將他', '我我給給我給了三三三我我給三給給我了了我給', '戴戴了紅色的', '的有有多重', '攰累攰攰累累', '有有幾何吖,冇乜幾何啫', '呢這人此一其在他得我依是只而的們個不任', '他他他他好好他', '你你你你已你你早你已已你,,你,已,,,了你,,我,,已了,了,,是了,是,,確,,當你,了已,了了,你了,已', '廣廣東話讀歪點就是普通話了', '有有什麼順東西吖?', '過次不看三多好是經一了那年已下很睇這十小', '我我我', '他我我他我我三我書我三我書書我他我我我他他他我三我我書我我三他我他他我三三我他三我書三我三書', '他我我我我他我他叫我叫逼我要要我逼我叫他他他逼我我我他我我叫我叫他叫叫我要他叫我', '送送筆他', '你你估你好聰啊!', '他他佢嗌亞黃去', '我他我我我他搶佢我搶我他我佢強我搶我佢我我我搶我我他他我他搶我搶搶我他我我強我我要我值我', '做做完點功課', '我我我啲錢', '枱把一桌張這子用拿那椅的給在弄檯將裡房店', '我我借五元你,你連句謝謝都冇', '將啲衫用', '人別寫字你不要在度喐', '寫寫寫封信給媽媽', '同敵人拚命', '我我我', '那間店我賺了我五塊', '我我我借我錢', '我借你五塊,遲早還還給你的', '這這這呢個老闆仲欠伙記三個月工資', '你就勸勸勸他不要哭!', '你你別我做這的的你你別!', '你這樣做法,正一拿苦來辛!', '嗰嗰拎嗰啲嘢出嚟', '我當他好人添!', '我我請吃', '我我對筷子', '我我教咗你十年英文', '生醜醜醜生醜生醜醜醜生醜長醜長生生生得醜醜長醜醜別醜醜得醜生生醜長生長醜生長長生醜生得生醜別生醜', '生醜醜醜生得醜醜醜生醜得醜長醜醜長生醜醜樣醜醜得長醜長長醜生長醜得真醜樣', '真真冇解八點鐘仲沒', '條條命凍過水', '事事了事了', '生長生得像你', '我我麻你事', '下下午開會嗎?', '乜乜鬼嘢都唔想要', '教教教幾教幾教教教幾幾教了幾十年書', '咗洗咗

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這這這麼久就搞定了,真', '幾時得嚟吖', '他望他爸爸一眼', '他那氣氣,說話跟雞啄不斷似', '他騙我我很多錢', '錢啲錢錢他', '我給了三次錢你', '戴戴帽戴帽帽帽', '有有的有多重', '累累累累攰攰累累累攰累攰攰攰累攰攰累忙累累忙悶累累疲累累活累累苦累忙累悶忙攰累活我累累我!', '有幾何吖,沒有什麼幾何啫', '這這這他一他人', '他他他好他他', '你你你跟你你跟你你我你我我你你已你你我你,你,我,我已我,,,我跟你説,,了你,,吧,,你了,,是,,已,,', '廣東話讀歪點就是普通話了', '有什順東?', '看多三戲次是過演好不一那下很電這經行小年', '我我你借', '他我他我我我三我我他我我我三我三三三我書三三書我我書我三本書三本我我', '他逼我叫我選他做班長', '筆筆他', '你你你很你你嗎!', '他他他去', '他強搶我我三粒糖', '做完功功的功課', '我我錢', '桌把把', '我借五塊你,你連五塊毫毫毫子你,我我五毫你,五毫子,你,', '一啲衣點啲啲啲點啲啲衣啲啲衫啲衣', '人人寫字你不別在這裡動', '信信信寫寫媽媽', '敵敵敵敵人敵人人敵敵', '我拿給我', '那間商店賺了我五塊', '你我我我錢', '我借你五塊,遲早還是要還給你的', '這這老還還這員還還還員工三個月工資', '你就勸他不要哭了!', '你逼我做這的', '你這樣做法,正一拿苦來辛!', '那拎嗰啲嘢出嚟', '我當他好人添!', '我我請', '筷我筷我對筷', '我教我教英我教', '長長醜醜醜長醜醜得醜醜', '長長生長長', '真沒解八點鐘還沒有人來', '條命命命條命命命條條命條命凍命命', '事無無謂搞', '長生生生生像生生得像你生生像像像你像像生像你', '我我麻我我一件', '下下午開會嗎', '什東什什什東是什東什什都什都什東東什都都都什都不都都不想想都都想要', '教教幾十年書', '洗咗啲衣', '給他給給給他打了一了', '筆筆筆', '我教我教了一', '我本書', '我我我你你我問你一問題', '郭我我幫他太太找師傅教車', '一涼涼一邊聊天', '這這這小啤孩好可', '你去不去嗎?', '這這套戲看得啊', '你你要不?', '你你你說挺這想呢', '你他他的樣子挺得敕'

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['點這,真', '幾幾時得嚟吖', '他了他爸爸一眼', '他氣,說話跟啄不斷一', '他騙我很錢', '錢錢把將送將', '我給了三錢你', '帽戴戴戴', '有有有多重', '累累累累攰活很悶死!', '有幾何吖,沒什麼幾何啫', '這這他一個人咋', '他他他', '你跟你說過的,,,是,', '廣廣話讀歪點就是普話了', '有什東東???', '看三次戲', '我我我', '我我他她不給', '我逼我選他做班長', '筆筆給送他', '你你你好啊!', '他他他他佢喊去', '我搶了我三顆糖', '做做功', '錢我我啲', '桌桌把一給將拿把把把桌把將把張把這把給把一把要把整把蓋把面把用把弄把讓把', '我借五塊你,你連錢都沒', '點衣', '人寫字你別在這裡動', '信信寫信寫媽', '敵跟', '我我我我', '那間商店賺了我五元', '我借錢錢', '我借五元,遲還還給你的', '這這老還還員工三個月工資', '你就勸他別哭!', '你我逼我做這的事!', '你這樣做法,正一拿苦來辛!', '那那拿把這拎拎出那那拿', '我當他好人添!', '我我我請', '筷筷', '我我你我我十年英語', '長長長醜', '醜醜醜長醜', '真沒解八點還沒有人來', '條', '事要搞不沒有了了', '長長生你', '我麻你一件事', '下下下下午不多?', '什東東都不想要', '教教幾十年書', '洗咗啲衣服', '他他打了一', '筆筆給送給送送給送送贈給他送', '我教了這一科十年', '我我我本書', '我問你一問題', '我我幫他太太找師傅教車', '涼涼一哪誰談聊傾', '小這呢小', '你去不?', '這套這呢演一戲我', '你你不您這?啊吖嗎你你你?啊你啊?你你嗎呀你你啊你你要嗎你??你?你?你啊?啊你啊', '你你說多好呢', '你他的樣挺', '這裡沒什好玩', '你叫什?名?', '你你人不別在,在你在在在這的在好在很在不在人在生在怎我是你在在你你你', '你不嗎?', '他他他他她在佢站', '你了哪?', '他他他人故事的事多', '他他打得', '他沒錯啊,為什麼說他呢?', '他的作風這麼,好人也有限了', '你這呢?呢', '他', '你你你大', '他他買東他買買東買東的', '你以為這麼著數嗎', 

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這麼久就辦好了,真膩', '什時得嚟啊', '他望了他爸爸一眼', '他非常生氣,說話跟雞啄不斷一樣', '他騙了我很多錢', '錢他', '我給了三次錢你', '戴戴戴帽帽帽戴帽', '的有多重', '累死了!', '有幾何吖,沒什麼幾何而已', '這他一個人咋', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西吖?', '看過三次戲', '我我借給你還不把這讓是', '我三本書我', '他逼我叫你選他做班長', '送枝筆他', '你猜你很聰明啊!', '他亞黃去', '他硬搶了我三粒糖', '做功課', '我的錢', '把桌子抹一抹', '我借五塊你,你連五塊謝謝都沒', '把這些衣服洗了', '人寫字你不在這裡動', '封信給媽', '跟敵人拚命', '我我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一拿苦嚟辛!', '那那些東出來', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長生得真醜', '長得真醜', '真沒解八點鐘還沒有人來', '條命凍過水', '別搞這麼多東西了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什東東都不想要', '教了幾十年書', '洗咗啲衣服', '給他打了一下', '送枝筆給他', '我教了這一科十年', '我書書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊涼涼一邊聊天', '這這個嬰孩孩兒好可', '你去不去嗎?', '這套戲看得啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這裡沒什好玩', '你叫什麼名字呀?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人無關的的事特多', '他很打得', '他沒錯啊,為什麼說他呢?', '他的作風這麼奇怪,好人也有限了', '你說怎麼辦呢?', '他想找你聊', '你也大兒了', '他買東西很尖的', '你以為這麼便宜嗎', '他長得好帥', '他不算腌尖', '他動到就看醫生吃藥', '是啊

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這麼久就辦好了,真膩', '什麼時候得來啊', '他望了他爸爸一眼', '他非常生氣,說話跟雞啄不斷一樣', '他騙了我很多錢', '把錢給他', '我給了三次錢你', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何吖,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西吖?', '看過三次電影', '你借錢我', '他給我三本書', '他逼我叫你選他做班長', '送枝筆他', '你以為你好聰明啊!', '他喊亞黃去', '他強迫搶了我三顆糖', '做完功課', '我的錢', '把張桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把啲衣服洗了', '人家寫字你不要在這裡動動貢', '寫封信給媽媽', '跟敵人拼命', '拿杯茶給我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一拿苦來辛!', '拿那些東西出來', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長得真醜', '長得真醜', '真沒解八點鐘還沒有人來', '條命沒指望', '沒必要搞這麼多嘢了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什麼鬼東西都不想要', '教咗幾十年書', '洗咗點衣服', '給他打了一下', '送枝筆給他', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊乘涼一邊聊天', '這個嬰兒孩子好可愛', '你去不去嗎?', '這套電影看得啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這這裡沒什好玩', '你叫什麼名字呀?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人烏七八糟的事特別多', '他好打得', '他沒錯啊,為什麼說他呢?', '他的作風這麼搞怪,好人也有限了', '你說怎麼辦呢?', '他想找你聊一下', '你也大兒子了', '他買東西很腌尖的', '你以為這麼著數嗎', '他長得很漂亮

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這些這麼久就辦好了,真膩', '什麼時候得來吖', '他望了他爸爸一眼', '他極其客氣,說話跟雞啄不斷一樣', '他騙了我很多錢', '把錢給他', '我給了三次錢你', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何吖,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西吖?', '看過三次戲', '你借錢我', '他給我三本書', '他逼我叫你選他做班長', '送枝筆他', '你猜你很聰明啊!', '他喊亞黃去', '他強迫搶了我三粒糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動動貢', '寫封信給媽媽', '跟敵人拚命', '拿杯茶給我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一拿苦來辛!', '拿那些東西出來', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長得真醜', '長得真醜', '真沒說明八點鐘還沒有有人來', '條命沒指望', '沒必要搞這麼多嘢了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什麼鬼東西都不想要', '教咗幾十年書', '洗了點衣服', '給他打了一下', '送枝筆給他', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊洗澡一邊聊天', '這個泰迪孩子很可愛', '你去不去嗎?', '這套電影看得啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這裡沒什麼好玩', '你叫什麼名字呀?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人烏七八糟的事特別多', '他很能打', '他沒錯啊,為什麼說他呢?', '他的作風這麼搞怪,好人也有限了', '你說怎麼辦呢?', '他想找你談一下', '你也大兒子了', '他買東西很腌尖的', '你以為這麼著數嗎', '他長得很

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這麼久就辦定了,真膩', '幾哪哪什這誰怎你不得很有隨好的我生那啥到時', '他爸爸他爸一眼', '他極其生氣,說話跟雞啄不斷一樣', '他騙了我很多錢', '錢錢他', '了給了三錢你', '了頂紅色的帽子', '的有多重', '累死了!', '有幾何啊,沒什麼幾何而已', '他一人而', '他很漂亮', '跟你說過的了,是不是先', '音廣廣不的自這粵一說文香給很其把哪學就生多', '什什什順東東?', '看過三次電影', '你借錢我', '我我他一給她這不學結誰其讓說把的所還那辯自', '他逼我叫你選他做班長', '送枝筆他', '你你你好你吧!你!!', '他亞黃去', '他強搶了我三顆糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動一下', '信給寫寄生一說這不的讓整自把哪我文', '跟敵人拚命', '我我我', '那間店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '我逼我做這樣的東西!', '你這樣做法,正一攞苦嚟辛!', '那那那東出來', '我他他他我了!', '我請你吃飯', '我對筷子', '我了你十年英', '醜醜長生得子漂不養白兒這髒裝的你那把臭怪醜', '醜樣醜長生得子不臭漂兒這的巴裝那你哪個', '怎真真對不實說這認誰你我有一因確的其還自就', '條命沒指望', '別了東東了', '長像你', '我麻麻你一件事', '會下不這多一哪你給很的開把那:說上在是有午', '什東東都不想要', '了幾年書', '洗了點衣服', '被他打了一下', '筆筆他他', '我了這一科十年', '我書書', '我你一個問題', '亞郭叫我幫他太太找師傅教車', '涼熱一哪誰這邊那不的讓生給你對說有還人很涼', '這小這兒生子孩人的不我依個你嬰一很把哇其有', '去你你不哪這您那把哦生給的我好:啊一得讓說', '這套戲看得啊', '要不要???', '你別這樣呢', '他的樣子挺得敕', '這這裡沒什好玩', '你什什什麼名嗎?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '哪哪了去哪了?', '他這人烏樣的事特別多'

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這些這麼久就辦好了,真膩', '什麼時候得來啊', '他看了他爸爸一眼', '他極氣,說話跟雞啄不斷一樣', '他騙了我很多錢', '把錢給他', '我給了三次錢你', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何吖,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過嘅,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西吖?', '看過三次電影', '你借錢我', '他給我三本書', '他逼我叫你選他做班長', '送筆他', '你猜你很聰明啊!', '他喊亞黃去', '他硬搶了我三顆糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把衣服洗了', '人家寫字你不要在這裡動動的', '寫封信給媽媽', '跟敵人拚命', '拿杯茶給我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一攞苦嚟辛!', '拿那些東西出來', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長得真醜', '長得真醜', '真沒解八點鐘還沒有人來', '條命沒指望', '沒必要搞這麼多嘢了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什麼東西都不想要', '教咗幾十年書', '洗了點衣服', '給他打了一下', '送筆給他', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊乘涼一邊聊天', '這個嬰兒兒子很可愛', '你去不去嗎?', '這套電影看得啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這裡沒什麼好玩', '你叫什麼名字呀?', '你們在搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人烏七八糟的事特別多', '他很能打', '他沒錯啊,為什麼說他呢?', '他的作風這麼搞怪,好人也有限了', '你說怎麼辦呢?', '他想找你談一下', '你也大兒子了', '他買東西很腌尖的', '你以為這麼著數嗎', '他長得很漂亮', '他不算腌尖'

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這些這麼久就辦好了,真膩', '什麼時候得來啊', '他看了他爸爸一眼', '他極氣,說話跟沒完沒了一樣', '他騙了我很多錢', '把錢給他', '我給了三次錢你', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何啊,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西啊?', '看過三次戲', '你借錢我', '他給我三本書', '他逼我叫你選他做班長', '送筆他', '你猜你很聰明啊!', '他喊亞黃去', '他強迫搶了我三粒糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動不動', '寫封信給媽媽', '跟敵人拚命', '拿杯茶給我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一拿苦嚟辛!', '拿那些東西出來', '我當他好人呢!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長得真醜', '長得真醜', '真沒解八點鐘還沒有有人來', '條命沒指望', '沒必要搞這麼多嘢了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什麼鬼東西都不想要', '教了幾十年書', '洗了點衣服', '給他打了一下', '送筆給他', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊乘涼一邊聊天', '這個嬰兒孩子很可愛', '你去不去嗎?', '這套電影看得啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這裡沒什麼好玩', '你叫什麼名字?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人烏七八糟的事特別多', '他很能打', '他沒錯啊,為什麼說他呢?', '他的作風這麼搞怪,好人也有限了', '你說怎麼辦呢?', '他想找你談一下', '你也大兒子了', '他買東西很腌尖的', '你以為這麼著數嗎', '他長得很漂亮', '

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這些這麼久就辦好了,真膩', '什麼時候得來啊', '他看了他爸爸一眼', '他非常生氣,說話跟沒完沒了一樣', '他騙了我很多錢', '把錢給他', '我給了三次錢你', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何啊,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西啊?', '看過三次電影', '你借錢我', '我他他給她有不說佢其比你被把又大一還多所結', '他逼我叫你選他做班長', '送筆他', '你猜你很聰明嗎!', '他喊亞黃去', '他強迫搶了我三粒糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動不動', '寫封信給媽媽', '跟敵人拚命', '拿杯茶給我', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事情!', '你這樣做法,正一拿苦來辛!', '拿那些東西出來', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長得真醜', '長得真醜', '真沒解八點鐘還沒有人來', '條命沒指望', '沒必要搞這麼多東西了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '什麼東西都不想要', '教了幾十年書', '洗了點衣服', '被他打了一下', '送筆給他', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '一邊乘涼一邊聊天', '這個嬰兒孩子很可愛', '你去不去嗎?', '這套電影看得', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這不有其好多你是得很玩依又在點大呢到幾哪這', '你叫什麼名字呀?', '你們在這裡搗亂,人家也沒辦法休息了', '你不去嗎?', '他站在外面', '你滾了去哪裡?', '他這個人烏七八糟的事特別多', '他很能打', '他沒錯啊,為什麼說他呢?', '他的作風這麼搞怪,好人也有限了', '你說怎麼辦呢?', '他想找你聊一下', '你也大兒子了', '買他

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這麼久就辦好了,真膩', '什麼時候得來啊', '他看了他爸爸一眼', '他極氣,說話跟沒完沒了一樣', '他騙了我很多錢', '把錢給他', '給你我一哪這有不說那把比多是點讓好還得就給', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何啊,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西吖?', '看過三次戲', '你借錢我', '我他他她給有不說其佢一比把得你大被又自那多', '他逼我叫你選他做班長', '送筆他', '你猜你很聰明啊!', '他喊亞黃去', '他強迫搶了我三粒糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動動貢', '寫封信給媽媽', '跟敵人拚命', '杯拿把你給有我點說一哪整怎好這要得喝我給給給拿給', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一拿苦來辛!', '拿拿拎那把整你哪說有就不點怎得要是又以提一', '我當他好人添!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長長醜得生有個不整著哪多好那又大矮養真臭比', '長長醜得生有醜得醜樣臭矮樣醜醜醜得醜長醜生得醜生醜得得生得得生生生醜臭', '真沒解八點鐘還沒有人來', '條命沒指望', '搞要別不無有把沒好就很又整哪多得你用說幾搞', '長得像你', '我麻煩你一件事', '下午開會嗎?', '東什怎把哪點什有是有什什什怎什哪什點不整你那一的這東', '教了幾十年書', '洗了點衣服', '被他打了一下', '筆送送給有說把這一要生帶得不點哪買請那的自', '教我我一學給這有哪那很不整說考多就好大點又', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '涼涼一哪邊誰這有要怎那給點說不得多在好就可', '小這個有不整好哇哪點依下一得就是很的把多小', '你去不去嗎?', '這套電影能看啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這不有得其哪好

 0%| | 0/10 [00:00, ?it/s]

Predicted: ['這麼久就辦好了,真膩', '什麼時候得來啊', '他看了他爸爸一眼', '他極其客氣,說話跟沒完沒了一樣', '他騙了我很多錢', '把錢給他', '給我我你哪這有不一說多比點得把是好要很就那', '戴了頂紅色的帽子', '的有多重', '累死了!', '有幾何啊,沒什麼幾何而已', '這裡只有他一個人而已', '他很漂亮', '跟你說過的了,是不是先', '廣東話讀歪點就是普通話了', '有什麼順東西啊?', '看過三次戲', '你借錢我', '我他他有她佢給其不說大得比把你又自被一如多', '他逼我叫你選他做班長', '送筆他', '你猜你很聰明嗎!', '他喊亞黃去', '他強迫搶了我三粒糖', '做完功課', '我的錢', '把桌子擦一擦', '我借五塊你,你連句謝謝都沒有', '把這些衣服洗了', '人家寫字你不要在這裡動動亂跑', '寫封信給媽媽', '跟敵人拚命', '杯拿給喝把有給給拿給給拿給有整說哪你點這得要我好一', '那間商店賺了我五塊', '你借我錢', '我借你五塊,遲早還是要還給你的', '這個老闆還欠員工三個月工資', '你就勸一下他別哭了!', '你別逼我做這樣的事!', '你這樣做法,正一攞苦來辛!', '拿那些東西出來', '我當他好人呢!', '我請你吃飯', '我對筷子', '我教了你十年英語', '長長長得醜生得很臭有不哪整好多奶養真大比點要', '長長醜得生有醜得醜醜醜樣臭矮不長醜得得得醜生得生生醜生得生生醜哪哪', '真沒解八點鐘還沒有人來', '條命沒指望', '沒必要搞這麼多嘢了', '長得像你', '我麻煩你一件事', '下午開會嗎?', '東什怎哪把點都有是這不整你乜好要大自的說很', '教了幾十年書', '洗了點衣服', '被他打了一下', '筆送送給有把說這要哪得生自帶一點不買請大可', '我教了這一科十年', '我本書', '我問你一個問題', '亞郭叫我幫他太太找師傅教車', '涼涼邊哪一誰這要有怎給說點那可得不多在人好', '小這這有整哇不好大哪依點得就生很你要多把個', '你去不去嗎?', '這套電影能看啊', '你要不要啊?', '你說多麼漂亮呢', '你看他的樣子挺得敕', '這不有得其哪好很呢是多依你在點大玩就幾又這', '你叫什麼名字呀?', '你們在搗亂,人家也沒辦

KeyboardInterrupt: 

### **Evaluation**

Awesome, we finished training our dummy model. Let's now evaluated the model on the test data. We make use of the dataset's handy `.map()` function to generate a summary of each sample of the test data.

In [19]:
# map data correctly
def generate_summary(model, pred_file, batch):
 # Tokenizer will automatically set [BOS] [EOS]
 # cut off at BERT max length 512
 inputs = tokenizer(batch["can"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
 input_ids = inputs.input_ids.to('mps')
 attention_mask = inputs.attention_mask.to('mps')

 outputs = model.generate(input_ids, attention_mask=attention_mask)

 # all special tokens including will be removed
 output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 batch["pred"] = [s.replace(" ", "") for s in output_str]
 for man, pred in zip(batch["man"], batch["pred"]):
 pred_file.write("P: " + pred + "\n")
 pred_file.write("T: " + man + "\n\n")
 return batch

In [20]:
from transformers import BertTokenizer, BartForConditionalGeneration

tokenizer = BertTokenizer.from_pretrained('Ayaka/bart-base-cantonese')
model = BartForConditionalGeneration.from_pretrained("./checkpoint-11000").to('mps')

batch_size = 64 # change to 64 for full evaluation

In [21]:
pred_file = open("test.typos.pred.bing.11000.man", "w+")

results = test_typos_data.map(lambda batch: generate_summary(model, pred_file, batch), batched=True, batch_size=batch_size)

pred_file.flush()

Map: 0%| | 0/4309 [00:00, ? examples/s]

In [26]:
pred_file = open("test.pred.bing.11000.man", "w+")

results = test_data.map(lambda batch: generate_summary(model, pred_file, batch), batched=True, batch_size=batch_size)

pred_file.flush()

Map: 0%| | 0/6502 [00:00, ? examples/s]

In [29]:
import sacrebleu

def eval_metrics(label_str, pred_str):
 bleu = sacrebleu.BLEU(trg_lang="zh")
 bleu_score = bleu.corpus_score(label_str, [pred_str]).score
 chrf = sacrebleu.CHRF()
 chrf_score = chrf.corpus_score(label_str, [pred_str]).score
 return {"bleu": bleu_score, "chrf": chrf_score}

def calculate_metrics(pred_file_path):
 # Normal eval
 pred_str = []
 label_str = []
 with open(pred_file_path, "r") as pred_file:
 pred_lines = pred_file.read().splitlines()
 for pred_line in pred_lines:
 if pred_line.startswith("P:"):
 pred_str.append(pred_line.removeprefix("P: "))
 elif pred_line.startswith("T:"):
 label_str.append(pred_line.removeprefix("T: "))

 full_metrics = eval_metrics(label_str, pred_str)
 print(f"Full BLEU: {full_metrics['bleu']}")
 print(f"Full CHRF: {full_metrics['chrf']}")

 # Length > 15
 pred_str = []
 label_str = []
 with open(pred_file_path, "r") as pred_file:
 pred_lines = pred_file.read().splitlines()
 i = 0
 num_bitext = 0
 while i < len(pred_lines):
 if pred_lines[i].startswith("P:") and len(pred_lines[i + 1]) - 3 <= 15:
 num_bitext += 1
 i += 3
 elif pred_lines[i].startswith("P:"):
 num_bitext += 1
 pred_str.append(pred_lines[i].removeprefix("P: "))
 i += 1
 elif pred_lines[i].startswith("T:"):
 label_str.append(pred_lines[i].removeprefix("T: "))
 i += 1
 else:
 i += 1
 print(f"Kept {len(pred_str)} out of {num_bitext} lines with length > 15")
 long_metrics = eval_metrics(label_str, pred_str)
 print(f"Long BLEU: {long_metrics['bleu']}")
 print(f"Long CHRF: {long_metrics['chrf']}")

print("BART: 16K")
calculate_metrics("test.pred.16K.man")
print("")
 
print("Base Phrase table: 80K")
calculate_metrics("test.pred.80K.man")
print("")

print("Base Phrase table with typo augmentation: 130K")
calculate_metrics("test.pred.130K.old.man")
print("")

print("Full Phrase table with typo augmentation: 130K")
calculate_metrics("test.pred.130K.new.12000.man")
print("")

print("Full Phrase table, bidirectional with typo augmentation: 175K")
calculate_metrics("test.pred.175K.12000.bidir.man")
print("")

print("Bing with typo augmentation: 300K")
calculate_metrics("test.pred.bing.11000.man")

BART: 16K
Full BLEU: 1.0193819910663349e-05
Full CHRF: 0.2511563711401644
Kept 1022 out of 6489 lines with length > 15
Long BLEU: 0.0001133003812423589
Long CHRF: 0.2785323244848631

Base Phrase table: 80K
Full BLEU: 29.764865629969385
Full CHRF: 26.37265669702092
Kept 1021 out of 6485 lines with length > 15
Long BLEU: 30.332747246990724
Long CHRF: 27.85622174467189

Base Phrase table with typo augmentation: 130K
Full BLEU: 29.6230711726356
Full CHRF: 26.304954605724934
Kept 1021 out of 6480 lines with length > 15
Long BLEU: 30.161200446446298
Long CHRF: 27.775833644294366

Full Phrase table with typo augmentation: 130K
Full BLEU: 28.735741304040417
Full CHRF: 25.482035653550916
Kept 1026 out of 6502 lines with length > 15
Long BLEU: 29.470470986318464
Long CHRF: 27.043012476120676

Full Phrase table, bidirectional with typo augmentation: 175K
Full BLEU: 28.96935496818149
Full CHRF: 25.83725955850799
Kept 1026 out of 6502 lines with length > 15
Long BLEU: 29.550611277731083
Long CHRF: 

In [25]:
print("[TYPOS] Base Phrase table: 80K")
calculate_metrics("test.typos.pred.80K.7000.man")
print("")

print("[TYPOS] Base Phrase table with typo augmentation: 130K")
calculate_metrics("test.typos.pred.130K.old.12000.man")
print("")

print("[TYPOS] Full Phrase table with typo augmentation: 130K")
calculate_metrics("test.typos.pred.130K.new.12000.man")
print("")

print("[TYPOS] Full Phrase table, bidirectional with typo augmentation: 175K")
calculate_metrics("test.typos.pred.175K.12000.bidir.man")
print("")

print("[TYPOS] Full Phrase table, mined bitext with typo augmentation: 170K")
calculate_metrics("test.typos.pred.170K.mined.6000.man")
print("")

print("[TYPOS] Bing with typo augmentation: 100K")
calculate_metrics("test.typos.pred.bing.8000.man")
print("")

print("[TYPOS] Bing with typo augmentation: 300K")
calculate_metrics("test.typos.pred.bing.11000.man")

[TYPOS] Base Phrase table: 80K
Full BLEU: 25.67903890146884
Full CHRF: 23.006809067678784
Kept 847 out of 4309 lines with length > 15
Long BLEU: 27.23221590137431
Long CHRF: 25.098827658569146

[TYPOS] Base Phrase table with typo augmentation: 130K
Full BLEU: 28.852904211703883
Full CHRF: 25.54057487338131
Kept 847 out of 4309 lines with length > 15
Long BLEU: 29.451457762359148
Long CHRF: 26.957779099041645

[TYPOS] Full Phrase table with typo augmentation: 130K
Full BLEU: 27.67252498656469
Full CHRF: 24.507617808716358
Kept 847 out of 4309 lines with length > 15
Long BLEU: 28.665473655102208
Long CHRF: 26.19398178032962

[TYPOS] Full Phrase table, bidirectional with typo augmentation: 175K
Full BLEU: 27.968586584774183
Full CHRF: 24.8085921158302
Kept 832 out of 4237 lines with length > 15
Long BLEU: 28.742461772965285
Long CHRF: 26.311239247095287

[TYPOS] Full Phrase table, mined bitext with typo augmentation: 170K
Full BLEU: 25.70538675709905
Full CHRF: 22.979575970979766
Kept 847

In [5]:
# Commercial baseline

def calculate_metrics(pred_file_path, tgt_file_path):
 # Normal eval
 with open(pred_file_path, "r") as pred_file, open(tgt_file_path, "r") as tgt_file:
 pred_str = pred_file.read().splitlines()
 label_str = tgt_file.read().splitlines()
 full_metrics = eval_metrics(label_str, pred_str)
 print(f"Full BLEU: {full_metrics['bleu']}")
 print(f"Full CHRF: {full_metrics['chrf']}")

 # Length > 15
 with open(pred_file_path, "r") as pred_file, open(tgt_file_path, "r") as tgt_file:
 pred_full_str = pred_file.read().splitlines()
 label_full_str = tgt_file.read().splitlines()
 pred_str = []
 label_str = []
 for pred_line, label_line in zip(pred_full_str, label_full_str):
 if len(label_line) > 15:
 pred_str.append(pred_line)
 label_str.append(label_line)
 print(f"Kept {len(pred_str)} out of {len(pred_full_str)} lines with length > 15")
 long_metrics = eval_metrics(label_str, pred_str)
 print(f"Long BLEU: {long_metrics['bleu']}")
 print(f"Long CHRF: {long_metrics['chrf']}")

print("Bing Translate:")
calculate_metrics("test.pred.bing.man", "para/test/test.man")
print("")

print("[TYPOS] Bing Translate:")
calculate_metrics("test.typos.pred.bing.man", "para/test/test.typos.man")
print("")

Bing Translate:
Full BLEU: 38.075374632013514
Full CHRF: 33.38543089520182
Kept 1026 out of 6502 lines with length > 15
Long BLEU: 37.353351481695206
Long CHRF: 34.13833877980428

[TYPOS] Bing Translate:
Full BLEU: 33.92564400445071
Full CHRF: 29.856792221070954
Kept 847 out of 4309 lines with length > 15
Long BLEU: 35.012635782444036
Long CHRF: 32.00001079226985

