KoichiYasuoka
commited on
Commit
•
fd94a1e
1
Parent(s):
5b12b0e
model improved
Browse files- maker.py +12 -4
- pytorch_model.bin +1 -1
maker.py
CHANGED
@@ -5,7 +5,7 @@ url="https://github.com/KoichiYasuoka/spaCy-Thai"
|
|
5 |
import os
|
6 |
d=os.path.join(os.path.basename(url),"UD_Thai-Corpora")
|
7 |
os.system("test -d {} || git clone --depth=1 {}".format(d,url))
|
8 |
-
s='{if(NF>0)u=u$0"\\n";else{if(u~/\\t0\\troot\\t/)print u>"train.conllu";u=""}}'
|
9 |
os.system("nawk -F'\\t' '{}' {}/*-ud-*.conllu".format(s,d))
|
10 |
class UDgoeswithDataset(object):
|
11 |
def __init__(self,conllu,tokenizer):
|
@@ -33,15 +33,23 @@ class UDgoeswithDataset(object):
|
|
33 |
self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
|
34 |
c=[]
|
35 |
self.label2id={l:i for i,l in enumerate(sorted(label))}
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
__len__=lambda self:len(self.ids)
|
37 |
__getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
|
38 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
39 |
tkz=AutoTokenizer.from_pretrained(src)
|
40 |
trainDS=UDgoeswithDataset("train.conllu",tkz)
|
41 |
-
|
|
|
|
|
42 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
43 |
-
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1)
|
44 |
-
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS)
|
45 |
trn.train()
|
46 |
trn.save_model(tgt)
|
47 |
tkz.save_pretrained(tgt)
|
|
|
5 |
import os
|
6 |
d=os.path.join(os.path.basename(url),"UD_Thai-Corpora")
|
7 |
os.system("test -d {} || git clone --depth=1 {}".format(d,url))
|
8 |
+
s='{if(NF>0)u=u$0"\\n";else{f=FILENAME;if(u~/\\t0\\troot\\t/)print u>(f~/-dev/?"dev":f~/-test/?"test":"train")".conllu";u=""}}'
|
9 |
os.system("nawk -F'\\t' '{}' {}/*-ud-*.conllu".format(s,d))
|
10 |
class UDgoeswithDataset(object):
|
11 |
def __init__(self,conllu,tokenizer):
|
|
|
33 |
self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
|
34 |
c=[]
|
35 |
self.label2id={l:i for i,l in enumerate(sorted(label))}
|
36 |
+
def __call__(*args):
|
37 |
+
label=set(sum([list(t.label2id) for t in args],[]))
|
38 |
+
lid={l:i for i,l in enumerate(sorted(label))}
|
39 |
+
for t in args:
|
40 |
+
t.label2id=lid
|
41 |
+
return lid
|
42 |
__len__=lambda self:len(self.ids)
|
43 |
__getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
|
44 |
from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
45 |
tkz=AutoTokenizer.from_pretrained(src)
|
46 |
trainDS=UDgoeswithDataset("train.conllu",tkz)
|
47 |
+
devDS=UDgoeswithDataset("dev.conllu",tkz)
|
48 |
+
testDS=UDgoeswithDataset("test.conllu",tkz)
|
49 |
+
lid=trainDS(devDS,testDS)
|
50 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
51 |
+
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
|
52 |
+
trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
|
53 |
trn.train()
|
54 |
trn.save_model(tgt)
|
55 |
tkz.save_pretrained(tgt)
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 351720561
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84a1018e4627ed153c908596557ebbd905a1dedeff2fb076e0ff62175bd55327
|
3 |
size 351720561
|