#! /bin/sh test -f ja_gsd_modern.conllu || curl -LO https://github.com/KoichiYasuoka/SuPar-UniDic/raw/main/suparunidic/suparmodels/ja_gsd_modern.conllu ( if [ -f KoichiYasuoka/Swallow-7b-plus-upos/tokenizer.json ] then cat KoichiYasuoka/Swallow-7b-plus-upos/tokenizer.json else curl -L https://huggingface.co/KoichiYasuoka/Swallow-7b-plus-upos/resolve/main/tokenizer.json fi ) | env LANG=ja_JP.utf8 egrep -v '"[ぁ-ん] [ぁ-ん]",$' > newtokenizer.json TMP=./maker$$.py cat << 'EOF' > $TMP #! /usr/bin/env deepspeed src="KoichiYasuoka/Swallow-7b-plus-upos" tgt="KoichiYasuoka/Swallow-7b-plus-char-upos" from transformers import LlamaTokenizerFast,LlamaForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer class UPOSFileDataset(object): def __init__(self,conllu,tokenizer): self.conllu=open(conllu,"r",encoding="utf-8") self.tokenizer=tokenizer self.seeks=[0] self.multiword={} label=set(["SYM"]) s=self.conllu.readline() while s!="": if s=="\n": self.seeks.append(self.conllu.tell()) else: w=s.split("\t") if len(w)==10: if w[0].isdecimal(): label.add(w[3] if w[5]=="_" else w[3]+"|"+w[5]) elif w[0].find("-")>0: t=w[0].split("-") f,j,k=w[1],[],[] for i in range(int(t[0]),int(t[1])+1): w=self.conllu.readline().split("\t") j.append(w[3] if w[5]=="_" else w[3]+"|"+w[5]) k.append(w[1]) p="+".join(j) label.add(p) if p in self.multiword: self.multiword[p][f]=list(k) else: self.multiword[p]={f:list(k)} s=self.conllu.readline() lid={} for i,l in enumerate(sorted(label)): lid[l],lid["B-"+l],lid["I-"+l]=i*3,i*3+1,i*3+2 self.label2id=lid def __call__(*args): lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))} for t in args: t.label2id=lid return lid def __del__(self): self.conllu.close() __len__=lambda self:len(self.seeks)-1 def __getitem__(self,i): self.conllu.seek(self.seeks[i]) form,upos=[],[] while self.conllu.tell()0: t=w[0].split("-") u=[] for j in range(int(t[0]),int(t[1])+1): k=self.conllu.readline().split("\t") u.append(k[3] if k[5]=="_" else k[3]+"|"+k[5]) upos.append("+".join(u)) v=self.tokenizer(form,add_special_tokens=False) i,u=[],[] for j,(x,y) in enumerate(zip(v["input_ids"],upos)): if x!=[]: i+=x u+=[y] if len(x)==1 else ["B-"+y]+["I-"+y]*(len(x)-1) if len(i)