
Initial commit with adapted deliverables from Clarin: http://hdl.handle.net/20.500.12537/301
4f09c24
from combo.predict import COMBO | |
from allennlp.data import tokenizers | |
from argparse import ArgumentParser | |
parser = ArgumentParser() | |
parser.add_argument('--parser') | |
parser.add_argument('--infile') | |
parser.add_argument('--pretokenized', action='store_true') | |
args = parser.parse_args() | |
# If your data is pre-tokenized, you can add the --pretokenized flag | |
# If you have a GPU available, you can add cuda_device=<your-device> to COMBO.from_pretrained | |
# The parser expects input in the same format as test_file.txt, i.e. one sentence per line | |
if args.pretokenized: | |
from Tokenizer.src.tokenizer import split_into_sentences | |
nlp = COMBO.from_pretrained('combo-is-combined-v211', tokenizer=tokenizers.SpacyTokenizer(split_on_spaces=True)) | |
else: | |
nlp = COMBO.from_pretrained(args.parser) | |
def read_test_file(file): | |
with open(file, 'r', encoding='utf-8') as infile: | |
for line in infile: | |
if args.pretokenized: | |
yield ' '.join(split_into_sentences(line)) | |
else: | |
yield line.rstrip() | |
test_file = read_test_file(args.infile) | |
for sent in test_file: | |
sentence = nlp(sent) | |
for index, token in enumerate(sentence.tokens, 1): | |
print(f'{token.id}\t{token.token}\t{token.lemma}\t{token.upostag}\t{token.xpostag}\t{token.feats}\t{token.head}\t{token.deprel}\t{token.deps}\t{token.misc}') | |
print() | |