combo_parser / parse_file.py
danielschnell's picture
Initial commit with adapted deliverables from Clarin: http://hdl.handle.net/20.500.12537/301
4f09c24
from combo.predict import COMBO
from allennlp.data import tokenizers
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--parser')
parser.add_argument('--infile')
parser.add_argument('--pretokenized', action='store_true')
args = parser.parse_args()
# If your data is pre-tokenized, you can add the --pretokenized flag
# If you have a GPU available, you can add cuda_device=<your-device> to COMBO.from_pretrained
# The parser expects input in the same format as test_file.txt, i.e. one sentence per line
if args.pretokenized:
from Tokenizer.src.tokenizer import split_into_sentences
nlp = COMBO.from_pretrained('combo-is-combined-v211', tokenizer=tokenizers.SpacyTokenizer(split_on_spaces=True))
else:
nlp = COMBO.from_pretrained(args.parser)
def read_test_file(file):
with open(file, 'r', encoding='utf-8') as infile:
for line in infile:
if args.pretokenized:
yield ' '.join(split_into_sentences(line))
else:
yield line.rstrip()
test_file = read_test_file(args.infile)
for sent in test_file:
sentence = nlp(sent)
for index, token in enumerate(sentence.tokens, 1):
print(f'{token.id}\t{token.token}\t{token.lemma}\t{token.upostag}\t{token.xpostag}\t{token.feats}\t{token.head}\t{token.deprel}\t{token.deps}\t{token.misc}')
print()