aineid / morph_simplifier.py
grosenthal's picture
rest of stuff
aaffe1f
def parse_verb_morphology(morph):
word_pos = "Verb"
offset = 7
# Tense Consumption
if morph[3:7] == "PRES":
tense = "Present"
elif morph[3:7] == "IMPF":
tense = "Imperfect"
elif morph[3:7] == "PLUP":
tense = "Pluperfect"
elif morph[3:7] == "PERF":
tense = "Pluperfect"
elif morph[3:7] == "FUTP" and morph[3:13] != 'FUTPASSIVE':
tense = "FuturePerfect"
elif morph[3:6] == 'FUT':
offset = 6
tense = "Future"
elif morph == "V99XXX0X":
# these should we just have a default?
tense = "Undeclined"
else:
tense = "PROBLEM"
# Voice consumption
if morph[offset:offset+6] == "ACTIVE":
voice = 'Active'
offset += 6
elif morph[offset:offset+7] == "PASSIVE":
voice = 'Passive'
offset += 7
elif morph[offset:offset+3] == 'IND' or morph[offset:offset+3] == 'SUB' or morph[offset:offset+3] == 'INF' or morph[offset:offset+3] == 'IMP':
# Deponent verbs - Such verbs occur in passive voice but are translated in active voice.
voice = 'Active'
else:
voice = 'PROBLEM'
# Mood consumption
if morph[offset:offset+3] == "SUB":
mood = 'Subjunctive'
elif morph[offset:offset+3] == "IND":
mood = 'Indicative'
elif morph[offset:offset+3] == "IMP":
mood = 'Imperative'
elif morph[offset:offset+3] == "INF":
mood = 'Infinitive'
else:
mood = 'PROBLEM'
offset += 3
# Person consumption
if morph[offset] == '0':
person = 'Undeclined'
elif morph[offset] == '1':
person = 'First'
elif morph[offset] == '2':
person = 'Second'
elif morph[offset] == '3':
person = 'Third'
else:
person = "PROBLEM"
offset += 1
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
return {
'pos': word_pos,
'tense': tense,
'voice': voice,
'mood': mood,
'person': person,
'number': number
}
def parse_verb_participle_morphology(morph):
word_pos = "Participle"
offset = 6
# Case consumption
if morph[offset:offset+3] == 'NOM':
v_case = 'Nominative'
elif morph[offset:offset+3] == 'GEN':
v_case = 'Genitive'
elif morph[offset:offset+3] == 'DAT':
v_case = 'Dative'
elif morph[offset:offset+3] == 'ACC':
v_case = 'Accusative'
elif morph[offset:offset+3] == 'ABL':
v_case = 'Ablative'
elif morph[offset:offset+3] == 'VOC':
v_case = 'Vocative'
elif morph[offset:offset+3] == 'LOC':
v_case = 'Locative'
else:
v_case = "PROBLEM"
offset += 3
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
offset += 1
# Gender consumption
if morph[offset] == 'M':
gender = 'Masculine'
elif morph[offset] == 'F':
gender = 'Feminine'
elif morph[offset] == 'N':
gender = 'Neuter'
elif morph[offset] == 'X':
gender = 'Unknown'
elif morph[offset] == 'C':
gender = 'Common'
else:
gender = 'PROBLEM'
offset += 1
if morph[offset:] == 'FUTPPL' or morph[offset:] == 'FUTACTIVEPPL':
participle_type = 'FutureActive'
elif morph[offset:] == 'PRESPPL' or morph[offset:] == 'PRESACTIVEPPL':
participle_type = 'Present Active'
elif morph[offset:] == 'PERFPPL' or morph[offset:] == 'PERFACTIVEPPL':
participle_type = 'PerfectActive'
elif morph[offset:] == 'FUTPASSIVEPPL':
participle_type = 'FuturePassive'
elif morph[offset:] == 'PERFPASSIVEPPL':
participle_type = 'PerfectPassive'
else:
participle_type = 'PROBLEM'
return {
'pos': word_pos,
'case': v_case,
'number': number,
'gender': gender,
'participle_type': participle_type
}
def parse_noun_morphology(morph):
word_pos = "Noun"
offset = 3
# Case consumption
if morph[offset:offset+3] == 'NOM':
v_case = 'Nominative'
elif morph[offset:offset+3] == 'GEN':
v_case = 'Genitive'
elif morph[offset:offset+3] == 'DAT':
v_case = 'Dative'
elif morph[offset:offset+3] == 'ACC':
v_case = 'Accusative'
elif morph[offset:offset+3] == 'ABL':
v_case = 'Ablative'
elif morph[offset:offset+3] == 'VOC':
v_case = 'Vocative'
elif morph[offset:offset+3] == 'LOC':
v_case = 'Locative'
elif morph[offset] == 'X':
offset -= 2
v_case = 'Undeclined'
else:
v_case = "PROBLEM"
offset += 3
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
offset += 1
# Gender consumption
if morph[offset] == 'M':
gender = 'Masculine'
elif morph[offset] == 'F':
gender = 'Feminine'
elif morph[offset] == 'N':
gender = 'Neuter'
elif morph[offset] == 'X':
gender = 'Unknown'
elif morph[offset] == 'C':
gender = 'Common'
else:
gender = 'PROBLEM'
return {
'pos': word_pos,
'case': v_case,
'number': number,
'gender': gender,
}
def parse_adjective_morphology(morph):
word_pos = "Adjective"
offset = 5
# Case consumption
if morph[offset:offset+3] == 'NOM':
v_case = 'Nominative'
elif morph[offset:offset+3] == 'GEN':
v_case = 'Genitive'
elif morph[offset:offset+3] == 'DAT':
v_case = 'Dative'
elif morph[offset:offset+3] == 'ACC':
v_case = 'Accusative'
elif morph[offset:offset+3] == 'ABL':
v_case = 'Ablative'
elif morph[offset:offset+3] == 'VOC':
v_case = 'Vocative'
elif morph[offset:offset+3] == 'LOC':
v_case = 'Locative'
elif morph[offset] == 'X':
offset -= 2
v_case = 'Undeclined'
else:
v_case = "PROBLEM"
offset += 3
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
offset += 1
# Gender consumption
if morph[offset] == 'M':
gender = 'Masculine'
elif morph[offset] == 'F':
gender = 'Feminine'
elif morph[offset] == 'N':
gender = 'Neuter'
elif morph[offset] == 'X':
gender = 'Unknown'
elif morph[offset] == 'C':
gender = 'Common'
else:
gender = 'PROBLEM'
offset += 1
# Comparison consumption
if morph[offset:] == 'POS':
comparison = "Positive"
elif morph[offset:] == 'COMP':
comparison = "Comparative"
elif morph[offset:] == 'SUPER':
comparison = "Superlative"
elif morph[offset:] == 'X':
comparison = 'Unknown'
else:
comparison = "PROBLEM"
print(morph)
return {
'pos': word_pos,
'case': v_case,
'number': number,
'gender': gender,
'comparison': comparison
}
# PRON31NOMPM
def parse_pronoun_morphology(morph):
word_pos = "Pronoun"
offset = 6
# Case consumption
if morph[offset:offset+3] == 'NOM':
v_case = 'Nominative'
elif morph[offset:offset+3] == 'GEN':
v_case = 'Genitive'
elif morph[offset:offset+3] == 'DAT':
v_case = 'Dative'
elif morph[offset:offset+3] == 'ACC':
v_case = 'Accusative'
elif morph[offset:offset+3] == 'ABL':
v_case = 'Ablative'
elif morph[offset:offset+3] == 'VOC':
v_case = 'Vocative'
elif morph[offset:offset+3] == 'LOC':
v_case = 'Locative'
elif morph[offset] == 'X':
offset -= 2
v_case = 'Undeclined'
else:
v_case = "PROBLEM"
offset += 3
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
offset += 1
# Gender consumption
if morph[offset] == 'M':
gender = 'Masculine'
elif morph[offset] == 'F':
gender = 'Feminine'
elif morph[offset] == 'N':
gender = 'Neuter'
elif morph[offset] == 'X':
gender = 'Unknown'
elif morph[offset] == 'C':
gender = 'Common'
else:
gender = 'PROBLEM'
return {
'pos': word_pos,
'case': v_case,
'number': number,
'gender': gender,
}
def parse_preposition_morphology(morph):
offset = 4
word_pos = "Preposition"
if morph[offset:] == 'ABL':
v_case = 'Ablative'
elif morph[offset:] == 'ACC':
v_case = 'Accusative'
else:
print(morph[offset:])
v_case = 'PROBLEM'
return {
'pos': word_pos,
'case': v_case
}
def parse_adverb_morphology(morph):
word_pos = "Adverb"
offset = 3
if len(morph[offset:]) < 1:
comparison = "Positive"
# Comparison consumption
elif morph[offset:] == 'POS':
comparison = "Positive"
elif morph[offset:] == 'COMP':
comparison = "Comparative"
elif morph[offset:] == 'SUPER':
comparison = "Superlative"
elif morph[offset:] == 'X':
comparison = 'Unknown'
else:
comparison = "PROBLEM"
print(morph)
return {
'pos': word_pos,
'comparison': comparison
}
# PRON31NOMPM
def parse_supine_morphology(morph):
word_pos = "Supine"
offset = 8
# Case consumption
if morph[offset:offset+3] == 'NOM':
v_case = 'Nominative'
elif morph[offset:offset+3] == 'GEN':
v_case = 'Genitive'
elif morph[offset:offset+3] == 'DAT':
v_case = 'Dative'
elif morph[offset:offset+3] == 'ACC':
v_case = 'Accusative'
elif morph[offset:offset+3] == 'ABL':
v_case = 'Ablative'
elif morph[offset:offset+3] == 'VOC':
v_case = 'Vocative'
elif morph[offset:offset+3] == 'LOC':
v_case = 'Locative'
elif morph[offset] == 'X':
offset -= 2
v_case = 'Undeclined'
else:
v_case = "PROBLEM"
offset += 3
# Number consumption
if morph[offset] == 'S':
number = 'Singular'
elif morph[offset] == 'P':
number = 'Plural'
elif morph[offset] == 'X':
number = 'Infinitive'
else:
number = 'PROBLEM'
offset += 1
# Gender consumption
if morph[offset] == 'M':
gender = 'Masculine'
elif morph[offset] == 'F':
gender = 'Feminine'
elif morph[offset] == 'N':
gender = 'Neuter'
elif morph[offset] == 'X':
gender = 'Unknown'
elif morph[offset] == 'C':
gender = 'Common'
else:
gender = 'PROBLEM'
return {
'pos': word_pos,
'case': v_case,
'number': number,
'gender': gender,
}
def parse_morphology(morph):
if len(morph) < 2:
return {}
# Participle
if morph[0:4] == "VPAR":
return parse_verb_participle_morphology(morph)
# Adjective
elif morph[0:3] == 'ADJ':
return parse_adjective_morphology(morph)
# Adverb
elif morph[0:3] == 'ADV':
return parse_adverb_morphology(morph)
# Verb
elif morph[0] == 'V' and morph[1] in [str(p) for p in range(9)]:
return parse_verb_morphology(morph)
# Noun
elif morph[0] == 'N':
return parse_noun_morphology(morph)
# Pronoun
elif morph[0:4] == 'PRON':
return parse_pronoun_morphology(morph)
# Preposition
elif morph[0:4] == 'PREP':
return parse_preposition_morphology(morph)
# Supine
elif morph[0:6] == 'SUPINE':
return parse_supine_morphology(morph)
elif morph[0:6] == 'INTERJ':
return {
'pos': 'Interjection'
}
elif morph in ['TACKON', 'PREFIX', 'SUFFIX']:
return {
'pos': 'Enclitic'
}
elif morph in ['OTHER', 'CONJ']:
return {
'pos': morph.capitalize()
}
def morph_to_string(morph):
if morph == {} or morph is None:
return ""
if morph['pos'] == 'Verb':
return 'Verb_' + morph['tense'] + "_" + morph['voice'] + "_" + morph['mood'] + "_" + morph['person'] + "_" + morph['number']
elif morph['pos'] == 'Participle':
return "Participle_" + morph['case'] + "_" + morph['number'] + "_" + morph['gender'] + "_" + morph['participle_type']
elif morph['pos'] == 'Noun':
return "Noun_" + morph['case'] + "_" + morph['number'] + "_" + morph['gender']
elif morph['pos'] == 'Adjective':
return 'Adjective_' + morph['case'] + "_" + morph['number'] + "_" + morph['gender'] + '_' + morph['comparison']
elif morph['pos'] == 'Pronoun':
return "Pronoun_" + morph['case'] + "_" + morph['number'] + "_" + morph['gender']
elif morph['pos'] == 'Preposition':
return "Preposition_" + morph['case']
elif morph['pos'] == 'Adverb':
return "Adverb_" + morph['comparison']
elif morph['pos'] == 'Supine':
return "Supine_" + morph['case'] + "_" + morph['number'] + "_" + morph['gender']
elif morph['pos'] == 'Enclitic':
return 'Enclitic'
elif morph['pos'] == 'Other':
return 'OTHER'
elif morph['pos'] == 'Conj':
return 'Conjunction'
elif morph['pos'] == 'Interjection':
return 'Interjection'
def simplify_form(morph):
return morph_to_string(parse_morphology(morph))