import json import subprocess from cltk.core.data_types import Process from dataclasses import dataclass from copy import deepcopy from boltons.cacheutils import cachedproperty from cltk.core.data_types import Doc, Word import subprocess import re import string from cltk.tokenizers.lat.lat import LatinWordTokenizer from cltk.core.data_types import Process, Pipeline from cltk.languages.utils import get_lang from cltk.alphabet.processes import LatinNormalizeProcess from cltk.nlp import NLP from cltk.text.processes import DefaultPunctuationRemovalProcess from fastapi import FastAPI from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles from typing import Optional import json from transformers import AutoTokenizer import morph_simplifier import json import os app = FastAPI() @dataclass class LatinWhitakersWordsMorphology(Process): """A simple ``Process`` for giving the stem and morphological features of a latin word using Whitakers Words """ language: str = None @cachedproperty def algorithm(self): return None def parse_word(self, tup): index, (word_tup) = tup word_obj, word_lookup = word_tup word_obj.word_lookup = word_lookup word_lookup = word_lookup.strip() if word_obj.string in [',', ":", "'", '"', ".", ";"] or "UNKNOWN" in word_lookup or "" == word_lookup: word_obj.stem = word_obj.string word_obj.morph = "OTHER" word_obj.case = "" else: letter_swap = False # shit like XIIX, why was this written?? if "Bad Roman Numeral?" in word_lookup: word_obj.stem = word_obj.string word_obj.morph = "NUM20XXXCARD" word_obj.case = "" return word_obj if "WORD_EDIT" in word_lookup: letter_swap = True word_lookup = word_lookup.replace("WORD_EDIT\n", "") # form of sum/esse/ if word_lookup[0] == '.': word_obj.stem = "esse" word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV") word_obj.case = word_obj.string return word_obj try: # alicuius - [XXXAO] starts the line if word_lookup.split("\n")[1].strip()[0] == "[": sp = word_lookup.split("\n") word_lookup = sp[0] + "\n" + sp[2] except Exception as e: print(e) #i/j u/v d/t swap, need to drop another line if word_lookup.split(" ")[0].split(".")[0] == "Word": word_lookup = "\n".join(word_lookup.split("\n")[2:]) letter_swap = True # Cardinal number if "CARD" in word_lookup and "." not in word_lookup.split(" ")[0]: word_obj.stem = word_lookup.split(" ")[0] word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV") word_obj.case = "" return word_obj if word_lookup.split(" ")[0].replace(".", "").replace("ivi", "ii").replace("v", "u").replace("j", "i").strip().lower() != word_obj.string.lower().replace("j", "i").replace("v.i", "").replace("ivi", "ii").replace("-", "").replace("v", "u"): if word_lookup.split(" ")[0].replace(".", "").strip().lower() == 'special_replace': word_obj.stem = word_obj.string word_obj.morph = "V51PRESACTIVEIND3P" word_obj.case = "" return word_obj elif word_lookup.split(" ")[0].replace(".", "").strip().lower() == 'iri_special': word_obj.stem = word_obj.string word_obj.morph = "V31FUTPASSIVEINF0X" word_obj.case = "" return word_obj if index != self.l - 1 and not letter_swap: try: word_lookup = word_lookup.split("\n")[2] word_obj.word_lookup = word_lookup except: word_obj.word_lookup = word_lookup word_obj.stem = word_lookup.split(" ")[0].split(".")[0] word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV") word_obj.case = word_lookup.split(" ")[0].split(".")[1] if "." in word_lookup.split(" ")[0] else "" return word_obj def run(self, input_doc: Doc) -> Doc: output_doc = deepcopy(input_doc) output_doc.words = [word for word in output_doc.words if word is not None and word.string != '-'] self.l = len(output_doc.words) words =re.sub(r"SUPINE \+ iri.*\n", "\n\nIRI_SPECIAL ", re.sub(r"PPL\+sunt.*\n\nsum|Syncope s => vis *\n\n", "", "\n".join(re.split(r"\n=>|=>\n",subprocess.check_output(["./words"],input=" ".join([word.string.replace("j","i") for word in output_doc.words]), cwd='./bin/', text=True), maxsplit=1)[1].split("\n")[:-6]) .replace("MORE - hit RETURN/ENTER to continue\nUnexpected exception in PAUSE", "") \ .replace("\n*", '\n') \ .replace("PERF PASSIVE PPL + verb TO_BE => PASSIVE perfect system", "\n\nSPECIAL_REPLACE") \ .replace("FUT PASSIVE PPL + esse => PRES PASSIVE INF", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT PASSIVE PPL + verb TO_BE => PASSIVE Periphrastic - should/ought/had to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT ACTIVE PPL + verb TO_BE => ACTIVE Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT PASSIVE PPL + esse => PASSIVE Periphrastic - should/ought/had to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT ACT PPL+fuisse => PERF ACT INF Periphrastic - to have been about/going to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT PASSIVE PPL + fuisse => PERF PASSIVE INF Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT ACTIVE PPL + esse => ACTIVE Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \ .replace("\nFUT ACTIVE PPL + esse => PRES Periphastic/FUT ACTIVE INF - be about/going to", "\n\nSPECIAL_REPLACE") \ .replace("Syncope s => vis\n\n", "WORD_EDIT") \ .replace("Syncope s => vis \n\n", "WORD_EDIT") \ .replace("\nSyncope ii => ivi \nSyncopated perfect ivi can drop 'v' without contracting vowel", "WORD_EDIT") \ .replace("Syncope s => vis \nSyncopated perfect often drops the 'v' and contracts vowel", "WORD_EDIT") \ .replace("\nPERF PASSIVE PPL + esse => PERF PASSIVE INF", "\n\nSPECIAL_REPLACE"))) \ .replace("\nSlur sub/su~ \nAn initial 'sub' may be rendered by su~", "WORD_EDIT") \ .replace("\nSyncope r => v.r \n\n", "WORD_EDIT") \ .split("\n\n") output_tokens = list(map(self.parse_word, enumerate(zip(output_doc.words, words)))) return output_tokens @dataclass class LatinTokenizationProcessWithPropers(Process): @cachedproperty def algorithm(self): return LatinWordTokenizer() def run(self, input_doc: Doc) -> Doc: output_doc = deepcopy(input_doc) output_doc.words = [] tokenizer_obj = self.algorithm enclitics_exceptions=LatinWordTokenizer.EXCEPTIONS + ["beniamin", "mosen", "hegesian", "bitumen", "aaron", "aristomene", 'disan', 'aran', 'lothan', 'amdan', 'amdan', 'esban', 'iethran', 'charan', "restitue", "resen"] tokens = tokenizer_obj.tokenize(output_doc.raw, enclitics_exceptions=enclitics_exceptions, enclitics=['que', 'n', 'ne', 'ue', 've', 'st']) indices = tokenizer_obj.compute_indices(output_doc.raw, tokens) for index, token in enumerate(tokens): word_obj = Word( string=token, index_token=index, index_char_start=indices[index], index_char_stop=indices[index] + len(token), ) output_doc.words.append(word_obj) return output_doc pipe_morph = Pipeline(description="A custom Latin pipeline", processes=[LatinNormalizeProcess, LatinTokenizationProcessWithPropers, DefaultPunctuationRemovalProcess, LatinWhitakersWordsMorphology], language=get_lang("lat")) nlp_morph = NLP(language='lat', custom_pipeline = pipe_morph, suppress_banner=True) def process_line_morph(line): an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":")) output_line = "" for word in an: if not word: continue output_line += word.stem + (" " + word.morph + " " if word.morph != "" else " ") return output_line[:-1].replace("\n", "").replace(" ", " ") def process_line_morph_simplified(line): an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":")) output_line = "" for word in an: if not word: continue output_line += word.stem + (" " + morph_simplifier.simplify_form(word.morph) + " " if word.morph != "" else " ") return output_line[:-1].replace("\n", "").replace(" ", " ") def process_line_case(line): an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":")) output_line = "" for word in an: if not word: continue output_line += (word.stem) + (" CASE_" + word.case + " " if word.case != "" else " ") return output_line[:-1].replace("\n", "").replace(" ", " ") base_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_base") morph_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_morphology") morph_simplified_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_morph_simplified") case_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_case") def tokenize(tokenizer, text): split_text = tokenizer.tokenize(text, truncation=True, max_length=128) input_ids = tokenizer(text, truncation=True, max_length=128)['input_ids'] return { "text": split_text, "ids": input_ids } tokenize_base = lambda t: tokenize(base_tokenizer, t) tokenize_morph = lambda t: tokenize(morph_tokenizer, t) tokenize_morph_simplified = lambda t: tokenize(morph_simplified_tokenizer, t) tokenize_case = lambda t: tokenize(case_tokenizer, t) def process_handler(text): print("in handler") morph_text = process_line_morph(text) morph_simplified_text = process_line_morph_simplified(text) case_text = process_line_case(text) return { 'processed_texts':{ 'base': text, 'morph': morph_text, 'morph_simplified': morph_simplified_text, 'split': case_text }, 'tokenized':{ 'base': tokenize_base(text), 'morph': tokenize_morph(morph_text), 'morph_simplified': tokenize_morph_simplified(morph_simplified_text), 'case': tokenize_case(case_text), } } @app.get('/process/') async def process(text: Optional[str] = None): if text is not None: result = process_handler(text) return json.dumps(result) else: return json.dumps({"error": "Missing required parameter 'text'"}), 400 app.mount("/", StaticFiles(directory="static", html=True), name="static") @app.get("/") def index() -> FileResponse: return FileResponse(path="/app/static/index.html", media_type="text/html")