import json
import subprocess
from cltk.core.data_types import Process
from dataclasses import dataclass
from copy import deepcopy
from boltons.cacheutils import cachedproperty
from cltk.core.data_types import Doc, Word
import subprocess
import re
import string
from cltk.tokenizers.lat.lat import LatinWordTokenizer
from cltk.core.data_types import Process, Pipeline
from cltk.languages.utils import get_lang
from cltk.alphabet.processes import LatinNormalizeProcess
from cltk.nlp import NLP
from cltk.text.processes import DefaultPunctuationRemovalProcess
from fastapi import FastAPI
from typing import Optional
import json
from transformers import AutoTokenizer
import morph_simplifier
import json
import os

app = FastAPI()

@dataclass
class LatinWhitakersWordsMorphology(Process):
    """A simple ``Process`` for giving the stem and morphological features
       of a latin word using Whitakers Words
    """
    
    language: str = None
    
    @cachedproperty
    def algorithm(self):
      return None
    def parse_word(self, tup):
        index, (word_tup) = tup
        word_obj, word_lookup = word_tup
        word_obj.word_lookup = word_lookup
        word_lookup = word_lookup.strip()
        if word_obj.string in  [',', ":", "'", '"', ".", ";"] or "UNKNOWN" in word_lookup or "" == word_lookup:
            word_obj.stem = word_obj.string
            word_obj.morph = "OTHER"
            word_obj.case = ""
        else:
            letter_swap = False

            # shit like XIIX, why was this written??
            if "Bad Roman Numeral?" in word_lookup:
              word_obj.stem = word_obj.string
              word_obj.morph = "NUM20XXXCARD"
              word_obj.case = ""
              return word_obj
            
            if "WORD_EDIT" in word_lookup:
              letter_swap = True
              word_lookup = word_lookup.replace("WORD_EDIT\n", "")

            # form of sum/esse/
            if word_lookup[0] == '.':
              word_obj.stem = "esse"
              word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV")
              word_obj.case = word_obj.string
              return word_obj

            try:
              # alicuius - [XXXAO] starts the line
              if word_lookup.split("\n")[1].strip()[0] == "[":
                sp = word_lookup.split("\n")
                word_lookup = sp[0] + "\n" + sp[2]
            except Exception as e:
              print(e)
            #i/j u/v d/t swap, need to drop another line 
            if word_lookup.split(" ")[0].split(".")[0] == "Word": 
              word_lookup = "\n".join(word_lookup.split("\n")[2:])
              letter_swap = True

            
            
            # Cardinal number
            if "CARD" in word_lookup and "." not in word_lookup.split(" ")[0]:
              word_obj.stem = word_lookup.split(" ")[0]
              word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV")
              word_obj.case = ""
              return word_obj
            if word_lookup.split(" ")[0].replace(".", "").replace("ivi", "ii").replace("v", "u").replace("j", "i").strip().lower() != word_obj.string.lower().replace("j", "i").replace("v.i", "").replace("ivi", "ii").replace("-", "").replace("v", "u"):
                if word_lookup.split(" ")[0].replace(".", "").strip().lower() == 'special_replace':
                  word_obj.stem = word_obj.string
                  word_obj.morph = "V51PRESACTIVEIND3P"
                  word_obj.case = ""

                  return word_obj
                elif word_lookup.split(" ")[0].replace(".", "").strip().lower() == 'iri_special':
                  word_obj.stem = word_obj.string
                  word_obj.morph = "V31FUTPASSIVEINF0X"
                  word_obj.case = ""
                  return word_obj
                if index != self.l - 1 and not letter_swap:
                    try:
                        word_lookup = word_lookup.split("\n")[2]
                        word_obj.word_lookup = word_lookup
                    except:
                        word_obj.word_lookup = word_lookup
            word_obj.stem = word_lookup.split(" ")[0].split(".")[0]
            word_obj.morph = "".join(word_lookup.split("\n")[0].split(" ")[1:]).replace("Late", "").replace("Early", "").replace("N98XXM", "ADV")
            word_obj.case = word_lookup.split(" ")[0].split(".")[1] if "." in word_lookup.split(" ")[0] else ""
        return word_obj

    def run(self, input_doc: Doc) -> Doc:
        output_doc = deepcopy(input_doc)
        output_doc.words = [word for word in output_doc.words if word is not None and word.string != '-']
        self.l = len(output_doc.words)
        words =re.sub(r"SUPINE \+ iri.*\n", "\n\nIRI_SPECIAL ", re.sub(r"PPL\+sunt.*\n\nsum|Syncope   s => vis *\n\n", "", "\n".join(re.split(r"\n=>|=>\n",subprocess.check_output(["./words"],input=" ".join([word.string.replace("j","i") for word in output_doc.words]), cwd='./bin/', text=True), maxsplit=1)[1].split("\n")[:-6])
        .replace("MORE - hit RETURN/ENTER to continue\nUnexpected exception in PAUSE", "") \
        .replace("\n*", '\n') \
        .replace("PERF PASSIVE PPL + verb TO_BE => PASSIVE perfect system", "\n\nSPECIAL_REPLACE") \
        .replace("FUT PASSIVE PPL + esse => PRES PASSIVE INF", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT PASSIVE PPL + verb TO_BE => PASSIVE Periphrastic - should/ought/had to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT ACTIVE PPL + verb TO_BE => ACTIVE Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT PASSIVE PPL + esse => PASSIVE Periphrastic - should/ought/had to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT ACT PPL+fuisse => PERF ACT INF Periphrastic - to have been about/going to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT PASSIVE PPL + fuisse => PERF PASSIVE INF Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT ACTIVE PPL + esse => ACTIVE Periphrastic - about to, going to", "\n\nSPECIAL_REPLACE") \
        .replace("\nFUT ACTIVE PPL + esse => PRES Periphastic/FUT ACTIVE INF - be about/going to", "\n\nSPECIAL_REPLACE") \
        .replace("Syncope   s => vis\n\n", "WORD_EDIT") \
        .replace("Syncope   s => vis    \n\n", "WORD_EDIT") \
        .replace("\nSyncope  ii => ivi   \nSyncopated perfect ivi can drop 'v' without contracting vowel", "WORD_EDIT") \
        .replace("Syncope   s => vis   \nSyncopated perfect often drops the 'v' and contracts vowel", "WORD_EDIT") \
        .replace("\nPERF PASSIVE PPL + esse => PERF PASSIVE INF", "\n\nSPECIAL_REPLACE"))) \
        .replace("\nSlur sub/su~         \nAn initial 'sub' may be rendered by su~", "WORD_EDIT") \
        .replace("\nSyncope   r => v.r   \n\n", "WORD_EDIT") \
        .split("\n\n")
        output_tokens = list(map(self.parse_word, enumerate(zip(output_doc.words, words))))
        return output_tokens

@dataclass
class LatinTokenizationProcessWithPropers(Process):
    @cachedproperty
    def algorithm(self):
        return LatinWordTokenizer()

    def run(self, input_doc: Doc) -> Doc:
        output_doc = deepcopy(input_doc)
        output_doc.words = []
        tokenizer_obj = self.algorithm
        enclitics_exceptions=LatinWordTokenizer.EXCEPTIONS + ["beniamin", "mosen", "hegesian", "bitumen", "aaron", "aristomene", 'disan', 'aran', 'lothan', 'amdan', 'amdan', 'esban', 'iethran', 'charan', "restitue", "resen"]
        tokens = tokenizer_obj.tokenize(output_doc.raw, enclitics_exceptions=enclitics_exceptions, enclitics=['que', 'n', 'ne', 'ue', 've', 'st'])
        indices = tokenizer_obj.compute_indices(output_doc.raw, tokens)
        for index, token in enumerate(tokens):
            word_obj = Word(
                string=token,
                index_token=index,
                index_char_start=indices[index],
                index_char_stop=indices[index] + len(token),
            )
            output_doc.words.append(word_obj)
        return output_doc

pipe_morph = Pipeline(description="A custom Latin pipeline", processes=[LatinNormalizeProcess, LatinTokenizationProcessWithPropers, DefaultPunctuationRemovalProcess, LatinWhitakersWordsMorphology], language=get_lang("lat"))

nlp_morph = NLP(language='lat', custom_pipeline = pipe_morph, suppress_banner=True)


def process_line_morph(line):
    an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":"))
    output_line = ""
    for word in an:
      if not word:
          continue
      output_line += word.stem +  (" " + word.morph + " " if word.morph != "" else " ")
    return output_line[:-1].replace("\n", "").replace("  ", " ")

def process_line_morph_simplified(line):
    an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":"))
    output_line = ""
    for word in an:
      if not word:
          continue
      output_line += word.stem +  (" " + morph_simplifier.simplify_form(word.morph) + " " if word.morph != "" else " ")
    return output_line[:-1].replace("\n", "").replace("  ", " ")


def process_line_case(line):
    an = nlp_morph.analyze(line.translate(str.maketrans('', '', string.punctuation)).replace('(','').replace(')','').replace("“", "").replace("”", "").replace("—", ":"))
    output_line = ""
    for word in an:
      if not word:
          continue
      output_line += (word.stem) +  (" CASE_" + word.case + " " if word.case != "" else " ")
    return output_line[:-1].replace("\n", "").replace("  ", " ")

base_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_base")
morph_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_morphology")
morph_simplified_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_morph_simplified")
case_tokenizer = AutoTokenizer.from_pretrained("grosenthal/la_en_case")


def tokenize(tokenizer, text):
  split_text = tokenizer.tokenize(text, truncation=True, max_length=128)
  input_ids = tokenizer(text, truncation=True, max_length=128)['input_ids']
  return {
    "text": split_text,
    "ids": input_ids
  }

tokenize_base = lambda t: tokenize(base_tokenizer, t)
tokenize_morph = lambda t: tokenize(morph_tokenizer, t)
tokenize_morph_simplified = lambda t: tokenize(morph_simplified_tokenizer, t)
tokenize_case = lambda t: tokenize(case_tokenizer, t)
def process_handler(text):
    print("in handler")
    morph_text = process_line_morph(text)
    morph_simplified_text = process_line_morph_simplified(text)
    case_text = process_line_case(text)

    return {
        'processed_texts':{
          'base': text,
          'morph': morph_text,
          'morph_simplified': morph_simplified_text,
          'split': case_text
        },
        'tokenized':{
          'base': tokenize_base(text),
          'morph': tokenize_morph(morph_text),
          'morph_simplified': tokenize_morph_simplified(morph_simplified_text),
          'case': tokenize_case(case_text),
        }
    }
   
@app.get('/process/')
async def process(text: Optional[str] = None):
    if text is not None:
        result = process_handler(text)
        return json.dumps(result)
    else:
        return json.dumps({"error": "Missing required parameter 'text'"}), 400