Spaces:

aimlnerd
/

legal-entity-ner-transformers

Runtime error

App Files Files Community

aimlnerd commited on Jan 5, 2024

Commit

1d6faef

1 Parent(s): b9d5349

add

Browse files

Files changed (12) hide show

Dockerfile +14 -0
README.md +4 -5
configuration/.gitkeep +0 -0
configuration/__init__.py +0 -0
configuration/config.py +94 -0
source/services/ner/__init__.py +0 -0
source/services/ner/model/hf_tokenclassification/.gitkeep +0 -0
source/services/ner/steps/steps.py +106 -0
source/services/ner/train/__init__.py +0 -0
source/services/ner/utils/__init__.py +0 -0
tests/.gitkeep +0 -0
tests/ner/.gitkeep +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.12.1
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 ---
-title: Legal Entity Ner Transformers
-emoji: 🌖
-colorFrom: green
-colorTo: yellow
 sdk: docker
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Legal Entity Ner Crf
+emoji: 🏆
+colorFrom: red
+colorTo: gray
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

configuration/.gitkeep ADDED Viewed

File without changes

configuration/__init__.py ADDED Viewed

File without changes

configuration/config.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import spacy
+from functools import lru_cache
+from pydantic import BaseSettings, Field
+from source.datamodel.common import CountryCode, LineOfBusiness
+from source.datamodel.annotation_ranking import Weights, WeightCatalog
+class Settings(BaseSettings):
+    SERVER_HOST: str = '0.0.0.0'
+    PORT: int = 3000
+    STOP_TIMEOUT = 120
+    SLEEP_DURATION = 1e-4  # 0.1 ms sleep
+    APP_NAME: str = "MIRA MODELS"
+    MIRA_MODELS_BLOB_PATH: str = "Mira/ml_models"
+    LOCAL_MIRA_MODELS: str = "ml_models"
+    MIRA_INTENT_MODEL: str = "ml_models/intent_classifier/2021-04-09"
+    MARINE_NL_NER_MODEL: str = "ml_models/ner_marine_nl/2021-04-09"
+    MARINE_NL_RB_MODEL: str = "ml_models/ner_marine_nl/rule_based_annotator/rb_annotator.pkl"
+    PROPERTY_NL_NER_MODEL: str = "ml_models/ner_property_nl/ner_v10"
+    PROPERTY_BE_NER_MODEL: str = "ml_models/ner_property_be/ner_v10"
+    PROPERTY_BE_UW_MODEL: str = Field("ml_models/ner_property_be/uw_property_be_dev", env='PROPERTY_BE_UW_MODEL')
+    PROPERTY_NL_UW_MODEL: str = Field("ml_models/ner_property_nl/uw_property_nl_dev", env='PROPERTY_NL_UW_MODEL')
+    ADDRESS_DETECTION_LAXONS: str = "ml_models/address_detection/laxons.json"
+    ADDRESS_DETECTION_TERMS: str = "ml_models/address_detection/terms.json"
+    ADDRESS_DETECTION_BROKER_ADDRESSES: str = "ml_models/address_detection/broker_addresses.json"
+    LAYOUTLM_MODEL: str = "ml_models/layoutlm/layoutlm_model.pth"
+    LAYOUTLM_LABEL_MAPPING: str = "ml_models/layoutlm/labels_mapping.json"
+    LAYOUTLM_TOKENIZER: str = "ml_models/layoutlm/tokenizer"
+    ADDRESS_DETECTION_MAX_LEN: int = 60
+    ADDRESS_INDEX_MIN: int = 40
+    DEEPPARSE_ROOT_DIR: str = "ml_models/deepparse"
+    TSI_THRESHOLD: int = 100000
+    BROKER_MODEL: dict = {
+        'CRF_BROKER_MODEL_PATH': r"source/services/ner_crf/model/crf/30_Nov_2023-14h-broker_pycrf.crfsuite",
+        'WORD_POSITION': 1,
+        #'POS_POSITION': 2,
+        'LEMMA_POSITION': 2,
+        #'NER_POSITION': 3
+        }
+    si_model: dict = {
+        'CRF_SI_MODEL_PATH': r"ml_models/si/crf_23_Jun_2022-11h_inclu_lemma_n_amount_with_eur_gt10k_amount.joblib",
+        'WORD_POSITION': 1,
+        'LEMMA_POSITION': 2,
+        'NER_POSITION': 3,
+        'POS_POSITION': 4
+        }
+    #spacy_pretrained_model_nl_sm = spacy.load('nl_core_news_sm')
+    spacy_pretrained_model_nl_md = spacy.load('nl_core_news_md')
+    layoutlm_config: dict = {'local_rank': -1,
+                             'overwrite_cache': True,
+                             'max_seq_length': 512,
+                             'model_type': 'layoutlm',
+                             'cls_token_box': [0, 0, 0, 0],
+                             'sep_token_box': [1000, 1000, 1000, 1000],
+                             'pad_token_box': [0, 0, 0, 0]}
+def loss_ratio_params():
+    url = "http://0.0.0.0:3000/claim-experience-risk-level/"
+    login = "clerk"
+    pw = "asdfgh"
+    return url, login, pw
+@lru_cache()
+def get_weight_catalog():
+    weight_catalog = WeightCatalog()
+    # PROPERTY BE WEIGHTS
+    weight_catalog.set_weights(
+        LineOfBusiness.property, CountryCode.belgium, 'POLICYHOLDER',
+        Weights(subject=0.7, body=0.2, attachment=0.1))
+    weight_catalog.set_weights(
+        LineOfBusiness.property, CountryCode.belgium, 'BROKER',
+        Weights(subject=0.1, body=0.6, attachment=0.2))
+    # PROPERTY NL WEIGHTS
+    weight_catalog.set_weights(
+        LineOfBusiness.property, CountryCode.netherlands, 'POLICYHOLDER',
+        Weights(subject=0.7, body=0.2, attachment=0.1))
+    weight_catalog.set_weights(
+        LineOfBusiness.property, CountryCode.netherlands, 'BROKER',
+        Weights(subject=0.1, body=0.6, attachment=0.2))
+    return weight_catalog
+@lru_cache()
+def get_settings():
+    return Settings()
+# Instantiate the settings
+settings = get_settings()

source/services/ner/__init__.py ADDED Viewed

File without changes

source/services/ner/model/hf_tokenclassification/.gitkeep ADDED Viewed

File without changes

source/services/ner/steps/steps.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import List, Union
+from spacy.tokens import Doc
+from configuration.config import settings
+from spacy.training import docs_to_json
+import pycrfsuite
+import srsly
+nlp = settings.spacy_pretrained_model_nl_md
+DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
+def get_entity_prediction(lst_token: List, pred: List) -> List:
+    lst = []
+    for token, pred_token in zip(lst_token[0], pred[0]):
+        if pred_token != 'O':
+            lst.append((token[1], token[2], pred_token[2:]))
+    return lst
+def format_prediction(offsets: List, text: str, **kwargs) -> List:
+    if kwargs.get('model_name', None):
+        source = kwargs['model_name']
+    else:
+        source = "crf-broker"
+    lst = []
+    for pred_token in offsets:
+        lst.append({
+            "text": text[pred_token[0]:pred_token[1]],
+            "start": pred_token[0],
+            "end": pred_token[1],
+            "label": pred_token[2],
+            "source": source,
+            "score": 1.0})
+    return lst
+def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
+    """
+    Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
+    the model will be feed at document level not sentence level
+    doc['sentences'] represents one document eg: 1 email or 1 attachment
+                 format example
+            {
+              "id":10,
+              "orth":"Belgium",
+              "space":"",
+              "tag":"SPEC|deeleigen",
+              "pos":"PROPN",
+              "morph":"",
+              "lemma":"belgium",
+              "head":-1,
+              "dep":"flat",
+              "ner":"O"
+            }
+    """
+    lst_crf_docs = []
+    for doc in doc2json['paragraphs']:
+        lst_crf_doc = []
+        for sents in doc['sentences']:
+            sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
+                        sents['tokens']]
+            lst_crf_doc.extend(sentence)
+        lst_crf_docs.append(lst_crf_doc)
+    if save_path:
+        srsly.write_json(save_path, lst_crf_docs)
+    return lst_crf_docs
+def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
+    doc = nlp(input_text)
+    lst_tokens = []
+    for token in doc:
+        lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
+    doc2json1 = docs_to_json(doc)
+    lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
+    return lst_data1, [lst_tokens], doc
+def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
+    X = [tokenfeatures.sent2features(s) for s in raw_data]
+    return X
+def load_crf_model(path: str) -> pycrfsuite.Tagger:
+    tagger = pycrfsuite.Tagger()
+    tagger.open(path)
+    return tagger
+def predictor(tagger, x: List) -> List:
+    """Runs prediction.
+    Args:
+        tagger: CRF
+        x: input data
+    Returns:
+        prediction: List
+    """
+    y_pred = [tagger.tag(xseq) for xseq in x]
+    return y_pred

source/services/ner/train/__init__.py ADDED Viewed

File without changes

source/services/ner/utils/__init__.py ADDED Viewed

File without changes

tests/.gitkeep ADDED Viewed

File without changes

tests/ner/.gitkeep ADDED Viewed

File without changes