Spaces:
Runtime error
Runtime error
add
Browse files- Dockerfile +14 -0
- README.md +4 -5
- configuration/.gitkeep +0 -0
- configuration/__init__.py +0 -0
- configuration/config.py +94 -0
- source/services/ner/__init__.py +0 -0
- source/services/ner/model/hf_tokenclassification/.gitkeep +0 -0
- source/services/ner/steps/steps.py +106 -0
- source/services/ner/train/__init__.py +0 -0
- source/services/ner/utils/__init__.py +0 -0
- tests/.gitkeep +0 -0
- tests/ner/.gitkeep +0 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.12.1
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
+
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
---
|
2 |
-
title: Legal Entity Ner
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
-
license: apache-2.0
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Legal Entity Ner Crf
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: gray
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
configuration/.gitkeep
ADDED
File without changes
|
configuration/__init__.py
ADDED
File without changes
|
configuration/config.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from functools import lru_cache
|
3 |
+
from pydantic import BaseSettings, Field
|
4 |
+
from source.datamodel.common import CountryCode, LineOfBusiness
|
5 |
+
from source.datamodel.annotation_ranking import Weights, WeightCatalog
|
6 |
+
|
7 |
+
|
8 |
+
class Settings(BaseSettings):
|
9 |
+
SERVER_HOST: str = '0.0.0.0'
|
10 |
+
PORT: int = 3000
|
11 |
+
STOP_TIMEOUT = 120
|
12 |
+
SLEEP_DURATION = 1e-4 # 0.1 ms sleep
|
13 |
+
APP_NAME: str = "MIRA MODELS"
|
14 |
+
MIRA_MODELS_BLOB_PATH: str = "Mira/ml_models"
|
15 |
+
LOCAL_MIRA_MODELS: str = "ml_models"
|
16 |
+
MIRA_INTENT_MODEL: str = "ml_models/intent_classifier/2021-04-09"
|
17 |
+
MARINE_NL_NER_MODEL: str = "ml_models/ner_marine_nl/2021-04-09"
|
18 |
+
MARINE_NL_RB_MODEL: str = "ml_models/ner_marine_nl/rule_based_annotator/rb_annotator.pkl"
|
19 |
+
PROPERTY_NL_NER_MODEL: str = "ml_models/ner_property_nl/ner_v10"
|
20 |
+
PROPERTY_BE_NER_MODEL: str = "ml_models/ner_property_be/ner_v10"
|
21 |
+
PROPERTY_BE_UW_MODEL: str = Field("ml_models/ner_property_be/uw_property_be_dev", env='PROPERTY_BE_UW_MODEL')
|
22 |
+
PROPERTY_NL_UW_MODEL: str = Field("ml_models/ner_property_nl/uw_property_nl_dev", env='PROPERTY_NL_UW_MODEL')
|
23 |
+
ADDRESS_DETECTION_LAXONS: str = "ml_models/address_detection/laxons.json"
|
24 |
+
ADDRESS_DETECTION_TERMS: str = "ml_models/address_detection/terms.json"
|
25 |
+
ADDRESS_DETECTION_BROKER_ADDRESSES: str = "ml_models/address_detection/broker_addresses.json"
|
26 |
+
LAYOUTLM_MODEL: str = "ml_models/layoutlm/layoutlm_model.pth"
|
27 |
+
LAYOUTLM_LABEL_MAPPING: str = "ml_models/layoutlm/labels_mapping.json"
|
28 |
+
LAYOUTLM_TOKENIZER: str = "ml_models/layoutlm/tokenizer"
|
29 |
+
ADDRESS_DETECTION_MAX_LEN: int = 60
|
30 |
+
ADDRESS_INDEX_MIN: int = 40
|
31 |
+
DEEPPARSE_ROOT_DIR: str = "ml_models/deepparse"
|
32 |
+
TSI_THRESHOLD: int = 100000
|
33 |
+
BROKER_MODEL: dict = {
|
34 |
+
'CRF_BROKER_MODEL_PATH': r"source/services/ner_crf/model/crf/30_Nov_2023-14h-broker_pycrf.crfsuite",
|
35 |
+
'WORD_POSITION': 1,
|
36 |
+
#'POS_POSITION': 2,
|
37 |
+
'LEMMA_POSITION': 2,
|
38 |
+
#'NER_POSITION': 3
|
39 |
+
}
|
40 |
+
si_model: dict = {
|
41 |
+
'CRF_SI_MODEL_PATH': r"ml_models/si/crf_23_Jun_2022-11h_inclu_lemma_n_amount_with_eur_gt10k_amount.joblib",
|
42 |
+
'WORD_POSITION': 1,
|
43 |
+
'LEMMA_POSITION': 2,
|
44 |
+
'NER_POSITION': 3,
|
45 |
+
'POS_POSITION': 4
|
46 |
+
}
|
47 |
+
#spacy_pretrained_model_nl_sm = spacy.load('nl_core_news_sm')
|
48 |
+
spacy_pretrained_model_nl_md = spacy.load('nl_core_news_md')
|
49 |
+
layoutlm_config: dict = {'local_rank': -1,
|
50 |
+
'overwrite_cache': True,
|
51 |
+
'max_seq_length': 512,
|
52 |
+
'model_type': 'layoutlm',
|
53 |
+
'cls_token_box': [0, 0, 0, 0],
|
54 |
+
'sep_token_box': [1000, 1000, 1000, 1000],
|
55 |
+
'pad_token_box': [0, 0, 0, 0]}
|
56 |
+
|
57 |
+
|
58 |
+
def loss_ratio_params():
|
59 |
+
url = "http://0.0.0.0:3000/claim-experience-risk-level/"
|
60 |
+
login = "clerk"
|
61 |
+
pw = "asdfgh"
|
62 |
+
return url, login, pw
|
63 |
+
|
64 |
+
|
65 |
+
@lru_cache()
|
66 |
+
def get_weight_catalog():
|
67 |
+
weight_catalog = WeightCatalog()
|
68 |
+
|
69 |
+
# PROPERTY BE WEIGHTS
|
70 |
+
weight_catalog.set_weights(
|
71 |
+
LineOfBusiness.property, CountryCode.belgium, 'POLICYHOLDER',
|
72 |
+
Weights(subject=0.7, body=0.2, attachment=0.1))
|
73 |
+
weight_catalog.set_weights(
|
74 |
+
LineOfBusiness.property, CountryCode.belgium, 'BROKER',
|
75 |
+
Weights(subject=0.1, body=0.6, attachment=0.2))
|
76 |
+
|
77 |
+
# PROPERTY NL WEIGHTS
|
78 |
+
weight_catalog.set_weights(
|
79 |
+
LineOfBusiness.property, CountryCode.netherlands, 'POLICYHOLDER',
|
80 |
+
Weights(subject=0.7, body=0.2, attachment=0.1))
|
81 |
+
weight_catalog.set_weights(
|
82 |
+
LineOfBusiness.property, CountryCode.netherlands, 'BROKER',
|
83 |
+
Weights(subject=0.1, body=0.6, attachment=0.2))
|
84 |
+
|
85 |
+
return weight_catalog
|
86 |
+
|
87 |
+
|
88 |
+
@lru_cache()
|
89 |
+
def get_settings():
|
90 |
+
return Settings()
|
91 |
+
|
92 |
+
|
93 |
+
# Instantiate the settings
|
94 |
+
settings = get_settings()
|
source/services/ner/__init__.py
ADDED
File without changes
|
source/services/ner/model/hf_tokenclassification/.gitkeep
ADDED
File without changes
|
source/services/ner/steps/steps.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
|
3 |
+
from spacy.tokens import Doc
|
4 |
+
from configuration.config import settings
|
5 |
+
from spacy.training import docs_to_json
|
6 |
+
import pycrfsuite
|
7 |
+
|
8 |
+
import srsly
|
9 |
+
|
10 |
+
nlp = settings.spacy_pretrained_model_nl_md
|
11 |
+
|
12 |
+
DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
|
13 |
+
|
14 |
+
|
15 |
+
def get_entity_prediction(lst_token: List, pred: List) -> List:
|
16 |
+
lst = []
|
17 |
+
for token, pred_token in zip(lst_token[0], pred[0]):
|
18 |
+
if pred_token != 'O':
|
19 |
+
lst.append((token[1], token[2], pred_token[2:]))
|
20 |
+
return lst
|
21 |
+
|
22 |
+
|
23 |
+
def format_prediction(offsets: List, text: str, **kwargs) -> List:
|
24 |
+
if kwargs.get('model_name', None):
|
25 |
+
source = kwargs['model_name']
|
26 |
+
else:
|
27 |
+
source = "crf-broker"
|
28 |
+
|
29 |
+
lst = []
|
30 |
+
for pred_token in offsets:
|
31 |
+
lst.append({
|
32 |
+
"text": text[pred_token[0]:pred_token[1]],
|
33 |
+
"start": pred_token[0],
|
34 |
+
"end": pred_token[1],
|
35 |
+
"label": pred_token[2],
|
36 |
+
"source": source,
|
37 |
+
"score": 1.0})
|
38 |
+
return lst
|
39 |
+
|
40 |
+
|
41 |
+
def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
|
42 |
+
"""
|
43 |
+
Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
|
44 |
+
the model will be feed at document level not sentence level
|
45 |
+
doc['sentences'] represents one document eg: 1 email or 1 attachment
|
46 |
+
format example
|
47 |
+
{
|
48 |
+
"id":10,
|
49 |
+
"orth":"Belgium",
|
50 |
+
"space":"",
|
51 |
+
"tag":"SPEC|deeleigen",
|
52 |
+
"pos":"PROPN",
|
53 |
+
"morph":"",
|
54 |
+
"lemma":"belgium",
|
55 |
+
"head":-1,
|
56 |
+
"dep":"flat",
|
57 |
+
"ner":"O"
|
58 |
+
}
|
59 |
+
|
60 |
+
"""
|
61 |
+
lst_crf_docs = []
|
62 |
+
for doc in doc2json['paragraphs']:
|
63 |
+
lst_crf_doc = []
|
64 |
+
for sents in doc['sentences']:
|
65 |
+
sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
|
66 |
+
sents['tokens']]
|
67 |
+
lst_crf_doc.extend(sentence)
|
68 |
+
lst_crf_docs.append(lst_crf_doc)
|
69 |
+
if save_path:
|
70 |
+
srsly.write_json(save_path, lst_crf_docs)
|
71 |
+
return lst_crf_docs
|
72 |
+
|
73 |
+
|
74 |
+
def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
|
75 |
+
doc = nlp(input_text)
|
76 |
+
lst_tokens = []
|
77 |
+
for token in doc:
|
78 |
+
lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
|
79 |
+
|
80 |
+
doc2json1 = docs_to_json(doc)
|
81 |
+
lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
|
82 |
+
return lst_data1, [lst_tokens], doc
|
83 |
+
|
84 |
+
|
85 |
+
def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
|
86 |
+
X = [tokenfeatures.sent2features(s) for s in raw_data]
|
87 |
+
return X
|
88 |
+
|
89 |
+
|
90 |
+
def load_crf_model(path: str) -> pycrfsuite.Tagger:
|
91 |
+
tagger = pycrfsuite.Tagger()
|
92 |
+
tagger.open(path)
|
93 |
+
return tagger
|
94 |
+
|
95 |
+
|
96 |
+
def predictor(tagger, x: List) -> List:
|
97 |
+
"""Runs prediction.
|
98 |
+
Args:
|
99 |
+
tagger: CRF
|
100 |
+
x: input data
|
101 |
+
Returns:
|
102 |
+
prediction: List
|
103 |
+
"""
|
104 |
+
|
105 |
+
y_pred = [tagger.tag(xseq) for xseq in x]
|
106 |
+
return y_pred
|
source/services/ner/train/__init__.py
ADDED
File without changes
|
source/services/ner/utils/__init__.py
ADDED
File without changes
|
tests/.gitkeep
ADDED
File without changes
|
tests/ner/.gitkeep
ADDED
File without changes
|