Spaces:

adit94
/

entity_extraction

Sleeping

App Files Files Community

adit94 commited on Jun 13

Commit

846b4a5

•

1 Parent(s): a1c3649

Upload 9 files

Browse files

Files changed (9) hide show

helpers/common.py +0 -0
helpers/entity_extraction_helpers.py +121 -0
helpers/pii.py +0 -0
requirements.txt +12 -0
s3_uiapp.py +53 -0
services/mongo_service.py +33 -0
services/ocr_service.py +87 -0
services/openai_service.py +44 -0
services/pii_service.py +309 -0

helpers/common.py ADDED Viewed

File without changes

helpers/entity_extraction_helpers.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from constants import DOCUMENT_COLLECTION
+from openai_constants import ENTITY_EXTRACTION_PROMPT, ENTITY_EXTRACTION_FUNCTION, GPT35_PRICING
+def extract_all_documents(openai_instance, chunks):
+    all_entities = {}
+    all_usage = {}
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+    print(f"Number of chunks to process :: {len(chunks)}")
+    for chunk_idx, chunk in enumerate(chunks):
+        print(f"Sending request to OpenAI for {chunk_idx}")
+        openai_entities_out = openai_instance.generate_response(ENTITY_EXTRACTION_PROMPT, chunk, ENTITY_EXTRACTION_FUNCTION)
+        print("OpenAI out received")
+        print(openai_entities_out['function_output'])
+        #for ent in openai_entities_out['function_output'].items():
+        for key, val in openai_entities_out['function_output'].items():
+            print(key, val)
+            if key in all_entities:
+                if isinstance(val, list):
+                    all_entities[key].extend(val)  # Extend the existing list with the new list
+                else:
+                    all_entities[key].append(val)  # Append the value to the existing list
+            else:
+                if isinstance(val, list):
+                    all_entities[key] = val  # Initialize the key with the list
+                else:
+                    all_entities[key] = [val]
+        if 'prompt_tokens' in openai_entities_out['usage']:
+            total_prompt_tokens += openai_entities_out['usage']['prompt_tokens']
+        if 'completion_tokens' in openai_entities_out['usage']:
+            total_completion_tokens += openai_entities_out['usage']['completion_tokens']
+    all_usage = {
+        'prompt_tokens':total_prompt_tokens,
+        'completion_tokens':total_completion_tokens,
+        'output_pricing': total_completion_tokens/1000 * GPT35_PRICING['input'],
+        'input_pricing':total_prompt_tokens/1000 * GPT35_PRICING['output']
+    }
+    return all_entities, all_usage
+def process_insurance_document(pii_instance, mongo_instance, openai_instance, ocr_instance,
+                                document_path, document_id):
+    print("---- \nInside Process insurance document function")
+    ## save file to S3
+    document_s3_url = ""
+    ## OCR
+    try:
+        document_text = ocr_instance.extract_text_from_document(document_path)
+        ocr_status = "Completed"
+        process_status = "OCR Completed"
+        print(f"OCR complete")
+    except Exception as ex:
+        document_text = ""
+        ocr_status = ex
+        process_status = f"OCR Failed. {ex}"
+        print(process_status)
+    ## save ocr file to S3, add document S3 url
+    ocr_document_s3_url = ""
+    ## update ocr_status in db
+    #mongo_instance.update(DOCUMENT_COLLECTION,
+    #                      {'document_id':document_id},
+    #                      {'set':{'ocr_status':ocr_status, 'document_s3_url':document_s3_url,
+    #                              'ocr_document_s3_url':ocr_document_s3_url, 'process_status':process_status}})
+    print(f"OCR status updated in db")
+    ## PII entity extraction and masking
+    pii_entities = pii_instance.identify(document_text)
+    print(f"pii entiites are :: {pii_entities}")
+    pii_entities = pii_instance.add_mask(pii_entities)
+    print(f"\npii_entities after adding mask :: {pii_entities}")
+    masked_text = pii_instance.anonymize(pii_entities, document_text)
+    print(f"\nPII anonumized text is :: {masked_text}")
+    print(f"\nPII complete")
+    ## Openai extraction
+    chunks = ocr_instance.chunk_document(masked_text)
+    openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
+    entity_extraction_status = 'Completed'
+    process_status = 'Document term extraction completed'
+    """try:
+        openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
+        entity_extraction_status = 'Completed'
+        process_status = 'Document term extraction completed'
+    except Exception as ex:
+        openai_entities = {}
+        all_usage = {}
+        entity_extraction_status = ex
+        process_status = f"Document term extraction failed. {ex}"
+    """
+    #openai_entities_out = {
+    #    'status':"Success",
+    #    'function_output':{},
+    #    'usage':{}
+    #}
+    print(f"openai_entities are :: {openai_entities}")
+    print(f"Request to OpenAI complete")
+    print("----------- \nProcessing complete\n ")
+    ## Unmask PII entities in openai entities
+    ## update entity extraction status in db
+    #mongo_instance.update(DOCUMENT_COLLECTION,
+    #                      {'document_id':document_id},
+    #                      {'set':{'entity_extraction_status':entity_extraction_status,
+    #                              'entities':openai_entities, 'process_status':process_status}})
+    #print(f"Entities updated in DB")
+    out = {
+        "entities":openai_entities,
+        "masked_text":masked_text
+    }
+    return out

helpers/pii.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit
+fastapi
+pydantic
+scipy==1.10.1
+flair
+presidio-analyzer
+presidio-anonymizer
+openai==0.28.1
+pytesseract
+pdf2image
+PyPDF2
+python-docx

s3_uiapp.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import time
+import uuid
+import streamlit as st
+from helpers.entity_extraction_helpers import process_insurance_document
+from services.pii_service import PIIService
+from services.openai_service import OpenAIService
+from services.mongo_service import MongoService
+from services.ocr_service import OCRService
+def init_session():
+    print("------------------ Initializing")
+    if 'a' not in st.session_state:
+        st.session_state['pii_instance'] = PIIService()
+        print("PII service initialized")
+        time.sleep(2)
+        st.session_state['openai_instance'] = OpenAIService(st.secrets["OPENAI_KEY"],
+                                        st.secrets["OPENAI_AZURE_ENDPOINT"],
+                                        st.secrets["OPENAI_API_VERSION"],
+                                        st.secrets["DEPLOYMENT_NAME"])
+        print("OpenAI service initialized")
+        time.sleep(2)
+        st.session_state['ocr_instance'] = OCRService()
+        print("OCR service initialized")
+        st.session_state.a = 1
+        print("-----------------------------")
+st.header('', divider='rainbow')
+st.title("Data extraction")
+st.header('', divider='rainbow')
+init_session()
+uploaded_doc = st.file_uploader("Upload an insurance document", type=["pdf"])
+if uploaded_doc is not None:
+    with open(uploaded_doc.name,"wb") as f:
+        f.write(uploaded_doc.getbuffer())
+    document_id = str(uuid.uuid4())
+    print(f"File uploaded :: {uploaded_doc.name} :: {document_id}")
+    process_out = process_insurance_document(st.session_state['pii_instance'], "", st.session_state['openai_instance'],
+                                st.session_state['ocr_instance']    , uploaded_doc.name, document_id)
+    st.header('Extracted entities !! ', divider='rainbow')
+    st.write(process_out['entities'])
+    st.header('', divider='rainbow')
+### TO RUN :: streamlit run ui_app.py

services/mongo_service.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from pymongo import MongoClient
+from pydantic import BaseModel, HttpUrl
+from datetime import datetime
+from typing import Optional, List, Dict, Union
+class MongoService:
+    def __init__(self, mongo_url:str, database:str):
+        self.mongo_url = mongo_url
+        self.mongo_client = MongoClient(self.mongo_url)
+        self.mongo_database = self.mongo_client[database]
+    def insert(self, collection:str, data:Dict):
+        inserted = self.mongo_database[collection].insert_one(data)
+        return
+    def get(self, collection:str, filter:Dict, fields_to_retrieve:List=[]):
+        fields = {}
+        if fields_to_retrieve != []:
+            for field in fields_to_retrieve:
+                fields[field] = 1
+        retrieved_data = list(self.mongo_database[collection].find(filter,fields))
+        return retrieved_data
+    def update(self, collection:str, filter:Dict, update_value:Dict, many=False):
+        #myquery = { "address": { "$regex": "^S" } }
+        #newvalues = { "$set": { "name": "Minnie" } }
+        if many == True:
+            updated = self.mongo_database[collection].update_many(filter, update_value)
+        else:
+            updated = self.mongo_database[collection].update_one(filter, update_value)
+        return

services/ocr_service.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import re
+import docx
+import pytesseract
+from nltk.tokenize import sent_tokenize, word_tokenize
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+class OCRService:
+    def __init__(self):
+        return
+    def extract_ocrless_pdf(self, filepath):
+        reader = PdfReader(filepath)
+        extracted_text = ""
+        for page in reader.pages:
+            text = page.extract_text()
+            extracted_text += " "
+            extracted_text += text
+        return extracted_text
+    def extract_text_from_pdf(self, filepath):
+        images = convert_from_path(filepath, thread_count=4)
+        full_text = []
+        #config = (r"--oem 2 --psm 7")
+        for image_idx, image in enumerate(images):
+            text = pytesseract.image_to_string(image)
+            #text = pytesseract.image_to_string(image, config=config)
+            full_text.append(text)
+        return full_text
+    def extract_text_from_document(self, filepath):
+        file_ext = os.path.splitext(filepath)[-1]
+        if file_ext in [".pdf"]:
+            text_to_process = self.extract_text_from_pdf(filepath)
+            text_joined = " ".join(text_to_process)
+            #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
+            #    file.writelines(text_to_process)
+        elif file_ext in [".doc", ".DOC", ".docx", ".DOCX"]:
+            doc_content = docx.Document(filepath)
+            text_to_process = [i.text for i in doc_content.paragraphs]
+            text_joined = " \n ".join(text_to_process)
+            #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
+            #    file.write(text_joined)
+        elif file_ext in [".txt"]:
+            file = open(f"{os.path.splitext(filepath)[0]}.txt", encoding="utf8")
+            text_joined = file.read()
+        return text_joined
+    def preprocess_document(self, document):
+        document = document.replace(r'\n+', "\n")
+        #document = re.sub(r"\s+", " ", document)
+        document = re.sub("“", r"\"", document)
+        document = re.sub("”", r"\"", document)
+        document = re.sub(r"\\\"", "\"", document)
+        return document
+    def chunk_document(self, text, k=1500):
+        sentences = sent_tokenize(text)
+        words = word_tokenize(text)
+        chunks = []
+        current_chunk = []
+        current_word_count = 0
+        for sentence in sentences:
+            sentence_words = word_tokenize(sentence)
+            if current_word_count + len(sentence_words) <= k:
+                current_chunk.append(sentence)
+                current_word_count += len(sentence_words)
+            else:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = len(sentence_words)
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        for id, chunk in enumerate(chunks):
+            if len(chunk.split()) < 2:
+                del chunks[id]
+        return chunks

services/openai_service.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import json
+import openai
+class OpenAIService:
+    def __init__(self, api_key, api_endpoint, api_version, deployment_name):
+        openai.api_key = api_key
+        openai.api_base = api_endpoint
+        openai.api_type = "azure"
+        openai.api_version = api_version
+        self.openai_deployment_name = deployment_name
+    def generate_response(self, system_prompt, user_prompt, openai_function):
+        try:
+            response = openai.ChatCompletion.create(engine=self.openai_deployment_name,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                    ],
+                functions = openai_function,
+                function_call = {"name": openai_function[0]['name']}
+            )
+            openai_output = response["choices"]
+            usage = response["usage"].to_dict()
+            print(response)
+            function_output = json.loads(openai_output[0].message.function_call.arguments, strict=False)
+            print(function_output)
+            openai_out = {
+                'function_output':function_output,
+                'usage':usage,
+                'status':'Success'
+            }
+            return openai_out
+        except Exception as ex:
+            print(f"Openai generate response exceptin ::: {ex}")
+            openai_out = {
+                'function_output':{},
+                'usage':{},
+                'status':ex
+            }
+            return openai_out

services/pii_service.py ADDED Viewed

	@@ -0,0 +1,309 @@

+from pprint import pprint
+import json
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
+from presidio_analyzer import PatternRecognizer
+from presidio_analyzer import Pattern, PatternRecognizer
+from presidio_analyzer.predefined_recognizers import SpacyRecognizer
+from presidio_analyzer.predefined_recognizers import IbanRecognizer, EmailRecognizer, IpRecognizer,\
+EmailRecognizer, PhoneRecognizer, UrlRecognizer, DateRecognizer
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from flair.data import Sentence
+from flair.models import SequenceTagger
+### Creating FlairRecognizer class for NER(names, location)
+class FlairRecognizer(EntityRecognizer):
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        # "MISCELLANEOUS"   # - There are no direct correlation with Presidio entities.
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
+    ]
+    MODEL_LANGUAGES = {
+        #"en": "flair/ner-english-large",
+        #"es": "flair/ner-spanish-large",
+        "de": "flair/ner-german-large",
+        #"nl": "flair/ner-dutch-large",
+    }
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        self.model = (
+            model
+            if model
+            else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
+        )
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+        print("Flair class initialized")
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )
+class PIIService:
+    def __init__(self):
+        configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [
+                {"lang_code": "de", "model_name": "de_core_news_sm"}
+            ],
+        }
+        # Create NLP engine based on configuration
+        provider = NlpEngineProvider(nlp_configuration=configuration)
+        nlp_engine = provider.create_engine()
+        ## Creating regex for PatternRecognizers - SWIFT, vehicle number, zipcode, ssn
+        swift_regex = r"\b[A-Z]{4}DE[A-Z0-9]{2}(?:[A-Z0-9]{3})?"
+        vehicle_number_with_hyphen_regex = r"\b[A-ZÄÖÜ]{1,3}-[A-ZÄÖÜ]{1,2}-[0-9]{1,4}"
+        vehicle_number_without_hyphen_regex = r"\b[A-ZÄÖÜ]{1,3}[A-ZÄÖÜ]{1,2}[0-9]{1,4}"
+        german_zipcode_regex = r"\b((?:0[1-46-9]\d{3})|(?:[1-357-9]\d{4})|(?:[4][0-24-9]\d{3})|(?:[6][013-9]\d{3}))\b(?![\d/])"
+        german_ssn_regex = r"\b\d{2}\s?\d{6}\s?[A-Z]\s?\d{3}\b"
+        # Creating Presidio pattern object
+        vehicle_numbers_pattern1 = Pattern(name="vehicle_pattern", regex=vehicle_number_without_hyphen_regex, score=1)
+        vehicle_numbers_pattern2 = Pattern(name="vehicle_pattern", regex=vehicle_number_with_hyphen_regex, score=1)
+        swift_pattern = Pattern(name="bank_swift_pattern", regex=swift_regex, score=1)
+        germanzipcode_pattern = Pattern(name="german_zip_pattern",regex=german_zipcode_regex, score=1)
+        german_ssn_pattern = Pattern(name="german_ssn_pattern",regex=german_ssn_regex, score=1)
+        # Define the recognizer
+        swift_recognizer = PatternRecognizer(supported_entity="SWIFT", supported_language="de",patterns=[swift_pattern])
+        vehicle_number_recognizer = PatternRecognizer(supported_entity="VEHICLE_NUMBER", supported_language="de",patterns=[vehicle_numbers_pattern1,vehicle_numbers_pattern2])
+        germanzip_recognizer = PatternRecognizer(supported_entity="GERMAN_ZIP", supported_language="de",patterns=[germanzipcode_pattern])
+        germanssn_recognizer = PatternRecognizer(supported_entity="GERMAN_SSN", supported_language="de",patterns=[german_ssn_pattern])
+        ## Lading flair entity model for person, location ID
+        print("Loading flair")
+        flair_recognizer = FlairRecognizer(supported_language="de")
+        print("Flair loaded")
+        registry = RecognizerRegistry()
+        #registry.load_predefined_recognizers()
+        #registry.add_recognizer(SpacyRecognizer(supported_language="de"))
+        #registry.add_recognizer(SpacyRecognizer(supported_language="en"))
+        registry.remove_recognizer("SpacyRecognizer")
+        registry.add_recognizer(flair_recognizer)
+        registry.add_recognizer(swift_recognizer)
+        registry.add_recognizer(vehicle_number_recognizer)
+        registry.add_recognizer(germanzip_recognizer)
+        registry.add_recognizer(germanssn_recognizer)
+        ## Adding predefined recognizers
+        registry.add_recognizer(IbanRecognizer(supported_language="de"))
+        registry.add_recognizer(DateRecognizer(supported_language="de"))
+        registry.add_recognizer(EmailRecognizer(supported_language="de"))
+        registry.add_recognizer(IpRecognizer(supported_language="de"))
+        registry.add_recognizer(PhoneRecognizer(supported_language="de"))
+        registry.add_recognizer(UrlRecognizer(supported_language="de"))
+        registry.add_recognizer(PhoneRecognizer(supported_language="de"))
+        print("Recognizer registry loaded")
+        self.analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine, supported_languages=["de"])
+        #print(f"Type of recognizers ::\n {self.analyzer.registry.recognizers}")
+        print("PII initialized")
+        self.anonymizer = AnonymizerEngine()
+    def identify(self, text):
+        results_de = self.analyzer.analyze(
+            text,
+            language='de'
+        )
+        #anonymized_results = self.anonymize(results_de, text)
+        entities = []
+        for result in results_de:
+            result_dict = result.to_dict()
+            temp_entity = {
+                "start":result_dict['start'],
+                "end":result_dict['end'],
+                "entity_type":result_dict['entity_type'],
+                "score":result_dict['score'],
+                "word":text[result_dict['start']:result_dict['end']]
+            }
+            entities.append(temp_entity)
+        return {"entities":entities, "text":text}#, "anonymized_text":anonymized_results['text']}
+    """def anonymize(self, entities, text):
+        anonymized_results = self.anonymizer.anonymize(
+            text=text,
+            analyzer_results=entities,
+            #operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})},
+        )
+        return ""#json.loads(anonymized_results.to_json())"""
+    def add_mask(self, data):
+      masked_data = []
+      entity_count = {}
+      for item_idx,item in enumerate(data['entities']):
+          entity_type = item['entity_type']
+          word = item['word']
+          suffix = entity_count.get(entity_type, 0) + 1
+          entity_count[entity_type] = suffix
+          masked_word = f"{entity_type}_{suffix}"
+          item['mask'] = masked_word
+          #data['entities'][item_idx]['mask'] = masked_word
+          masked_data.append(item)
+      return masked_data
+    def anonymize(self, entities, text):
+        print("anonymyzing")
+        updated_text = text
+        for ent_idx, ent in enumerate(entities):
+            #text[ent['start']:ent['end']] = ent['mask']
+            updated_text = updated_text[:ent['start']] + " " + ent['mask'] + " " + updated_text[ent['end']:]
+        return updated_text
+    def remove_overlapping_entities(entities):
+        return