jfrery-zama
commited on
Commit
β’
df6182e
1
Parent(s):
1fa0767
space working with chatgpt 4
Browse files- .gitignore +2 -1
- anonymize_file_clear.py +61 -0
- app.py +83 -25
- demo_text.txt +1 -10
- fhe_anonymizer.py +29 -48
- files/anonymized_document.txt +6 -50
- files/chatgpt_prompt.txt +9 -0
- files/original_document.txt +6 -50
- original_document_uuid_mapping.json +34 -0
- requirements.txt +2 -1
- utils_demo.py +0 -274
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
__pycache__/
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.venv/
|
anonymize_file_clear.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import uuid
|
5 |
+
from pathlib import Path
|
6 |
+
import gensim
|
7 |
+
from concrete.ml.common.serialization.loaders import load
|
8 |
+
|
9 |
+
def load_models():
|
10 |
+
base_dir = Path(__file__).parent
|
11 |
+
embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
|
12 |
+
with open(base_dir / "cml_xgboost.model", "r") as model_file:
|
13 |
+
fhe_ner_detection = load(file=model_file)
|
14 |
+
return embeddings_model, fhe_ner_detection
|
15 |
+
|
16 |
+
def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
17 |
+
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
18 |
+
tokens = re.findall(token_pattern, text)
|
19 |
+
uuid_map = {}
|
20 |
+
processed_tokens = []
|
21 |
+
|
22 |
+
for token in tokens:
|
23 |
+
if token.strip() and re.match(r"\w+", token): # If the token is a word
|
24 |
+
x = embeddings_model.wv[token][None]
|
25 |
+
prediction_proba = fhe_ner_detection.predict_proba(x)
|
26 |
+
probability = prediction_proba[0][1]
|
27 |
+
prediction = probability >= 0.5
|
28 |
+
if prediction:
|
29 |
+
if token not in uuid_map:
|
30 |
+
uuid_map[token] = str(uuid.uuid4())[:8]
|
31 |
+
processed_tokens.append(uuid_map[token])
|
32 |
+
else:
|
33 |
+
processed_tokens.append(token)
|
34 |
+
else:
|
35 |
+
processed_tokens.append(token) # Preserve punctuation and spaces as is
|
36 |
+
|
37 |
+
return uuid_map
|
38 |
+
|
39 |
+
def main():
|
40 |
+
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
|
41 |
+
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
|
42 |
+
args = parser.parse_args()
|
43 |
+
|
44 |
+
embeddings_model, fhe_ner_detection = load_models()
|
45 |
+
|
46 |
+
# Read the input file
|
47 |
+
with open(args.file_path, 'r', encoding='utf-8') as file:
|
48 |
+
text = file.read()
|
49 |
+
|
50 |
+
# Anonymize the text
|
51 |
+
uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
|
52 |
+
|
53 |
+
# Save the UUID mapping to a JSON file
|
54 |
+
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
|
55 |
+
with open(mapping_path, 'w', encoding='utf-8') as file:
|
56 |
+
json.dump(uuid_map, file, indent=4, sort_keys=True)
|
57 |
+
|
58 |
+
print(f"UUID mapping saved to {mapping_path}")
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
main()
|
app.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1 |
"""A Gradio app for anonymizing text data using FHE."""
|
2 |
|
3 |
import gradio as gr
|
4 |
-
import re
|
5 |
from fhe_anonymizer import FHEAnonymizer
|
6 |
import pandas as pd
|
7 |
-
|
|
|
|
|
|
|
8 |
|
9 |
anonymizer = FHEAnonymizer()
|
10 |
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def deidentify_text(input_text):
|
13 |
anonymized_text, identified_words_with_prob = anonymizer(input_text)
|
@@ -22,10 +28,62 @@ def deidentify_text(input_text):
|
|
22 |
return anonymized_text, identified_df
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Default demo text from the file
|
26 |
with open("demo_text.txt", "r") as file:
|
27 |
default_demo_text = file.read()
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
demo = gr.Blocks()
|
30 |
|
31 |
with demo:
|
@@ -50,33 +108,23 @@ with demo:
|
|
50 |
with gr.Accordion("What is Encrypted Anonymization?", open=False):
|
51 |
gr.Markdown(
|
52 |
"""
|
53 |
-
Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process.
|
54 |
"""
|
55 |
)
|
56 |
|
57 |
-
with gr.Accordion("
|
58 |
-
gr.Markdown(
|
59 |
-
"""
|
60 |
-
Privacy in data processing is critical to protect individuals' personal information from unauthorized access and potential misuse. With the increasing amount of personal data being collected and analyzed, the risks associated with data breaches and identity theft have also risen. By implementing privacy-preserving techniques, such as encrypted anonymization, organizations can safeguard sensitive information, build trust with their customers, and comply with stringent data protection regulations.
|
61 |
-
"""
|
62 |
-
)
|
63 |
|
64 |
-
with gr.Accordion(
|
65 |
-
|
66 |
-
):
|
67 |
-
gr.Markdown(
|
68 |
-
"""
|
69 |
-
Fully Homomorphic Encryption (FHE) enhances data privacy by enabling computations on encrypted data without needing to decrypt it first. This revolutionary technology ensures that sensitive data can be processed and analyzed securely, without exposing it to potential threats. FHE is a game-changer for privacy-preserving computations, allowing for the secure analysis of encrypted data, which is particularly beneficial in sectors like finance, healthcare, and beyond.
|
70 |
-
"""
|
71 |
-
)
|
72 |
|
73 |
-
gr.Markdown(
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
)
|
80 |
|
81 |
with gr.Row():
|
82 |
input_text = gr.Textbox(
|
@@ -88,7 +136,7 @@ with demo:
|
|
88 |
|
89 |
anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
|
90 |
|
91 |
-
identified_words_output = gr.Dataframe(label="Identified Words")
|
92 |
|
93 |
submit_button = gr.Button("Anonymize")
|
94 |
|
@@ -98,6 +146,16 @@ with demo:
|
|
98 |
outputs=[anonymized_text_output, identified_words_output],
|
99 |
)
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# Launch the app
|
103 |
demo.launch(share=False)
|
|
|
1 |
"""A Gradio app for anonymizing text data using FHE."""
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
from fhe_anonymizer import FHEAnonymizer
|
5 |
import pandas as pd
|
6 |
+
from openai import OpenAI
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import re
|
10 |
|
11 |
anonymizer = FHEAnonymizer()
|
12 |
|
13 |
+
client = OpenAI(
|
14 |
+
api_key=os.environ.get("openaikey"),
|
15 |
+
)
|
16 |
+
|
17 |
|
18 |
def deidentify_text(input_text):
|
19 |
anonymized_text, identified_words_with_prob = anonymizer(input_text)
|
|
|
28 |
return anonymized_text, identified_df
|
29 |
|
30 |
|
31 |
+
def query_chatgpt(anonymized_query):
|
32 |
+
|
33 |
+
with open("files/anonymized_document.txt", "r") as file:
|
34 |
+
anonymized_document = file.read()
|
35 |
+
with open("files/chatgpt_prompt.txt", "r") as file:
|
36 |
+
prompt = file.read()
|
37 |
+
|
38 |
+
# Prepare prompt
|
39 |
+
full_prompt = (
|
40 |
+
prompt + "\n"
|
41 |
+
)
|
42 |
+
query = "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```"
|
43 |
+
print(full_prompt)
|
44 |
+
|
45 |
+
completion = client.chat.completions.create(
|
46 |
+
model="gpt-4-1106-preview", # Replace with "gpt-4" if available
|
47 |
+
messages=[
|
48 |
+
{"role": "system", "content": prompt},
|
49 |
+
{"role": "user", "content": query},
|
50 |
+
],
|
51 |
+
)
|
52 |
+
anonymized_response = completion.choices[0].message.content
|
53 |
+
with open("original_document_uuid_mapping.json", "r") as file:
|
54 |
+
uuid_map = json.load(file)
|
55 |
+
inverse_uuid_map = {v: k for k, v in uuid_map.items()} # TODO load the inverse mapping from disk for efficiency
|
56 |
+
|
57 |
+
# Pattern to identify words and non-words (including punctuation, spaces, etc.)
|
58 |
+
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
59 |
+
tokens = re.findall(token_pattern, anonymized_response)
|
60 |
+
processed_tokens = []
|
61 |
+
|
62 |
+
print(tokens)
|
63 |
+
for token in tokens:
|
64 |
+
# Directly append non-word tokens or whitespace to processed_tokens
|
65 |
+
if not token.strip() or not re.match(r"\w+", token):
|
66 |
+
processed_tokens.append(token)
|
67 |
+
continue
|
68 |
+
print(token)
|
69 |
+
if token in inverse_uuid_map:
|
70 |
+
processed_tokens.append(inverse_uuid_map[token])
|
71 |
+
else:
|
72 |
+
processed_tokens.append(token)
|
73 |
+
deanonymized_response = "".join(processed_tokens)
|
74 |
+
return anonymized_response, deanonymized_response
|
75 |
+
|
76 |
+
|
77 |
# Default demo text from the file
|
78 |
with open("demo_text.txt", "r") as file:
|
79 |
default_demo_text = file.read()
|
80 |
|
81 |
+
with open("files/original_document.txt", "r") as file:
|
82 |
+
original_document = file.read()
|
83 |
+
|
84 |
+
with open("files/anonymized_document.txt", "r") as file:
|
85 |
+
anonymized_document = file.read()
|
86 |
+
|
87 |
demo = gr.Blocks()
|
88 |
|
89 |
with demo:
|
|
|
108 |
with gr.Accordion("What is Encrypted Anonymization?", open=False):
|
109 |
gr.Markdown(
|
110 |
"""
|
111 |
+
Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process.
|
112 |
"""
|
113 |
)
|
114 |
|
115 |
+
with gr.Accordion("Original Document", open=False):
|
116 |
+
gr.Markdown(original_document)
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
with gr.Accordion("Anonymized Document", open=False):
|
119 |
+
gr.Markdown(anonymized_document)
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# gr.Markdown(
|
122 |
+
# """
|
123 |
+
# <p align="center">
|
124 |
+
# <img src="file/images/banner.png">
|
125 |
+
# </p>
|
126 |
+
# """
|
127 |
+
# )
|
128 |
|
129 |
with gr.Row():
|
130 |
input_text = gr.Textbox(
|
|
|
136 |
|
137 |
anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
|
138 |
|
139 |
+
identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
|
140 |
|
141 |
submit_button = gr.Button("Anonymize")
|
142 |
|
|
|
146 |
outputs=[anonymized_text_output, identified_words_output],
|
147 |
)
|
148 |
|
149 |
+
with gr.Row():
|
150 |
+
chatgpt_response_anonymized = gr.Textbox(label="ChatGPT Anonymized Response", lines=13)
|
151 |
+
chatgpt_response_deanonymized = gr.Textbox(label="ChatGPT Deanonymized Response", lines=13)
|
152 |
+
|
153 |
+
chatgpt_button = gr.Button("Query ChatGPT")
|
154 |
+
chatgpt_button.click(
|
155 |
+
query_chatgpt,
|
156 |
+
inputs=[anonymized_text_output],
|
157 |
+
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
|
158 |
+
)
|
159 |
|
160 |
# Launch the app
|
161 |
demo.launch(share=False)
|
demo_text.txt
CHANGED
@@ -1,10 +1 @@
|
|
1 |
-
|
2 |
-
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
|
3 |
-
|
4 |
-
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
|
5 |
-
|
6 |
-
My passport: 191280342 and my phone number: (212) 555-1234.
|
7 |
-
|
8 |
-
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
|
9 |
-
|
10 |
-
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
|
|
1 |
+
Who lives in Maine?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fhe_anonymizer.py
CHANGED
@@ -3,6 +3,8 @@ import re
|
|
3 |
from concrete.ml.deployment import FHEModelClient, FHEModelServer
|
4 |
from pathlib import Path
|
5 |
from concrete.ml.common.serialization.loaders import load
|
|
|
|
|
6 |
|
7 |
base_dir = Path(__file__).parent
|
8 |
|
@@ -17,6 +19,9 @@ class FHEAnonymizer:
|
|
17 |
with open(base_dir / "cml_xgboost.model", "r") as model_file:
|
18 |
self.fhe_ner_detection = load(file=model_file)
|
19 |
|
|
|
|
|
|
|
20 |
path_to_model = (base_dir / "deployment").resolve()
|
21 |
self.client = FHEModelClient(path_to_model)
|
22 |
self.server = FHEModelServer(path_to_model)
|
@@ -30,57 +35,33 @@ class FHEAnonymizer:
|
|
30 |
return y
|
31 |
|
32 |
def __call__(self, text: str):
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
for word in text.split():
|
38 |
# Prediction for each word
|
39 |
-
x = self.embeddings_model.wv[
|
40 |
prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
41 |
-
# prediction = self.fhe_inference(x).argmax(1)[0]
|
42 |
-
# print(word, prediction)
|
43 |
probability = prediction_proba[0][1]
|
44 |
-
prediction = probability >= 0.5
|
45 |
-
if prediction == 1:
|
46 |
-
identified_words_with_prob.append((word, probability))
|
47 |
-
new_text.append("<REMOVED>")
|
48 |
-
else:
|
49 |
-
new_text.append(word)
|
50 |
-
|
51 |
-
# Joining the modified text
|
52 |
-
modified_text = " ".join(new_text)
|
53 |
-
|
54 |
-
return modified_text, identified_words_with_prob
|
55 |
-
|
56 |
-
def preprocess_sentences(self, sentence, verbose=False):
|
57 |
-
"""Preprocess the sentence."""
|
58 |
-
|
59 |
-
sentence = re.sub(r"\n+", " ", sentence)
|
60 |
-
if verbose:
|
61 |
-
print(sentence)
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
|
72 |
-
if verbose:
|
73 |
-
print(sentence)
|
74 |
-
|
75 |
-
pattern = r"(?<!\w)[{}]|[{}](?!\w)".format(
|
76 |
-
re.escape(self.punctuation_list), re.escape(self.punctuation_list)
|
77 |
-
)
|
78 |
-
sentence = re.sub(pattern, "", sentence)
|
79 |
-
if verbose:
|
80 |
-
print(sentence)
|
81 |
-
|
82 |
-
sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
|
83 |
-
if verbose:
|
84 |
-
print(sentence)
|
85 |
|
86 |
-
|
|
|
|
|
|
3 |
from concrete.ml.deployment import FHEModelClient, FHEModelServer
|
4 |
from pathlib import Path
|
5 |
from concrete.ml.common.serialization.loaders import load
|
6 |
+
import uuid
|
7 |
+
import json
|
8 |
|
9 |
base_dir = Path(__file__).parent
|
10 |
|
|
|
19 |
with open(base_dir / "cml_xgboost.model", "r") as model_file:
|
20 |
self.fhe_ner_detection = load(file=model_file)
|
21 |
|
22 |
+
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
23 |
+
self.uuid_map = json.load(file)
|
24 |
+
|
25 |
path_to_model = (base_dir / "deployment").resolve()
|
26 |
self.client = FHEModelClient(path_to_model)
|
27 |
self.server = FHEModelServer(path_to_model)
|
|
|
35 |
return y
|
36 |
|
37 |
def __call__(self, text: str):
|
38 |
+
# Pattern to identify words and non-words (including punctuation, spaces, etc.)
|
39 |
+
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
40 |
+
tokens = re.findall(token_pattern, text)
|
41 |
+
identified_words_with_prob = []
|
42 |
+
processed_tokens = []
|
43 |
+
|
44 |
+
print(tokens)
|
45 |
+
for token in tokens:
|
46 |
+
# Directly append non-word tokens or whitespace to processed_tokens
|
47 |
+
if not token.strip() or not re.match(r"\w+", token):
|
48 |
+
processed_tokens.append(token)
|
49 |
+
continue
|
50 |
|
|
|
51 |
# Prediction for each word
|
52 |
+
x = self.embeddings_model.wv[token][None]
|
53 |
prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
|
|
|
|
54 |
probability = prediction_proba[0][1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
if probability >= 0.5:
|
57 |
+
identified_words_with_prob.append((token, probability))
|
58 |
+
# Use the existing UUID if available, otherwise generate a new one
|
59 |
+
tmp_uuid = self.uuid_map.get(token, str(uuid.uuid4())[:8])
|
60 |
+
processed_tokens.append(tmp_uuid)
|
61 |
+
self.uuid_map[token] = tmp_uuid
|
62 |
+
else:
|
63 |
+
processed_tokens.append(token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
# Reconstruct the sentence
|
66 |
+
reconstructed_sentence = ''.join(processed_tokens)
|
67 |
+
return reconstructed_sentence, identified_words_with_prob
|
files/anonymized_document.txt
CHANGED
@@ -1,54 +1,10 @@
|
|
1 |
-
|
2 |
-
|
3 |
|
4 |
-
|
5 |
-
b8859a2c 553cc220
|
6 |
|
7 |
-
|
8 |
-
a28dbb08 9f84ce23
|
9 |
-
90112484 5c1705ba
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
-
4cd9e789 a1f51933: 019ac660 million
|
15 |
-
3196fc94 Sales: 50311924 million
|
16 |
-
|
17 |
-
496b82c2 bdf223ed: da54a11a million
|
18 |
-
61307cb8 and f4e35ce3
|
19 |
-
|
20 |
-
0a5f5ee5 3a0429d8's 904bce1e are ce79e4ff 8d5303a8 with research and 0b4a7271 RD, sales and 0807fdda, and c1d7820d 4c0101b2. c5894e7a a0e3c060 are the 2b81915c figures for the 2042d5df f2ddfe31 6cee2393:
|
21 |
-
|
22 |
-
RD: 2bfddef5 million
|
23 |
-
Sales and 8e7609fc: 682daa09 million
|
24 |
-
1eb46a02 93908764: c4a6681a million
|
25 |
-
|
26 |
-
9cc85bb0 9b15cd0b: 20c5b8db million
|
27 |
-
|
28 |
-
30d55f53 ef4684e2 for 274f506b 7153d60a: 92063576 million
|
29 |
-
26fa65d5 299f9a34 3e07531f
|
30 |
-
|
31 |
-
To ensure f48418ed growth and 4e52397d, d93931c1 bf8dd379. needs to focus on several key areas. 8b71b2c4 plan b8d6f955 77273f70 eff17325 6e32e65a, using bd178c9b data and 56935895 for 94fe5b9b 5c729e9f.
|
32 |
-
b56f5f8e into f17b897f f0458470
|
33 |
-
|
34 |
-
fbe11266 the global digital c5bc7803 79c37224, emerging markets present f6ea9661 9c3eaa4d for growth. c1d933f6 plans to 7612d3f9 e64531b7 in e3a6050a bd2a43c4 as 375af421, 5dc6aed3, and 60fd1ca9 7675408f, 2bfce010 to 71336309 revenue by b40729e6 over the 807f7c70 16671a96 933e82e7.
|
35 |
-
430afebf e77cc3ff
|
36 |
-
|
37 |
-
cfdd4e98 841ff829 in RD is 206d362e for maintaining a 13bb2340 bbf9e6e6. 9132698f 1c52e182 16d64f13 million 8d9c99ec developing new d1e42c0a 62e93c73 and cloud f6017e35, projecting a 313b7853 8b793da0 in software 8719a9b3 revenue by 1bfef750.
|
38 |
-
820123a4 and 12093506
|
39 |
-
|
40 |
-
To 14785122 1ce8ed84 product d21e9b14 and 3cafa2cb 8b2d9344 market 9a9ea085, 3a1c4899 will 2c8df8f2 7bef6ad5 8def87a9 and 8c2453f7. eb9d3366 02bf0ddf has set 99e4cfd8 a fund of a1dbd949 million for 1168967a 180c6196 in 68b2be65 and f3d42218 aa61a94d, expecting to enhance 5c96deb8 offering and drive a ba5f69cf revenue growth.
|
41 |
-
f7531366 8367b98b
|
42 |
-
|
43 |
-
Identifying and a1c193cd dd578067 is 45ba7c0c for the d9f1babd of 6b2c4206's growth. 734bf200 60ed7498 16a3f35e:
|
44 |
-
|
45 |
-
a860d0af 911f97f6: 9a030bf5 ce10476f and adaptation to 76f34e5f 703994f9 are b1a68f0d.
|
46 |
-
2cb167d2 53ba4fff: edfc1496 ahead in f0426ec0 to meet 3443e779 market 0c1ac39b.
|
47 |
-
6c72c52f e2d3bfae: fc5d928d 71902961 to global 1277dd76, 3374e1d9 in new markets.
|
48 |
-
|
49 |
-
d0c05d81
|
50 |
-
|
51 |
-
20c10c38 document ac328034 a 55d3f414 80a154f5 7b580e92 and 8c5457ab b75d19ad plan for af820f2a 83382bae., using 7dc8aae7 851e65f7 and figures. It is designed to bd7f1439 how the 6f55f229 can fb8a773e the 5f837d8d of the global digital market, 42f6054b ba70b81c for growth while c9a6eef7 potential 2252cf74. cd4c1afa data and scenarios d9d31530 71a5d9f4 are 09d6c906 3bfc963f and 5352f583 6ef4d7c7 be d69e06ae c65365d9 of 828e04dd real 003af3df or b26c90f6.
|
52 |
-
7275fe97
|
53 |
-
|
54 |
-
7ab379b9 3e911aa4 figures, 41df9da7, and scenarios 24ded709 in 765fb783 document are b14afae8 351fbe3c and created for 62b7490d e6f0cc19 only. 7917e19d do 9b4f4c24 50c52f98 real a92adfc3, c25f65bc, or 17e4168d 3224758c data. 0de09cb5 document is 631d727a as a 25114a3c d7470130 in df9c5fed 6ac1a4e7 and 5430f574 planning.
|
|
|
1 |
+
Hello, 0a182475 name is 84e24eb8 0ea0e35c and 9f2963af live in 5d0a593d.
|
2 |
+
6d7a71e0 credit card number is 1155d2a5 and 0a182475 d8e7c7fc c627b841 id is be0eddce.
|
3 |
|
4 |
+
On b926f540 ee1fa38f 9f2963af visited 1e63e774 and sent an email to f70cf334, from the IP 2d66aefe.
|
|
|
5 |
|
6 |
+
6d7a71e0 passport: 53b52085 and 0a182475 phone number: 81133b16 08aeb9f5.
|
|
|
|
|
7 |
|
8 |
+
3bbcaf02 is a valid 11e27c41 5a60bcfa 3730179e 97008100: ab18e6bb . Can 773b08cb f4953628 check the 8ff83dc3 on bank account e1a23ada?
|
9 |
|
10 |
+
7ab7335c's social security number is 11e90148. 67e5e61e driver license? it is c6652117.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files/chatgpt_prompt.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Task: Analyze an anonymized document containing sensitive information, where personal details are replaced with unique identifiers (alphanumeric). You're to answer queries based on this document, using these identifiers to reference specific data.
|
2 |
+
|
3 |
+
Details:
|
4 |
+
|
5 |
+
- Sensitive information includes: names, locations, credit card numbers, email addresses, IP addresses, passport details, phone numbers, bank accounts, social security numbers, and driver's licenses.
|
6 |
+
- Each piece of information is represented by a unique identifier, maintaining privacy while discussing document content.
|
7 |
+
- Your role is to interpret the document's anonymized content and accurately respond to queries using the identifiers.
|
8 |
+
- Consistency in identifiers is crucial for connecting the text with the queries correctly.
|
9 |
+
- You must not discuss the anonymized nature of the text and use the identifiers as if they were real words for a smooth chat with users.
|
files/original_document.txt
CHANGED
@@ -1,54 +1,10 @@
|
|
1 |
-
|
2 |
-
|
3 |
|
4 |
-
|
5 |
-
Company Overview
|
6 |
|
7 |
-
|
8 |
-
Financial Analysis
|
9 |
-
Revenue Streams
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
-
Consulting Services: $50 million
|
15 |
-
Hardware Sales: $30 million
|
16 |
-
|
17 |
-
Total Revenue: $230 million
|
18 |
-
Costs and Expenses
|
19 |
-
|
20 |
-
The company's costs are primarily associated with research and development (R&D), sales and marketing, and administrative expenses. The following are the fabricated figures for the fiscal year 2023:
|
21 |
-
|
22 |
-
R&D: $70 million
|
23 |
-
Sales and Marketing: $50 million
|
24 |
-
Administrative Expenses: $30 million
|
25 |
-
|
26 |
-
Total Expenses: $150 million
|
27 |
-
|
28 |
-
Net Profit for FY 2023: $80 million
|
29 |
-
Strategic Development Plan
|
30 |
-
|
31 |
-
To ensure continued growth and profitability, TechInnovate Inc. needs to focus on several key areas. The plan below outlines strategic initiatives, using fictitious data and assumptions for illustration purposes.
|
32 |
-
Expansion into Emerging Markets
|
33 |
-
|
34 |
-
With the global digital transformation accelerating, emerging markets present significant opportunities for growth. TechInnovate plans to establish operations in countries such as India, Brazil, and South Africa, aiming to increase revenue by 20% over the next three years.
|
35 |
-
Product Innovation
|
36 |
-
|
37 |
-
Continued investment in R&D is crucial for maintaining a competitive edge. TechInnovate allocates $100 million towards developing new AI algorithms and cloud solutions, projecting a 30% increase in software subscription revenue by 2025.
|
38 |
-
Partnerships and Acquisitions
|
39 |
-
|
40 |
-
To diversify its product portfolio and strengthen its market position, TechInnovate will seek strategic partnerships and acquisitions. The company has set aside a fund of $200 million for acquiring startups in AI and IoT sectors, expecting to enhance its offering and drive a 25% revenue growth.
|
41 |
-
Risk Management
|
42 |
-
|
43 |
-
Identifying and mitigating risks is essential for the sustainability of TechInnovate's growth. Potential risks include:
|
44 |
-
|
45 |
-
Market Competition: Continuous analysis and adaptation to competitive strategies are required.
|
46 |
-
Technological Changes: Staying ahead in innovation to meet evolving market demands.
|
47 |
-
Regulatory Compliance: Ensuring adherence to global regulations, especially in new markets.
|
48 |
-
|
49 |
-
Conclusion
|
50 |
-
|
51 |
-
This document presents a fictional financial analysis and strategic development plan for TechInnovate Inc., using made-up names and figures. It is designed to showcase how the company can navigate the complexities of the global digital market, leveraging opportunities for growth while managing potential risks. All data and scenarios mentioned herein are entirely fabricated and should not be considered reflective of any real individuals or entities.
|
52 |
-
Disclaimer
|
53 |
-
|
54 |
-
The financial figures, names, and scenarios presented in this document are purely fictional and created for illustrative purposes only. They do not represent real individuals, companies, or actual financial data. This document is intended as a hypothetical exercise in financial analysis and strategic planning.
|
|
|
1 |
+
Hello, my name is David Johnson and I live in Maine.
|
2 |
+
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
|
3 |
|
4 |
+
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
|
|
|
5 |
|
6 |
+
My passport: 191280342 and my phone number: (212) 555-1234.
|
|
|
|
|
7 |
|
8 |
+
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
|
9 |
|
10 |
+
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
original_document_uuid_mapping.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"078-05-1126": "11e90148",
|
3 |
+
"1234567A": "c6652117",
|
4 |
+
"16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "be0eddce",
|
5 |
+
"18": "ee1fa38f",
|
6 |
+
"191280342": "53b52085",
|
7 |
+
"192.168.0.1": "2d66aefe",
|
8 |
+
"212": "81133b16",
|
9 |
+
"4095-2609-9393-4932": "1155d2a5",
|
10 |
+
"555-1234": "08aeb9f5",
|
11 |
+
"954567876544": "e1a23ada",
|
12 |
+
"Account": "3730179e",
|
13 |
+
"Bank": "5a60bcfa",
|
14 |
+
"David": "84e24eb8",
|
15 |
+
"Her": "67e5e61e",
|
16 |
+
"I": "9f2963af",
|
17 |
+
"IL150120690000003111111": "ab18e6bb",
|
18 |
+
"International": "11e27c41",
|
19 |
+
"Johnson": "0ea0e35c",
|
20 |
+
"Kate": "7ab7335c",
|
21 |
+
"Maine": "5d0a593d",
|
22 |
+
"My": "6d7a71e0",
|
23 |
+
"Number": "97008100",
|
24 |
+
"September": "b926f540",
|
25 |
+
"This": "3bbcaf02",
|
26 |
+
"crypto": "d8e7c7fc",
|
27 |
+
"microsoft.com": "1e63e774",
|
28 |
+
"my": "0a182475",
|
29 |
+
"please": "f4953628",
|
30 |
+
"status": "8ff83dc3",
|
31 |
+
"test@presidio.site": "f70cf334",
|
32 |
+
"wallet": "c627b841",
|
33 |
+
"you": "773b08cb"
|
34 |
+
}
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
concrete-ml==1.5.0rc0
|
2 |
gensim==4.3.2
|
3 |
-
gradio==3.40.1
|
|
|
|
1 |
concrete-ml==1.5.0rc0
|
2 |
gensim==4.3.2
|
3 |
+
gradio==3.40.1
|
4 |
+
openai==1.13.3
|
utils_demo.py
CHANGED
@@ -1,274 +0,0 @@
|
|
1 |
-
|
2 |
-
import logging
|
3 |
-
import re
|
4 |
-
import string
|
5 |
-
from flair.data import Sentence
|
6 |
-
from flair.models import SequenceTagger
|
7 |
-
from presidio_analyzer import AnalyzerEngine
|
8 |
-
from presidio_anonymizer import AnonymizerEngine
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
entity_label_to_code_map = {'<PERSON>': 0,
|
13 |
-
'<O>': 1,
|
14 |
-
'<MISC>-<NRP>': 2,
|
15 |
-
'<NUMBER>': 3,
|
16 |
-
'<PER>-<LOCATION>': 4,
|
17 |
-
'<LOC>': 5,
|
18 |
-
'<MISC>': 6, # Miscellaneous: doesn't fall into the more common categories of PERSON, LOCATION, ORGANIZATION,
|
19 |
-
'<DATE_TIME>': 7,
|
20 |
-
'<LOCATION>': 8,
|
21 |
-
'<PRONOUNS>': 9,
|
22 |
-
'<IN_PAN>': 10,
|
23 |
-
'<MISC>-<DATE_TIME>': 11,
|
24 |
-
'<ORG>': 12,
|
25 |
-
'<MISC>-<IN_PAN>': 13,
|
26 |
-
'<MISC>-<LOCATION>': 14,
|
27 |
-
'<PER>': 15,
|
28 |
-
'<MISC>-<PERSON>': 16,
|
29 |
-
'<LOC>-<PERSON>': 17,
|
30 |
-
'<PHONE_NUMBER>': 18,
|
31 |
-
'<LOC>-<DATE_TIME>': 19,
|
32 |
-
'<LOC>-<NRP>': 20,
|
33 |
-
'<NRP>': 21,
|
34 |
-
'<ORG>-<PERSON>': 22,
|
35 |
-
'<PER>-<NRP>': 23,
|
36 |
-
'<ORG>-<LOCATION>': 24,
|
37 |
-
'<PER>-<DATE_TIME>': 25,
|
38 |
-
'<PER>-<IN_PAN>': 26,
|
39 |
-
'<ORG>-<IN_PAN>': 27,
|
40 |
-
'<ORG>-<NRP>': 28,
|
41 |
-
'<US_DRIVER_LICENSE>': 29,
|
42 |
-
'<KEY <EMAIL_ADDRESS>': 30,
|
43 |
-
'<US_BANK_NUMBER>': 33,
|
44 |
-
'<IN_AADHAAR>': 34,
|
45 |
-
'<CRYPTO>': 35,
|
46 |
-
'<IP_ADDRESS>': 36,
|
47 |
-
'<EMAIL_ADDRESS>': 35,
|
48 |
-
'<US_PASSPORT>': 36,
|
49 |
-
'<US_SSN>': 37,
|
50 |
-
'<MISC>-<URL>': 38}
|
51 |
-
|
52 |
-
|
53 |
-
pronoun_list = [
|
54 |
-
'I', 'i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', "I'm", "I am",\
|
55 |
-
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "i'm", \
|
56 |
-
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', \
|
57 |
-
'their', 'theirs', 'themselves', 'we', 'us', 'our', 'ours', 'ourselves' \
|
58 |
-
'Me', 'My', 'Mine', 'Myself', 'You', 'Your', 'Yours', 'Yourself', 'Yourselves', \
|
59 |
-
'He', 'Him', 'His', 'Himself', 'She', 'Her', 'Hers', 'Herself', 'It', 'Its', 'Itself', \
|
60 |
-
'They', 'Them', 'Their', 'Theirs', 'Themselves', 'We', 'Us', 'Our', 'Ours', 'Ourselves',
|
61 |
-
"Lady", "Madam", "Mr.", "Mister", "Sir", "Miss", "Ms.", "Mrs.", "Mr"
|
62 |
-
]
|
63 |
-
|
64 |
-
|
65 |
-
privacy_category_codes = {'<PRIVATE>': 1, '<NON_PRIVATE>': 2, '<OTHER>': 3}
|
66 |
-
|
67 |
-
punctuation_list = list(string.punctuation)
|
68 |
-
punctuation_list.remove('%')
|
69 |
-
punctuation_list.remove('$')
|
70 |
-
punctuation_list = ''.join(punctuation_list)
|
71 |
-
|
72 |
-
def get_word_boundaries(sentence):
|
73 |
-
""" Find the start and end positions of each word in a sentence."""
|
74 |
-
return [(match.start(), match.end()) for match in re.finditer(r'[^\s]+', sentence)]
|
75 |
-
|
76 |
-
|
77 |
-
def fuse_ner_labels(flair_ner, presidio_ner, text_type="<PRIVATE>"):
|
78 |
-
"""Merges The NER labels from 'Flair' and 'Presidio' for a given text.
|
79 |
-
|
80 |
-
We add take into account custom cases and predefined rules for entity classification.
|
81 |
-
"""
|
82 |
-
merged_ner = []
|
83 |
-
|
84 |
-
# Sanity check
|
85 |
-
assert len(flair_ner) == len(presidio_ner)
|
86 |
-
|
87 |
-
for i, ((w1, n1), (w2, n2)) in enumerate(zip(presidio_ner, flair_ner)):
|
88 |
-
|
89 |
-
assert w1 == w2
|
90 |
-
|
91 |
-
if w1.lower() in pronoun_list:
|
92 |
-
common_ner = "<PRONOUNS>"
|
93 |
-
# elif w1 in ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']:
|
94 |
-
# common_ner = "<PRIVATE>"
|
95 |
-
elif n1 == "<O>" and n2 == "<O>":
|
96 |
-
if w1.lower() in ["am", "'m"] and (i - 1) >= 0 and presidio_ner[i - 1][0].lower() == 'i':
|
97 |
-
common_ner = "<PRONOUNS>"
|
98 |
-
|
99 |
-
elif bool(re.match(r'(?<!\S)[\$β¬]?(?:\d{1,3}(?:[ ,.]\d{3})*|\d+)(?:\.\d+)?%?', w1)):
|
100 |
-
common_ner = "<NUMBER>"
|
101 |
-
else:
|
102 |
-
common_ner = '<O>'
|
103 |
-
elif n1 in n2:
|
104 |
-
common_ner = n2
|
105 |
-
elif n1 == '<O>' and n2 != '<O>':
|
106 |
-
common_ner = n2
|
107 |
-
elif n2 == '<O>' and n1 != '<O>':
|
108 |
-
common_ner = f"<{n1}>"
|
109 |
-
else:
|
110 |
-
common_ner = f"<{n1}>-{n2}"
|
111 |
-
try:
|
112 |
-
common_binary_label = 0 if common_ner =="<O>" else 1
|
113 |
-
|
114 |
-
except:
|
115 |
-
print(f"ERROR: common_binary_label = 0 if common_ner =='<O>' else 1 | {w1=}, {w2=}, {n1=}, {n2=}")
|
116 |
-
|
117 |
-
if common_ner not in entity_label_to_code_map.keys():
|
118 |
-
common_multi_label = len(entity_label_to_code_map)
|
119 |
-
if common_ner not in entity_label_to_code_map.keys():
|
120 |
-
print("NOT in KEY", common_ner)
|
121 |
-
entity_label_to_code_map[common_ner] = common_multi_label
|
122 |
-
else:
|
123 |
-
common_multi_label = entity_label_to_code_map[common_ner]
|
124 |
-
|
125 |
-
is_private = text_type if common_ner != '<O>' else '<OTHER>'
|
126 |
-
|
127 |
-
merged_ner.append([w1, common_ner, is_private, privacy_category_codes[is_private], common_binary_label, common_multi_label])
|
128 |
-
|
129 |
-
return merged_ner
|
130 |
-
|
131 |
-
analyzer = AnalyzerEngine()
|
132 |
-
anonymizer = AnonymizerEngine()
|
133 |
-
|
134 |
-
|
135 |
-
def apply_presidio_model(sentence, verbose=True):
|
136 |
-
"""Get Presidio predictions."""
|
137 |
-
|
138 |
-
if verbose: print(f"{sentence=}")
|
139 |
-
# anonymized_text looks like: ['<PERSON>', 'went', 'to', 'Pitier', 'Hospital', ...]
|
140 |
-
|
141 |
-
anonymized_text = anonymizer.anonymize(text=sentence, analyzer_results=analyzer.analyze(text=sentence, language='en'))
|
142 |
-
anonymized_text = anonymized_text.__dict__['text'].split()
|
143 |
-
anonymized_text = ' '.join(anonymized_text)
|
144 |
-
next_word_to_concate = None
|
145 |
-
|
146 |
-
if verbose: print(f"{anonymized_text=}")
|
147 |
-
if verbose: print(f"{anonymized_text.split('<')=}")
|
148 |
-
|
149 |
-
start_index, label = 0, []
|
150 |
-
previous_label = None
|
151 |
-
|
152 |
-
for i, before_split in enumerate(anonymized_text.split('<')):
|
153 |
-
|
154 |
-
if verbose:
|
155 |
-
print(f"\nSubseq_{i}: {before_split=}")
|
156 |
-
|
157 |
-
if i == 0:
|
158 |
-
assert len(before_split) == len(sentence[start_index: len(before_split)])
|
159 |
-
start_index = len(before_split)
|
160 |
-
label.extend([(s, '<O>') for s in before_split.split()])
|
161 |
-
else:
|
162 |
-
after_split = before_split.split(">")
|
163 |
-
if verbose:
|
164 |
-
print(f" -----> ", after_split)
|
165 |
-
print(sentence[start_index:])
|
166 |
-
print(sentence[start_index:].find(after_split[-1]))
|
167 |
-
|
168 |
-
start2_index = start_index + sentence[start_index:].find(after_split[-1])
|
169 |
-
end2_index = start2_index + len(after_split[-1])
|
170 |
-
|
171 |
-
if verbose:
|
172 |
-
print(f"Sanity check: '[{sentence[start2_index: end2_index]}]' VS '[{after_split[-1]}]'")
|
173 |
-
print(f"Hidden part: sentence[{start2_index}: {end2_index}] = {sentence[start2_index: end2_index]}")
|
174 |
-
|
175 |
-
assert sentence[start2_index: end2_index] == after_split[-1]
|
176 |
-
|
177 |
-
start2_index = start2_index if start2_index != start_index else len(sentence)
|
178 |
-
|
179 |
-
for j, anonimyzed_word in enumerate((sentence[start_index: start2_index]).split()):
|
180 |
-
if next_word_to_concate != None and j == 0:
|
181 |
-
label.append((f"{next_word_to_concate}{anonimyzed_word}", f"<{after_split[0]}>"))
|
182 |
-
next_word_to_concate = None
|
183 |
-
else:
|
184 |
-
label.append((anonimyzed_word, f"<{after_split[0]}>"))
|
185 |
-
|
186 |
-
previous_label = f"<{after_split[0]}>"
|
187 |
-
|
188 |
-
if len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][-1] != ' ' and i != len(anonymized_text.split('<')) - 1:
|
189 |
-
if verbose: print("Is there a space after?", after_split, after_split[-1][-1], i, len(anonymized_text.split('<')))
|
190 |
-
|
191 |
-
for j, anonimyzed_word in enumerate((after_split[-1]).split()[:-1]):
|
192 |
-
label.append((anonimyzed_word, "<O>"))
|
193 |
-
|
194 |
-
next_word_to_concate = (after_split[-1]).split()[-1]
|
195 |
-
|
196 |
-
elif len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][0] != ' ' and i != len(anonymized_text.split('<')) - 1:
|
197 |
-
if verbose: print("Is there a space before?", after_split, after_split[-1][0], i, len(anonymized_text.split('<')))
|
198 |
-
|
199 |
-
label[-1] = (f"{label[-1][0]}{after_split[-1].split()[0]}", previous_label)
|
200 |
-
|
201 |
-
for j, anonimyzed_word in enumerate((after_split[-1]).split()[1:]):
|
202 |
-
label.append((anonimyzed_word, "<O>"))
|
203 |
-
|
204 |
-
else:
|
205 |
-
for j, anonimyzed_word in enumerate((after_split[-1]).split()):
|
206 |
-
label.append((anonimyzed_word, "<O>"))
|
207 |
-
|
208 |
-
start_index = end2_index
|
209 |
-
|
210 |
-
return label
|
211 |
-
|
212 |
-
|
213 |
-
def apply_flair_model(original_sentence):
|
214 |
-
"""Get Flair predictions."""
|
215 |
-
|
216 |
-
logging.getLogger('flair').setLevel(logging.WARNING)
|
217 |
-
|
218 |
-
tagger = SequenceTagger.load("flair/ner-english-large")
|
219 |
-
flair_sentence = Sentence(original_sentence)
|
220 |
-
tagger.predict(flair_sentence)
|
221 |
-
|
222 |
-
word_boundaries = get_word_boundaries(original_sentence)
|
223 |
-
|
224 |
-
ner = [[i_token.form, \
|
225 |
-
b_token.get_label().value, \
|
226 |
-
i_token.get_label().score, \
|
227 |
-
i_token.start_position, \
|
228 |
-
i_token.end_position] for b_token in flair_sentence.get_spans("ner") for i_token in b_token]
|
229 |
-
|
230 |
-
ner_labels, ner_index = [], 0
|
231 |
-
|
232 |
-
for start, end in word_boundaries:
|
233 |
-
word_from_text = original_sentence[start:end]
|
234 |
-
if ner_index < len(ner):
|
235 |
-
form, label, _, s, e = ner[ner_index]
|
236 |
-
|
237 |
-
if (s, e) == (start, end) and word_from_text == form:
|
238 |
-
ner_labels.append((word_from_text, label))
|
239 |
-
ner_index += 1
|
240 |
-
else:
|
241 |
-
ner_labels.append((word_from_text, "<O>"))
|
242 |
-
else:
|
243 |
-
ner_labels.append((word_from_text, "<O>"))
|
244 |
-
|
245 |
-
assert len(ner_labels) == len(word_boundaries)
|
246 |
-
|
247 |
-
return ner_labels
|
248 |
-
|
249 |
-
|
250 |
-
def preprocess_sentences(sentence, verbose=False):
|
251 |
-
"""Preprocess the sentence."""
|
252 |
-
|
253 |
-
# Removing Extra Newlines:
|
254 |
-
sentence = re.sub(r'\n+', ' ', sentence)
|
255 |
-
if verbose: print(sentence)
|
256 |
-
|
257 |
-
# Collapsing Multiple Spaces:
|
258 |
-
sentence = re.sub(' +', ' ', sentence)
|
259 |
-
if verbose: print(sentence)
|
260 |
-
|
261 |
-
# Handling Apostrophes in Possessives:
|
262 |
-
sentence = re.sub(r"'s\b", " s", sentence)
|
263 |
-
if verbose: print(sentence)
|
264 |
-
|
265 |
-
# Removing Spaces Before Punctuation:
|
266 |
-
sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
|
267 |
-
if verbose: print(sentence)
|
268 |
-
|
269 |
-
# Pattern for Matching Leading or Trailing Punctuation:
|
270 |
-
pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(punctuation_list), re.escape(punctuation_list))
|
271 |
-
sentence = re.sub(pattern, '', sentence)
|
272 |
-
if verbose: print(sentence)
|
273 |
-
|
274 |
-
return sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|