Spaces:
Runtime error
Runtime error
add
Browse files
data/ner_input_data/ner_dataset.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
source/services/ner/awscomprehend_2_ner_format.py
CHANGED
@@ -1,31 +1,89 @@
|
|
1 |
import json
|
|
|
2 |
import pandas as pd
|
3 |
-
|
4 |
-
json_letter = json.load(file)
|
5 |
|
6 |
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
data_nertags = []
|
16 |
-
|
17 |
-
ner_tag = block['Type']
|
18 |
-
for subref in block['BlockReferences']:
|
19 |
-
counter = 0
|
20 |
-
for child in subref['ChildBlocks']:
|
21 |
-
if counter == 0:
|
22 |
-
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
|
23 |
-
counter = counter+1
|
24 |
-
else:
|
25 |
-
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
print()
|
|
|
1 |
import json
|
2 |
+
import numpy as np
|
3 |
import pandas as pd
|
4 |
+
from pathlib import Path
|
|
|
5 |
|
6 |
|
7 |
+
class Comprehend2NERFormat:
|
8 |
+
def __init__(self, letterfilepath):
|
9 |
+
self.letterfilepath = letterfilepath
|
10 |
|
11 |
+
def load_data(self):
|
12 |
+
with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json"
|
13 |
+
json_letter = json.load(file)
|
14 |
+
return json_letter
|
15 |
+
|
16 |
+
@staticmethod
|
17 |
+
def get_tokens(jsondata):
|
18 |
+
data_token = []
|
19 |
+
for block in jsondata['Blocks']:
|
20 |
+
if block["BlockType"] == 'WORD':
|
21 |
+
data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
|
22 |
+
df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
|
23 |
+
return df_token
|
24 |
+
|
25 |
+
@staticmethod
|
26 |
+
def get_line_child_ids(jsondata):
|
27 |
+
df_line = pd.DataFrame(columns = ['lineid', 'childid'])
|
28 |
+
for block in jsondata['Blocks']:
|
29 |
+
if block["BlockType"] == 'LINE':
|
30 |
+
childid = block['Relationships'][0]['Ids']
|
31 |
+
lineid = [block['Id']] *len(childid)
|
32 |
+
df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0)
|
33 |
+
return df_line
|
34 |
|
35 |
+
@staticmethod
|
36 |
+
def get_ner_tags(jsondata):
|
37 |
+
data_nertags = []
|
38 |
+
for block in jsondata['Entities']:
|
39 |
+
ner_tag = block['Type']
|
40 |
+
for subref in block['BlockReferences']:
|
41 |
+
counter = 0
|
42 |
+
for child in subref['ChildBlocks']:
|
43 |
+
if counter == 0:
|
44 |
+
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
|
45 |
+
counter = counter+1
|
46 |
+
else:
|
47 |
+
data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
|
48 |
|
49 |
+
df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
|
50 |
+
return df_nertags
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
@staticmethod
|
53 |
+
def insert_newline_char(df_prev):
|
54 |
+
df = pd.DataFrame()
|
55 |
+
df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0])
|
56 |
+
for group in df_prev.groupby('lineid'):
|
57 |
+
insertrank = group[1]['linewordrank'].iloc[-1] + 0.1
|
58 |
+
df_insert['linewordrank'] = insertrank
|
59 |
+
df = pd.concat([df, group[1], df_insert], axis=0)
|
60 |
+
df.sort_values(by='linewordrank', inplace=True)
|
61 |
+
return df
|
62 |
|
63 |
+
def __call__(self):
|
64 |
+
json_letter = self.load_data()
|
65 |
+
df_token = self.get_tokens(jsondata=json_letter)
|
66 |
+
df_line = self.get_line_child_ids(jsondata=json_letter)
|
67 |
+
df_nertags = self.get_ner_tags(jsondata=json_letter)
|
68 |
+
|
69 |
+
df1 = pd.merge(df_token, df_nertags, on='blockid', how='left')
|
70 |
+
df1['ner_tag'][df1['ner_tag'].isna()] = 'O'
|
71 |
+
df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid'])
|
72 |
+
df2['linewordrank'] = np.arange(df2.shape[0])
|
73 |
+
|
74 |
+
df3 = self.insert_newline_char(df_prev=df2)
|
75 |
+
return {"tokens": df3['token'].tolist(),
|
76 |
+
"ner_tags": df3['ner_tag'].tolist(),
|
77 |
+
"filename": self.letterfilepath.name
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
if __name__ == '__main__':
|
82 |
+
dataset_lst = []
|
83 |
+
for file in Path(r'data/raw_data/annotations/').glob('**/*'):
|
84 |
+
comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
|
85 |
+
dataset_lst.append(comprehend2NERFormat())
|
86 |
+
|
87 |
+
with open('data/ner_input_data/ner_dataset.json', 'w') as f:
|
88 |
+
json.dump(dataset_lst, f)
|
89 |
|
|