aimlnerd commited on
Commit
5348cff
1 Parent(s): 0806367
data/ner_input_data/ner_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
source/services/ner/awscomprehend_2_ner_format.py CHANGED
@@ -1,31 +1,89 @@
1
  import json
 
2
  import pandas as pd
3
- with open(r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json", "r") as file:
4
- json_letter = json.load(file)
5
 
6
 
 
 
 
7
 
8
- data_token = []
9
- for block in json_letter['Blocks']:
10
- if block["BlockType"] == 'WORD':
11
- data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- data_nertags = []
16
- for block in json_letter['Entities']:
17
- ner_tag = block['Type']
18
- for subref in block['BlockReferences']:
19
- counter = 0
20
- for child in subref['ChildBlocks']:
21
- if counter == 0:
22
- data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
23
- counter = counter+1
24
- else:
25
- data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
26
 
27
- df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
 
 
 
 
 
 
 
 
 
28
 
29
- df = pd.merge(df_token, df_nertags, on='blockid', how='left')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- print()
 
1
  import json
2
+ import numpy as np
3
  import pandas as pd
4
+ from pathlib import Path
 
5
 
6
 
7
+ class Comprehend2NERFormat:
8
+ def __init__(self, letterfilepath):
9
+ self.letterfilepath = letterfilepath
10
 
11
+ def load_data(self):
12
+ with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json"
13
+ json_letter = json.load(file)
14
+ return json_letter
15
+
16
+ @staticmethod
17
+ def get_tokens(jsondata):
18
+ data_token = []
19
+ for block in jsondata['Blocks']:
20
+ if block["BlockType"] == 'WORD':
21
+ data_token.append({'blockid' : block['Id'], 'token' : block['Text']})
22
+ df_token = pd.DataFrame(data_token, columns = ['blockid', 'token'])
23
+ return df_token
24
+
25
+ @staticmethod
26
+ def get_line_child_ids(jsondata):
27
+ df_line = pd.DataFrame(columns = ['lineid', 'childid'])
28
+ for block in jsondata['Blocks']:
29
+ if block["BlockType"] == 'LINE':
30
+ childid = block['Relationships'][0]['Ids']
31
+ lineid = [block['Id']] *len(childid)
32
+ df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0)
33
+ return df_line
34
 
35
+ @staticmethod
36
+ def get_ner_tags(jsondata):
37
+ data_nertags = []
38
+ for block in jsondata['Entities']:
39
+ ner_tag = block['Type']
40
+ for subref in block['BlockReferences']:
41
+ counter = 0
42
+ for child in subref['ChildBlocks']:
43
+ if counter == 0:
44
+ data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"})
45
+ counter = counter+1
46
+ else:
47
+ data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"})
48
 
49
+ df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag'])
50
+ return df_nertags
 
 
 
 
 
 
 
 
 
51
 
52
+ @staticmethod
53
+ def insert_newline_char(df_prev):
54
+ df = pd.DataFrame()
55
+ df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0])
56
+ for group in df_prev.groupby('lineid'):
57
+ insertrank = group[1]['linewordrank'].iloc[-1] + 0.1
58
+ df_insert['linewordrank'] = insertrank
59
+ df = pd.concat([df, group[1], df_insert], axis=0)
60
+ df.sort_values(by='linewordrank', inplace=True)
61
+ return df
62
 
63
+ def __call__(self):
64
+ json_letter = self.load_data()
65
+ df_token = self.get_tokens(jsondata=json_letter)
66
+ df_line = self.get_line_child_ids(jsondata=json_letter)
67
+ df_nertags = self.get_ner_tags(jsondata=json_letter)
68
+
69
+ df1 = pd.merge(df_token, df_nertags, on='blockid', how='left')
70
+ df1['ner_tag'][df1['ner_tag'].isna()] = 'O'
71
+ df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid'])
72
+ df2['linewordrank'] = np.arange(df2.shape[0])
73
+
74
+ df3 = self.insert_newline_char(df_prev=df2)
75
+ return {"tokens": df3['token'].tolist(),
76
+ "ner_tags": df3['ner_tag'].tolist(),
77
+ "filename": self.letterfilepath.name
78
+ }
79
+
80
+
81
+ if __name__ == '__main__':
82
+ dataset_lst = []
83
+ for file in Path(r'data/raw_data/annotations/').glob('**/*'):
84
+ comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file)
85
+ dataset_lst.append(comprehend2NERFormat())
86
+
87
+ with open('data/ner_input_data/ner_dataset.json', 'w') as f:
88
+ json.dump(dataset_lst, f)
89