import json import numpy as np import pandas as pd from pathlib import Path class Comprehend2NERFormat: def __init__(self, letterfilepath): self.letterfilepath = letterfilepath def load_data(self): with open(self.letterfilepath, "r") as file: #r"data/raw_data/annotations/Letter 0-1-ccf1b225-ann.json" json_letter = json.load(file) return json_letter @staticmethod def get_tokens(jsondata): data_token = [] for block in jsondata['Blocks']: if block["BlockType"] == 'WORD': data_token.append({'blockid' : block['Id'], 'token' : block['Text']}) df_token = pd.DataFrame(data_token, columns = ['blockid', 'token']) return df_token @staticmethod def get_line_child_ids(jsondata): df_line = pd.DataFrame(columns = ['lineid', 'childid']) for block in jsondata['Blocks']: if block["BlockType"] == 'LINE': childid = block['Relationships'][0]['Ids'] lineid = [block['Id']] *len(childid) df_line = pd.concat([df_line, pd.DataFrame({'lineid' : lineid, 'childid' : childid})], axis=0) return df_line @staticmethod def get_ner_tags(jsondata): data_nertags = [] for block in jsondata['Entities']: ner_tag = block['Type'] for subref in block['BlockReferences']: counter = 0 for child in subref['ChildBlocks']: if counter == 0: data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"B-{ner_tag}"}) counter = counter+1 else: data_nertags.append({'blockid': child['ChildBlockId'], 'ner_tag': f"I-{ner_tag}"}) df_nertags = pd.DataFrame(data_nertags, columns = ['blockid', 'ner_tag']) return df_nertags @staticmethod def insert_newline_char(df_prev): df = pd.DataFrame() df_insert = pd.DataFrame({'blockid' : 'newline', 'token' : '\n', 'ner_tag': 'O','lineid': 'newline'},index=[0]) for group in df_prev.groupby('lineid'): insertrank = group[1]['linewordrank'].iloc[-1] + 0.1 df_insert['linewordrank'] = insertrank df = pd.concat([df, group[1], df_insert], axis=0) df.sort_values(by='linewordrank', inplace=True) return df def __call__(self): json_letter = self.load_data() df_token = self.get_tokens(jsondata=json_letter) df_line = self.get_line_child_ids(jsondata=json_letter) df_nertags = self.get_ner_tags(jsondata=json_letter) df1 = pd.merge(df_token, df_nertags, on='blockid', how='left') df1['ner_tag'][df1['ner_tag'].isna()] = 'O' df2 = pd.merge(df1, df_line, left_on='blockid', right_on='childid', how='left').drop(columns=['childid']) df2['linewordrank'] = np.arange(df2.shape[0]) df3 = self.insert_newline_char(df_prev=df2) return {"tokens": df3['token'].tolist(), "ner_tags": df3['ner_tag'].tolist(), "filename": self.letterfilepath.name } if __name__ == '__main__': dataset_lst = [] for file in Path(r'data/raw_data/annotations/').glob('**/*'): comprehend2NERFormat = Comprehend2NERFormat(letterfilepath=file) dataset_lst.append(comprehend2NERFormat()) print(" ".join(dataset_lst[0]['tokens'])) with open('data/ner_input_data/ner_dataset.json', 'w') as f: json.dump(dataset_lst, f)