from constants import DOCUMENT_COLLECTION from openai_constants import ENTITY_EXTRACTION_PROMPT, ENTITY_EXTRACTION_FUNCTION, GPT35_PRICING def extract_all_documents(openai_instance, chunks): all_entities = {} all_usage = {} total_prompt_tokens = 0 total_completion_tokens = 0 print(f"Number of chunks to process :: {len(chunks)}") for chunk_idx, chunk in enumerate(chunks): print(f"Sending request to OpenAI for {chunk_idx}") openai_entities_out = openai_instance.generate_response(ENTITY_EXTRACTION_PROMPT, chunk, ENTITY_EXTRACTION_FUNCTION) print("OpenAI out received") print(openai_entities_out['function_output']) #for ent in openai_entities_out['function_output'].items(): for key, val in openai_entities_out['function_output'].items(): print(key, val) if key in all_entities: if isinstance(val, list): all_entities[key].extend(val) # Extend the existing list with the new list else: all_entities[key].append(val) # Append the value to the existing list else: if isinstance(val, list): all_entities[key] = val # Initialize the key with the list else: all_entities[key] = [val] if 'prompt_tokens' in openai_entities_out['usage']: total_prompt_tokens += openai_entities_out['usage']['prompt_tokens'] if 'completion_tokens' in openai_entities_out['usage']: total_completion_tokens += openai_entities_out['usage']['completion_tokens'] all_usage = { 'prompt_tokens':total_prompt_tokens, 'completion_tokens':total_completion_tokens, 'output_pricing': total_completion_tokens/1000 * GPT35_PRICING['input'], 'input_pricing':total_prompt_tokens/1000 * GPT35_PRICING['output'] } return all_entities, all_usage def process_insurance_document(pii_instance, mongo_instance, openai_instance, ocr_instance, document_path, document_id): print("---- \nInside Process insurance document function") ## save file to S3 document_s3_url = "" ## OCR try: #document_text = ocr_instance.extract_text_from_document(document_path) document_text = ocr_instance.llama_parse_ocr(document_path) ocr_status = "Completed" process_status = "OCR Completed" print(f"OCR complete") except Exception as ex: document_text = "" ocr_status = ex process_status = f"OCR Failed. {ex}" print(process_status) ## save ocr file to S3, add document S3 url ocr_document_s3_url = "" ## update ocr_status in db #mongo_instance.update(DOCUMENT_COLLECTION, # {'document_id':document_id}, # {'set':{'ocr_status':ocr_status, 'document_s3_url':document_s3_url, # 'ocr_document_s3_url':ocr_document_s3_url, 'process_status':process_status}}) print(f"OCR status updated in db") ## PII entity extraction and masking pii_entities = pii_instance.identify(document_text) print(f"pii entiites are :: {pii_entities}") pii_entities = pii_instance.add_mask(pii_entities) print(f"\npii_entities after adding mask :: {pii_entities}") masked_text = pii_instance.anonymize(pii_entities, document_text) print(f"\nPII anonumized text is :: {masked_text}") print(f"\nPII complete") ## Openai extraction chunks = ocr_instance.chunk_document(masked_text) openai_entities, all_usage = extract_all_documents(openai_instance, chunks) entity_extraction_status = 'Completed' process_status = 'Document term extraction completed' """try: openai_entities, all_usage = extract_all_documents(openai_instance, chunks) entity_extraction_status = 'Completed' process_status = 'Document term extraction completed' except Exception as ex: openai_entities = {} all_usage = {} entity_extraction_status = ex process_status = f"Document term extraction failed. {ex}" """ #openai_entities_out = { # 'status':"Success", # 'function_output':{}, # 'usage':{} #} print(f"openai_entities are :: {openai_entities}") print(f"Request to OpenAI complete") print("----------- \nProcessing complete\n ") ## Unmask PII entities in openai entities ## update entity extraction status in db #mongo_instance.update(DOCUMENT_COLLECTION, # {'document_id':document_id}, # {'set':{'entity_extraction_status':entity_extraction_status, # 'entities':openai_entities, 'process_status':process_status}}) #print(f"Entities updated in DB") out = { "entities":openai_entities, "masked_text":masked_text, "masked_entities":pii_entities } return out