""" OCR @author : Sakshi Tantak """ # Imports import json from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient import pymupdf4llm, pymupdf from invoice_extractor import CREDENTIALS def convert_nested_complex_obj_to_json(result): result = json.loads(json.dumps(result, default = lambda o : o.__dict__)) return result class AzureLayoutOCR: def __init__(self): self.client = self._authenticate() self.engine = 'azure/layout' def _authenticate(self): client = DocumentAnalysisClient( endpoint=CREDENTIALS['azure']['layout']['endpoint'], credential=AzureKeyCredential(CREDENTIALS['azure']['layout']['key']), connection_verify=False ) return client def _table2md(self, table, **kwargs): row_count, column_count = table['row_count'], table['column_count'] cells = table['cells'] markdown_table = [] table_offsets = (table['spans'][0]['offset'], table['spans'][-1]['offset'] + table['spans'][-1]['length']) for _ in range(row_count + 1): row = [''] * column_count markdown_table.append(row) header_row_idx = [0] for cell in cells: row_index = cell['row_index'] if cell['kind'] == 'columnHeader': # Headers are in the first row of markdown_table, which is row_index 0 markdown_table[row_index + 1][cell['column_index']] = '**' + cell['content'].replace('|', '') + '**' header_row_idx.append(row_index + 1) else: # Content cells are offset by 1 due to headers markdown_table[row_index + 1][cell['column_index']] = cell['content'].replace('|', '') markdown_output = '' for row in markdown_table: markdown_output += '| ' + ' | '.join(row) + ' |\n' if markdown_table.index(row) in header_row_idx: # if markdown_table.index(row) == 0: # Add a separator after the header markdown_output += '| ' + ' | '.join(['---'] * column_count) + ' |\n' return markdown_output, table_offsets def _paragraphs2md(self, paragraph, element_offsets, **kwargs): paragraph_offsets = ( paragraph['spans'][0]['offset'], paragraph['spans'][-1]['offset'] + paragraph['spans'][-1]['length']) for offset in element_offsets: if paragraph_offsets[0] >= offset[0] and paragraph['spans'][0]['offset'] <= offset[1]: return None, None markdown_text = '' if paragraph['role'] == 'title': markdown_text += f'# {paragraph["content"]}' elif paragraph == "sectionHeading": markdown_text += f'## {paragraph["content"]}' else: markdown_text += f'{paragraph["content"]}' return markdown_text, paragraph_offsets def _stitch_paragraphs_elements(self, paragraphs, elements, **kwargs): new_list = paragraphs + elements sorted_new_list = sorted(new_list, key=lambda x: x['offset'][0]) return sorted_new_list def _convert2md(self, result, **kwargs): paragraphs, tables = result['paragraphs'], result['tables'] md_tables = [] for table in tables: md, offset = self._table2md(table, requestId=kwargs.get('requestId')) md_tables.append({'content': md, 'offset': offset}) table_offsets = [element['offset'] for element in md_tables] md_paragraphs = [] for para in paragraphs: md, offset = self._paragraphs2md(para, table_offsets, requestId=kwargs.get('requestId')) if md is not None: md_paragraphs.append({'content': md, 'offset': offset}) all_md_elements = self._stitch_paragraphs_elements(md_paragraphs, md_tables, requestId=kwargs.get('requestId')) full_md = '\n\n'.join([record['content'] for record in all_md_elements]) return full_md def _call_engine(self, image_reader, **kwargs): poller = self.client.begin_analyze_document( CREDENTIALS['azure']['layout']['model'], image_reader ) result = poller.result() result = convert_nested_complex_obj_to_json(result) md_text = self._convert2md(result, requestId=kwargs.get('requestId')) return md_text, result def __call__(self, file_bytes): text, raw_response = self._call_engine(file_bytes) return text, raw_response class PyMuPDF4LLMOCR: def __init__(self): self.engine = 'open-source/pymupdf4llm' self.file_type = 'pdf' def _create_document(self, file_bytes, file_type = None): return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type) def __call__(self, file_bytes, file_type = None): document = self._create_document(file_bytes, file_type) response = pymupdf4llm.to_markdown(document) return response, None if __name__ == '__main__': import sys filepath = sys.argv[1] file_bytes = open(filepath, 'rb').read() ocr = AzureLayoutOCR() text, raw_response = ocr(file_bytes) print(text)