Spaces:
Running
Running
from typing import List | |
from langchain.document_loaders.unstructured import UnstructuredFileLoader | |
from document_loaders.ocr import get_ocr | |
class RapidOCRLoader(UnstructuredFileLoader): | |
def _get_elements(self) -> List: | |
def img2text(filepath): | |
resp = "" | |
ocr = get_ocr() | |
result, _ = ocr(filepath) | |
if result: | |
ocr_result = [line[1] for line in result] | |
resp += "\n".join(ocr_result) | |
return resp | |
text = img2text(self.file_path) | |
from unstructured.partition.text import partition_text | |
return partition_text(text=text, **self.unstructured_kwargs) | |
if __name__ == "__main__": | |
loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg") | |
docs = loader.load() | |
print(docs) | |