adit94 commited on
Commit
b07041b
1 Parent(s): ac9589b

Update services/ocr_service.py

Browse files
Files changed (1) hide show
  1. services/ocr_service.py +28 -1
services/ocr_service.py CHANGED
@@ -8,7 +8,8 @@ from pdf2image import convert_from_path
8
 
9
 
10
  class OCRService:
11
- def __init__(self):
 
12
  return
13
 
14
  def extract_ocrless_pdf(self, filepath):
@@ -85,3 +86,29 @@ class OCRService:
85
  del chunks[id]
86
 
87
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  class OCRService:
11
+ def __init__(self, LLAMAPARSE_API_KEY):
12
+ self.llama_parse_key = LLAMAPARSE_API_KEY
13
  return
14
 
15
  def extract_ocrless_pdf(self, filepath):
 
86
  del chunks[id]
87
 
88
  return chunks
89
+
90
+ def llama_parse_ocr(self, file_path):
91
+ llamaparse_url = 'https://api.cloud.llamaindex.ai/api/parsing/upload'
92
+ headers = {
93
+ 'accept': 'application/json',
94
+ 'Authorization': f'Bearer {self.llama_parse_key}'
95
+ }
96
+ files = {
97
+ 'file': (file_path, open(file_path, 'rb'), 'application/pdf')
98
+ }
99
+ response = requests.post(llamaparse_url, headers=headers, files=files)
100
+ print(response.json()) # If you want to print the JSON response
101
+
102
+ job_id = response.json()["id"]
103
+ result_type = "markdown"
104
+
105
+ llamaparse_result_url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/{result_type}"
106
+
107
+ # check for the result until its ready
108
+ while True:
109
+ response = requests.get(llamaparse_result_url, headers=headers)
110
+ if response.status_code == 200:
111
+ break
112
+
113
+
114
+ return response.json()['markdown']