Rauhan commited on
Commit
47650a0
·
1 Parent(s): b92a943

UPDATE: New Endpoints

Browse files
Files changed (2) hide show
  1. functions.py +5 -12
  2. requirements.txt +1 -2
functions.py CHANGED
@@ -20,7 +20,7 @@ from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
21
  from pdf2image import convert_from_bytes
22
  import numpy as np
23
- from paddleocr import PaddleOCR
24
  from bs4 import BeautifulSoup
25
  from urllib.parse import urlparse, urljoin
26
  from supabase import create_client
@@ -40,7 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
40
  model_kwargs = model_kwargs,
41
  encode_kwargs = encode_kwargs
42
  )
43
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
44
  sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
45
  prompt = """
46
  INSTRUCTIONS:
@@ -290,15 +290,8 @@ def getLinks(url: str, timeout = 30):
290
 
291
 
292
  def getTextFromImagePDF(pdfBytes):
293
- global ocr
294
  allImages = convert_from_bytes(pdfBytes)
295
  allImages = [np.array(image) for image in allImages]
296
- pageWiseText = []
297
- for page in allImages:
298
- result = ocr.ocr(page)
299
- if result[0]:
300
- retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
301
- else:
302
- retrievedText = ""
303
- pageWiseText.append(retrievedText)
304
- return "\n\n\n".join(pageWiseText)
 
20
  from langchain_groq import ChatGroq
21
  from pdf2image import convert_from_bytes
22
  import numpy as np
23
+ import easyocr
24
  from bs4 import BeautifulSoup
25
  from urllib.parse import urlparse, urljoin
26
  from supabase import create_client
 
40
  model_kwargs = model_kwargs,
41
  encode_kwargs = encode_kwargs
42
  )
43
+ reader = easyocr.Reader(['en'], gpu = True)
44
  sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
45
  prompt = """
46
  INSTRUCTIONS:
 
290
 
291
 
292
  def getTextFromImagePDF(pdfBytes):
293
+ global reader
294
  allImages = convert_from_bytes(pdfBytes)
295
  allImages = [np.array(image) for image in allImages]
296
+ text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
297
+ return text
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -17,8 +17,7 @@ PyPDF2
17
  python-dotenv
18
  pydantic
19
  pandas
20
- paddlepaddle-gpu
21
- paddleocr
22
  pdf2image
23
  sentence-transformers
24
  supabase
 
17
  python-dotenv
18
  pydantic
19
  pandas
20
+ easyocr
 
21
  pdf2image
22
  sentence-transformers
23
  supabase