Spaces:
Sleeping
Sleeping
UPDATE: New Endpoints
Browse files- functions.py +5 -12
- requirements.txt +1 -2
functions.py
CHANGED
@@ -20,7 +20,7 @@ from qdrant_client import QdrantClient
|
|
20 |
from langchain_groq import ChatGroq
|
21 |
from pdf2image import convert_from_bytes
|
22 |
import numpy as np
|
23 |
-
|
24 |
from bs4 import BeautifulSoup
|
25 |
from urllib.parse import urlparse, urljoin
|
26 |
from supabase import create_client
|
@@ -40,7 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
|
|
40 |
model_kwargs = model_kwargs,
|
41 |
encode_kwargs = encode_kwargs
|
42 |
)
|
43 |
-
|
44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
45 |
prompt = """
|
46 |
INSTRUCTIONS:
|
@@ -290,15 +290,8 @@ def getLinks(url: str, timeout = 30):
|
|
290 |
|
291 |
|
292 |
def getTextFromImagePDF(pdfBytes):
|
293 |
-
global
|
294 |
allImages = convert_from_bytes(pdfBytes)
|
295 |
allImages = [np.array(image) for image in allImages]
|
296 |
-
|
297 |
-
|
298 |
-
result = ocr.ocr(page)
|
299 |
-
if result[0]:
|
300 |
-
retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
|
301 |
-
else:
|
302 |
-
retrievedText = ""
|
303 |
-
pageWiseText.append(retrievedText)
|
304 |
-
return "\n\n\n".join(pageWiseText)
|
|
|
20 |
from langchain_groq import ChatGroq
|
21 |
from pdf2image import convert_from_bytes
|
22 |
import numpy as np
|
23 |
+
import easyocr
|
24 |
from bs4 import BeautifulSoup
|
25 |
from urllib.parse import urlparse, urljoin
|
26 |
from supabase import create_client
|
|
|
40 |
model_kwargs = model_kwargs,
|
41 |
encode_kwargs = encode_kwargs
|
42 |
)
|
43 |
+
reader = easyocr.Reader(['en'], gpu = True)
|
44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
45 |
prompt = """
|
46 |
INSTRUCTIONS:
|
|
|
290 |
|
291 |
|
292 |
def getTextFromImagePDF(pdfBytes):
|
293 |
+
global reader
|
294 |
allImages = convert_from_bytes(pdfBytes)
|
295 |
allImages = [np.array(image) for image in allImages]
|
296 |
+
text = "\n\n\n".join(["\n".join([text[1] for text in reader.readtext(image, paragraph=True)]) for image in allImages])
|
297 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -17,8 +17,7 @@ PyPDF2
|
|
17 |
python-dotenv
|
18 |
pydantic
|
19 |
pandas
|
20 |
-
|
21 |
-
paddleocr
|
22 |
pdf2image
|
23 |
sentence-transformers
|
24 |
supabase
|
|
|
17 |
python-dotenv
|
18 |
pydantic
|
19 |
pandas
|
20 |
+
easyocr
|
|
|
21 |
pdf2image
|
22 |
sentence-transformers
|
23 |
supabase
|