Spaces:
Sleeping
Sleeping
UPDATE: supabase
Browse files- app.py +26 -2
- functions.py +0 -1
app.py
CHANGED
@@ -233,7 +233,9 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
233 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
234 |
temp_file.write(pdf)
|
235 |
temp_file_path = temp_file.name
|
|
|
236 |
text = extractTextFromPdf(temp_file_path)
|
|
|
237 |
os.remove(temp_file_path)
|
238 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
239 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
@@ -244,7 +246,23 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
244 |
if newCount < int(limit):
|
245 |
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
246 |
"chatbotname", chatbotname).execute()
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
else:
|
249 |
return {
|
250 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
@@ -254,8 +272,14 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
254 |
@app.post("/scanAndReturnText")
|
255 |
async def returnText(pdf: UploadFile = File(...)):
|
256 |
pdf = await pdf.read()
|
|
|
257 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
|
261 |
@app.post("/addText")
|
|
|
233 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
234 |
temp_file.write(pdf)
|
235 |
temp_file_path = temp_file.name
|
236 |
+
start = time.time()
|
237 |
text = extractTextFromPdf(temp_file_path)
|
238 |
+
textExtraction = time.time()
|
239 |
os.remove(temp_file_path)
|
240 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
241 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
|
|
246 |
if newCount < int(limit):
|
247 |
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
248 |
"chatbotname", chatbotname).execute()
|
249 |
+
uploadStart = time.time()
|
250 |
+
output = addDocuments(text=text, source=source, vectorstore=vectorstore)
|
251 |
+
uploadEnd = time.time()
|
252 |
+
uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
|
253 |
+
timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
|
254 |
+
tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
|
255 |
+
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
256 |
+
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
257 |
+
newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
|
258 |
+
fileId = str(uuid.uuid4())
|
259 |
+
with open(f"{fileId}.txt", "w") as file:
|
260 |
+
file.write(newText)
|
261 |
+
with open(f"{fileId}.txt", "rb") as f:
|
262 |
+
supabase.storage.from_("ConversAI").upload(file = f, path = os.path.join("/", f.name), file_options={"content-type": "text/plain"})
|
263 |
+
os.remove(f"{fileId}.txt")
|
264 |
+
output["supabaseFileName"] = f"{fileId}.txt"
|
265 |
+
return output
|
266 |
else:
|
267 |
return {
|
268 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
|
272 |
@app.post("/scanAndReturnText")
|
273 |
async def returnText(pdf: UploadFile = File(...)):
|
274 |
pdf = await pdf.read()
|
275 |
+
start = time.time()
|
276 |
text = getTextFromImagePDF(pdfBytes=pdf)
|
277 |
+
end = time.time()
|
278 |
+
timeTaken = f"{end - start}s"
|
279 |
+
return {
|
280 |
+
"extractionTime": timeTaken,
|
281 |
+
"output": text
|
282 |
+
}
|
283 |
|
284 |
|
285 |
@app.post("/addText")
|
functions.py
CHANGED
@@ -293,7 +293,6 @@ def getTextFromImagePDF(pdfBytes):
|
|
293 |
def getText(image):
|
294 |
global reader
|
295 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
296 |
-
|
297 |
allImages = convert_from_bytes(pdfBytes)
|
298 |
texts = [getText(image) for image in allImages]
|
299 |
return "\n\n\n".join(texts)
|
|
|
293 |
def getText(image):
|
294 |
global reader
|
295 |
return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
|
|
|
296 |
allImages = convert_from_bytes(pdfBytes)
|
297 |
texts = [getText(image) for image in allImages]
|
298 |
return "\n\n\n".join(texts)
|