Spaces:

techconspartners
/

ConversAI

Sleeping

App Files Files Community

Rauhan commited on Aug 19, 2024

Commit

f04fc73

1 Parent(s): 2145a2a

UPDATE: supabase

Browse files

Files changed (2) hide show

app.py +26 -2
functions.py +0 -1

app.py CHANGED Viewed

@@ -233,7 +233,9 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
         temp_file.write(pdf)
         temp_file_path = temp_file.name
     text = extractTextFromPdf(temp_file_path)
     os.remove(temp_file_path)
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
@@ -244,7 +246,23 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     if newCount < int(limit):
         client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
             "chatbotname", chatbotname).execute()
-        return addDocuments(text=text, source=source, vectorstore=vectorstore)
     else:
         return {
             "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
@@ -254,8 +272,14 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
 @app.post("/scanAndReturnText")
 async def returnText(pdf: UploadFile = File(...)):
     pdf = await pdf.read()
     text = getTextFromImagePDF(pdfBytes=pdf)
-    return text
 @app.post("/addText")

     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
         temp_file.write(pdf)
         temp_file_path = temp_file.name
+    start = time.time()
     text = extractTextFromPdf(temp_file_path)
+    textExtraction = time.time()
     os.remove(temp_file_path)
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
     if newCount < int(limit):
         client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
             "chatbotname", chatbotname).execute()
+        uploadStart = time.time()
+        output = addDocuments(text=text, source=source, vectorstore=vectorstore)
+        uploadEnd = time.time()
+        uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
+        timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
+        tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
+        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+        wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
+        newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
+        fileId = str(uuid.uuid4())
+        with open(f"{fileId}.txt", "w") as file:
+            file.write(newText)
+        with open(f"{fileId}.txt", "rb") as f:
+            supabase.storage.from_("ConversAI").upload(file = f, path = os.path.join("/", f.name), file_options={"content-type": "text/plain"})
+        os.remove(f"{fileId}.txt")
+        output["supabaseFileName"] = f"{fileId}.txt"
+        return output
     else:
         return {
             "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
 @app.post("/scanAndReturnText")
 async def returnText(pdf: UploadFile = File(...)):
     pdf = await pdf.read()
+    start = time.time()
     text = getTextFromImagePDF(pdfBytes=pdf)
+    end = time.time()
+    timeTaken = f"{end - start}s"
+    return {
+        "extractionTime": timeTaken,
+        "output": text
+    }
 @app.post("/addText")

functions.py CHANGED Viewed

@@ -293,7 +293,6 @@ def getTextFromImagePDF(pdfBytes):
     def getText(image):
         global reader
         return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
     return "\n\n\n".join(texts)

     def getText(image):
         global reader
         return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
     allImages = convert_from_bytes(pdfBytes)
     texts = [getText(image) for image in allImages]
     return "\n\n\n".join(texts)