Rauhan commited on
Commit
f04fc73
·
1 Parent(s): 2145a2a

UPDATE: supabase

Browse files
Files changed (2) hide show
  1. app.py +26 -2
  2. functions.py +0 -1
app.py CHANGED
@@ -233,7 +233,9 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
233
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
234
  temp_file.write(pdf)
235
  temp_file_path = temp_file.name
 
236
  text = extractTextFromPdf(temp_file_path)
 
237
  os.remove(temp_file_path)
238
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
239
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
@@ -244,7 +246,23 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
244
  if newCount < int(limit):
245
  client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
246
  "chatbotname", chatbotname).execute()
247
- return addDocuments(text=text, source=source, vectorstore=vectorstore)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  else:
249
  return {
250
  "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
@@ -254,8 +272,14 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
254
  @app.post("/scanAndReturnText")
255
  async def returnText(pdf: UploadFile = File(...)):
256
  pdf = await pdf.read()
 
257
  text = getTextFromImagePDF(pdfBytes=pdf)
258
- return text
 
 
 
 
 
259
 
260
 
261
  @app.post("/addText")
 
233
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
234
  temp_file.write(pdf)
235
  temp_file_path = temp_file.name
236
+ start = time.time()
237
  text = extractTextFromPdf(temp_file_path)
238
+ textExtraction = time.time()
239
  os.remove(temp_file_path)
240
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
241
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
 
246
  if newCount < int(limit):
247
  client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
248
  "chatbotname", chatbotname).execute()
249
+ uploadStart = time.time()
250
+ output = addDocuments(text=text, source=source, vectorstore=vectorstore)
251
+ uploadEnd = time.time()
252
+ uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
253
+ timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
254
+ tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
255
+ tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
256
+ wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
257
+ newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
258
+ fileId = str(uuid.uuid4())
259
+ with open(f"{fileId}.txt", "w") as file:
260
+ file.write(newText)
261
+ with open(f"{fileId}.txt", "rb") as f:
262
+ supabase.storage.from_("ConversAI").upload(file = f, path = os.path.join("/", f.name), file_options={"content-type": "text/plain"})
263
+ os.remove(f"{fileId}.txt")
264
+ output["supabaseFileName"] = f"{fileId}.txt"
265
+ return output
266
  else:
267
  return {
268
  "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
 
272
  @app.post("/scanAndReturnText")
273
  async def returnText(pdf: UploadFile = File(...)):
274
  pdf = await pdf.read()
275
+ start = time.time()
276
  text = getTextFromImagePDF(pdfBytes=pdf)
277
+ end = time.time()
278
+ timeTaken = f"{end - start}s"
279
+ return {
280
+ "extractionTime": timeTaken,
281
+ "output": text
282
+ }
283
 
284
 
285
  @app.post("/addText")
functions.py CHANGED
@@ -293,7 +293,6 @@ def getTextFromImagePDF(pdfBytes):
293
  def getText(image):
294
  global reader
295
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
296
-
297
  allImages = convert_from_bytes(pdfBytes)
298
  texts = [getText(image) for image in allImages]
299
  return "\n\n\n".join(texts)
 
293
  def getText(image):
294
  global reader
295
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
 
296
  allImages = convert_from_bytes(pdfBytes)
297
  texts = [getText(image) for image in allImages]
298
  return "\n\n\n".join(texts)