AamirAli123 commited on
Commit
802e608
·
verified ·
1 Parent(s): 41c05e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -0
app.py CHANGED
@@ -10,6 +10,8 @@ from langchain_community.llms import HuggingFacePipeline
10
  from langchain.chains import ConversationChain
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.llms import HuggingFaceHub
 
 
13
  from pathlib import Path
14
  import chromadb
15
  # Later Packages
@@ -19,6 +21,7 @@ import weasyprint
19
  import matplotlib.pyplot as plt
20
  from langchain.document_loaders import PyPDFDirectoryLoader
21
  load_dotenv()
 
22
  huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
23
  openai_key = os.getenv("OPEN_API_KEY")
24
  # default_persist_directory = './chroma_HF/'
@@ -29,6 +32,16 @@ list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instru
29
  "google/flan-t5-xxl"
30
  ]
31
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
 
 
 
 
 
 
 
 
 
 
32
  # Craete PDf from URL
33
  def create_pdf_from_url(url):
34
  pdf = weasyprint.HTML(url).write_pdf()
@@ -50,6 +63,11 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
50
  chunk_size = chunk_size,
51
  chunk_overlap = chunk_overlap)
52
  doc_splits = text_splitter.split_documents(pages)
 
 
 
 
 
53
  return doc_splits
54
 
55
  # Create vector database
 
10
  from langchain.chains import ConversationChain
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.llms import HuggingFaceHub
13
+ from doctr.models import ocr_predictor
14
+ from doctr.io import DocumentFile
15
  from pathlib import Path
16
  import chromadb
17
  # Later Packages
 
21
  import matplotlib.pyplot as plt
22
  from langchain.document_loaders import PyPDFDirectoryLoader
23
  load_dotenv()
24
+ model = ocr_predictor(pretrained = True)
25
  huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
26
  openai_key = os.getenv("OPEN_API_KEY")
27
  # default_persist_directory = './chroma_HF/'
 
32
  "google/flan-t5-xxl"
33
  ]
34
  list_llm_simple = [os.path.basename(llm) for llm in list_llm]
35
+ #Extract text data from doctr reaponse
36
+ def extract_value_from_response(response):
37
+ value = ''
38
+ for page in response.pages:
39
+ for block in page.blocks:
40
+ for line in block.lines:
41
+ for word in line.words:
42
+ value += " "+word.value
43
+ return value
44
+
45
  # Craete PDf from URL
46
  def create_pdf_from_url(url):
47
  pdf = weasyprint.HTML(url).write_pdf()
 
63
  chunk_size = chunk_size,
64
  chunk_overlap = chunk_overlap)
65
  doc_splits = text_splitter.split_documents(pages)
66
+ # if len(doc_splits) == 0:
67
+ # doc = DocumentFile.from_pdf(list_file_path[0])
68
+ # result = model(doc)
69
+ # response = extract_value_from_response(result)
70
+ # doc_splits = text_splitter.split_documents(response)
71
  return doc_splits
72
 
73
  # Create vector database