Spaces:
Running
Running
AamirAli123
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ from langchain_community.llms import HuggingFacePipeline
|
|
10 |
from langchain.chains import ConversationChain
|
11 |
from langchain.memory import ConversationBufferMemory
|
12 |
from langchain.llms import HuggingFaceHub
|
|
|
|
|
13 |
from pathlib import Path
|
14 |
import chromadb
|
15 |
# Later Packages
|
@@ -19,6 +21,7 @@ import weasyprint
|
|
19 |
import matplotlib.pyplot as plt
|
20 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
21 |
load_dotenv()
|
|
|
22 |
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
23 |
openai_key = os.getenv("OPEN_API_KEY")
|
24 |
# default_persist_directory = './chroma_HF/'
|
@@ -29,6 +32,16 @@ list_llm = ["mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instru
|
|
29 |
"google/flan-t5-xxl"
|
30 |
]
|
31 |
list_llm_simple = [os.path.basename(llm) for llm in list_llm]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# Craete PDf from URL
|
33 |
def create_pdf_from_url(url):
|
34 |
pdf = weasyprint.HTML(url).write_pdf()
|
@@ -50,6 +63,11 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
|
|
50 |
chunk_size = chunk_size,
|
51 |
chunk_overlap = chunk_overlap)
|
52 |
doc_splits = text_splitter.split_documents(pages)
|
|
|
|
|
|
|
|
|
|
|
53 |
return doc_splits
|
54 |
|
55 |
# Create vector database
|
|
|
10 |
from langchain.chains import ConversationChain
|
11 |
from langchain.memory import ConversationBufferMemory
|
12 |
from langchain.llms import HuggingFaceHub
|
13 |
+
from doctr.models import ocr_predictor
|
14 |
+
from doctr.io import DocumentFile
|
15 |
from pathlib import Path
|
16 |
import chromadb
|
17 |
# Later Packages
|
|
|
21 |
import matplotlib.pyplot as plt
|
22 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
23 |
load_dotenv()
|
24 |
+
model = ocr_predictor(pretrained = True)
|
25 |
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
26 |
openai_key = os.getenv("OPEN_API_KEY")
|
27 |
# default_persist_directory = './chroma_HF/'
|
|
|
32 |
"google/flan-t5-xxl"
|
33 |
]
|
34 |
list_llm_simple = [os.path.basename(llm) for llm in list_llm]
|
35 |
+
#Extract text data from doctr reaponse
|
36 |
+
def extract_value_from_response(response):
|
37 |
+
value = ''
|
38 |
+
for page in response.pages:
|
39 |
+
for block in page.blocks:
|
40 |
+
for line in block.lines:
|
41 |
+
for word in line.words:
|
42 |
+
value += " "+word.value
|
43 |
+
return value
|
44 |
+
|
45 |
# Craete PDf from URL
|
46 |
def create_pdf_from_url(url):
|
47 |
pdf = weasyprint.HTML(url).write_pdf()
|
|
|
63 |
chunk_size = chunk_size,
|
64 |
chunk_overlap = chunk_overlap)
|
65 |
doc_splits = text_splitter.split_documents(pages)
|
66 |
+
# if len(doc_splits) == 0:
|
67 |
+
# doc = DocumentFile.from_pdf(list_file_path[0])
|
68 |
+
# result = model(doc)
|
69 |
+
# response = extract_value_from_response(result)
|
70 |
+
# doc_splits = text_splitter.split_documents(response)
|
71 |
return doc_splits
|
72 |
|
73 |
# Create vector database
|