Spaces:
Build error
Build error
First attempt
Browse files- app.py +32 -0
- requirements.txt +1 -0
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
|
4 |
+
import logging
|
5 |
+
|
6 |
+
document_store = InMemoryDocumentStore()
|
7 |
+
preprocessor = PreProcessor(
|
8 |
+
clean_empty_lines=True,
|
9 |
+
clean_whitespace=True,
|
10 |
+
clean_header_footer=True,
|
11 |
+
split_by="word",
|
12 |
+
split_length=100,
|
13 |
+
split_respect_sentence_boundary=True,
|
14 |
+
split_overlap=3
|
15 |
+
)
|
16 |
+
|
17 |
+
uploaded_files = st.file_uploader(label='Upload a PDF Document', accept_multiple_files=True)
|
18 |
+
logging.info(uploaded_files)
|
19 |
+
|
20 |
+
def pdf_to_document_store(pdf_files):
|
21 |
+
document_store.delete_documents()
|
22 |
+
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
23 |
+
documents = []
|
24 |
+
for pdf in pdf_files:
|
25 |
+
documents.append(converter.convert(file_path=pdf.name, meta=None))
|
26 |
+
preprocessed_docs = preprocessor.process(documents)
|
27 |
+
document_store.write_documents(preprocessed_docs)
|
28 |
+
return None
|
29 |
+
|
30 |
+
if uploaded_files is not None:
|
31 |
+
document_store.delete_all_documents()
|
32 |
+
pdf_to_document_store(uploaded_files)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
farm-haystack==1.4.0
|