Spaces:
Sleeping
Sleeping
sourabhzanwar
commited on
Commit
•
1400e5d
1
Parent(s):
684322b
trying pdf tp txt converter
Browse files- app.py +10 -1
- requirements.txt +1 -1
app.py
CHANGED
@@ -41,11 +41,20 @@ from utils.ui import reset_results, set_initial_state
|
|
41 |
import pandas as pd
|
42 |
import haystack
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Whether the file upload should be enabled or not
|
46 |
DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
|
47 |
# Define a function to handle file uploads
|
48 |
def upload_files():
|
|
|
49 |
uploaded_files = upload_container.file_uploader(
|
50 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
|
51 |
)
|
@@ -78,7 +87,7 @@ def process_file(data_file, preprocesor, document_store):
|
|
78 |
print(e)
|
79 |
|
80 |
def reset_documents():
|
81 |
-
print('
|
82 |
document_store.delete_documents()
|
83 |
|
84 |
def upload_document():
|
|
|
41 |
import pandas as pd
|
42 |
import haystack
|
43 |
|
44 |
+
from datetime import datetime
|
45 |
+
|
46 |
+
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
|
47 |
+
|
48 |
+
pdf_converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
|
49 |
+
docx_converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en","de"])
|
50 |
+
txt_converter = TextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
|
51 |
+
|
52 |
|
53 |
# Whether the file upload should be enabled or not
|
54 |
DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
|
55 |
# Define a function to handle file uploads
|
56 |
def upload_files():
|
57 |
+
print(f'Uploading files at {datetime.now()}')
|
58 |
uploaded_files = upload_container.file_uploader(
|
59 |
"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
|
60 |
)
|
|
|
87 |
print(e)
|
88 |
|
89 |
def reset_documents():
|
90 |
+
print('\nReseting documents list at ' + str(datetime.now()) + '\n')
|
91 |
document_store.delete_documents()
|
92 |
|
93 |
def upload_document():
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
safetensors==0.3.3.post1
|
2 |
-
farm-haystack[inference,weaviate,opensearch]==1.20.0
|
3 |
milvus-haystack
|
4 |
streamlit==1.23.0
|
5 |
markdown
|
|
|
1 |
safetensors==0.3.3.post1
|
2 |
+
farm-haystack[inference,weaviate,opensearch,file-conversion,pdf]==1.20.0
|
3 |
milvus-haystack
|
4 |
streamlit==1.23.0
|
5 |
markdown
|