sourabhzanwar commited on
Commit
1400e5d
1 Parent(s): 684322b

trying pdf tp txt converter

Browse files
Files changed (2) hide show
  1. app.py +10 -1
  2. requirements.txt +1 -1
app.py CHANGED
@@ -41,11 +41,20 @@ from utils.ui import reset_results, set_initial_state
41
  import pandas as pd
42
  import haystack
43
 
 
 
 
 
 
 
 
 
44
 
45
  # Whether the file upload should be enabled or not
46
  DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
47
  # Define a function to handle file uploads
48
  def upload_files():
 
49
  uploaded_files = upload_container.file_uploader(
50
  "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
51
  )
@@ -78,7 +87,7 @@ def process_file(data_file, preprocesor, document_store):
78
  print(e)
79
 
80
  def reset_documents():
81
- print('Reseting documents list')
82
  document_store.delete_documents()
83
 
84
  def upload_document():
 
41
  import pandas as pd
42
  import haystack
43
 
44
+ from datetime import datetime
45
+
46
+ from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
47
+
48
+ pdf_converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
49
+ docx_converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en","de"])
50
+ txt_converter = TextConverter(remove_numeric_tables=True, valid_languages=["en","de"])
51
+
52
 
53
  # Whether the file upload should be enabled or not
54
  DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
55
  # Define a function to handle file uploads
56
  def upload_files():
57
+ print(f'Uploading files at {datetime.now()}')
58
  uploaded_files = upload_container.file_uploader(
59
  "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="collapsed"
60
  )
 
87
  print(e)
88
 
89
  def reset_documents():
90
+ print('\nReseting documents list at ' + str(datetime.now()) + '\n')
91
  document_store.delete_documents()
92
 
93
  def upload_document():
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  safetensors==0.3.3.post1
2
- farm-haystack[inference,weaviate,opensearch]==1.20.0
3
  milvus-haystack
4
  streamlit==1.23.0
5
  markdown
 
1
  safetensors==0.3.3.post1
2
+ farm-haystack[inference,weaviate,opensearch,file-conversion,pdf]==1.20.0
3
  milvus-haystack
4
  streamlit==1.23.0
5
  markdown