ugaray96 commited on
Commit
5124fda
·
unverified ·
2 Parent(s): d36f6ee dbcf2e8

Merge pull request #3 from ugm2/feature/add_file

Browse files
interface/components.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from interface.utils import get_pipelines, extract_text_from_url
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
@@ -80,7 +80,43 @@ def component_article_url(container):
80
  st.markdown("---")
81
  else:
82
  break
 
 
 
 
 
83
  corpus = [
84
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
85
  ]
86
  return corpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
 
80
  st.markdown("---")
81
  else:
82
  break
83
+
84
+ for idx, doc in enumerate(urls):
85
+ with st.expander(f"Preview URL {idx}"):
86
+ st.write(doc)
87
+
88
  corpus = [
89
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
90
  ]
91
  return corpus
92
+
93
+
94
+ def component_file_input(container):
95
+ """Draw the extract text from file widget"""
96
+ with container:
97
+ files = []
98
+ doc_id = 1
99
+ with st.expander("Enter Files"):
100
+ while True:
101
+ file = st.file_uploader(
102
+ "Upload a .txt, .pdf, .csv, image file", key=doc_id
103
+ )
104
+ if file != None:
105
+ extracted_text = extract_text_from_file(file)
106
+ if extracted_text != None:
107
+ files.append({"text": extracted_text})
108
+ doc_id += 1
109
+ st.markdown("---")
110
+ else:
111
+ break
112
+ else:
113
+ break
114
+
115
+ for idx, doc in enumerate(files):
116
+ with st.expander(f"Preview File {idx}"):
117
+ st.write(doc)
118
+
119
+ corpus = [
120
+ {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
121
+ ]
122
+ return corpus
interface/pages.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
  from interface.components import (
 
5
  component_show_pipeline,
6
  component_show_search_result,
7
  component_text_input,
@@ -25,7 +26,6 @@ def page_landing_page(container):
25
  st.markdown(
26
  "TODO list:"
27
  "\n - Build other pipelines"
28
- "\n - Include file/url indexing"
29
  "\n - [Optional] Include text to audio to read responses"
30
  )
31
 
@@ -59,6 +59,7 @@ def page_index(container):
59
  input_funcs = {
60
  "Raw Text": (component_text_input, "card-text"),
61
  "URL": (component_article_url, "card-link"),
 
62
  }
63
  selected_input = option_menu(
64
  "Input Text",
 
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
  from interface.components import (
5
+ component_file_input,
6
  component_show_pipeline,
7
  component_show_search_result,
8
  component_text_input,
 
26
  st.markdown(
27
  "TODO list:"
28
  "\n - Build other pipelines"
 
29
  "\n - [Optional] Include text to audio to read responses"
30
  )
31
 
 
59
  input_funcs = {
60
  "Raw Text": (component_text_input, "card-text"),
61
  "URL": (component_article_url, "card-link"),
62
+ "File": (component_file_input, "card-file"),
63
  }
64
  selected_input = option_menu(
65
  "Input Text",
interface/utils.py CHANGED
@@ -1,7 +1,12 @@
 
1
  import core.pipelines as pipelines_functions
2
  from inspect import getmembers, isfunction
3
  from newspaper import Article
 
4
  import streamlit as st
 
 
 
5
 
6
 
7
  def get_pipelines():
@@ -21,3 +26,64 @@ def extract_text_from_url(url: str):
21
  article.parse()
22
 
23
  return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
  import core.pipelines as pipelines_functions
3
  from inspect import getmembers, isfunction
4
  from newspaper import Article
5
+ from PyPDF2 import PdfFileReader
6
  import streamlit as st
7
+ import pandas as pd
8
+ import pytesseract
9
+ from PIL import Image
10
 
11
 
12
  def get_pipelines():
 
26
  article.parse()
27
 
28
  return article.text
29
+
30
+
31
+ @st.experimental_memo
32
+ def extract_text_from_file(file):
33
+ # read text file
34
+ if file.type == "text/plain":
35
+ # To convert to a string based IO:
36
+ stringio = StringIO(file.getvalue().decode("utf-8"))
37
+
38
+ # To read file as string:
39
+ file_text = stringio.read()
40
+
41
+ return file_text
42
+
43
+ # read pdf file
44
+ elif file.type == "application/pdf":
45
+ pdfReader = PdfFileReader(file)
46
+ count = pdfReader.numPages
47
+ all_text = ""
48
+
49
+ for i in range(count):
50
+ try:
51
+ page = pdfReader.getPage(i)
52
+ all_text += page.extractText()
53
+ except:
54
+ continue
55
+ file_text = all_text
56
+
57
+ return file_text
58
+
59
+ # read csv file
60
+ elif file.type == "text/csv":
61
+ csv = pd.read_csv(file)
62
+ # get columns of type string
63
+ string_columns = csv.select_dtypes(include=["object"]).columns
64
+ # get data from columns and join it together
65
+ file_text = ""
66
+ for row in csv[string_columns].values.tolist():
67
+ # remove NaNs
68
+ row = [x for x in row if str(x) != "nan"]
69
+ for column in row:
70
+ txt = ""
71
+ if isinstance(column, list):
72
+ try:
73
+ txt = " ".join(column)
74
+ except:
75
+ continue
76
+ elif isinstance(column, str):
77
+ txt = column
78
+ else:
79
+ continue
80
+ file_text += " " + txt
81
+ return file_text
82
+
83
+ # read image file (OCR)
84
+ elif file.type == "image/jpeg":
85
+ return pytesseract.image_to_string(Image.open(file))
86
+
87
+ else:
88
+ st.warning(f"File type {file.type} not supported")
89
+ return None
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr-all
requirements.txt CHANGED
@@ -3,4 +3,6 @@ streamlit_option_menu==0.3.2
3
  farm-haystack==1.8.0
4
  black==22.8.0
5
  plotly==5.10.0
6
- newspaper3k==0.2.8
 
 
 
3
  farm-haystack==1.8.0
4
  black==22.8.0
5
  plotly==5.10.0
6
+ newspaper3k==0.2.8
7
+ PyPDF2==2.10.7
8
+ pytesseract==0.3.10