Spaces:
Runtime error
Runtime error
Merge pull request #3 from ugm2/feature/add_file
Browse files- interface/components.py +37 -1
- interface/pages.py +2 -1
- interface/utils.py +66 -0
- packages.txt +1 -0
- requirements.txt +3 -1
interface/components.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from interface.utils import get_pipelines, extract_text_from_url
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
@@ -80,7 +80,43 @@ def component_article_url(container):
|
|
80 |
st.markdown("---")
|
81 |
else:
|
82 |
break
|
|
|
|
|
|
|
|
|
|
|
83 |
corpus = [
|
84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
85 |
]
|
86 |
return corpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
|
|
80 |
st.markdown("---")
|
81 |
else:
|
82 |
break
|
83 |
+
|
84 |
+
for idx, doc in enumerate(urls):
|
85 |
+
with st.expander(f"Preview URL {idx}"):
|
86 |
+
st.write(doc)
|
87 |
+
|
88 |
corpus = [
|
89 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
90 |
]
|
91 |
return corpus
|
92 |
+
|
93 |
+
|
94 |
+
def component_file_input(container):
|
95 |
+
"""Draw the extract text from file widget"""
|
96 |
+
with container:
|
97 |
+
files = []
|
98 |
+
doc_id = 1
|
99 |
+
with st.expander("Enter Files"):
|
100 |
+
while True:
|
101 |
+
file = st.file_uploader(
|
102 |
+
"Upload a .txt, .pdf, .csv, image file", key=doc_id
|
103 |
+
)
|
104 |
+
if file != None:
|
105 |
+
extracted_text = extract_text_from_file(file)
|
106 |
+
if extracted_text != None:
|
107 |
+
files.append({"text": extracted_text})
|
108 |
+
doc_id += 1
|
109 |
+
st.markdown("---")
|
110 |
+
else:
|
111 |
+
break
|
112 |
+
else:
|
113 |
+
break
|
114 |
+
|
115 |
+
for idx, doc in enumerate(files):
|
116 |
+
with st.expander(f"Preview File {idx}"):
|
117 |
+
st.write(doc)
|
118 |
+
|
119 |
+
corpus = [
|
120 |
+
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
121 |
+
]
|
122 |
+
return corpus
|
interface/pages.py
CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
from core.search_index import index, search
|
4 |
from interface.components import (
|
|
|
5 |
component_show_pipeline,
|
6 |
component_show_search_result,
|
7 |
component_text_input,
|
@@ -25,7 +26,6 @@ def page_landing_page(container):
|
|
25 |
st.markdown(
|
26 |
"TODO list:"
|
27 |
"\n - Build other pipelines"
|
28 |
-
"\n - Include file/url indexing"
|
29 |
"\n - [Optional] Include text to audio to read responses"
|
30 |
)
|
31 |
|
@@ -59,6 +59,7 @@ def page_index(container):
|
|
59 |
input_funcs = {
|
60 |
"Raw Text": (component_text_input, "card-text"),
|
61 |
"URL": (component_article_url, "card-link"),
|
|
|
62 |
}
|
63 |
selected_input = option_menu(
|
64 |
"Input Text",
|
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
from core.search_index import index, search
|
4 |
from interface.components import (
|
5 |
+
component_file_input,
|
6 |
component_show_pipeline,
|
7 |
component_show_search_result,
|
8 |
component_text_input,
|
|
|
26 |
st.markdown(
|
27 |
"TODO list:"
|
28 |
"\n - Build other pipelines"
|
|
|
29 |
"\n - [Optional] Include text to audio to read responses"
|
30 |
)
|
31 |
|
|
|
59 |
input_funcs = {
|
60 |
"Raw Text": (component_text_input, "card-text"),
|
61 |
"URL": (component_article_url, "card-link"),
|
62 |
+
"File": (component_file_input, "card-file"),
|
63 |
}
|
64 |
selected_input = option_menu(
|
65 |
"Input Text",
|
interface/utils.py
CHANGED
@@ -1,7 +1,12 @@
|
|
|
|
1 |
import core.pipelines as pipelines_functions
|
2 |
from inspect import getmembers, isfunction
|
3 |
from newspaper import Article
|
|
|
4 |
import streamlit as st
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def get_pipelines():
|
@@ -21,3 +26,64 @@ def extract_text_from_url(url: str):
|
|
21 |
article.parse()
|
22 |
|
23 |
return article.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import StringIO
|
2 |
import core.pipelines as pipelines_functions
|
3 |
from inspect import getmembers, isfunction
|
4 |
from newspaper import Article
|
5 |
+
from PyPDF2 import PdfFileReader
|
6 |
import streamlit as st
|
7 |
+
import pandas as pd
|
8 |
+
import pytesseract
|
9 |
+
from PIL import Image
|
10 |
|
11 |
|
12 |
def get_pipelines():
|
|
|
26 |
article.parse()
|
27 |
|
28 |
return article.text
|
29 |
+
|
30 |
+
|
31 |
+
@st.experimental_memo
|
32 |
+
def extract_text_from_file(file):
|
33 |
+
# read text file
|
34 |
+
if file.type == "text/plain":
|
35 |
+
# To convert to a string based IO:
|
36 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
37 |
+
|
38 |
+
# To read file as string:
|
39 |
+
file_text = stringio.read()
|
40 |
+
|
41 |
+
return file_text
|
42 |
+
|
43 |
+
# read pdf file
|
44 |
+
elif file.type == "application/pdf":
|
45 |
+
pdfReader = PdfFileReader(file)
|
46 |
+
count = pdfReader.numPages
|
47 |
+
all_text = ""
|
48 |
+
|
49 |
+
for i in range(count):
|
50 |
+
try:
|
51 |
+
page = pdfReader.getPage(i)
|
52 |
+
all_text += page.extractText()
|
53 |
+
except:
|
54 |
+
continue
|
55 |
+
file_text = all_text
|
56 |
+
|
57 |
+
return file_text
|
58 |
+
|
59 |
+
# read csv file
|
60 |
+
elif file.type == "text/csv":
|
61 |
+
csv = pd.read_csv(file)
|
62 |
+
# get columns of type string
|
63 |
+
string_columns = csv.select_dtypes(include=["object"]).columns
|
64 |
+
# get data from columns and join it together
|
65 |
+
file_text = ""
|
66 |
+
for row in csv[string_columns].values.tolist():
|
67 |
+
# remove NaNs
|
68 |
+
row = [x for x in row if str(x) != "nan"]
|
69 |
+
for column in row:
|
70 |
+
txt = ""
|
71 |
+
if isinstance(column, list):
|
72 |
+
try:
|
73 |
+
txt = " ".join(column)
|
74 |
+
except:
|
75 |
+
continue
|
76 |
+
elif isinstance(column, str):
|
77 |
+
txt = column
|
78 |
+
else:
|
79 |
+
continue
|
80 |
+
file_text += " " + txt
|
81 |
+
return file_text
|
82 |
+
|
83 |
+
# read image file (OCR)
|
84 |
+
elif file.type == "image/jpeg":
|
85 |
+
return pytesseract.image_to_string(Image.open(file))
|
86 |
+
|
87 |
+
else:
|
88 |
+
st.warning(f"File type {file.type} not supported")
|
89 |
+
return None
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr-all
|
requirements.txt
CHANGED
@@ -3,4 +3,6 @@ streamlit_option_menu==0.3.2
|
|
3 |
farm-haystack==1.8.0
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
-
newspaper3k==0.2.8
|
|
|
|
|
|
3 |
farm-haystack==1.8.0
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
+
newspaper3k==0.2.8
|
7 |
+
PyPDF2==2.10.7
|
8 |
+
pytesseract==0.3.10
|