Spaces:

ugaray96
/

neural-search

Runtime error

App Files Files Community

ugaray96 commited on Sep 14, 2022

Commit

5124fda

unverified ·

2 Parent(s): d36f6ee dbcf2e8

Merge pull request #3 from ugm2/feature/add_file

Browse files

Files changed (5) hide show

interface/components.py +37 -1
interface/pages.py +2 -1
interface/utils.py +66 -0
packages.txt +1 -0
requirements.txt +3 -1

interface/components.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import streamlit as st
-from interface.utils import get_pipelines, extract_text_from_url
 from interface.draw_pipelines import get_pipeline_graph
@@ -80,7 +80,43 @@ def component_article_url(container):
                     st.markdown("---")
                 else:
                     break
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
         ]
         return corpus

 import streamlit as st
+from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
 from interface.draw_pipelines import get_pipeline_graph
                     st.markdown("---")
                 else:
                     break
+        for idx, doc in enumerate(urls):
+            with st.expander(f"Preview URL {idx}"):
+                st.write(doc)
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
         ]
         return corpus
+def component_file_input(container):
+    """Draw the extract text from file widget"""
+    with container:
+        files = []
+        doc_id = 1
+        with st.expander("Enter Files"):
+            while True:
+                file = st.file_uploader(
+                    "Upload a .txt, .pdf, .csv, image file", key=doc_id
+                )
+                if file != None:
+                    extracted_text = extract_text_from_file(file)
+                    if extracted_text != None:
+                        files.append({"text": extracted_text})
+                        doc_id += 1
+                        st.markdown("---")
+                    else:
+                        break
+                else:
+                    break
+        for idx, doc in enumerate(files):
+            with st.expander(f"Preview File {idx}"):
+                st.write(doc)
+        corpus = [
+            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
+        ]
+        return corpus

interface/pages.py CHANGED Viewed

@@ -2,6 +2,7 @@ import streamlit as st
 from streamlit_option_menu import option_menu
 from core.search_index import index, search
 from interface.components import (
     component_show_pipeline,
     component_show_search_result,
     component_text_input,
@@ -25,7 +26,6 @@ def page_landing_page(container):
         st.markdown(
             "TODO list:"
             "\n  - Build other pipelines"
-            "\n  - Include file/url indexing"
             "\n  - [Optional] Include text to audio to read responses"
         )
@@ -59,6 +59,7 @@ def page_index(container):
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),
             "URL": (component_article_url, "card-link"),
         }
         selected_input = option_menu(
             "Input Text",

 from streamlit_option_menu import option_menu
 from core.search_index import index, search
 from interface.components import (
+    component_file_input,
     component_show_pipeline,
     component_show_search_result,
     component_text_input,
         st.markdown(
             "TODO list:"
             "\n  - Build other pipelines"
             "\n  - [Optional] Include text to audio to read responses"
         )
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),
             "URL": (component_article_url, "card-link"),
+            "File": (component_file_input, "card-file"),
         }
         selected_input = option_menu(
             "Input Text",

interface/utils.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import core.pipelines as pipelines_functions
 from inspect import getmembers, isfunction
 from newspaper import Article
 import streamlit as st
 def get_pipelines():
@@ -21,3 +26,64 @@ def extract_text_from_url(url: str):
     article.parse()
     return article.text

+from io import StringIO
 import core.pipelines as pipelines_functions
 from inspect import getmembers, isfunction
 from newspaper import Article
+from PyPDF2 import PdfFileReader
 import streamlit as st
+import pandas as pd
+import pytesseract
+from PIL import Image
 def get_pipelines():
     article.parse()
     return article.text
+@st.experimental_memo
+def extract_text_from_file(file):
+    # read text file
+    if file.type == "text/plain":
+        # To convert to a string based IO:
+        stringio = StringIO(file.getvalue().decode("utf-8"))
+        # To read file as string:
+        file_text = stringio.read()
+        return file_text
+    # read pdf file
+    elif file.type == "application/pdf":
+        pdfReader = PdfFileReader(file)
+        count = pdfReader.numPages
+        all_text = ""
+        for i in range(count):
+            try:
+                page = pdfReader.getPage(i)
+                all_text += page.extractText()
+            except:
+                continue
+        file_text = all_text
+        return file_text
+    # read csv file
+    elif file.type == "text/csv":
+        csv = pd.read_csv(file)
+        # get columns of type string
+        string_columns = csv.select_dtypes(include=["object"]).columns
+        # get data from columns and join it together
+        file_text = ""
+        for row in csv[string_columns].values.tolist():
+            # remove NaNs
+            row = [x for x in row if str(x) != "nan"]
+            for column in row:
+                txt = ""
+                if isinstance(column, list):
+                    try:
+                        txt = " ".join(column)
+                    except:
+                        continue
+                elif isinstance(column, str):
+                    txt = column
+                else:
+                    continue
+                file_text += " " + txt
+        return file_text
+    # read image file (OCR)
+    elif file.type == "image/jpeg":
+        return pytesseract.image_to_string(Image.open(file))
+    else:
+        st.warning(f"File type {file.type} not supported")
+        return None

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tesseract-ocr-all

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ streamlit_option_menu==0.3.2
 farm-haystack==1.8.0
 black==22.8.0
 plotly==5.10.0
-newspaper3k==0.2.8

 farm-haystack==1.8.0
 black==22.8.0
 plotly==5.10.0
+newspaper3k==0.2.8
+PyPDF2==2.10.7
+pytesseract==0.3.10