ugmSorcero
Linter
dbcf2e8
raw
history blame
2.47 kB
from io import StringIO
import core.pipelines as pipelines_functions
from inspect import getmembers, isfunction
from newspaper import Article
from PyPDF2 import PdfFileReader
import streamlit as st
import pandas as pd
import pytesseract
from PIL import Image
def get_pipelines():
pipeline_names, pipeline_funcs = list(
zip(*getmembers(pipelines_functions, isfunction))
)
pipeline_names = [
" ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
]
return pipeline_names, pipeline_funcs
@st.experimental_memo
def extract_text_from_url(url: str):
article = Article(url)
article.download()
article.parse()
return article.text
@st.experimental_memo
def extract_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_text = stringio.read()
return file_text
# read pdf file
elif file.type == "application/pdf":
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_text = ""
for i in range(count):
try:
page = pdfReader.getPage(i)
all_text += page.extractText()
except:
continue
file_text = all_text
return file_text
# read csv file
elif file.type == "text/csv":
csv = pd.read_csv(file)
# get columns of type string
string_columns = csv.select_dtypes(include=["object"]).columns
# get data from columns and join it together
file_text = ""
for row in csv[string_columns].values.tolist():
# remove NaNs
row = [x for x in row if str(x) != "nan"]
for column in row:
txt = ""
if isinstance(column, list):
try:
txt = " ".join(column)
except:
continue
elif isinstance(column, str):
txt = column
else:
continue
file_text += " " + txt
return file_text
# read image file (OCR)
elif file.type == "image/jpeg":
return pytesseract.image_to_string(Image.open(file))
else:
st.warning(f"File type {file.type} not supported")
return None