Spaces:
Running
Running
File size: 2,467 Bytes
4107940 dd7488f 1b47089 4107940 1b47089 4107940 46323da 8de7c36 dbcf2e8 dd7488f 8de7c36 1b47089 4107940 dbcf2e8 46323da 4107940 753ae25 4107940 753ae25 4107940 753ae25 4107940 753ae25 4107940 dbcf2e8 46323da dbcf2e8 46323da 4107940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from io import StringIO
import core.pipelines as pipelines_functions
from inspect import getmembers, isfunction
from newspaper import Article
from PyPDF2 import PdfFileReader
import streamlit as st
import pandas as pd
import pytesseract
from PIL import Image
def get_pipelines():
pipeline_names, pipeline_funcs = list(
zip(*getmembers(pipelines_functions, isfunction))
)
pipeline_names = [
" ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
]
return pipeline_names, pipeline_funcs
@st.experimental_memo
def extract_text_from_url(url: str):
article = Article(url)
article.download()
article.parse()
return article.text
@st.experimental_memo
def extract_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_text = stringio.read()
return file_text
# read pdf file
elif file.type == "application/pdf":
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_text = ""
for i in range(count):
try:
page = pdfReader.getPage(i)
all_text += page.extractText()
except:
continue
file_text = all_text
return file_text
# read csv file
elif file.type == "text/csv":
csv = pd.read_csv(file)
# get columns of type string
string_columns = csv.select_dtypes(include=["object"]).columns
# get data from columns and join it together
file_text = ""
for row in csv[string_columns].values.tolist():
# remove NaNs
row = [x for x in row if str(x) != "nan"]
for column in row:
txt = ""
if isinstance(column, list):
try:
txt = " ".join(column)
except:
continue
elif isinstance(column, str):
txt = column
else:
continue
file_text += " " + txt
return file_text
# read image file (OCR)
elif file.type == "image/jpeg":
return pytesseract.image_to_string(Image.open(file))
else:
st.warning(f"File type {file.type} not supported")
return None
|