Spaces:
Runtime error
Runtime error
import re | |
import spacy | |
import json | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel | |
import streamlit as st | |
from urllib.request import Request, urlopen, HTTPError | |
from bs4 import BeautifulSoup | |
def hide_footer(): | |
hide_st_style = """ | |
<style> | |
footer {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_st_style, unsafe_allow_html=True) | |
def get_seq2seq_model(model_id): | |
return AutoModelForSeq2SeqLM.from_pretrained(model_id) | |
def get_causal_model(model_id): | |
return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) | |
def get_auto_model(model_id): | |
return AutoModel.from_pretrained(model_id) | |
def get_tokenizer(model_id): | |
return AutoTokenizer.from_pretrained(model_id) | |
def get_celeb_data(fpath): | |
with open(fpath, encoding='UTF-8') as json_file: | |
return json.load(json_file) | |
def get_article(url): | |
req = Request( | |
url=url, | |
headers={'User-Agent': 'Mozilla/5.0'} | |
) | |
try: | |
html = urlopen(req).read() | |
soup = BeautifulSoup(html, features="html.parser") | |
# kill all script and style elements | |
for script in soup(["script", "style"]): | |
script.extract() # rip it out | |
lines = [] | |
# get text | |
for para in soup.find_all("p", class_='topic-paragraph'): | |
lines.append(para.get_text().strip()) | |
# break multi-headlines into a line each | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
# drop blank lines | |
text = ' '.join(chunk for chunk in chunks if chunk) | |
return text | |
except: | |
st.markdown("The internet is not stable.") | |
return "" | |
def get_spacy_model(model_id): | |
return spacy.load(model_id) | |
def preprocess_text(name, text:str, model_id): | |
spacy_model = get_spacy_model(model_id) | |
texts = [i.text.strip() for i in spacy_model(text).sents] | |
return spacy_model, texts | |