import re import spacy import json from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel import streamlit as st from urllib.request import Request, urlopen, HTTPError from bs4 import BeautifulSoup def hide_footer(): hide_st_style = """ """ st.markdown(hide_st_style, unsafe_allow_html=True) @st.cache_resource def get_seq2seq_model(model_id): return AutoModelForSeq2SeqLM.from_pretrained(model_id) @st.cache_resource def get_causal_model(model_id): return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) @st.cache_resource def get_auto_model(model_id): return AutoModel.from_pretrained(model_id) @st.cache_resource def get_tokenizer(model_id): return AutoTokenizer.from_pretrained(model_id) @st.cache_data def get_celeb_data(fpath): with open(fpath, encoding='UTF-8') as json_file: return json.load(json_file) def get_article(url): req = Request( url=url, headers={'User-Agent': 'Mozilla/5.0'} ) try: html = urlopen(req).read() soup = BeautifulSoup(html, features="html.parser") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out lines = [] # get text for para in soup.find_all("p", class_='topic-paragraph'): lines.append(para.get_text().strip()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = ' '.join(chunk for chunk in chunks if chunk) return text except: st.markdown("The internet is not stable.") return "" @st.cache_resource def get_spacy_model(model_id): return spacy.load(model_id) def preprocess_text(name, text:str, model_id): spacy_model = get_spacy_model(model_id) texts = [i.text.strip() for i in spacy_model(text).sents] return spacy_model, texts