import re import spacy import json from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel import streamlit as st from urllib.request import Request, urlopen from bs4 import BeautifulSoup he_regex = re.compile(r'\b(he|him|himself)\b', flags=re.IGNORECASE) his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE) she_regex = re.compile(r'\b(she|herself)\b', flags=re.IGNORECASE) her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE) def hide_footer(): hide_st_style = """ """ st.markdown(hide_st_style, unsafe_allow_html=True) @st.cache_resource def get_seq2seq_model(model_id): return AutoModelForSeq2SeqLM.from_pretrained(model_id) @st.cache_resource def get_causal_model(model_id): return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) @st.cache_resource def get_auto_model(model_id): return AutoModel.from_pretrained(model_id) @st.cache_resource def get_tokenizer(model_id): return AutoTokenizer.from_pretrained(model_id) @st.cache_data def get_celeb_data(fpath): with open(fpath, encoding='UTF-8') as json_file: return json.load(json_file) @st.cache_data def get_article(url): req = Request( url=url, headers={'User-Agent': 'Mozilla/5.0'} ) html = urlopen(req).read() soup = BeautifulSoup(html, features="html.parser") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out lines = [] # get text for para in soup.find_all("p", class_='topic-paragraph'): lines.append(para.get_text().strip()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = ' '.join(chunk for chunk in chunks if chunk) return text @st.cache_resource def preprocess_text(name, gender, text, model_id): lname = name.split(" ")[-1] lnames = lname+"’s" lnames_regex = re.compile(rf'\b({lnames})\b') names = name+"’s" names_regex = re.compile(rf'\b({names})\b') if gender == "M": text = re.sub(he_regex, "I", text) text = re.sub(his_regex, "my", text) elif gender == "F": text = re.sub(she_regex, "I", text) text = re.sub(her_regex, "my", text) text = re.sub(names_regex, "my", text) text = re.sub(lnames_regex, "my", text) spacy_model = spacy.load(model_id) texts = [i.text.strip() for i in spacy_model(text).sents] return spacy_model, texts