Spaces:

liuhaozhe6788
/

CelebChat

Runtime error

App Files Files Community

CelebChat / utils.py

lhzstar

new commits

aafa95b 10 months ago

raw

history blame

No virus

2.64 kB

	import re
	import spacy
	import json
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
	import streamlit as st
	from urllib.request import Request, urlopen
	from bs4 import BeautifulSoup

	he_regex = re.compile(r'\b(he\|him\|himself)\b', flags=re.IGNORECASE)
	his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE)
	she_regex = re.compile(r'\b(she\|herself)\b', flags=re.IGNORECASE)
	her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE)


	def hide_footer():
	hide_st_style = """
	<style>
	footer {visibility: hidden;}
	</style>
	"""
	st.markdown(hide_st_style, unsafe_allow_html=True)

	@st.cache_resource
	def get_seq2seq_model(model_id):
	return AutoModelForSeq2SeqLM.from_pretrained(model_id)

	@st.cache_resource
	def get_causal_model(model_id):
	return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

	@st.cache_resource
	def get_auto_model(model_id):
	return AutoModel.from_pretrained(model_id)

	@st.cache_resource
	def get_tokenizer(model_id):
	return AutoTokenizer.from_pretrained(model_id)

	@st.cache_data
	def get_celeb_data(fpath):
	with open(fpath, encoding='UTF-8') as json_file:
	return json.load(json_file)

	@st.cache_data
	def get_article(url):
	req = Request(
	url=url,
	headers={'User-Agent': 'Mozilla/5.0'}
	)
	html = urlopen(req).read()
	soup = BeautifulSoup(html, features="html.parser")

	# kill all script and style elements
	for script in soup(["script", "style"]):
	script.extract() # rip it out

	lines = []

	# get text
	for para in soup.find_all("p", class_='topic-paragraph'):
	lines.append(para.get_text().strip())

	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# drop blank lines
	text = ' '.join(chunk for chunk in chunks if chunk)
	return text


	@st.cache_resource
	def preprocess_text(name, gender, text, model_id):
	lname = name.split(" ")[-1]
	lnames = lname+"’s"
	lnames_regex = re.compile(rf'\b({lnames})\b')
	names = name+"’s"
	names_regex = re.compile(rf'\b({names})\b')
	if gender == "M":
	text = re.sub(he_regex, "I", text)
	text = re.sub(his_regex, "my", text)
	elif gender == "F":
	text = re.sub(she_regex, "I", text)
	text = re.sub(her_regex, "my", text)
	text = re.sub(names_regex, "my", text)
	text = re.sub(lnames_regex, "my", text)
	spacy_model = spacy.load(model_id)
	texts = [i.text.strip() for i in spacy_model(text).sents]
	return spacy_model, texts