lhzstar commited on
Commit
5beab45
Β·
1 Parent(s): a2127f4

new commits

Browse files
celebbot.py CHANGED
@@ -35,7 +35,7 @@ class CelebBot():
35
  with sr.Microphone() as mic:
36
  recognizer.adjust_for_ambient_noise(mic, duration=1)
37
  # flag = input("Are you ready to record?\nProceed (Y/n)")
38
-
39
  # try:
40
  # assert flag=='Y'
41
  # except:
@@ -62,8 +62,8 @@ class CelebBot():
62
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
63
  # Tokenize sentences
64
  encoded_input = self.sentTr_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
65
- encoded_input["input_ids"] = encoded_input["input_ids"]
66
- encoded_input["attention_mask"] = encoded_input["attention_mask"]
67
 
68
  # Compute token embeddings
69
  with torch.no_grad():
@@ -76,9 +76,9 @@ class CelebBot():
76
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
77
 
78
  return sentence_embeddings
79
-
80
  def retrieve_knowledge_assertions(self, change_person=True):
81
- question_embeddings = self.sentence_embeds_inference([self.name + ', ' + self.text])
82
 
83
  all_knowledge_embeddings = self.sentence_embeds_inference(self.all_knowledge)
84
  similarity = cosine_similarity(all_knowledge_embeddings.cpu(), question_embeddings.cpu())
@@ -89,23 +89,19 @@ class CelebBot():
89
 
90
  # similarities = np.array(similarity)[top_K]
91
 
92
- print(*all_knowledge_assertions, sep='\n')
93
-
94
- knowledge = ' '.join(all_knowledge_assertions)
95
 
96
  if change_person:
97
- return self.third_to_first_person(knowledge)
98
- else:
99
- return knowledge
100
-
101
  def third_to_first_person(self, text):
 
102
  name = self.name.split(" ")[-1].lower()
103
  doc = self.spacy_model(text)
104
  transformed_text = []
105
 
106
- for i, token in enumerate(doc):
107
- if token.text == "pen":
108
- print(token.text, token.dep_)
109
  if self.gender == "M":
110
  if token.text.lower() == "he":
111
  transformed_text.append("I")
@@ -114,54 +110,56 @@ class CelebBot():
114
  elif token.text.lower() == "his":
115
  transformed_text.append("my")
116
  elif token.text.lower() == name and token.dep_ in ["nsubj", "nsubjpass"]:
117
- transformed_text.append("I")
118
-
119
- elif token.text == "'s" and doc[i-1].text.lower() == name:
120
  transformed_text[-1] = "my"
121
-
 
 
 
122
  else:
123
- transformed_text.append(token.text)
124
  elif self.gender == "F":
125
  if token.text.lower() == "she":
126
  transformed_text.append("I")
127
  elif token.text.lower() == "her":
128
- if i != len(doc)-2 and doc[i+1].dep_ in ["nsubj", "nsubjpass", "dobj", "appos", "dative", "attr", "amod"]:
129
- transformed_text.append("my")
130
  else:
131
- transformed_text.append("me")
132
  elif token.text.lower() == name and token.dep_ in ["nsubj", "nsubjpass"]:
133
- transformed_text.append("I")
134
-
135
- elif token.text == "'s" and doc[i-1].text.lower() == name:
136
  transformed_text[-1] = "my"
137
-
 
 
 
138
  else:
139
- transformed_text.append(token.text)
140
 
141
- return " ".join(transformed_text)
142
 
143
  def question_answer(self, instruction='', knowledge=''):
 
144
  if self.text != "":
145
- if re.search(re.compile(rf'\b(you|your|ypurs|{self.name})\b', flags=re.IGNORECASE), self.text) != None:
146
- instruction = f"You are a celebrity named {self.name}. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
147
- if re.search(re.compile(rf'\b(you|your|yours)\b', flags=re.IGNORECASE), self.text) != None:
148
- knowledge = self.retrieve_knowledge_assertions()
149
- else:
150
- knowledge = self.retrieve_knowledge_assertions(change_person=False)
151
  else:
152
- instruction = f"Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
 
153
  query = f"Context: {instruction} {knowledge}\n\nQuestion: {self.text}\n\nAnswer:"
154
- input_ids = self.QA_tokenizer(f"{query}", return_tensors="pt").input_ids
155
- outputs = self.QA_model.generate(input_ids, max_length=1024, min_length=8,do_sample=True, temperature=0.2, repetition_penalty=1.1)
156
  self.text = self.QA_tokenizer.decode(outputs[0], skip_special_tokens=True)
157
-
158
  return self.text
159
 
160
  @staticmethod
161
  def action_time():
162
  return f"it's {datetime.datetime.now().time().strftime('%H:%M')}"
163
 
164
- @staticmethod
165
  def save_kb(kb, filename):
166
  with open(filename, "wb") as f:
167
  pickle.dump(kb, f)
 
35
  with sr.Microphone() as mic:
36
  recognizer.adjust_for_ambient_noise(mic, duration=1)
37
  # flag = input("Are you ready to record?\nProceed (Y/n)")
38
+
39
  # try:
40
  # assert flag=='Y'
41
  # except:
 
62
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
63
  # Tokenize sentences
64
  encoded_input = self.sentTr_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
65
+ encoded_input["input_ids"] = encoded_input["input_ids"].to(self.sentTr_model.device)
66
+ encoded_input["attention_mask"] = encoded_input["attention_mask"].to(self.sentTr_model.device)
67
 
68
  # Compute token embeddings
69
  with torch.no_grad():
 
76
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
77
 
78
  return sentence_embeddings
79
+
80
  def retrieve_knowledge_assertions(self, change_person=True):
81
+ question_embeddings = self.sentence_embeds_inference([self.text])
82
 
83
  all_knowledge_embeddings = self.sentence_embeds_inference(self.all_knowledge)
84
  similarity = cosine_similarity(all_knowledge_embeddings.cpu(), question_embeddings.cpu())
 
89
 
90
  # similarities = np.array(similarity)[top_K]
91
 
92
+ # print(*all_knowledge_assertions, sep='\n')
 
 
93
 
94
  if change_person:
95
+ all_knowledge_assertions = [self.third_to_first_person(sent) for sent in all_knowledge_assertions]
96
+ return " ".join(all_knowledge_assertions)
97
+
 
98
  def third_to_first_person(self, text):
99
+ text = text.replace(" ", " ")
100
  name = self.name.split(" ")[-1].lower()
101
  doc = self.spacy_model(text)
102
  transformed_text = []
103
 
104
+ for i, token in enumerate(doc):
 
 
105
  if self.gender == "M":
106
  if token.text.lower() == "he":
107
  transformed_text.append("I")
 
110
  elif token.text.lower() == "his":
111
  transformed_text.append("my")
112
  elif token.text.lower() == name and token.dep_ in ["nsubj", "nsubjpass"]:
113
+ transformed_text.append("I")
114
+ elif token.text == "’s" and doc[i-1].text.lower() == name:
 
115
  transformed_text[-1] = "my"
116
+ elif token.text.lower() == "their":
117
+ transformed_text.append("our")
118
+ elif token.text.lower() == "they":
119
+ transformed_text.append("we")
120
  else:
121
+ transformed_text.append(token.text)
122
  elif self.gender == "F":
123
  if token.text.lower() == "she":
124
  transformed_text.append("I")
125
  elif token.text.lower() == "her":
126
+ if i != len(doc)-1 and doc[i+2].dep_ in ["nsubj", "nsubjpass", "dobj", "appos", "dative", "attr", "amod", "nummod", "compound", "pobj", "pcomp"]:
127
+ transformed_text.append("my")
128
  else:
129
+ transformed_text.append("me")
130
  elif token.text.lower() == name and token.dep_ in ["nsubj", "nsubjpass"]:
131
+ transformed_text.append("I")
132
+ elif token.text == "’s" and doc[i-1].text.lower() == name:
 
133
  transformed_text[-1] = "my"
134
+ elif token.text.lower() == "their":
135
+ transformed_text.append("our")
136
+ elif token.text.lower() == "they":
137
+ transformed_text.append("we")
138
  else:
139
+ transformed_text.append(token.text)
140
 
141
+ return "".join(transformed_text)
142
 
143
  def question_answer(self, instruction='', knowledge=''):
144
+ instruction = f"Your name is {self.name}. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
145
  if self.text != "":
146
+ if re.search(re.compile(rf'\b(you|your|yours)\b', flags=re.IGNORECASE), self.text) != None:
147
+ knowledge = self.retrieve_knowledge_assertions()
 
 
 
 
148
  else:
149
+ knowledge = self.retrieve_knowledge_assertions(change_person=False)
150
+
151
  query = f"Context: {instruction} {knowledge}\n\nQuestion: {self.text}\n\nAnswer:"
152
+ input_ids = self.QA_tokenizer(f"{query}", return_tensors="pt").input_ids.to(self.QA_model.device)
153
+ outputs = self.QA_model.generate(input_ids, max_length=1024, min_length=8, do_sample=True, temperature=0.2, repetition_penalty=2.5)
154
  self.text = self.QA_tokenizer.decode(outputs[0], skip_special_tokens=True)
155
+ # self.text = " ".join([i.text.strip().capitalize() for i in self.spacy_model(self.text).sents])
156
  return self.text
157
 
158
  @staticmethod
159
  def action_time():
160
  return f"it's {datetime.datetime.now().time().strftime('%H:%M')}"
161
 
162
+ @staticmethod
163
  def save_kb(kb, filename):
164
  with open(filename, "wb") as f:
165
  pickle.dump(kb, f)
embeds/Helen_Mirren.npy CHANGED
Binary files a/embeds/Helen_Mirren.npy and b/embeds/Helen_Mirren.npy differ
 
rtvc/synthesizer/utils/cleaners.py CHANGED
@@ -30,8 +30,8 @@ _alphabet2pronunciation = {
30
  'g': 'jee',
31
  'H': 'eiich',
32
  'h': 'eiich',
33
- 'I': 'eye',
34
- 'i': 'eye',
35
  'J': 'jay',
36
  'j': 'jay',
37
  'K': 'kay',
 
30
  'g': 'jee',
31
  'H': 'eiich',
32
  'h': 'eiich',
33
+ 'I': 'I',
34
+ 'i': 'I',
35
  'J': 'jay',
36
  'j': 'jay',
37
  'K': 'kay',
run_cli.py CHANGED
@@ -10,8 +10,7 @@ QA_MODEL_ID = "google/flan-t5-large"
10
  SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
11
 
12
  def main():
13
- with open("data.json") as json_file:
14
- celeb_data = json.load(json_file)
15
  message = "Please choose your favorite celebrity from\n"\
16
  "1. Cate Blanchett\n"\
17
  "2. David Beckham\n"\
@@ -25,37 +24,23 @@ def main():
25
  sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID)
26
  sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID)
27
 
28
- name = input(message)
29
- gender = celeb_data[name]["gender"]
30
- knowledge = celeb_data[name]["knowledge"]
31
-
32
- lname = name.split(" ")[-1]
33
- lname_regex = re.compile(rf'\b({lname})\b')
34
- name_regex = re.compile(rf'\b({name})\b')
35
- lnames = lname+"’s" if not lname.endswith("s") else lname+"’"
36
- lnames_regex = re.compile(rf'\b({lnames})\b')
37
- names = name+"’s" if not name.endswith("s") else name+"’"
38
- names_regex = re.compile(rf'\b({names})\b')
39
- if gender == "M":
40
- knowledge = re.sub(he_regex, "I", knowledge)
41
- knowledge = re.sub(his_regex, "my", knowledge)
42
- elif gender == "F":
43
- knowledge = re.sub(she_regex, "I", knowledge)
44
- knowledge = re.sub(her_regex, "my", knowledge)
45
- knowledge = re.sub(names_regex, "my", knowledge)
46
- knowledge = re.sub(lnames_regex, "my", knowledge)
47
- knowledge = re.sub(name_regex, "I", knowledge)
48
- knowledge = re.sub(lname_regex, "I", knowledge)
49
-
50
- spacy_model = spacy.load("en_core_web_sm")
51
  knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents]
52
 
53
- ai = CelebBot(name, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents)
54
 
55
  answers = []
56
 
57
  while True:
58
- # for q in celeb_data[ai.name_id][ai.name]["questions"]:
59
  if not DEBUG:
60
  ai.speech_to_text()
61
  else:
 
10
  SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
11
 
12
  def main():
13
+ celeb_data = get_celeb_data("data.json")
 
14
  message = "Please choose your favorite celebrity from\n"\
15
  "1. Cate Blanchett\n"\
16
  "2. David Beckham\n"\
 
24
  sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID)
25
  sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID)
26
 
27
+ celeb_name = input(message)
28
+ gender = celeb_data[celeb_name]["gender"]
29
+ if celeb_name == "Madonna":
30
+ name = "Madonna-American-singer-and-actress"
31
+ elif celeb_name == "Anne Hathaway":
32
+ name = "Anne-Hathaway-American-actress"
33
+ else:
34
+ name="-".join(celeb_name.split(" "))
35
+ knowledge = get_article(f"https://www.britannica.com/biography/{name}")
36
+ spacy_model = spacy.load("en_core_web_lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents]
38
 
39
+ ai = CelebBot(celeb_name, gender, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents)
40
 
41
  answers = []
42
 
43
  while True:
 
44
  if not DEBUG:
45
  ai.speech_to_text()
46
  else:
run_eval.py CHANGED
@@ -9,82 +9,58 @@ import torch
9
  from utils import *
10
  from celebbot import CelebBot
11
 
12
- QA_MODEL_ID = "google/flan-t5-xl"
13
  SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
14
  celeb_names = ["Cate Blanchett", "David Beckham", "Emma Watson", "Lady Gaga", "Madonna", "Mark Zuckerberg"]
15
 
16
- def evaluate_system():
 
 
 
17
 
18
- device = 'cpu'
19
- with open("data.json", encoding='utf-8') as json_file:
20
- celeb_data = json.load(json_file)
21
- references = [val['answers'] for key, val in list(celeb_data.items()) if key in celeb_names]
22
- references = list(itertools.chain.from_iterable(references))
23
- predictions = []
24
 
25
- QA_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_ID)
26
- QA_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_ID).to(device)
27
- sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID)
28
- sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID).to(device)
29
-
30
- for name in celeb_names:
31
- gender = celeb_data[name]["gender"]
32
- knowledge = celeb_data[name]["knowledge"]
33
-
34
- lname = name.split(" ")[-1]
35
- lname_regex = re.compile(rf'\b({lname})\b')
36
- name_regex = re.compile(rf'\b({name})\b')
37
- lnames = lname+"’s" if not lname.endswith("s") else lname+"’"
38
- lnames_regex = re.compile(rf'\b({lnames})\b')
39
- names = name+"’s" if not name.endswith("s") else name+"’"
40
- names_regex = re.compile(rf'\b({names})\b')
41
- if gender == "M":
42
- knowledge = re.sub(he_regex, "I", knowledge)
43
- knowledge = re.sub(his_regex, "my", knowledge)
44
- elif gender == "F":
45
- knowledge = re.sub(she_regex, "I", knowledge)
46
- knowledge = re.sub(her_regex, "my", knowledge)
47
- knowledge = re.sub(names_regex, "my", knowledge)
48
- knowledge = re.sub(lnames_regex, "my", knowledge)
49
- knowledge = re.sub(name_regex, "I", knowledge)
50
- knowledge = re.sub(lname_regex, "I", knowledge)
51
-
52
- spacy_model = spacy.load("en_core_web_sm")
53
- knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents]
54
 
55
- ai = CelebBot(name, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents)
56
- if re.search(re.compile(rf'\b(you|your|{ai.name})\b', flags=re.IGNORECASE), ai.text) != None:
57
- instruction1 = f"You are a celebrity named {ai.name}. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
58
 
59
- knowledge = ai.retrieve_knowledge_assertions()
60
- else:
61
- instruction1 = f"Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
62
- queries = [f"Context: {instruction1} {knowledge}\n\nQuestion: {q}\n\nAnswer:" for q in celeb_data[name]["questions"]]
63
- input_ids = ai.QA_tokenizer(f"{queries}", return_tensors="pt").input_ids.to(device)
64
- outputs = ai.QA_model.generate(input_ids, max_length=1024)
65
- predictions+= ai.QA_tokenizer.batch_decode(outputs, skip_special_tokens=True)
66
 
67
- file = open('predictions.txt','w')
68
- for prediction in predictions:
69
- file.write(prediction+"\n")
70
- file.close()
71
 
72
- bleu = evaluate.load("bleu")
73
- results = bleu.compute(predictions=predictions, references=references, max_order=4)
74
- print(f"BLEU: {round(results['bleu'], 2)}")
75
 
76
- meteor = evaluate.load("meteor")
77
- results = meteor.compute(predictions=predictions, references=references)
78
- print(f"METEOR: {round(results['meteor'], 2)}")
79
 
80
- rouge = evaluate.load("rouge")
81
- results = rouge.compute(predictions=predictions, references=references)
82
- print(f"ROUGE: {round(results['rougeL'], 2)}")
83
 
84
- bertscore = evaluate.load("bertscore")
85
- results = bertscore.compute(predictions=predictions, references=references, rescale_with_baseline=True, lang="en")
86
- print(f"F1: {round(sum(results['f1'])/len(results['f1']), 2)}")
87
-
88
- if __name__ == "__main__":
89
- evaluate_system()
90
-
 
9
  from utils import *
10
  from celebbot import CelebBot
11
 
12
+ QA_MODEL_ID = "google/flan-t5-large"
13
  SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
14
  celeb_names = ["Cate Blanchett", "David Beckham", "Emma Watson", "Lady Gaga", "Madonna", "Mark Zuckerberg"]
15
 
16
+ celeb_data = get_celeb_data("data.json")
17
+ references = [val['answers'] for key, val in list(celeb_data.items()) if key in celeb_names]
18
+ references = list(itertools.chain.from_iterable(references))
19
+ predictions = []
20
 
21
+ device = 'cpu'
22
+ QA_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_ID)
23
+ QA_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_ID).to(device)
24
+ sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID)
25
+ sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID).to(device)
 
26
 
27
+ for celeb_name in celeb_names:
28
+ gender = celeb_data[celeb_name]["gender"]
29
+ if celeb_name == "Madonna":
30
+ name = "Madonna-American-singer-and-actress"
31
+ elif celeb_name == "Anne Hathaway":
32
+ name = "Anne-Hathaway-American-actress"
33
+ else:
34
+ name="-".join(celeb_name.split(" "))
35
+ knowledge = get_article(f"https://www.britannica.com/biography/{name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ spacy_model = spacy.load("en_core_web_lg")
38
+ knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents]
 
39
 
40
+ ai = CelebBot(celeb_name, gender, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents)
41
+ for q in celeb_data[celeb_name]["questions"]:
42
+ ai.text = q
43
+ response = ai.question_answer()
44
+ print("response:", response)
45
+ predictions.append(response)
 
46
 
47
+ file = open('predictions.txt','w')
48
+ for prediction in predictions:
49
+ file.write(prediction+"\n")
50
+ file.close()
51
 
52
+ bleu = evaluate.load("bleu")
53
+ results = bleu.compute(predictions=predictions, references=references, max_order=4)
54
+ print(f"BLEU: {round(results['bleu'], 2)}")
55
 
56
+ meteor = evaluate.load("meteor")
57
+ results = meteor.compute(predictions=predictions, references=references)
58
+ print(f"METEOR: {round(results['meteor'], 2)}")
59
 
60
+ rouge = evaluate.load("rouge")
61
+ results = rouge.compute(predictions=predictions, references=references)
62
+ print(f"ROUGE: {round(results['rougeL'], 2)}")
63
 
64
+ bertscore = evaluate.load("bertscore")
65
+ results = bertscore.compute(predictions=predictions, references=references, rescale_with_baseline=True, lang="en")
66
+ print(f"F1: {round(sum(results['f1'])/len(results['f1']), 2)}")
 
 
 
 
run_tts.py CHANGED
@@ -119,8 +119,8 @@ def tts(text, embed_name, nlp, autoplay=True):
119
 
120
  if __name__ == "__main__":
121
  text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London."
122
- embed_name = "Beyonce"
123
- nlp = spacy.load('en_core_web_sm')
124
  b64 = tts(text, embed_name, nlp, autoplay=False)
125
 
126
  md = f"""
 
119
 
120
  if __name__ == "__main__":
121
  text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London."
122
+ embed_name = "Helen_Mirren"
123
+ nlp = spacy.load('en_core_web_lg')
124
  b64 = tts(text, embed_name, nlp, autoplay=False)
125
 
126
  md = f"""
utils.py CHANGED
@@ -36,7 +36,6 @@ def get_celeb_data(fpath):
36
  with open(fpath, encoding='UTF-8') as json_file:
37
  return json.load(json_file)
38
 
39
-
40
  def get_article(url):
41
  req = Request(
42
  url=url,
@@ -61,7 +60,7 @@ def get_article(url):
61
  # drop blank lines
62
  text = ' '.join(chunk for chunk in chunks if chunk)
63
  return text
64
-
65
  except:
66
  st.markdown("The internet is not stable.")
67
  return ""
@@ -72,10 +71,5 @@ def get_spacy_model(model_id):
72
 
73
  def preprocess_text(name, text:str, model_id):
74
  spacy_model = get_spacy_model(model_id)
75
-
76
- text = text.replace("’", "'")
77
- text = text.replace("β€˜", "'")
78
- text = text.replace("”", "\"")
79
- text = text.replace("β€œ", "\"")
80
  texts = [i.text.strip() for i in spacy_model(text).sents]
81
  return spacy_model, texts
 
36
  with open(fpath, encoding='UTF-8') as json_file:
37
  return json.load(json_file)
38
 
 
39
  def get_article(url):
40
  req = Request(
41
  url=url,
 
60
  # drop blank lines
61
  text = ' '.join(chunk for chunk in chunks if chunk)
62
  return text
63
+
64
  except:
65
  st.markdown("The internet is not stable.")
66
  return ""
 
71
 
72
  def preprocess_text(name, text:str, model_id):
73
  spacy_model = get_spacy_model(model_id)
 
 
 
 
 
74
  texts = [i.text.strip() for i in spacy_model(text).sents]
75
  return spacy_model, texts