Spaces:
Runtime error
Runtime error
File size: 5,811 Bytes
be4e89b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import json
from nltk.tokenize import sent_tokenize
import torch
import ujson as json
from transformers import AutoModelForCausalLM,LlamaTokenizer
from peft import PeftModel
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import nltk
nltk.download('punkt')
# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
model_name = "llama-7b-hf"
adapters_name = 'guanaco-7b'
# print(f"Starting to load the model {model_name} into memory")
m = AutoModelForCausalLM.from_pretrained(
model_name,
#load_in_4bit=True,
torch_dtype=torch.bfloat16,
device_map='auto'
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1
stop_token_ids = [0]
# print(f"Successfully loaded the model {model_name} into memory")
print('Guanaco model loaded into memory.')
def generate(title, abstract):
print("Started running.")
'''
Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
'''
newline = {}
text = abstract
# eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
if text.lower()[0:9] == "abstract.":
text = text[9:]
elif text.lower()[0:8] == "abstract":
text = text[8:]
sentences = sent_tokenize(text)
newline["target"] = sentences
newline["title"] = title
first_file = open("data/sample-data.jsonl", "w")
first_file.write(json.dumps(newline))
first_file.close()
print(newline)
print("Tokenized abstract to sentences.")
'''
Main part
'''
'''
This is for summarization
'''
tooShortForKeyword = False
with open("data/sample-data.jsonl", "r") as f:
obj = [json.loads(l) for l in f]
doc = ""
if len(obj[0]["target"]) > 1:
doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1]
elif len(obj[0]["target"]) == 1:
tooShortForKeyword = True
doc += obj[0]["title"] + ". " + obj[0]["target"][0]
else:
tooShortForKeyword = True
doc += obj[0]["title"]
text = doc
prompt = """
Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
"""
formatted_prompt = (
f"A chat between a curious human and an artificial intelligence assistant."
f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
f"### Human: {prompt + doc} \n"
f"### Assistant:"
)
inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:1")
outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
output = tok.decode(outputs[0], skip_special_tokens=True)
index_response = output.find("### Assistant: ") + 15
if (output[index_response:index_response + 10] == "Certainly!"):
index_response += 10
end_response = output.rfind('.') + 1
response = output[index_response:end_response]
with open("data/guanacoSummaryOutput.txt", "w") as f2:
f2.write(response)
print('Plain Language Summary Created.')
'''
Keyphrase extraction.
'''
# the document is the title and first two sentences of the abstract.
with open("data/sample-data.jsonl", "r") as f:
obj = [json.loads(l) for l in f]
doc = ""
if len(obj[0]["target"]) > 1:
doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " " + obj[0]["target"][1]
kw_model = KeyBERT(model="all-MiniLM-L6-v2")
vectorizer = KeyphraseCountVectorizer()
top_n = 2
keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
my_keywords = []
for i in range(top_n):
add = True
for j in range(top_n):
if i != j:
if keywords[i][0] in keywords[j][0]:
add = False
if add:
my_keywords.append(keywords[i][0])
for entry in my_keywords:
print(entry)
'''
This is for feeding the keyphrases into Guanaco.
'''
responseTwo = ""
keyword_string = ""
if not tooShortForKeyword:
separator = ', '
keyword_string = separator.join(my_keywords)
prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
formatted_prompt = (
f"A chat between a curious human and an artificial intelligence assistant."
f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
f"### Human: {prompt} \n"
f"### Assistant:"
)
inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:2")
outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
output = tok.decode(outputs[0], skip_special_tokens=True)
index_response = output.find("### Assistant: ") + 15
end_response = output.rfind('.') + 1
responseTwo = output[index_response:end_response]
with open("data/guanacoElaborationOutput.txt", "w") as f2:
f2.write(responseTwo)
print('Keyphrase elaboration ran.')
return keyword_string, responseTwo, response
demo = gr.Interface(
fn=generate,
inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
).launch(share = True)
print('after launch') # now executes
|