mdj1412
commit first
c109682
import os
import torch
import numpy as np
import spacy
from spacy.tokens import Span
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy import displacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
if torch.cuda.is_available():
device = 'cuda'
elif torch.backends.mps.is_available():
device = 'mps'
else:
device = 'cpu'
print(f"inference.py -> DEVICE : {device}")
summarizer = pipeline(
"summarization",
"pszemraj/long-t5-tglobal-base-16384-book-summary",
device=0 if torch.cuda.is_available() else -1,
)
long_text = "Here is a lot of text I don't want to read. Replace me"
# [ Practice ]
# result = summarizer(long_text)
# print(result[0]["summary_text"])
tokenizer = AutoTokenizer.from_pretrained("allenai/tk-instruct-base-def-pos")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/tk-instruct-base-def-pos")
# k = pipeline("text2text-generation", model="allenai/tk-instruct-3b-def")
# [ Practice ]
# input_ids = tokenizer.encode("Definition: return the currency of the given country. Now complete the following example - Input: India. Output:",
# return_tensors="pt")
# output = model.generate(input_ids, max_length=10)
# output = tokenizer.decode(output[0], skip_special_tokens=True) # model should output 'Indian Rupee'
# print(output)
# input_ids = tokenizer.encode("Definition: negate the following sentence. Input: John went to school. Output:",
# return_tensors="pt")
# output = model.generate(input_ids, max_length=10)
# output = tokenizer.decode(output[0], skip_special_tokens=True) # model should output 'John did not go to shool.'
# print(output)
# text = "Alphabet's results also missed forecasts on revenue and earnings per share, as advertising declined year-over-year. The numbers come after the company laid off about 12,000 employees in January, a move CEO Sundar Pichai blamed on Alphabet overhiring during the pandemic boom. \
# Q: Why did Alphabet's stock go down?"
# input_ids = tokenizer.encode(text, return_tensors="pt")
# output = model.generate(input_ids, max_length=10)
# output = tokenizer.decode(output[0], skip_special_tokens=True) # model should output 'John did not go to shool.'
# print(output)
def Tk_instruct(text, questions):
# Summary ν–ˆλŠ”μ§€ μ•ˆν–ˆλŠ”μ§€
summarized = False
summarized_data = ""
text = text + "\n\nQ: " + questions
print("Model's input : ", text)
if len(text) >= 512:
print(f"===================== Apply Summarization : length = {len(text)} =====================")
text = summarizer(text)[0]["summary_text"]
print(f"===================== Summary text : {text} =====================")
summarized = True
summarized_data = text
input_ids = tokenizer.encode(text, return_tensors="pt")
output = model.generate(input_ids, max_length=10)
output = tokenizer.decode(output[0], skip_special_tokens=True)
if summarized:
output = "Summary News : " + summarized_data + "\n\n" + "Answer : " + output
return output
# NER μ—°μŠ΅
def practice1():
print(f"======================={ 1. }=======================")
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
print(doc)
print(doc.ents)
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
title = "2. Accessing entity annotations and labels"
print(f"======================={ title }=======================")
nlp = spacy.load("en_core_web_sm")
doc = nlp("San Francisco considers banning sidewalk delivery robots")
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)
# I - Token is inside an entity.
# O - Token is outside an entity.
# B - Token is the beginning of an entity.
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)
print(ent_francisco)
title = "3. Setting entity annotations"
print(f"======================={ title }=======================")
nlp = spacy.load("en_core_web_sm")
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(
# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG"); print(fb_ent)
orig_ents = list(doc.ents)
# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc.set_ents([fb_ent], default="unmodified")
# Option 2: Assign a complete list of ents to doc.ents
doc.ents = orig_ents + [fb_ent]
ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 1, 'ORG')]
title = "4. Setting entity annotations from array"
print(f"======================={ title }=======================")
nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents) # []
header = [ENT_IOB, ENT_TYPE]; print(header)
attr_array = np.zeros((len(doc), len(header)), dtype="uint64"); print(attr_array)
attr_array[0, 0] = 3 # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array); print(attr_array)
print("After", doc.ents) # [London]
title = "5. Visualizing named entities"
print(f"======================={ title }=======================")
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# displacy.serve(doc, style="ent")
displacy.serve(doc, port=3, style="ent")
############################################################################
# news_analysis.html + ner.html => news.html λ§Œλ“œλŠ” μ—°μŠ΅
from flask import Flask, jsonify, request, render_template
from bs4 import BeautifulSoup
app = Flask(__name__)
@app.route('/')
def practice2():
title = "1. Rendering HTML"
print(f"======================={ title }=======================")
nlp = spacy.load("en_core_web_sm")
doc1 = nlp("This is a sentence.")
doc2 = nlp("This is another sentence.")
ner_html = displacy.render([doc1, doc2], style="dep", page=True)
print("ner_html : ", ner_html)
# NER html code
soup = BeautifulSoup(ner_html, 'html.parser')
ner_figure_list = soup.select('figure')
ner_html = ""
for i in range(len(ner_figure_list)):
ner_html = ner_html + str(ner_figure_list[i])
f = open("./templates/news_analysis.html", 'r')
f2 = open("./modules/templates/example.html", 'w')# read and write
html = f.read()
idx = html.find("ner-box") + 9 # NER html μ‚½μž…λ˜λŠ” λΆ€λΆ„
html = html[:idx] + ner_html + html[idx:]
f2.seek(0) # openν•˜λ©΄
f2.write(html)
# f2.seek(0)
# print(f2.read())
# from IPython import embed; embed()
# f2.write(f.read())
# f2.seek(0) # κ°€μž₯ μ•žμœΌλ‘œ
return render_template("example.html")
if __name__ == "__main__":
# app.run(host='0.0.0.0', port='777')
practice1()