|
from transformers import MarkupLMForTokenClassification |
|
from transformers import MarkupLMProcessor |
|
from code import utils, labels |
|
import torch |
|
import os |
|
import numpy as np |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
def rank_titles(titles, content): |
|
vectorizer = TfidfVectorizer() |
|
texts = titles + [content] |
|
|
|
tfidf_matrix = vectorizer.fit_transform(texts) |
|
|
|
cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten() |
|
ranked_titles_indices = np.argsort(cosine_similarities)[::-1] |
|
ranked_titles = [titles[idx] for idx in ranked_titles_indices] |
|
return ranked_titles |
|
|
|
|
|
def eval(url): |
|
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
model_folder = os.path.join(current_dir, 'models') |
|
model_name = 'OxMarkupLM.pt' |
|
|
|
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") |
|
processor.parse_html = False |
|
|
|
model_path = os.path.join(model_folder, model_name) |
|
|
|
model = MarkupLMForTokenClassification.from_pretrained( |
|
model_path, id2label=labels.id2label, label2id=labels.label2id |
|
) |
|
|
|
html = utils.clean_html(utils.get_html_content(url)) |
|
data = [utils.extract_nodes_and_feautures(html)] |
|
example = utils.split_sliding_data(data, 10, 0) |
|
|
|
title, author, date, content = [], [], [], [] |
|
for splited in example: |
|
nodes, xpaths = splited['nodes'], splited['xpaths'] |
|
encoding = processor( |
|
nodes=nodes, xpaths=xpaths, return_offsets_mapping=True, |
|
padding="max_length", truncation=True, max_length=512, return_tensors="pt" |
|
) |
|
offset_mapping = encoding.pop("offset_mapping") |
|
with torch.no_grad(): |
|
logits = model(**encoding).logits |
|
|
|
predictions = logits.argmax(-1) |
|
processed_words = [] |
|
|
|
for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()): |
|
if word_id is not None and offset[0] == 0: |
|
if pred_id == 1: |
|
title.append(nodes[word_id]) |
|
elif pred_id == 2 and word_id not in processed_words: |
|
processed_words.append(word_id) |
|
content.append(nodes[word_id]) |
|
elif pred_id == 3: |
|
author.append(nodes[word_id]) |
|
elif pred_id == 4: |
|
date.append(nodes[word_id]) |
|
|
|
title = rank_titles(title, '\n'.join(content)) |
|
return { |
|
"model_name": model_name, |
|
"url": url, |
|
"title": title, |
|
"author": author, |
|
"date": date, |
|
"content": content, |
|
} |
|
|
|
|