from transformers import MarkupLMForTokenClassification from transformers import MarkupLMProcessor from code import utils, labels import torch import os import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def rank_titles(titles, content): vectorizer = TfidfVectorizer() texts = titles + [content] tfidf_matrix = vectorizer.fit_transform(texts) cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten() ranked_titles_indices = np.argsort(cosine_similarities)[::-1] ranked_titles = [titles[idx] for idx in ranked_titles_indices] return ranked_titles def eval(url): current_dir = os.path.dirname(os.path.abspath(__file__)) model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root model_name = 'OxMarkupLM.pt' processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") processor.parse_html = False model_path = os.path.join(model_folder, model_name) model = MarkupLMForTokenClassification.from_pretrained( model_path, id2label=labels.id2label, label2id=labels.label2id ) html = utils.clean_html(utils.get_html_content(url)) data = [utils.extract_nodes_and_feautures(html)] example = utils.split_sliding_data(data, 10, 0) title, author, date, content = [], [], [], [] for splited in example: nodes, xpaths = splited['nodes'], splited['xpaths'] encoding = processor( nodes=nodes, xpaths=xpaths, return_offsets_mapping=True, padding="max_length", truncation=True, max_length=512, return_tensors="pt" ) offset_mapping = encoding.pop("offset_mapping") with torch.no_grad(): logits = model(**encoding).logits predictions = logits.argmax(-1) processed_words = [] for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()): if word_id is not None and offset[0] == 0: if pred_id == 1: title.append(nodes[word_id]) elif pred_id == 2 and word_id not in processed_words: processed_words.append(word_id) content.append(nodes[word_id]) elif pred_id == 3: author.append(nodes[word_id]) elif pred_id == 4: date.append(nodes[word_id]) title = rank_titles(title, '\n'.join(content)) return { "model_name": model_name, "url": url, "title": title, "author": author, "date": date, "content": content, }