metadata
base_model:
- microsoft/markuplm-base
library_name: transformers
pipeline_tag: token-classification
tags:
- html
- parser
- web-crawler
- news
- crawler
This is fine-tuned PoC of markuplm-base model for parsing news attributes from web-pages: author, pulication date, content and etc.
Inference example
Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).
id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
def eval(url):
current_dir = os.path.dirname(os.path.abspath(__file__))
model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root
model_name = 'OxMarkupLM.pt'
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
processor.parse_html = False
model_path = os.path.join(model_folder, model_name)
model = MarkupLMForTokenClassification.from_pretrained(
model_path, id2label=labels.id2label, label2id=labels.label2id
)
html = utils.clean_html(utils.get_html_content(url))
data = [utils.extract_nodes_and_feautures(html)]
example = utils.split_sliding_data(data, 10, 0)
title, author, date, content = [], [], [], []
for splited in example:
nodes, xpaths = splited['nodes'], splited['xpaths']
encoding = processor(
nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
padding="max_length", truncation=True, max_length=512, return_tensors="pt"
)
offset_mapping = encoding.pop("offset_mapping")
with torch.no_grad():
logits = model(**encoding).logits
predictions = logits.argmax(-1)
processed_words = []
for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
if word_id is not None and offset[0] == 0:
if pred_id == 1:
title.append(nodes[word_id])
elif pred_id == 2 and word_id not in processed_words:
processed_words.append(word_id)
content.append(nodes[word_id])
elif pred_id == 3:
author.append(nodes[word_id])
elif pred_id == 4:
date.append(nodes[word_id])
title = rank_titles(title, '\n'.join(content))
return {
"model_name": model_name,
"url": url,
"title": title,
"author": author,
"date": date,
"content": content,
}