olga-rondareva
/

OxMarkupLM

 - microsoft/markuplm-base
 library_name: transformers
 pipeline_tag: token-classification
+---
+This is fine-tuned PoC of [markuplm-base](https://huggingface.co/microsoft/markuplm-base) model for parsing news attributes from web-pages: author, pulication date, content and etc.
+**Inference example**
+Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).
+```python
+id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
+def eval(url):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    model_folder = os.path.join(current_dir, 'models')  # models folder is in the repository root
+    model_name = 'OxMarkupLM.pt'
+    processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+    processor.parse_html = False
+    model_path = os.path.join(model_folder, model_name)
+    model = MarkupLMForTokenClassification.from_pretrained(
+        model_path, id2label=labels.id2label, label2id=labels.label2id
+    )
+    html = utils.clean_html(utils.get_html_content(url))
+    data = [utils.extract_nodes_and_feautures(html)]
+    example = utils.split_sliding_data(data, 10, 0)
+    title, author, date, content = [], [], [], []
+    for splited in example:
+        nodes, xpaths = splited['nodes'], splited['xpaths']
+        encoding = processor(
+            nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
+            padding="max_length", truncation=True, max_length=512, return_tensors="pt"
+        )
+        offset_mapping = encoding.pop("offset_mapping")
+        with torch.no_grad():
+            logits = model(**encoding).logits
+        predictions = logits.argmax(-1)
+        processed_words = []
+        for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
+            if word_id is not None and offset[0] == 0:
+                if pred_id == 1:
+                    title.append(nodes[word_id])
+                elif pred_id == 2 and word_id not in processed_words:
+                    processed_words.append(word_id)
+                    content.append(nodes[word_id])
+                elif pred_id == 3:
+                    author.append(nodes[word_id])
+                elif pred_id == 4:
+                    date.append(nodes[word_id])
+    title = rank_titles(title, '\n'.join(content))
+    return {
+        "model_name": model_name,
+        "url": url,
+        "title": title,
+        "author": author,
+        "date": date,
+        "content": content,
+    }
+```

code/labels.py CHANGED Viewed

@@ -7,28 +7,6 @@ id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header
 label2id = {label:id for id, label in id2label.items()}
 label_list = ["B-" + x for x in list(id2label.values())]
-print(label_list)
-def get_class_weights_tensor(dataset):
-    all_labels = [label for data_item in dataset for label in data_item['labels']]
-    # Count the frequency of each label
-    label_counter = Counter(all_labels)
-    # Calculate the class weights
-    total_count = sum(label_counter.values())
-    class_weights = {label: total_count / count for label, count in label_counter.items()}
-    # Normalize the weights
-    sum_weights = sum(class_weights.values())
-    normalized_class_weights = {label: weight / sum_weights for label, weight in class_weights.items()}
-    # Convert class weights to a tensor
-    class_weights_list = [normalized_class_weights[label] for label in sorted(normalized_class_weights.keys())]
-    class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float).to(device)
-    return class_weights_tensor
 def get_labels(predictions, references):
     # Transform predictions and references tensos to numpy arrays

 label2id = {label:id for id, label in id2label.items()}
 label_list = ["B-" + x for x in list(id2label.values())]
 def get_labels(predictions, references):
     # Transform predictions and references tensos to numpy arrays

code/utils.py CHANGED Viewed

@@ -2,29 +2,13 @@ import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import re
-import json
-import os
 from common import custom_feature_extraction_markuplm
-import glob
-def get_latest_file(directory):
-    # Get list of all files in the directory
-    list_of_files = glob.glob(os.path.join(directory, '*'))
-    print("Files in model folder\n" + str(list_of_files))
-    # Sort files based on creation time
-    latest_file = max(list_of_files, key=os.path.getctime)
-    return latest_file
 def split_sliding_data(items, window_size, overlap):
     new_data = []
     for obj in items:
-        #print (obj.keys(), '\n')
-        #print (obj, '\n')
         nodes = obj['nodes']
         num_elements = len(nodes)
-        #print(num_elements, '\n')
         counter = 0
         for i in range(0, num_elements, window_size - overlap):
             start = i
@@ -32,12 +16,10 @@ def split_sliding_data(items, window_size, overlap):
             #print (start, end)
             new_obj = {
             'Index': obj['Index'] if 'Index' in obj else 0,
-            'Index2': counter,
             'Url': obj['Url'] if 'Url' in obj else None,
             'Path': obj['Path'] if 'Path' in obj else None,
             'nodes': obj['nodes'][start:end],
             'xpaths': obj['xpaths'][start:end],
-            'xpaths_simple': obj['xpaths_simple'][start:end],
             'labels': obj['labels'][start:end] if 'labels' in obj else None,
             }
             counter= counter+1
@@ -89,7 +71,6 @@ def extract_nodes_and_feautures(html_content):
         row = {}
         row['nodes'] = encoding['nodes'][0]
         row['xpaths'] = encoding['xpaths'][0]
-        row['xpaths_simple'] = encoding['xpaths_simple'][0]
         row['labels'] = encoding['labels'][0]
         return row
     else:

 import requests
 from bs4 import BeautifulSoup
 import re
 from common import custom_feature_extraction_markuplm
 def split_sliding_data(items, window_size, overlap):
     new_data = []
     for obj in items:
         nodes = obj['nodes']
         num_elements = len(nodes)
         counter = 0
         for i in range(0, num_elements, window_size - overlap):
             start = i
             #print (start, end)
             new_obj = {
             'Index': obj['Index'] if 'Index' in obj else 0,
             'Url': obj['Url'] if 'Url' in obj else None,
             'Path': obj['Path'] if 'Path' in obj else None,
             'nodes': obj['nodes'][start:end],
             'xpaths': obj['xpaths'][start:end],
             'labels': obj['labels'][start:end] if 'labels' in obj else None,
             }
             counter= counter+1
         row = {}
         row['nodes'] = encoding['nodes'][0]
         row['xpaths'] = encoding['xpaths'][0]
         row['labels'] = encoding['labels'][0]
         return row
     else: