Inference

Files changed (5) hide show

.gitattributes +1 -0
api_inference.py +11 -0
code/labels.py +51 -0
code/utils.py +96 -0
eval.py +77 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.extension filter=lfs diff=lfs merge=lfs -text

api_inference.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from eval import eval
+def query(payload):
+    url = payload.get("url", "")
+    if not url:
+        return {"error": "No URL provided"}
+    try:
+        result = eval(url)
+        return result
+    except Exception as e:
+        return {"error": str(e)}

code/labels.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from collections import Counter
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
+label2id = {label:id for id, label in id2label.items()}
+label_list = ["B-" + x for x in list(id2label.values())]
+print(label_list)
+def get_class_weights_tensor(dataset):
+    all_labels = [label for data_item in dataset for label in data_item['labels']]
+    # Count the frequency of each label
+    label_counter = Counter(all_labels)
+    # Calculate the class weights
+    total_count = sum(label_counter.values())
+    class_weights = {label: total_count / count for label, count in label_counter.items()}
+    # Normalize the weights
+    sum_weights = sum(class_weights.values())
+    normalized_class_weights = {label: weight / sum_weights for label, weight in class_weights.items()}
+    # Convert class weights to a tensor
+    class_weights_list = [normalized_class_weights[label] for label in sorted(normalized_class_weights.keys())]
+    class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float).to(device)
+    return class_weights_tensor
+def get_labels(predictions, references):
+    # Transform predictions and references tensos to numpy arrays
+    if device.type == "cpu":
+        y_pred = predictions.detach().clone().numpy()
+        y_true = references.detach().clone().numpy()
+    else:
+        y_pred = predictions.detach().cpu().clone().numpy()
+        y_true = references.detach().cpu().clone().numpy()
+    # Remove ignored index (special tokens)
+    true_predictions = [
+        [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
+        for pred, gold_label in zip(y_pred, y_true)
+    ]
+    true_labels = [
+        [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
+        for pred, gold_label in zip(y_pred, y_true)
+    ]
+    return true_predictions, true_labels

code/utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+import re
+import json
+import os
+from common import custom_feature_extraction_markuplm
+import glob
+def get_latest_file(directory):
+    # Get list of all files in the directory
+    list_of_files = glob.glob(os.path.join(directory, '*'))
+    print("Files in model folder\n" + str(list_of_files))
+    # Sort files based on creation time
+    latest_file = max(list_of_files, key=os.path.getctime)
+    return latest_file
+def split_sliding_data(items, window_size, overlap):
+    new_data = []
+    for obj in items:
+        #print (obj.keys(), '\n')
+        #print (obj, '\n')
+        nodes = obj['nodes']
+        num_elements = len(nodes)
+        #print(num_elements, '\n')
+        counter = 0
+        for i in range(0, num_elements, window_size - overlap):
+            start = i
+            end = min(i + window_size, num_elements)
+            #print (start, end)
+            new_obj = {
+            'Index': obj['Index'] if 'Index' in obj else 0,
+            'Index2': counter,
+            'Url': obj['Url'] if 'Url' in obj else None,
+            'Path': obj['Path'] if 'Path' in obj else None,
+            'nodes': obj['nodes'][start:end],
+            'xpaths': obj['xpaths'][start:end],
+            'xpaths_simple': obj['xpaths_simple'][start:end],
+            'labels': obj['labels'][start:end] if 'labels' in obj else None,
+            }
+            counter= counter+1
+            #print (new_obj, '\n')
+            new_data.append(new_obj)
+    return new_data
+# Function to fetch HTML content from URL
+def get_html_content(url):
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+        else:
+            return None
+    except Exception as e:
+        print("Error fetching HTML content:", e)
+        return None
+# Function to clean HTML content
+def clean_html(html):
+    # Remove extra whitespaces, newlines, and tabs
+    soup = BeautifulSoup(html, "html.parser")
+    for data in soup(['style', 'script',]):
+        # Remove tags
+        data.decompose()
+    html = str(soup)
+    clean_html = re.sub(r'\s+', ' ', html)
+    # Escape double quotes and wrap content in double quotes
+    #clean_html = clean_html.replace('"', '""')
+    #clean_html = f'"{clean_html}"'
+    return clean_html
+# Function to extract HTML content from URL and save to new dataset
+def extract_nodes_and_feautures(html_content):
+    if html_content:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        cleaned_html = clean_html(str(soup))
+        feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None)
+        encoding = feature_extractor(cleaned_html)
+        #print(encoding.keys())
+        row = {}
+        row['nodes'] = encoding['nodes'][0]
+        row['xpaths'] = encoding['xpaths'][0]
+        row['xpaths_simple'] = encoding['xpaths_simple'][0]
+        row['labels'] = encoding['labels'][0]
+        return row
+    else:
+        return pd.Series()

eval.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from transformers import MarkupLMForTokenClassification
+from transformers import MarkupLMProcessor
+from code import utils, labels
+import torch
+import os
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+def rank_titles(titles, content):
+    vectorizer = TfidfVectorizer()
+    texts = titles + [content]
+    tfidf_matrix = vectorizer.fit_transform(texts)
+    cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
+    ranked_titles_indices = np.argsort(cosine_similarities)[::-1]
+    ranked_titles = [titles[idx] for idx in ranked_titles_indices]
+    return ranked_titles
+def eval(url):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    model_folder = os.path.join(current_dir, 'models')  # models folder is in the repository root
+    model_name = 'OxMarkupLM.pt'
+    processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
+    processor.parse_html = False
+    model_path = os.path.join(model_folder, model_name)
+    model = MarkupLMForTokenClassification.from_pretrained(
+        model_path, id2label=labels.id2label, label2id=labels.label2id
+    )
+    html = utils.clean_html(utils.get_html_content(url))
+    data = [utils.extract_nodes_and_feautures(html)]
+    example = utils.split_sliding_data(data, 10, 0)
+    title, author, date, content = [], [], [], []
+    for splited in example:
+        nodes, xpaths = splited['nodes'], splited['xpaths']
+        encoding = processor(
+            nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
+            padding="max_length", truncation=True, max_length=512, return_tensors="pt"
+        )
+        offset_mapping = encoding.pop("offset_mapping")
+        with torch.no_grad():
+            logits = model(**encoding).logits
+        predictions = logits.argmax(-1)
+        processed_words = []
+        for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
+            if word_id is not None and offset[0] == 0:
+                if pred_id == 1:
+                    title.append(nodes[word_id])
+                elif pred_id == 2 and word_id not in processed_words:
+                    processed_words.append(word_id)
+                    content.append(nodes[word_id])
+                elif pred_id == 3:
+                    author.append(nodes[word_id])
+                elif pred_id == 4:
+                    date.append(nodes[word_id])
+    title = rank_titles(title, '\n'.join(content))
+    return {
+        "model_name": model_name,
+        "url": url,
+        "title": title,
+        "author": author,
+        "date": date,
+        "content": content,
+    }