OxMarkupLM / code /utils.py
rondaravaol
Adding inference example
76f22a1
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from common import custom_feature_extraction_markuplm
def split_sliding_data(items, window_size, overlap):
new_data = []
for obj in items:
nodes = obj['nodes']
num_elements = len(nodes)
counter = 0
for i in range(0, num_elements, window_size - overlap):
start = i
end = min(i + window_size, num_elements)
#print (start, end)
new_obj = {
'Index': obj['Index'] if 'Index' in obj else 0,
'Url': obj['Url'] if 'Url' in obj else None,
'Path': obj['Path'] if 'Path' in obj else None,
'nodes': obj['nodes'][start:end],
'xpaths': obj['xpaths'][start:end],
'labels': obj['labels'][start:end] if 'labels' in obj else None,
}
counter= counter+1
#print (new_obj, '\n')
new_data.append(new_obj)
return new_data
# Function to fetch HTML content from URL
def get_html_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print("Error fetching HTML content:", e)
return None
# Function to clean HTML content
def clean_html(html):
# Remove extra whitespaces, newlines, and tabs
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script',]):
# Remove tags
data.decompose()
html = str(soup)
clean_html = re.sub(r'\s+', ' ', html)
# Escape double quotes and wrap content in double quotes
#clean_html = clean_html.replace('"', '""')
#clean_html = f'"{clean_html}"'
return clean_html
# Function to extract HTML content from URL and save to new dataset
def extract_nodes_and_feautures(html_content):
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
cleaned_html = clean_html(str(soup))
feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None)
encoding = feature_extractor(cleaned_html)
#print(encoding.keys())
row = {}
row['nodes'] = encoding['nodes'][0]
row['xpaths'] = encoding['xpaths'][0]
row['labels'] = encoding['labels'][0]
return row
else:
return pd.Series()