olga-rondareva
/

OxMarkupLM

Token Classification

Inference Endpoints

Model card Files Files and versions Community

OxMarkupLM / code /utils.py

rondaravaol

Adding inference example

76f22a1 4 months ago

history blame contribute delete

2.43 kB

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import re
	from common import custom_feature_extraction_markuplm

	def split_sliding_data(items, window_size, overlap):
	new_data = []
	for obj in items:
	nodes = obj['nodes']
	num_elements = len(nodes)
	counter = 0
	for i in range(0, num_elements, window_size - overlap):
	start = i
	end = min(i + window_size, num_elements)
	#print (start, end)
	new_obj = {
	'Index': obj['Index'] if 'Index' in obj else 0,
	'Url': obj['Url'] if 'Url' in obj else None,
	'Path': obj['Path'] if 'Path' in obj else None,
	'nodes': obj['nodes'][start:end],
	'xpaths': obj['xpaths'][start:end],
	'labels': obj['labels'][start:end] if 'labels' in obj else None,
	}
	counter= counter+1
	#print (new_obj, '\n')
	new_data.append(new_obj)

	return new_data


	# Function to fetch HTML content from URL
	def get_html_content(url):
	try:
	response = requests.get(url)
	if response.status_code == 200:
	return response.text
	else:
	return None
	except Exception as e:
	print("Error fetching HTML content:", e)
	return None

	# Function to clean HTML content
	def clean_html(html):
	# Remove extra whitespaces, newlines, and tabs
	soup = BeautifulSoup(html, "html.parser")

	for data in soup(['style', 'script',]):
	# Remove tags
	data.decompose()

	html = str(soup)
	clean_html = re.sub(r'\s+', ' ', html)
	# Escape double quotes and wrap content in double quotes
	#clean_html = clean_html.replace('"', '""')
	#clean_html = f'"{clean_html}"'
	return clean_html

	# Function to extract HTML content from URL and save to new dataset
	def extract_nodes_and_feautures(html_content):
	if html_content:
	soup = BeautifulSoup(html_content, 'html.parser')
	cleaned_html = clean_html(str(soup))

	feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None)

	encoding = feature_extractor(cleaned_html)

	#print(encoding.keys())
	row = {}
	row['nodes'] = encoding['nodes'][0]
	row['xpaths'] = encoding['xpaths'][0]
	row['labels'] = encoding['labels'][0]
	return row
	else:
	return pd.Series()