|
import pandas as pd |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import re |
|
from common import custom_feature_extraction_markuplm |
|
|
|
def split_sliding_data(items, window_size, overlap): |
|
new_data = [] |
|
for obj in items: |
|
nodes = obj['nodes'] |
|
num_elements = len(nodes) |
|
counter = 0 |
|
for i in range(0, num_elements, window_size - overlap): |
|
start = i |
|
end = min(i + window_size, num_elements) |
|
|
|
new_obj = { |
|
'Index': obj['Index'] if 'Index' in obj else 0, |
|
'Url': obj['Url'] if 'Url' in obj else None, |
|
'Path': obj['Path'] if 'Path' in obj else None, |
|
'nodes': obj['nodes'][start:end], |
|
'xpaths': obj['xpaths'][start:end], |
|
'labels': obj['labels'][start:end] if 'labels' in obj else None, |
|
} |
|
counter= counter+1 |
|
|
|
new_data.append(new_obj) |
|
|
|
return new_data |
|
|
|
|
|
|
|
def get_html_content(url): |
|
try: |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
return response.text |
|
else: |
|
return None |
|
except Exception as e: |
|
print("Error fetching HTML content:", e) |
|
return None |
|
|
|
|
|
def clean_html(html): |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
for data in soup(['style', 'script',]): |
|
|
|
data.decompose() |
|
|
|
html = str(soup) |
|
clean_html = re.sub(r'\s+', ' ', html) |
|
|
|
|
|
|
|
return clean_html |
|
|
|
|
|
def extract_nodes_and_feautures(html_content): |
|
if html_content: |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
cleaned_html = clean_html(str(soup)) |
|
|
|
feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None) |
|
|
|
encoding = feature_extractor(cleaned_html) |
|
|
|
|
|
row = {} |
|
row['nodes'] = encoding['nodes'][0] |
|
row['xpaths'] = encoding['xpaths'][0] |
|
row['labels'] = encoding['labels'][0] |
|
return row |
|
else: |
|
return pd.Series() |
|
|