File size: 2,430 Bytes
5a69a9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from common import custom_feature_extraction_markuplm
def split_sliding_data(items, window_size, overlap):
new_data = []
for obj in items:
nodes = obj['nodes']
num_elements = len(nodes)
counter = 0
for i in range(0, num_elements, window_size - overlap):
start = i
end = min(i + window_size, num_elements)
#print (start, end)
new_obj = {
'Index': obj['Index'] if 'Index' in obj else 0,
'Url': obj['Url'] if 'Url' in obj else None,
'Path': obj['Path'] if 'Path' in obj else None,
'nodes': obj['nodes'][start:end],
'xpaths': obj['xpaths'][start:end],
'labels': obj['labels'][start:end] if 'labels' in obj else None,
}
counter= counter+1
#print (new_obj, '\n')
new_data.append(new_obj)
return new_data
# Function to fetch HTML content from URL
def get_html_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print("Error fetching HTML content:", e)
return None
# Function to clean HTML content
def clean_html(html):
# Remove extra whitespaces, newlines, and tabs
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script',]):
# Remove tags
data.decompose()
html = str(soup)
clean_html = re.sub(r'\s+', ' ', html)
# Escape double quotes and wrap content in double quotes
#clean_html = clean_html.replace('"', '""')
#clean_html = f'"{clean_html}"'
return clean_html
# Function to extract HTML content from URL and save to new dataset
def extract_nodes_and_feautures(html_content):
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
cleaned_html = clean_html(str(soup))
feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None)
encoding = feature_extractor(cleaned_html)
#print(encoding.keys())
row = {}
row['nodes'] = encoding['nodes'][0]
row['xpaths'] = encoding['xpaths'][0]
row['labels'] = encoding['labels'][0]
return row
else:
return pd.Series()
|