File size: 2,430 Bytes

5a69a9a

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from common import custom_feature_extraction_markuplm

def split_sliding_data(items, window_size, overlap):
    new_data = []
    for obj in items:
        nodes = obj['nodes']
        num_elements = len(nodes)
        counter = 0
        for i in range(0, num_elements, window_size - overlap):
            start = i
            end = min(i + window_size, num_elements)
            #print (start, end)
            new_obj = {
            'Index': obj['Index'] if 'Index' in obj else 0,
            'Url': obj['Url'] if 'Url' in obj else None,
            'Path': obj['Path'] if 'Path' in obj else None,
            'nodes': obj['nodes'][start:end],
            'xpaths': obj['xpaths'][start:end],
            'labels': obj['labels'][start:end] if 'labels' in obj else None, 
            }
            counter= counter+1
            #print (new_obj, '\n')
            new_data.append(new_obj)

    return new_data


# Function to fetch HTML content from URL
def get_html_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except Exception as e:
        print("Error fetching HTML content:", e)
        return None
    
# Function to clean HTML content
def clean_html(html):
    # Remove extra whitespaces, newlines, and tabs
    soup = BeautifulSoup(html, "html.parser")
 
    for data in soup(['style', 'script',]):
        # Remove tags
        data.decompose()

    html = str(soup)
    clean_html = re.sub(r'\s+', ' ', html)
    # Escape double quotes and wrap content in double quotes
    #clean_html = clean_html.replace('"', '""')
    #clean_html = f'"{clean_html}"'
    return clean_html

# Function to extract HTML content from URL and save to new dataset
def extract_nodes_and_feautures(html_content):
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        cleaned_html = clean_html(str(soup))

        feature_extractor = custom_feature_extraction_markuplm.CustomMarkupLMFeatureExtractor(None)

        encoding = feature_extractor(cleaned_html)

        #print(encoding.keys())
        row = {}
        row['nodes'] = encoding['nodes'][0]
        row['xpaths'] = encoding['xpaths'][0]
        row['labels'] = encoding['labels'][0]
        return row
    else:
        return pd.Series()