import base64 import copy import json from collections import Counter from urllib.parse import urljoin import streamlit as st from bs4 import BeautifulSoup def remove_svg_elements(element): """ Remove all SVG elements from a BeautifulSoup element. Returns a copy of the element with SVGs removed. """ # Create a copy of the element to avoid modifying the original element_copy = copy.copy(element) # Find and remove all SVG elements if hasattr(element_copy, 'find_all'): svg_elements = element_copy.find_all('svg') for svg in svg_elements: svg.decompose() return element_copy def get_element_signature(element): """ Create a signature for an element based on its structure. """ signature = { 'tag': element.name, 'classes': tuple(sorted(element.get('class', []))), 'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)), 'has_image': bool(element.find('img')), 'has_price': bool(any(c in element.get_text() for c in '$€£¥')), 'has_link': bool(element.find('a')), } return str(signature) def analyze_children_similarity(element): """ Analyze how similar the direct children of an element are. """ if not element.contents: return 0, 0 child_signatures = [ get_element_signature(child) for child in element.find_all(recursive=False) if child.name ] if not child_signatures: return 0, 0 signature_counts = Counter(child_signatures) most_common_sig, most_common_count = signature_counts.most_common(1)[0] similarity_score = most_common_count / len(child_signatures) return similarity_score, most_common_count def count_images_in_element(element): """ Count all images within an element, including nested ones. """ return len(element.find_all('img', recursive=True)) def get_element_identifier(element): """ Create a unique identifier for an element including tag and classes. """ identifier = element.name if element.get('class'): identifier += f" .{' .'.join(element['class'])}" if element.get('id'): identifier += f" #{element['id']}" return identifier def convert_relative_urls(soup, base_url): """ Convert all relative URLs in the soup object to absolute URLs. """ for tag in soup.find_all(href=True): tag['href'] = urljoin(base_url, tag['href']) for tag in soup.find_all(src=True): tag['src'] = urljoin(base_url, tag['src']) for tag in soup.find_all(attrs={'data-src': True}): tag['data-src'] = urljoin(base_url, tag['data-src']) return soup def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7): """ Find elements containing images and return both sorted list and detailed top element info. """ soup = BeautifulSoup(html_content, "html.parser") # Convert relative URLs to absolute if base_url is provided if base_url: soup = convert_relative_urls(soup, base_url) # Collect potential container elements with their scores elements_with_scores = [] for element in soup.find_all(): if element.name in ['div', 'ul', 'section', 'main']: similarity_score, similar_children_count = analyze_children_similarity(element) image_count = count_images_in_element(element) if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0: # Count products (direct children with images) products_count = len([child for child in element.find_all(recursive=False) if child.name and child.find('img', recursive=True)]) combined_score = (similarity_score * similar_children_count * image_count) elements_with_scores.append((element, image_count, combined_score, products_count)) if not elements_with_scores: return [], {"error": "No elements with images found"}, "" # Sort by combined score elements_with_scores.sort(key=lambda x: x[2], reverse=True) # Process elements for sorted list output sorted_elements = [] for element, image_count, _, products_count in elements_with_scores: sorted_elements.append((get_element_identifier(element), image_count, products_count)) # Get top element (one with highest combined score) top_element = elements_with_scores[0][0] # Remove SVGs from the top element for HTML output top_element_no_svg = remove_svg_elements(top_element) # Separate child elements with images products = [] for child in top_element_no_svg.find_all(recursive=False): if child.name: # Skip text nodes # Remove SVGs from each product child_no_svg = remove_svg_elements(child) product_info = { "html_content": str(child_no_svg), "images": [] } # Get all images within this product for img in child_no_svg.find_all('img', recursive=True): image_info = { "src": img.get('src', 'No source'), "alt": img.get('alt', 'No alt text') } product_info["images"].append(image_info) products.append(product_info) # Create result dictionary for top element top_element_info = { "parent": { "tag": top_element_no_svg.name, "identifier": get_element_identifier(top_element_no_svg), "classes": top_element_no_svg.get('class', []), "id": top_element_no_svg.get('id', None) }, "products_count": len(products), "products": products } html_output = str(top_element_no_svg) return sorted_elements, top_element_info, html_output def get_download_link(content, filename, content_type="file/json"): """Generate a download link for the given content""" b64 = base64.b64encode(content.encode()).decode() return f'Download {filename}' def main(): st.title("HTML File Analyzer") st.write("Upload HTML files to analyze their structure and find image-rich elements") # File uploader allows multiple files uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html']) if uploaded_files: all_results = {} all_html_outputs = {} # Analysis parameters col1, col2 = st.columns(2) with col1: min_children = st.slider("Minimum number of similar children", 1, 10, 4) with col2: min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7) # Generate button if st.button("Generate Analysis"): # Show processing message with st.spinner('Processing files...'): all_results = {} all_html_outputs = {} # Process each file for uploaded_file in uploaded_files: st.subheader(f"Analysis for {uploaded_file.name}") try: # Read and process the file html_content = uploaded_file.read().decode('utf-8') sorted_elements, top_element_info, html_output = find_image_rich_parents( html_content, min_children=min_children, min_similarity=min_similarity ) # Display results st.write("Elements containing images:") for element, img_count, prod_count in sorted_elements: st.write(f"- {element}: {img_count} images, {prod_count} products") # Store results all_results[uploaded_file.name] = top_element_info all_html_outputs[uploaded_file.name] = html_output except Exception as e: st.error(f"Error processing {uploaded_file.name}: {str(e)}") continue # Create download buttons if we have results if all_results: st.subheader("Download Results") col1, col2 = st.columns(2) # JSON download with col1: json_str = json.dumps(all_results, indent=2) st.markdown(get_download_link(json_str, 'analysis_results.json'), unsafe_allow_html=True) # HTML download with col2: # Combine all HTML outputs with file names as headers combined_html = """
""" for filename, html in all_html_outputs.items(): combined_html += f"""