Spaces:

supunTE
/

products-extracter

Sleeping

App Files Files Community

supunTE commited on Oct 28, 2024

Commit

814b935

1 Parent(s): f2fe0c9

create streamlit app

Browse files

Files changed (3) hide show

app.py +286 -0
requirements.txt +132 -0
scrape-content.ipynb +3 -1

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import base64
+import copy
+import json
+from collections import Counter
+from urllib.parse import urljoin
+import streamlit as st
+from bs4 import BeautifulSoup
+def remove_svg_elements(element):
+    """
+    Remove all SVG elements from a BeautifulSoup element.
+    Returns a copy of the element with SVGs removed.
+    """
+    # Create a copy of the element to avoid modifying the original
+    element_copy = copy.copy(element)
+    # Find and remove all SVG elements
+    if hasattr(element_copy, 'find_all'):
+        svg_elements = element_copy.find_all('svg')
+        for svg in svg_elements:
+            svg.decompose()
+    return element_copy
+def get_element_signature(element):
+    """
+    Create a signature for an element based on its structure.
+    """
+    signature = {
+        'tag': element.name,
+        'classes': tuple(sorted(element.get('class', []))),
+        'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
+        'has_image': bool(element.find('img')),
+        'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
+        'has_link': bool(element.find('a')),
+    }
+    return str(signature)
+def analyze_children_similarity(element):
+    """
+    Analyze how similar the direct children of an element are.
+    """
+    if not element.contents:
+        return 0, 0
+    child_signatures = [
+        get_element_signature(child)
+        for child in element.find_all(recursive=False)
+        if child.name
+    ]
+    if not child_signatures:
+        return 0, 0
+    signature_counts = Counter(child_signatures)
+    most_common_sig, most_common_count = signature_counts.most_common(1)[0]
+    similarity_score = most_common_count / len(child_signatures)
+    return similarity_score, most_common_count
+def count_images_in_element(element):
+    """
+    Count all images within an element, including nested ones.
+    """
+    return len(element.find_all('img', recursive=True))
+def get_element_identifier(element):
+    """
+    Create a unique identifier for an element including tag and classes.
+    """
+    identifier = element.name
+    if element.get('class'):
+        identifier += f" .{' .'.join(element['class'])}"
+    if element.get('id'):
+        identifier += f" #{element['id']}"
+    return identifier
+def convert_relative_urls(soup, base_url):
+    """
+    Convert all relative URLs in the soup object to absolute URLs.
+    """
+    for tag in soup.find_all(href=True):
+        tag['href'] = urljoin(base_url, tag['href'])
+    for tag in soup.find_all(src=True):
+        tag['src'] = urljoin(base_url, tag['src'])
+    for tag in soup.find_all(attrs={'data-src': True}):
+        tag['data-src'] = urljoin(base_url, tag['data-src'])
+    return soup
+def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
+    """
+    Find elements containing images and return both sorted list and detailed top element info.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Convert relative URLs to absolute if base_url is provided
+    if base_url:
+        soup = convert_relative_urls(soup, base_url)
+    # Collect potential container elements with their scores
+    elements_with_scores = []
+    for element in soup.find_all():
+        if element.name in ['div', 'ul', 'section', 'main']:
+            similarity_score, similar_children_count = analyze_children_similarity(element)
+            image_count = count_images_in_element(element)
+            if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
+                # Count products (direct children with images)
+                products_count = len([child for child in element.find_all(recursive=False)
+                                      if child.name and child.find('img', recursive=True)])
+                combined_score = (similarity_score * similar_children_count * image_count)
+                elements_with_scores.append((element, image_count, combined_score, products_count))
+    if not elements_with_scores:
+        return [], {"error": "No elements with images found"}, ""
+    # Sort by combined score
+    elements_with_scores.sort(key=lambda x: x[2], reverse=True)
+    # Process elements for sorted list output
+    sorted_elements = []
+    for element, image_count, _, products_count in elements_with_scores:
+        sorted_elements.append((get_element_identifier(element), image_count, products_count))
+    # Get top element (one with highest combined score)
+    top_element = elements_with_scores[0][0]
+    # Remove SVGs from the top element for HTML output
+    top_element_no_svg = remove_svg_elements(top_element)
+    # Separate child elements with images
+    products = []
+    for child in top_element_no_svg.find_all(recursive=False):
+        if child.name:  # Skip text nodes
+            # Remove SVGs from each product
+            child_no_svg = remove_svg_elements(child)
+            product_info = {
+                "html_content": str(child_no_svg),
+                "images": []
+            }
+            # Get all images within this product
+            for img in child_no_svg.find_all('img', recursive=True):
+                image_info = {
+                    "src": img.get('src', 'No source'),
+                    "alt": img.get('alt', 'No alt text')
+                }
+                product_info["images"].append(image_info)
+            products.append(product_info)
+    # Create result dictionary for top element
+    top_element_info = {
+        "parent": {
+            "tag": top_element_no_svg.name,
+            "identifier": get_element_identifier(top_element_no_svg),
+            "classes": top_element_no_svg.get('class', []),
+            "id": top_element_no_svg.get('id', None)
+        },
+        "products_count": len(products),
+        "products": products
+    }
+    html_output = str(top_element_no_svg)
+    return sorted_elements, top_element_info, html_output
+def get_download_link(content, filename, content_type="file/json"):
+    """Generate a download link for the given content"""
+    b64 = base64.b64encode(content.encode()).decode()
+    return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
+def main():
+    st.title("HTML File Analyzer")
+    st.write("Upload HTML files to analyze their structure and find image-rich elements")
+    # File uploader allows multiple files
+    uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
+    if uploaded_files:
+        all_results = {}
+        all_html_outputs = {}
+        # Analysis parameters
+        col1, col2 = st.columns(2)
+        with col1:
+            min_children = st.slider("Minimum number of similar children", 1, 10, 4)
+        with col2:
+            min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
+        # Generate button
+        if st.button("Generate Analysis"):
+            # Show processing message
+            with st.spinner('Processing files...'):
+                all_results = {}
+                all_html_outputs = {}
+                # Process each file
+                for uploaded_file in uploaded_files:
+                    st.subheader(f"Analysis for {uploaded_file.name}")
+                    try:
+                        # Read and process the file
+                        html_content = uploaded_file.read().decode('utf-8')
+                        sorted_elements, top_element_info, html_output = find_image_rich_parents(
+                            html_content,
+                            min_children=min_children,
+                            min_similarity=min_similarity
+                        )
+                        # Display results
+                        st.write("Elements containing images:")
+                        for element, img_count, prod_count in sorted_elements:
+                            st.write(f"- {element}: {img_count} images, {prod_count} products")
+                        # Store results
+                        all_results[uploaded_file.name] = top_element_info
+                        all_html_outputs[uploaded_file.name] = html_output
+                    except Exception as e:
+                        st.error(f"Error processing {uploaded_file.name}: {str(e)}")
+                        continue
+                # Create download buttons if we have results
+                if all_results:
+                    st.subheader("Download Results")
+                    col1, col2 = st.columns(2)
+                    # JSON download
+                    with col1:
+                        json_str = json.dumps(all_results, indent=2)
+                        st.markdown(get_download_link(json_str, 'analysis_results.json'),
+                                    unsafe_allow_html=True)
+                    # HTML download
+                    with col2:
+                        # Combine all HTML outputs with file names as headers
+                        combined_html = """
+                            <!DOCTYPE html>
+                            <html>
+                            <head>
+                                <meta charset='UTF-8'>
+                                <style>
+                                    div {
+                                        width: auto !important;
+                                        height: auto !important;
+                                        padding: 0 !important;
+                                        margin: 0 !important;
+                                    }
+                                    img {
+                                        width: 300px;
+                                        height: 300px;
+                                        object-fit: contain;
+                                    }
+                                    body { font-family: Arial, sans-serif; }
+                                    .file-section { margin: 20px 0; }
+                                    .file-header {
+                                        background: #f0f0f0;
+                                        padding: 10px;
+                                        margin: 20px 0;
+                                    }
+                                </style>
+                            </head>
+                            <body>
+                            """
+                        for filename, html in all_html_outputs.items():
+                            combined_html += f"""
+                                <div class="file-section">
+                                    <h2 class="file-header">{filename}</h2>
+                                    {html}
+                                </div>
+                                """
+                        combined_html += "</body></html>"
+                        st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
+                                    unsafe_allow_html=True)
+                    # Success message
+                    st.success("Analysis completed successfully!")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,132 @@

+altair==5.4.1
+anyio==4.6.2.post1
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+black==24.10.0
+bleach==6.1.0
+blinker==1.8.2
+bs4==0.0.2
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fake-headers==1.0.2
+fastjsonschema==2.20.0
+fqdn==1.5.1
+gitdb==4.0.11
+GitPython==3.1.43
+h11==0.14.0
+html5lib==1.1
+httpcore==1.0.6
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.28.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.4
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+mypy-extensions==1.0.0
+narwhals==1.11.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.2.2
+notebook_shim==0.2.4
+numpy==2.1.2
+outcome==1.3.0.post0
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+protobuf==5.28.3
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pycparser==2.22
+pydeck==0.9.1
+Pygments==2.18.0
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.3
+rpds-py==0.20.0
+selenium==4.25.0
+Send2Trash==1.8.3
+setuptools==75.2.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+streamlit==1.39.0
+tenacity==9.0.0
+terminado==0.18.1
+tinycss2==1.4.0
+toml==0.10.2
+tornado==6.4.1
+traitlets==5.14.3
+trio==0.27.0
+trio-websocket==0.11.1
+types-python-dateutil==2.9.0.20241003
+typing_extensions==4.12.2
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==2.2.3
+watchdog==5.0.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+wsproto==1.2.0

scrape-content.ipynb CHANGED Viewed

@@ -207,6 +207,7 @@
     "    if not element.contents:\n",
     "        return 0, 0\n",
     "\n",
     "    child_signatures = [\n",
     "        get_element_signature(child)\n",
     "        for child in element.find_all(recursive=False)\n",
@@ -216,9 +217,10 @@
     "    if not child_signatures:\n",
     "        return 0, 0\n",
     "\n",
     "    signature_counts = Counter(child_signatures)\n",
     "    most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
-    "    similarity_score = most_common_count / len(child_signatures) if child_signatures else 0\n",
     "\n",
     "    return similarity_score, most_common_count\n",
     "\n",

     "    if not element.contents:\n",
     "        return 0, 0\n",
     "\n",
+    "    # Get signatures for all direct children that are elements (have a tag name)\n",
     "    child_signatures = [\n",
     "        get_element_signature(child)\n",
     "        for child in element.find_all(recursive=False)\n",
     "    if not child_signatures:\n",
     "        return 0, 0\n",
     "\n",
+    "    # Count how many times each signature appears and get the most common one\n",
     "    signature_counts = Counter(child_signatures)\n",
     "    most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
+    "    similarity_score = most_common_count / len(child_signatures)\n",
     "\n",
     "    return similarity_score, most_common_count\n",
     "\n",