Spaces:

supunTE
/

products-extracter

Sleeping

App Files Files Community

supunTE commited on Oct 28, 2024

Commit

f2fe0c9

1 Parent(s): 112a5ac

use element signature for identify similar children

Browse files

Files changed (2) hide show

draft.py +61 -0
scrape-content.ipynb +130 -195

draft.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#
+#
+# def is_likely_product_card(element, min_text_length=10):
+#     """
+#     Determine if an element is likely to be a product card based on various heuristics.
+#     """
+#     # 1. Check for common product card class/id patterns
+#     identifier = element.get('class', []) + [element.get('id', '')]
+#     product_patterns = ['product', 'item', 'card', 'goods', 'listing']
+#     if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier):
+#         return True
+#
+#     # 2. Check for price patterns
+#     text_content = element.get_text()
+#     price_patterns = [
+#         r'\$\d+\.?\d*',  # USD
+#         r'£\d+\.?\d*',  # GBP
+#         r'€\d+\.?\d*',  # EUR
+#         r'\d+\.?\d*\s*USD',
+#         r'\d+\.?\d*\s*EUR'
+#     ]
+#     if any(re.search(pattern, text_content) for pattern in price_patterns):
+#         return True
+#
+#     # 3. Check for minimum text content (excluding whitespace)
+#     clean_text = ' '.join(text_content.split())
+#     if len(clean_text) < min_text_length:
+#         return False
+#
+#     # 4. Check for typical product card elements
+#     has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
+#
+#     return has_title
+#
+#
+# def should_exclude_element(element):
+#     # """
+#     # Check if an element should be excluded from consideration.
+#     # """
+#
+#     # 1. Exclude common non-product sections
+#     exclude_patterns = [
+#         'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart',
+#         'search', 'pagination', 'sort', 'banner', 'ad', 'slider'
+#     ]
+#
+#     # Check class and id
+#     element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split()
+#     element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split()
+#
+#     print(element_classes)
+#
+#     for pattern in exclude_patterns:
+#         if pattern in element_classes:
+#             print(f"Excluded element due to class containing '{pattern}'")
+#             return True
+#         if pattern in element_id:
+#             print(f"Excluded element due to id containing '{pattern}'")
+#             return True
+#
+#     return False

scrape-content.ipynb CHANGED Viewed

@@ -3,8 +3,8 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-25T10:32:47.963356Z",
-     "start_time": "2024-10-25T10:32:47.950533Z"
     }
    },
    "cell_type": "code",
@@ -21,24 +21,24 @@
       "text/plain": [
        "{'Accept': '*/*',\n",
        " 'Connection': 'keep-alive',\n",
-       " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6; rv:61.0) Gecko/20100101 Firefox/61.0',\n",
-       " 'Cache-Control': 'max-age=0',\n",
-       " 'Upgrade-Insecure-Requests': '1',\n",
        " 'Referer': 'https://google.com'}"
       ]
      },
-     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 9
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-25T10:32:49.821005Z",
-     "start_time": "2024-10-25T10:32:49.798988Z"
     }
    },
    "cell_type": "code",
@@ -103,22 +103,22 @@
    ],
    "id": "11933d956e20b6b8",
    "outputs": [],
-   "execution_count": 10
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-25T10:33:23.469518Z",
-     "start_time": "2024-10-25T10:32:53.382666Z"
     }
    },
    "cell_type": "code",
    "source": [
     "chrome_options = Options()\n",
-    "chrome_options.add_argument(\"--headless\")\n",
-    "chrome_options.add_argument(\"--disable-gpu\")\n",
-    "chrome_options.add_argument(\"--no-sandbox\")\n",
-    "chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
     "\n",
     "# Add fake headers\n",
     "for key, value in headers.items():\n",
@@ -169,39 +169,58 @@
    ],
    "id": "ac14cff825f0887f",
    "outputs": [],
-   "execution_count": 11
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-25T12:57:47.980010Z",
-     "start_time": "2024-10-25T12:57:47.962944Z"
     }
    },
    "cell_type": "code",
    "source": [
     "from urllib.parse import urljoin\n",
     "import json\n",
     "\n",
     "\n",
-    "def convert_relative_urls(soup, base_url):\n",
     "    \"\"\"\n",
-    "    Convert all relative URLs in the soup object to absolute URLs.\n",
-    "    Handles href, src, and data-src attributes.\n",
     "    \"\"\"\n",
-    "    # Convert href attributes (links)\n",
-    "    for tag in soup.find_all(href=True):\n",
-    "        tag['href'] = urljoin(base_url, tag['href'])\n",
     "\n",
-    "    # Convert src attributes (images, scripts, etc.)\n",
-    "    for tag in soup.find_all(src=True):\n",
-    "        tag['src'] = urljoin(base_url, tag['src'])\n",
     "\n",
-    "    # Convert data-src attributes (lazy loaded images)\n",
-    "    for tag in soup.find_all(attrs={'data-src': True}):\n",
-    "        tag['data-src'] = urljoin(base_url, tag['data-src'])\n",
     "\n",
-    "    return soup\n",
     "\n",
     "\n",
     "def count_images_in_element(element):\n",
@@ -223,104 +242,108 @@
     "    return identifier\n",
     "\n",
     "\n",
-    "def has_child_with_same_count(element, image_count, all_elements_with_counts):\n",
     "    \"\"\"\n",
-    "    Check if the element has any child with the same image count.\n",
     "    \"\"\"\n",
-    "    for other_element, other_count in all_elements_with_counts:\n",
-    "        if other_count == image_count and other_element != element:\n",
-    "            if any(parent == element for parent in other_element.parents):\n",
-    "                return True\n",
-    "    return False\n",
     "\n",
     "\n",
-    "def find_image_rich_parents(soup, base_url):\n",
     "    \"\"\"\n",
     "    Find elements containing images and return both sorted list and detailed top element info.\n",
     "    \"\"\"\n",
     "    # Convert relative URLs to absolute\n",
     "    soup = convert_relative_urls(soup, base_url)\n",
     "\n",
-    "    # Collect all elements with their image counts\n",
-    "    elements_with_counts = []\n",
     "    for element in soup.find_all():\n",
-    "        if element.name != 'img':  # Skip img tags themselves\n",
     "            image_count = count_images_in_element(element)\n",
-    "            if image_count > 0:\n",
-    "                elements_with_counts.append((element, image_count))\n",
     "\n",
-    "    # Sort by image count in descending order\n",
-    "    elements_with_counts.sort(key=lambda x: x[1], reverse=True)\n",
     "\n",
-    "    if not elements_with_counts:\n",
     "        return [], {\"error\": \"No elements with images found\"}, \"\"\n",
     "\n",
-    "    # Process elements for sorted list\n",
-    "    sorted_elements = []\n",
-    "    max_count = elements_with_counts[0][1]\n",
-    "    current_count = max_count\n",
-    "\n",
-    "    while current_count > 0 and len(sorted_elements) < 100:\n",
-    "        # Get all elements with current count\n",
-    "        current_elements = [(elem, count) for elem, count in elements_with_counts if count == current_count]\n",
-    "\n",
-    "        # Filter out elements that have children with the same count\n",
-    "        for element, count in current_elements:\n",
-    "            if not has_child_with_same_count(element, count, elements_with_counts):\n",
-    "                sorted_elements.append((get_element_identifier(element), count))\n",
-    "\n",
-    "        # Move to next highest count\n",
-    "        remaining_counts = [count for _, count in elements_with_counts if count < current_count]\n",
-    "        current_count = max(remaining_counts) if remaining_counts else 0\n",
-    "\n",
-    "    # Get detailed info for top element\n",
-    "    top_element = elements_with_counts[0][0]\n",
-    "    max_count = elements_with_counts[0][1]\n",
-    "\n",
-    "    # Find the lowest-level element among those with max count\n",
-    "    for element, count in elements_with_counts:\n",
-    "        if count == max_count and not has_child_with_same_count(element, count, elements_with_counts):\n",
-    "            top_element = element\n",
-    "            break\n",
-    "\n",
-    "    # Collect all images within the top element\n",
-    "    images = []\n",
-    "    for img in top_element.find_all('img', recursive=True):\n",
-    "        image_data = {\n",
-    "            \"src\": img.get('src', 'No source'),\n",
-    "            \"alt\": img.get('alt', 'No alt text')\n",
-    "        }\n",
-    "        for attr in ['title', 'width', 'height', 'class']:\n",
-    "            if img.get(attr):\n",
-    "                image_data[attr] = img[attr]\n",
-    "        images.append(image_data)\n",
     "\n",
-    "    # Create result dictionary for top element\n",
     "    top_element_info = {\n",
-    "        \"element\": {\n",
     "            \"tag\": top_element.name,\n",
     "            \"identifier\": get_element_identifier(top_element),\n",
     "            \"classes\": top_element.get('class', []),\n",
     "            \"id\": top_element.get('id', None)\n",
     "        },\n",
-    "        \"image_count\": max_count,\n",
-    "        \"images\": images,\n",
-    "        \"html_content\": str(top_element)\n",
     "    }\n",
     "\n",
     "    # Create styled HTML output\n",
     "    style_tag = \"\"\"\n",
     "    <style>\n",
     "        img {\n",
     "            width: 300px;\n",
     "            height: 300px;\n",
     "            object-fit: contain;\n",
     "        }\n",
     "    </style>\n",
     "    \"\"\"\n",
     "    html_output = style_tag + str(top_element)\n",
     "\n",
-    "    return sorted_elements, top_element_info, html_output\n",
     "\n",
     "\n",
     "def print_results(element_list):\n",
@@ -341,7 +364,7 @@
    ],
    "id": "3830f2e224e84798",
    "outputs": [],
-   "execution_count": 46
   },
   {
    "metadata": {},
@@ -352,8 +375,8 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-10-25T13:00:02.064761Z",
-     "start_time": "2024-10-25T13:00:01.277970Z"
     }
    },
    "cell_type": "code",
@@ -365,7 +388,7 @@
     "print_results(sorted_elements)\n",
     "\n",
     "with open(\"output1.json\", \"w\") as file:\n",
-    "    file.write(json.dumps(top_element_info, indent=2))\n",
     "\n",
     "with open(\"output1.html\", \"w\") as file:\n",
     "    file.write(html_output)"
@@ -376,112 +399,24 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
       "Elements Containing Most Images (Lowest Level for Each Count):\n",
       "----------------------------------------------------------------------\n",
       "Rank  Element Tag & Classes                           Image Count\n",
       "----------------------------------------------------------------------\n",
-      "1.    div                                           63\n",
-      "2.    div .sc-5da3fdcc-0 .cqdDWw                    58\n",
-      "3.    div                                           5\n",
-      "4.    div .l-container-fixed .h-padding-h-x6 .h-padding-t-x6 .h-display-flex .h-flex-direction-row .h-flex-justify-space-between 4\n",
-      "5.    div                                           3\n",
-      "6.    a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "7.    a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "8.    a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "9.    a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "10.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "11.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "12.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "13.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "14.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "15.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "16.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "17.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "18.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "19.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "20.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "21.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "22.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "23.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "24.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "25.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "26.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "27.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "28.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "29.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "30.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "31.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "32.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "33.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "34.   a .sc-e851bd29-0 .dmfVmE .h-display-block     2\n",
-      "35.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "36.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "37.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "38.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "39.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "40.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "41.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "42.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "43.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "44.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "45.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "46.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "47.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "48.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "49.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "50.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "51.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "52.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "53.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "54.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "55.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "56.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "57.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "58.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "59.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "60.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "61.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "62.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "63.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "64.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "65.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "66.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "67.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "68.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "69.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "70.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "71.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "72.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "73.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "74.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "75.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "76.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "77.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "78.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "79.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "80.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "81.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "82.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "83.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "84.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "85.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "86.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "87.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "88.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "89.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "90.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "91.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "92.   picture .sc-68a8cd0e-0 .ldZWSf                1\n",
-      "93.   span                                          1\n",
-      "94.   span                                          1\n",
-      "95.   span                                          1\n",
-      "96.   span                                          1\n",
-      "97.   div .Illustration_IllustrationWrapper__xJP5g  1\n"
      ]
     }
    ],
-   "execution_count": 49
   }
  ],
  "metadata": {

   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2024-10-26T16:28:29.519384Z",
+     "start_time": "2024-10-26T16:28:29.506673Z"
     }
    },
    "cell_type": "code",
       "text/plain": [
        "{'Accept': '*/*',\n",
        " 'Connection': 'keep-alive',\n",
+       " 'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:60.3.0) Gecko/20100101 Firefox/60.3.0',\n",
+       " 'Accept-Language': 'en-US;q=0.5,en;q=0.3',\n",
+       " 'DNT': '1',\n",
        " 'Referer': 'https://google.com'}"
       ]
      },
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "execution_count": 1
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2024-10-26T16:28:29.719882Z",
+     "start_time": "2024-10-26T16:28:29.531148Z"
     }
    },
    "cell_type": "code",
    ],
    "id": "11933d956e20b6b8",
    "outputs": [],
+   "execution_count": 2
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2024-10-26T16:29:00.452959Z",
+     "start_time": "2024-10-26T16:28:29.721884Z"
     }
    },
    "cell_type": "code",
    "source": [
     "chrome_options = Options()\n",
+    "# chrome_options.add_argument(\"--headless\")\n",
+    "# chrome_options.add_argument(\"--disable-gpu\")\n",
+    "# chrome_options.add_argument(\"--no-sandbox\")\n",
+    "# chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
     "\n",
     "# Add fake headers\n",
     "for key, value in headers.items():\n",
    ],
    "id": "ac14cff825f0887f",
    "outputs": [],
+   "execution_count": 3
   },
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2024-10-26T16:54:58.190620Z",
+     "start_time": "2024-10-26T16:54:58.165031Z"
     }
    },
    "cell_type": "code",
    "source": [
     "from urllib.parse import urljoin\n",
     "import json\n",
+    "from collections import Counter\n",
     "\n",
     "\n",
+    "def get_element_signature(element):\n",
     "    \"\"\"\n",
+    "    Create a signature for an element based on its structure.\n",
     "    \"\"\"\n",
+    "    signature = {\n",
+    "        'tag': element.name,\n",
+    "        'classes': tuple(sorted(element.get('class', []))),\n",
+    "        'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),\n",
+    "        'has_image': bool(element.find('img')),\n",
+    "        'has_price': bool(any(c in element.get_text() for c in '$€£¥')),\n",
+    "        'has_link': bool(element.find('a')),\n",
+    "    }\n",
+    "    return str(signature)\n",
     "\n",
     "\n",
+    "def analyze_children_similarity(element):\n",
+    "    \"\"\"\n",
+    "    Analyze how similar the direct children of an element are.\n",
+    "    \"\"\"\n",
+    "    if not element.contents:\n",
+    "        return 0, 0\n",
     "\n",
+    "    child_signatures = [\n",
+    "        get_element_signature(child)\n",
+    "        for child in element.find_all(recursive=False)\n",
+    "        if child.name\n",
+    "    ]\n",
+    "\n",
+    "    if not child_signatures:\n",
+    "        return 0, 0\n",
+    "\n",
+    "    signature_counts = Counter(child_signatures)\n",
+    "    most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
+    "    similarity_score = most_common_count / len(child_signatures) if child_signatures else 0\n",
+    "\n",
+    "    return similarity_score, most_common_count\n",
     "\n",
     "\n",
     "def count_images_in_element(element):\n",
     "    return identifier\n",
     "\n",
     "\n",
+    "def convert_relative_urls(soup, base_url):\n",
     "    \"\"\"\n",
+    "    Convert all relative URLs in the soup object to absolute URLs.\n",
     "    \"\"\"\n",
+    "    for tag in soup.find_all(href=True):\n",
+    "        tag['href'] = urljoin(base_url, tag['href'])\n",
+    "    for tag in soup.find_all(src=True):\n",
+    "        tag['src'] = urljoin(base_url, tag['src'])\n",
+    "    for tag in soup.find_all(attrs={'data-src': True}):\n",
+    "        tag['data-src'] = urljoin(base_url, tag['data-src'])\n",
+    "    return soup\n",
     "\n",
     "\n",
+    "def find_image_rich_parents(soup, base_url, min_children=4, min_similarity=0.7):\n",
     "    \"\"\"\n",
     "    Find elements containing images and return both sorted list and detailed top element info.\n",
     "    \"\"\"\n",
     "    # Convert relative URLs to absolute\n",
     "    soup = convert_relative_urls(soup, base_url)\n",
     "\n",
+    "    # Collect potential container elements with their scores\n",
+    "    elements_with_scores = []\n",
     "    for element in soup.find_all():\n",
+    "        if element.name in ['div', 'ul', 'section', 'main']:\n",
+    "            similarity_score, similar_children_count = analyze_children_similarity(element)\n",
     "            image_count = count_images_in_element(element)\n",
     "\n",
+    "            if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:\n",
+    "                # Calculate combined score based on similarity and image count\n",
+    "                combined_score = (similarity_score * similar_children_count * image_count)\n",
+    "                elements_with_scores.append((element, image_count, combined_score))\n",
     "\n",
+    "    if not elements_with_scores:\n",
     "        return [], {\"error\": \"No elements with images found\"}, \"\"\n",
     "\n",
+    "    # Sort by combined score\n",
+    "    elements_with_scores.sort(key=lambda x: x[2], reverse=True)\n",
     "\n",
+    "    # Process elements for sorted list output\n",
+    "    sorted_elements = []\n",
+    "    for element, image_count, _ in elements_with_scores:\n",
+    "        sorted_elements.append((get_element_identifier(element), image_count))\n",
+    "\n",
+    "    # Get top element (one with highest combined score)\n",
+    "    top_element = elements_with_scores[0][0]\n",
+    "\n",
+    "    # Separate child elements with images\n",
+    "    products = []\n",
+    "    for child in top_element.find_all(recursive=False):\n",
+    "        if child.name:  # Skip text nodes\n",
+    "            product_info = {\n",
+    "                \"html_content\": str(child),\n",
+    "                \"images\": []\n",
+    "            }\n",
+    "\n",
+    "            # Get all images within this product\n",
+    "            for img in child.find_all('img', recursive=True):\n",
+    "                image_info = {\n",
+    "                    \"src\": img.get('src', 'No source'),\n",
+    "                    \"alt\": img.get('alt', 'No alt text')\n",
+    "                }\n",
+    "                product_info[\"images\"].append(image_info)\n",
+    "\n",
+    "            products.append(product_info)\n",
+    "\n",
+    "    print(len(products))\n",
+    "\n",
+    "    # Create result dictionary for top element   \n",
     "    top_element_info = {\n",
+    "        \"parent\": {\n",
     "            \"tag\": top_element.name,\n",
     "            \"identifier\": get_element_identifier(top_element),\n",
     "            \"classes\": top_element.get('class', []),\n",
     "            \"id\": top_element.get('id', None)\n",
     "        },\n",
+    "        \"products_count\": len(products),\n",
+    "        \"products\": products\n",
     "    }\n",
     "\n",
     "    # Create styled HTML output\n",
     "    style_tag = \"\"\"\n",
     "    <style>\n",
+    "        div {\n",
+    "            width: auto !important;\n",
+    "            height: auto !important;\n",
+    "        }\n",
+    "    \n",
     "        img {\n",
     "            width: 300px;\n",
     "            height: 300px;\n",
     "            object-fit: contain;\n",
     "        }\n",
+    "    \n",
+    "        svg {\n",
+    "            max-height: 10px;\n",
+    "            max-width: 10px;\n",
+    "        }\n",
     "    </style>\n",
     "    \"\"\"\n",
     "    html_output = style_tag + str(top_element)\n",
     "\n",
+    "    return sorted_elements, json.dumps(top_element_info, indent=2), html_output\n",
     "\n",
     "\n",
     "def print_results(element_list):\n",
    ],
    "id": "3830f2e224e84798",
    "outputs": [],
+   "execution_count": 11
   },
   {
    "metadata": {},
   {
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2024-10-26T16:55:03.174631Z",
+     "start_time": "2024-10-26T16:55:02.976453Z"
     }
    },
    "cell_type": "code",
     "print_results(sorted_elements)\n",
     "\n",
     "with open(\"output1.json\", \"w\") as file:\n",
+    "    file.write(top_element_info)\n",
     "\n",
     "with open(\"output1.html\", \"w\") as file:\n",
     "    file.write(html_output)"
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "28\n",
       "\n",
       "Elements Containing Most Images (Lowest Level for Each Count):\n",
       "----------------------------------------------------------------------\n",
       "Rank  Element Tag & Classes                           Image Count\n",
       "----------------------------------------------------------------------\n",
+      "1.    div .sc-5da3fdcc-0 .cqdDWw                    51\n",
+      "2.    div                                           1\n"
      ]
     }
    ],
+   "execution_count": 12
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "",
+   "id": "1465ddb6bce2981c"
   }
  ],
  "metadata": {