supunTE commited on
Commit
f2fe0c9
·
1 Parent(s): 112a5ac

use element signature for identify similar children

Browse files
Files changed (2) hide show
  1. draft.py +61 -0
  2. scrape-content.ipynb +130 -195
draft.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ #
3
+ # def is_likely_product_card(element, min_text_length=10):
4
+ # """
5
+ # Determine if an element is likely to be a product card based on various heuristics.
6
+ # """
7
+ # # 1. Check for common product card class/id patterns
8
+ # identifier = element.get('class', []) + [element.get('id', '')]
9
+ # product_patterns = ['product', 'item', 'card', 'goods', 'listing']
10
+ # if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier):
11
+ # return True
12
+ #
13
+ # # 2. Check for price patterns
14
+ # text_content = element.get_text()
15
+ # price_patterns = [
16
+ # r'\$\d+\.?\d*', # USD
17
+ # r'£\d+\.?\d*', # GBP
18
+ # r'€\d+\.?\d*', # EUR
19
+ # r'\d+\.?\d*\s*USD',
20
+ # r'\d+\.?\d*\s*EUR'
21
+ # ]
22
+ # if any(re.search(pattern, text_content) for pattern in price_patterns):
23
+ # return True
24
+ #
25
+ # # 3. Check for minimum text content (excluding whitespace)
26
+ # clean_text = ' '.join(text_content.split())
27
+ # if len(clean_text) < min_text_length:
28
+ # return False
29
+ #
30
+ # # 4. Check for typical product card elements
31
+ # has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
32
+ #
33
+ # return has_title
34
+ #
35
+ #
36
+ # def should_exclude_element(element):
37
+ # # """
38
+ # # Check if an element should be excluded from consideration.
39
+ # # """
40
+ #
41
+ # # 1. Exclude common non-product sections
42
+ # exclude_patterns = [
43
+ # 'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart',
44
+ # 'search', 'pagination', 'sort', 'banner', 'ad', 'slider'
45
+ # ]
46
+ #
47
+ # # Check class and id
48
+ # element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split()
49
+ # element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split()
50
+ #
51
+ # print(element_classes)
52
+ #
53
+ # for pattern in exclude_patterns:
54
+ # if pattern in element_classes:
55
+ # print(f"Excluded element due to class containing '{pattern}'")
56
+ # return True
57
+ # if pattern in element_id:
58
+ # print(f"Excluded element due to id containing '{pattern}'")
59
+ # return True
60
+ #
61
+ # return False
scrape-content.ipynb CHANGED
@@ -3,8 +3,8 @@
3
  {
4
  "metadata": {
5
  "ExecuteTime": {
6
- "end_time": "2024-10-25T10:32:47.963356Z",
7
- "start_time": "2024-10-25T10:32:47.950533Z"
8
  }
9
  },
10
  "cell_type": "code",
@@ -21,24 +21,24 @@
21
  "text/plain": [
22
  "{'Accept': '*/*',\n",
23
  " 'Connection': 'keep-alive',\n",
24
- " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6; rv:61.0) Gecko/20100101 Firefox/61.0',\n",
25
- " 'Cache-Control': 'max-age=0',\n",
26
- " 'Upgrade-Insecure-Requests': '1',\n",
27
  " 'Referer': 'https://google.com'}"
28
  ]
29
  },
30
- "execution_count": 9,
31
  "metadata": {},
32
  "output_type": "execute_result"
33
  }
34
  ],
35
- "execution_count": 9
36
  },
37
  {
38
  "metadata": {
39
  "ExecuteTime": {
40
- "end_time": "2024-10-25T10:32:49.821005Z",
41
- "start_time": "2024-10-25T10:32:49.798988Z"
42
  }
43
  },
44
  "cell_type": "code",
@@ -103,22 +103,22 @@
103
  ],
104
  "id": "11933d956e20b6b8",
105
  "outputs": [],
106
- "execution_count": 10
107
  },
108
  {
109
  "metadata": {
110
  "ExecuteTime": {
111
- "end_time": "2024-10-25T10:33:23.469518Z",
112
- "start_time": "2024-10-25T10:32:53.382666Z"
113
  }
114
  },
115
  "cell_type": "code",
116
  "source": [
117
  "chrome_options = Options()\n",
118
- "chrome_options.add_argument(\"--headless\")\n",
119
- "chrome_options.add_argument(\"--disable-gpu\")\n",
120
- "chrome_options.add_argument(\"--no-sandbox\")\n",
121
- "chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
122
  "\n",
123
  "# Add fake headers\n",
124
  "for key, value in headers.items():\n",
@@ -169,39 +169,58 @@
169
  ],
170
  "id": "ac14cff825f0887f",
171
  "outputs": [],
172
- "execution_count": 11
173
  },
174
  {
175
  "metadata": {
176
  "ExecuteTime": {
177
- "end_time": "2024-10-25T12:57:47.980010Z",
178
- "start_time": "2024-10-25T12:57:47.962944Z"
179
  }
180
  },
181
  "cell_type": "code",
182
  "source": [
183
  "from urllib.parse import urljoin\n",
184
  "import json\n",
 
185
  "\n",
186
  "\n",
187
- "def convert_relative_urls(soup, base_url):\n",
188
  " \"\"\"\n",
189
- " Convert all relative URLs in the soup object to absolute URLs.\n",
190
- " Handles href, src, and data-src attributes.\n",
191
  " \"\"\"\n",
192
- " # Convert href attributes (links)\n",
193
- " for tag in soup.find_all(href=True):\n",
194
- " tag['href'] = urljoin(base_url, tag['href'])\n",
 
 
 
 
 
 
195
  "\n",
196
- " # Convert src attributes (images, scripts, etc.)\n",
197
- " for tag in soup.find_all(src=True):\n",
198
- " tag['src'] = urljoin(base_url, tag['src'])\n",
199
  "\n",
200
- " # Convert data-src attributes (lazy loaded images)\n",
201
- " for tag in soup.find_all(attrs={'data-src': True}):\n",
202
- " tag['data-src'] = urljoin(base_url, tag['data-src'])\n",
 
 
 
203
  "\n",
204
- " return soup\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  "\n",
206
  "\n",
207
  "def count_images_in_element(element):\n",
@@ -223,104 +242,108 @@
223
  " return identifier\n",
224
  "\n",
225
  "\n",
226
- "def has_child_with_same_count(element, image_count, all_elements_with_counts):\n",
227
  " \"\"\"\n",
228
- " Check if the element has any child with the same image count.\n",
229
  " \"\"\"\n",
230
- " for other_element, other_count in all_elements_with_counts:\n",
231
- " if other_count == image_count and other_element != element:\n",
232
- " if any(parent == element for parent in other_element.parents):\n",
233
- " return True\n",
234
- " return False\n",
 
 
235
  "\n",
236
  "\n",
237
- "def find_image_rich_parents(soup, base_url):\n",
238
  " \"\"\"\n",
239
  " Find elements containing images and return both sorted list and detailed top element info.\n",
240
  " \"\"\"\n",
241
  " # Convert relative URLs to absolute\n",
242
  " soup = convert_relative_urls(soup, base_url)\n",
243
  "\n",
244
- " # Collect all elements with their image counts\n",
245
- " elements_with_counts = []\n",
246
  " for element in soup.find_all():\n",
247
- " if element.name != 'img': # Skip img tags themselves\n",
 
248
  " image_count = count_images_in_element(element)\n",
249
- " if image_count > 0:\n",
250
- " elements_with_counts.append((element, image_count))\n",
251
  "\n",
252
- " # Sort by image count in descending order\n",
253
- " elements_with_counts.sort(key=lambda x: x[1], reverse=True)\n",
 
 
254
  "\n",
255
- " if not elements_with_counts:\n",
256
  " return [], {\"error\": \"No elements with images found\"}, \"\"\n",
257
  "\n",
258
- " # Process elements for sorted list\n",
259
- " sorted_elements = []\n",
260
- " max_count = elements_with_counts[0][1]\n",
261
- " current_count = max_count\n",
262
- "\n",
263
- " while current_count > 0 and len(sorted_elements) < 100:\n",
264
- " # Get all elements with current count\n",
265
- " current_elements = [(elem, count) for elem, count in elements_with_counts if count == current_count]\n",
266
- "\n",
267
- " # Filter out elements that have children with the same count\n",
268
- " for element, count in current_elements:\n",
269
- " if not has_child_with_same_count(element, count, elements_with_counts):\n",
270
- " sorted_elements.append((get_element_identifier(element), count))\n",
271
- "\n",
272
- " # Move to next highest count\n",
273
- " remaining_counts = [count for _, count in elements_with_counts if count < current_count]\n",
274
- " current_count = max(remaining_counts) if remaining_counts else 0\n",
275
- "\n",
276
- " # Get detailed info for top element\n",
277
- " top_element = elements_with_counts[0][0]\n",
278
- " max_count = elements_with_counts[0][1]\n",
279
- "\n",
280
- " # Find the lowest-level element among those with max count\n",
281
- " for element, count in elements_with_counts:\n",
282
- " if count == max_count and not has_child_with_same_count(element, count, elements_with_counts):\n",
283
- " top_element = element\n",
284
- " break\n",
285
- "\n",
286
- " # Collect all images within the top element\n",
287
- " images = []\n",
288
- " for img in top_element.find_all('img', recursive=True):\n",
289
- " image_data = {\n",
290
- " \"src\": img.get('src', 'No source'),\n",
291
- " \"alt\": img.get('alt', 'No alt text')\n",
292
- " }\n",
293
- " for attr in ['title', 'width', 'height', 'class']:\n",
294
- " if img.get(attr):\n",
295
- " image_data[attr] = img[attr]\n",
296
- " images.append(image_data)\n",
297
  "\n",
298
- " # Create result dictionary for top element\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  " top_element_info = {\n",
300
- " \"element\": {\n",
301
  " \"tag\": top_element.name,\n",
302
  " \"identifier\": get_element_identifier(top_element),\n",
303
  " \"classes\": top_element.get('class', []),\n",
304
  " \"id\": top_element.get('id', None)\n",
305
  " },\n",
306
- " \"image_count\": max_count,\n",
307
- " \"images\": images,\n",
308
- " \"html_content\": str(top_element)\n",
309
  " }\n",
310
  "\n",
311
  " # Create styled HTML output\n",
312
  " style_tag = \"\"\"\n",
313
  " <style>\n",
 
 
 
 
 
314
  " img {\n",
315
  " width: 300px;\n",
316
  " height: 300px;\n",
317
  " object-fit: contain;\n",
318
  " }\n",
 
 
 
 
 
319
  " </style>\n",
320
  " \"\"\"\n",
321
  " html_output = style_tag + str(top_element)\n",
322
  "\n",
323
- " return sorted_elements, top_element_info, html_output\n",
324
  "\n",
325
  "\n",
326
  "def print_results(element_list):\n",
@@ -341,7 +364,7 @@
341
  ],
342
  "id": "3830f2e224e84798",
343
  "outputs": [],
344
- "execution_count": 46
345
  },
346
  {
347
  "metadata": {},
@@ -352,8 +375,8 @@
352
  {
353
  "metadata": {
354
  "ExecuteTime": {
355
- "end_time": "2024-10-25T13:00:02.064761Z",
356
- "start_time": "2024-10-25T13:00:01.277970Z"
357
  }
358
  },
359
  "cell_type": "code",
@@ -365,7 +388,7 @@
365
  "print_results(sorted_elements)\n",
366
  "\n",
367
  "with open(\"output1.json\", \"w\") as file:\n",
368
- " file.write(json.dumps(top_element_info, indent=2))\n",
369
  "\n",
370
  "with open(\"output1.html\", \"w\") as file:\n",
371
  " file.write(html_output)"
@@ -376,112 +399,24 @@
376
  "name": "stdout",
377
  "output_type": "stream",
378
  "text": [
 
379
  "\n",
380
  "Elements Containing Most Images (Lowest Level for Each Count):\n",
381
  "----------------------------------------------------------------------\n",
382
  "Rank Element Tag & Classes Image Count\n",
383
  "----------------------------------------------------------------------\n",
384
- "1. div 63\n",
385
- "2. div .sc-5da3fdcc-0 .cqdDWw 58\n",
386
- "3. div 5\n",
387
- "4. div .l-container-fixed .h-padding-h-x6 .h-padding-t-x6 .h-display-flex .h-flex-direction-row .h-flex-justify-space-between 4\n",
388
- "5. div 3\n",
389
- "6. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
390
- "7. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
391
- "8. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
392
- "9. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
393
- "10. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
394
- "11. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
395
- "12. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
396
- "13. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
397
- "14. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
398
- "15. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
399
- "16. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
400
- "17. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
401
- "18. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
402
- "19. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
403
- "20. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
404
- "21. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
405
- "22. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
406
- "23. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
407
- "24. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
408
- "25. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
409
- "26. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
410
- "27. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
411
- "28. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
412
- "29. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
413
- "30. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
414
- "31. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
415
- "32. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
416
- "33. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
417
- "34. a .sc-e851bd29-0 .dmfVmE .h-display-block 2\n",
418
- "35. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
419
- "36. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
420
- "37. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
421
- "38. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
422
- "39. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
423
- "40. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
424
- "41. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
425
- "42. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
426
- "43. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
427
- "44. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
428
- "45. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
429
- "46. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
430
- "47. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
431
- "48. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
432
- "49. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
433
- "50. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
434
- "51. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
435
- "52. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
436
- "53. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
437
- "54. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
438
- "55. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
439
- "56. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
440
- "57. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
441
- "58. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
442
- "59. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
443
- "60. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
444
- "61. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
445
- "62. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
446
- "63. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
447
- "64. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
448
- "65. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
449
- "66. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
450
- "67. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
451
- "68. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
452
- "69. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
453
- "70. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
454
- "71. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
455
- "72. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
456
- "73. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
457
- "74. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
458
- "75. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
459
- "76. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
460
- "77. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
461
- "78. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
462
- "79. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
463
- "80. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
464
- "81. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
465
- "82. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
466
- "83. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
467
- "84. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
468
- "85. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
469
- "86. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
470
- "87. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
471
- "88. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
472
- "89. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
473
- "90. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
474
- "91. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
475
- "92. picture .sc-68a8cd0e-0 .ldZWSf 1\n",
476
- "93. span 1\n",
477
- "94. span 1\n",
478
- "95. span 1\n",
479
- "96. span 1\n",
480
- "97. div .Illustration_IllustrationWrapper__xJP5g 1\n"
481
  ]
482
  }
483
  ],
484
- "execution_count": 49
 
 
 
 
 
 
485
  }
486
  ],
487
  "metadata": {
 
3
  {
4
  "metadata": {
5
  "ExecuteTime": {
6
+ "end_time": "2024-10-26T16:28:29.519384Z",
7
+ "start_time": "2024-10-26T16:28:29.506673Z"
8
  }
9
  },
10
  "cell_type": "code",
 
21
  "text/plain": [
22
  "{'Accept': '*/*',\n",
23
  " 'Connection': 'keep-alive',\n",
24
+ " 'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:60.3.0) Gecko/20100101 Firefox/60.3.0',\n",
25
+ " 'Accept-Language': 'en-US;q=0.5,en;q=0.3',\n",
26
+ " 'DNT': '1',\n",
27
  " 'Referer': 'https://google.com'}"
28
  ]
29
  },
30
+ "execution_count": 1,
31
  "metadata": {},
32
  "output_type": "execute_result"
33
  }
34
  ],
35
+ "execution_count": 1
36
  },
37
  {
38
  "metadata": {
39
  "ExecuteTime": {
40
+ "end_time": "2024-10-26T16:28:29.719882Z",
41
+ "start_time": "2024-10-26T16:28:29.531148Z"
42
  }
43
  },
44
  "cell_type": "code",
 
103
  ],
104
  "id": "11933d956e20b6b8",
105
  "outputs": [],
106
+ "execution_count": 2
107
  },
108
  {
109
  "metadata": {
110
  "ExecuteTime": {
111
+ "end_time": "2024-10-26T16:29:00.452959Z",
112
+ "start_time": "2024-10-26T16:28:29.721884Z"
113
  }
114
  },
115
  "cell_type": "code",
116
  "source": [
117
  "chrome_options = Options()\n",
118
+ "# chrome_options.add_argument(\"--headless\")\n",
119
+ "# chrome_options.add_argument(\"--disable-gpu\")\n",
120
+ "# chrome_options.add_argument(\"--no-sandbox\")\n",
121
+ "# chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
122
  "\n",
123
  "# Add fake headers\n",
124
  "for key, value in headers.items():\n",
 
169
  ],
170
  "id": "ac14cff825f0887f",
171
  "outputs": [],
172
+ "execution_count": 3
173
  },
174
  {
175
  "metadata": {
176
  "ExecuteTime": {
177
+ "end_time": "2024-10-26T16:54:58.190620Z",
178
+ "start_time": "2024-10-26T16:54:58.165031Z"
179
  }
180
  },
181
  "cell_type": "code",
182
  "source": [
183
  "from urllib.parse import urljoin\n",
184
  "import json\n",
185
+ "from collections import Counter\n",
186
  "\n",
187
  "\n",
188
+ "def get_element_signature(element):\n",
189
  " \"\"\"\n",
190
+ " Create a signature for an element based on its structure.\n",
 
191
  " \"\"\"\n",
192
+ " signature = {\n",
193
+ " 'tag': element.name,\n",
194
+ " 'classes': tuple(sorted(element.get('class', []))),\n",
195
+ " 'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),\n",
196
+ " 'has_image': bool(element.find('img')),\n",
197
+ " 'has_price': bool(any(c in element.get_text() for c in '$€£¥')),\n",
198
+ " 'has_link': bool(element.find('a')),\n",
199
+ " }\n",
200
+ " return str(signature)\n",
201
  "\n",
 
 
 
202
  "\n",
203
+ "def analyze_children_similarity(element):\n",
204
+ " \"\"\"\n",
205
+ " Analyze how similar the direct children of an element are.\n",
206
+ " \"\"\"\n",
207
+ " if not element.contents:\n",
208
+ " return 0, 0\n",
209
  "\n",
210
+ " child_signatures = [\n",
211
+ " get_element_signature(child)\n",
212
+ " for child in element.find_all(recursive=False)\n",
213
+ " if child.name\n",
214
+ " ]\n",
215
+ "\n",
216
+ " if not child_signatures:\n",
217
+ " return 0, 0\n",
218
+ "\n",
219
+ " signature_counts = Counter(child_signatures)\n",
220
+ " most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
221
+ " similarity_score = most_common_count / len(child_signatures) if child_signatures else 0\n",
222
+ "\n",
223
+ " return similarity_score, most_common_count\n",
224
  "\n",
225
  "\n",
226
  "def count_images_in_element(element):\n",
 
242
  " return identifier\n",
243
  "\n",
244
  "\n",
245
+ "def convert_relative_urls(soup, base_url):\n",
246
  " \"\"\"\n",
247
+ " Convert all relative URLs in the soup object to absolute URLs.\n",
248
  " \"\"\"\n",
249
+ " for tag in soup.find_all(href=True):\n",
250
+ " tag['href'] = urljoin(base_url, tag['href'])\n",
251
+ " for tag in soup.find_all(src=True):\n",
252
+ " tag['src'] = urljoin(base_url, tag['src'])\n",
253
+ " for tag in soup.find_all(attrs={'data-src': True}):\n",
254
+ " tag['data-src'] = urljoin(base_url, tag['data-src'])\n",
255
+ " return soup\n",
256
  "\n",
257
  "\n",
258
+ "def find_image_rich_parents(soup, base_url, min_children=4, min_similarity=0.7):\n",
259
  " \"\"\"\n",
260
  " Find elements containing images and return both sorted list and detailed top element info.\n",
261
  " \"\"\"\n",
262
  " # Convert relative URLs to absolute\n",
263
  " soup = convert_relative_urls(soup, base_url)\n",
264
  "\n",
265
+ " # Collect potential container elements with their scores\n",
266
+ " elements_with_scores = []\n",
267
  " for element in soup.find_all():\n",
268
+ " if element.name in ['div', 'ul', 'section', 'main']:\n",
269
+ " similarity_score, similar_children_count = analyze_children_similarity(element)\n",
270
  " image_count = count_images_in_element(element)\n",
 
 
271
  "\n",
272
+ " if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:\n",
273
+ " # Calculate combined score based on similarity and image count\n",
274
+ " combined_score = (similarity_score * similar_children_count * image_count)\n",
275
+ " elements_with_scores.append((element, image_count, combined_score))\n",
276
  "\n",
277
+ " if not elements_with_scores:\n",
278
  " return [], {\"error\": \"No elements with images found\"}, \"\"\n",
279
  "\n",
280
+ " # Sort by combined score\n",
281
+ " elements_with_scores.sort(key=lambda x: x[2], reverse=True)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  "\n",
283
+ " # Process elements for sorted list output\n",
284
+ " sorted_elements = []\n",
285
+ " for element, image_count, _ in elements_with_scores:\n",
286
+ " sorted_elements.append((get_element_identifier(element), image_count))\n",
287
+ "\n",
288
+ " # Get top element (one with highest combined score)\n",
289
+ " top_element = elements_with_scores[0][0]\n",
290
+ "\n",
291
+ " # Separate child elements with images\n",
292
+ " products = []\n",
293
+ " for child in top_element.find_all(recursive=False):\n",
294
+ " if child.name: # Skip text nodes\n",
295
+ " product_info = {\n",
296
+ " \"html_content\": str(child),\n",
297
+ " \"images\": []\n",
298
+ " }\n",
299
+ "\n",
300
+ " # Get all images within this product\n",
301
+ " for img in child.find_all('img', recursive=True):\n",
302
+ " image_info = {\n",
303
+ " \"src\": img.get('src', 'No source'),\n",
304
+ " \"alt\": img.get('alt', 'No alt text')\n",
305
+ " }\n",
306
+ " product_info[\"images\"].append(image_info)\n",
307
+ "\n",
308
+ " products.append(product_info)\n",
309
+ "\n",
310
+ " print(len(products))\n",
311
+ "\n",
312
+ " # Create result dictionary for top element \n",
313
  " top_element_info = {\n",
314
+ " \"parent\": {\n",
315
  " \"tag\": top_element.name,\n",
316
  " \"identifier\": get_element_identifier(top_element),\n",
317
  " \"classes\": top_element.get('class', []),\n",
318
  " \"id\": top_element.get('id', None)\n",
319
  " },\n",
320
+ " \"products_count\": len(products),\n",
321
+ " \"products\": products\n",
 
322
  " }\n",
323
  "\n",
324
  " # Create styled HTML output\n",
325
  " style_tag = \"\"\"\n",
326
  " <style>\n",
327
+ " div {\n",
328
+ " width: auto !important;\n",
329
+ " height: auto !important;\n",
330
+ " }\n",
331
+ " \n",
332
  " img {\n",
333
  " width: 300px;\n",
334
  " height: 300px;\n",
335
  " object-fit: contain;\n",
336
  " }\n",
337
+ " \n",
338
+ " svg {\n",
339
+ " max-height: 10px;\n",
340
+ " max-width: 10px;\n",
341
+ " }\n",
342
  " </style>\n",
343
  " \"\"\"\n",
344
  " html_output = style_tag + str(top_element)\n",
345
  "\n",
346
+ " return sorted_elements, json.dumps(top_element_info, indent=2), html_output\n",
347
  "\n",
348
  "\n",
349
  "def print_results(element_list):\n",
 
364
  ],
365
  "id": "3830f2e224e84798",
366
  "outputs": [],
367
+ "execution_count": 11
368
  },
369
  {
370
  "metadata": {},
 
375
  {
376
  "metadata": {
377
  "ExecuteTime": {
378
+ "end_time": "2024-10-26T16:55:03.174631Z",
379
+ "start_time": "2024-10-26T16:55:02.976453Z"
380
  }
381
  },
382
  "cell_type": "code",
 
388
  "print_results(sorted_elements)\n",
389
  "\n",
390
  "with open(\"output1.json\", \"w\") as file:\n",
391
+ " file.write(top_element_info)\n",
392
  "\n",
393
  "with open(\"output1.html\", \"w\") as file:\n",
394
  " file.write(html_output)"
 
399
  "name": "stdout",
400
  "output_type": "stream",
401
  "text": [
402
+ "28\n",
403
  "\n",
404
  "Elements Containing Most Images (Lowest Level for Each Count):\n",
405
  "----------------------------------------------------------------------\n",
406
  "Rank Element Tag & Classes Image Count\n",
407
  "----------------------------------------------------------------------\n",
408
+ "1. div .sc-5da3fdcc-0 .cqdDWw 51\n",
409
+ "2. div 1\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  ]
411
  }
412
  ],
413
+ "execution_count": 12
414
+ },
415
+ {
416
+ "metadata": {},
417
+ "cell_type": "markdown",
418
+ "source": "",
419
+ "id": "1465ddb6bce2981c"
420
  }
421
  ],
422
  "metadata": {