In [1]:
from fake_headers import Headers

headers = Headers(headers=True).generate()
headers

{'Accept': '*/*',
 'Connection': 'keep-alive',
 'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64; rv:60.3.0) Gecko/20100101 Firefox/60.3.0',
 'Accept-Language': 'en-US;q=0.5,en;q=0.3',
 'DNT': '1',
 'Referer': 'https://google.com'}

In [2]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time


def scroll_and_wait(driver, scroll_pause_time=2):
 """
 Scroll the page gradually and wait for images to load
 """
 # Get scroll height
 last_height = driver.execute_script("return document.body.scrollHeight")

 while True:
 # Scroll down gradually
 for i in range(10):
 driver.execute_script(f"window.scrollTo(0, {(i + 1) * (last_height / 10)});")
 time.sleep(0.5) # Short pause between each scroll step

 # Wait for new images to load
 time.sleep(scroll_pause_time)

 # Calculate new scroll height and compare with last scroll height
 new_height = driver.execute_script("return document.body.scrollHeight")
 if new_height == last_height:
 break
 last_height = new_height


def wait_for_images(driver, timeout=10):
 """
 Wait for images to load and become visible
 """
 try:
 # Wait for all image elements to be present
 WebDriverWait(driver, timeout).until(
 EC.presence_of_all_elements_located((By.TAG_NAME, "img"))
 )

 # Get all image elements
 images = driver.find_elements(By.TAG_NAME, "img")

 # Wait for images to load
 for img in images:
 try:
 WebDriverWait(driver, 2).until(
 lambda d: img.get_attribute('complete') == 'true' and
 img.get_attribute('naturalHeight') != '0'
 )
 except:
 continue # Skip images that don't load within timeout

 except Exception as e:
 print(f"Warning: Not all images could be loaded: {e}")

In [3]:
chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")

# Add fake headers
for key, value in headers.items():
 chrome_options.add_argument(f'--{key.lower()}={value}')

# Additional configurations to appear more human-like
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--window-size=1920,1080")

# Enable images in headless mode
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--high-dpi-support=1")

# Privacy and fingerprinting prevention
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Enable JavaScript
chrome_options.add_argument("--enable-javascript")

driver = webdriver.Chrome(options=chrome_options)

driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
 "source": """
 Object.defineProperty(navigator, 'webdriver', {
 get: () => undefined
 })
 """
})

products_url = "https://www.target.com/s?searchTerm=Peach&tref=typeahead%7Cterm%7CPeach%7C%7C%7Chistory"
driver.get(products_url)

time.sleep(3)

# Scroll and wait for content
scroll_and_wait(driver)

# Wait for images to load
wait_for_images(driver)

time.sleep(2)

soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [11]:
from urllib.parse import urljoin
import json
from collections import Counter


def get_element_signature(element):
 """
 Create a signature for an element based on its structure.
 """
 signature = {
 'tag': element.name,
 'classes': tuple(sorted(element.get('class', []))),
 'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
 'has_image': bool(element.find('img')),
 'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
 'has_link': bool(element.find('a')),
 }
 return str(signature)


def analyze_children_similarity(element):
 """
 Analyze how similar the direct children of an element are.
 """
 if not element.contents:
 return 0, 0

 # Get signatures for all direct children that are elements (have a tag name)
 child_signatures = [
 get_element_signature(child)
 for child in element.find_all(recursive=False)
 if child.name
 ]

 if not child_signatures:
 return 0, 0

 # Count how many times each signature appears and get the most common one
 signature_counts = Counter(child_signatures)
 most_common_sig, most_common_count = signature_counts.most_common(1)[0]
 similarity_score = most_common_count / len(child_signatures)

 return similarity_score, most_common_count


def count_images_in_element(element):
 """
 Count all images within an element, including nested ones.
 """
 return len(element.find_all('img', recursive=True))


def get_element_identifier(element):
 """
 Create a unique identifier for an element including tag and classes.
 """
 identifier = element.name
 if element.get('class'):
 identifier += f" .{' .'.join(element['class'])}"
 if element.get('id'):
 identifier += f" #{element['id']}"
 return identifier


def convert_relative_urls(soup, base_url):
 """
 Convert all relative URLs in the soup object to absolute URLs.
 """
 for tag in soup.find_all(href=True):
 tag['href'] = urljoin(base_url, tag['href'])
 for tag in soup.find_all(src=True):
 tag['src'] = urljoin(base_url, tag['src'])
 for tag in soup.find_all(attrs={'data-src': True}):
 tag['data-src'] = urljoin(base_url, tag['data-src'])
 return soup


def find_image_rich_parents(soup, base_url, min_children=4, min_similarity=0.7):
 """
 Find elements containing images and return both sorted list and detailed top element info.
 """
 # Convert relative URLs to absolute
 soup = convert_relative_urls(soup, base_url)

 # Collect potential container elements with their scores
 elements_with_scores = []
 for element in soup.find_all():
 if element.name in ['div', 'ul', 'section', 'main']:
 similarity_score, similar_children_count = analyze_children_similarity(element)
 image_count = count_images_in_element(element)

 if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
 # Calculate combined score based on similarity and image count
 combined_score = (similarity_score * similar_children_count * image_count)
 elements_with_scores.append((element, image_count, combined_score))

 if not elements_with_scores:
 return [], {"error": "No elements with images found"}, ""

 # Sort by combined score
 elements_with_scores.sort(key=lambda x: x[2], reverse=True)

 # Process elements for sorted list output
 sorted_elements = []
 for element, image_count, _ in elements_with_scores:
 sorted_elements.append((get_element_identifier(element), image_count))

 # Get top element (one with highest combined score)
 top_element = elements_with_scores[0][0]

 # Separate child elements with images
 products = []
 for child in top_element.find_all(recursive=False):
 if child.name: # Skip text nodes
 product_info = {
 "html_content": str(child),
 "images": []
 }

 # Get all images within this product
 for img in child.find_all('img', recursive=True):
 image_info = {
 "src": img.get('src', 'No source'),
 "alt": img.get('alt', 'No alt text')
 }
 product_info["images"].append(image_info)

 products.append(product_info)

 print(len(products))

 # Create result dictionary for top element 
 top_element_info = {
 "parent": {
 "tag": top_element.name,
 "identifier": get_element_identifier(top_element),
 "classes": top_element.get('class', []),
 "id": top_element.get('id', None)
 },
 "products_count": len(products),
 "products": products
 }

 # Create styled HTML output
 style_tag = """
 
 """
 html_output = style_tag + str(top_element)

 return sorted_elements, json.dumps(top_element_info, indent=2), html_output


def print_results(element_list):
 """
 Print formatted results.
 """
 print("\nElements Containing Most Images (Lowest Level for Each Count):")
 print("-" * 70)
 print("Rank Element Tag & Classes Image Count")
 print("-" * 70)

 for rank, element in enumerate(element_list, 1):
 tag_info, count = element
 rank_str = f"{rank}."
 rank_str = rank_str.ljust(5)
 tag_info_padded = tag_info.ljust(45)
 print(f"{rank_str} {tag_info_padded} {count}")

In [12]:
base_url = products_url.rsplit('/', 1)[0]
sorted_elements, top_element_info, html_output = find_image_rich_parents(soup, base_url)

# Print sorted list
print_results(sorted_elements)

with open("output1.json", "w") as file:
 file.write(top_element_info)

with open("output1.html", "w") as file:
 file.write(html_output)

28

Elements Containing Most Images (Lowest Level for Each Count):
----------------------------------------------------------------------
Rank Element Tag & Classes Image Count
----------------------------------------------------------------------
1. div .sc-5da3fdcc-0 .cqdDWw 51
2. div 1
