products-extracter / draft.py
supunTE's picture
use element signature for identify similar children
f2fe0c9
#
#
# def is_likely_product_card(element, min_text_length=10):
# """
# Determine if an element is likely to be a product card based on various heuristics.
# """
# # 1. Check for common product card class/id patterns
# identifier = element.get('class', []) + [element.get('id', '')]
# product_patterns = ['product', 'item', 'card', 'goods', 'listing']
# if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier):
# return True
#
# # 2. Check for price patterns
# text_content = element.get_text()
# price_patterns = [
# r'\$\d+\.?\d*', # USD
# r'£\d+\.?\d*', # GBP
# r'€\d+\.?\d*', # EUR
# r'\d+\.?\d*\s*USD',
# r'\d+\.?\d*\s*EUR'
# ]
# if any(re.search(pattern, text_content) for pattern in price_patterns):
# return True
#
# # 3. Check for minimum text content (excluding whitespace)
# clean_text = ' '.join(text_content.split())
# if len(clean_text) < min_text_length:
# return False
#
# # 4. Check for typical product card elements
# has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))
#
# return has_title
#
#
# def should_exclude_element(element):
# # """
# # Check if an element should be excluded from consideration.
# # """
#
# # 1. Exclude common non-product sections
# exclude_patterns = [
# 'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart',
# 'search', 'pagination', 'sort', 'banner', 'ad', 'slider'
# ]
#
# # Check class and id
# element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split()
# element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split()
#
# print(element_classes)
#
# for pattern in exclude_patterns:
# if pattern in element_classes:
# print(f"Excluded element due to class containing '{pattern}'")
# return True
# if pattern in element_id:
# print(f"Excluded element due to id containing '{pattern}'")
# return True
#
# return False