Spaces:
Sleeping
Sleeping
# | |
# | |
# def is_likely_product_card(element, min_text_length=10): | |
# """ | |
# Determine if an element is likely to be a product card based on various heuristics. | |
# """ | |
# # 1. Check for common product card class/id patterns | |
# identifier = element.get('class', []) + [element.get('id', '')] | |
# product_patterns = ['product', 'item', 'card', 'goods', 'listing'] | |
# if any(any(pattern in str(attr).lower() for pattern in product_patterns) for attr in identifier): | |
# return True | |
# | |
# # 2. Check for price patterns | |
# text_content = element.get_text() | |
# price_patterns = [ | |
# r'\$\d+\.?\d*', # USD | |
# r'£\d+\.?\d*', # GBP | |
# r'€\d+\.?\d*', # EUR | |
# r'\d+\.?\d*\s*USD', | |
# r'\d+\.?\d*\s*EUR' | |
# ] | |
# if any(re.search(pattern, text_content) for pattern in price_patterns): | |
# return True | |
# | |
# # 3. Check for minimum text content (excluding whitespace) | |
# clean_text = ' '.join(text_content.split()) | |
# if len(clean_text) < min_text_length: | |
# return False | |
# | |
# # 4. Check for typical product card elements | |
# has_title = bool(element.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])) | |
# | |
# return has_title | |
# | |
# | |
# def should_exclude_element(element): | |
# # """ | |
# # Check if an element should be excluded from consideration. | |
# # """ | |
# | |
# # 1. Exclude common non-product sections | |
# exclude_patterns = [ | |
# 'filter', 'filters', 'sidebar', 'menu', 'nav', 'header', 'footer', 'cart', | |
# 'search', 'pagination', 'sort', 'banner', 'ad', 'slider' | |
# ] | |
# | |
# # Check class and id | |
# element_classes = ' '.join(element.get('class', [])).replace("-", " ").replace("_", " ").lower().split() | |
# element_id = str(element.get('id', '')).replace("-", " ").replace("_", " ").lower().split() | |
# | |
# print(element_classes) | |
# | |
# for pattern in exclude_patterns: | |
# if pattern in element_classes: | |
# print(f"Excluded element due to class containing '{pattern}'") | |
# return True | |
# if pattern in element_id: | |
# print(f"Excluded element due to id containing '{pattern}'") | |
# return True | |
# | |
# return False | |