# html_to_markdown/dom_utils.py from bs4 import BeautifulSoup, Tag from typing import Optional import logging from conversion_options import ConversionOptions logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def find_main_content(soup: BeautifulSoup, options: ConversionOptions) -> Tag: logger.debug("Entering find_main_content function") main_element = soup.find('main') if main_element: logger.debug("Existing
element found") return main_element logger.debug("No
element found. Detecting main content.") if not soup.body: logger.debug("No body element found, returning the entire document") return soup return detect_main_content(soup.body, options) def wrap_main_content(main_content: Tag, soup: BeautifulSoup): if main_content.name.lower() != 'main': logger.debug("Wrapping main content in
element") main_element = soup.new_tag('main') main_content.wrap(main_element) main_element['id'] = 'detected-main-content' logger.debug("Main content wrapped successfully") else: logger.debug("Main content already wrapped") def detect_main_content(element: Tag, options: ConversionOptions) -> Tag: candidates = [] min_score = 20 logger.debug(f"Collecting candidates with minimum score: {min_score}") collect_candidates(element, candidates, min_score, options) logger.debug(f"Total candidates found: {len(candidates)}") if not candidates: logger.debug("No suitable candidates found, returning root element") return element # Sort candidates by score descending candidates.sort(key=lambda x: calculate_score(x, options), reverse=True) logger.debug("Candidates sorted by score") best_candidate = candidates[0] for candidate in candidates[1:]: if not any(other.contains(candidate) for other in candidates): if calculate_score(candidate, options) > calculate_score(best_candidate, options): best_candidate = candidate logger.debug(f"New best independent candidate found: {element_to_string(best_candidate)}") logger.debug(f"Final main content candidate: {element_to_string(best_candidate)}") return best_candidate def element_to_string(element: Optional[Tag]) -> str: if not element: return 'No element' classes = '.'.join(element.get('class', [])) return f"{element.name}#{element.get('id', 'no-id')}.{classes}" def collect_candidates(element: Tag, candidates: list, min_score: int, options: ConversionOptions): score = calculate_score(element, options) if score >= min_score: candidates.append(element) logger.debug(f"Candidate found: {element_to_string(element)}, score: {score}") for child in element.find_all(recursive=False): collect_candidates(child, candidates, min_score, options) def calculate_score(element: Tag, options: ConversionOptions) -> int: score = 0 score_log = [] # High impact attributes high_impact_attributes = ['article', 'content', 'main-container', 'main', 'main-content'] for attr in high_impact_attributes: if 'class' in element.attrs and attr in element['class']: score += 10 score_log.append(f"High impact attribute found: {attr}, score increased by 10") if 'id' in element.attrs and attr in element['id']: score += 10 score_log.append(f"High impact ID found: {attr}, score increased by 10") # High impact tags high_impact_tags = ['article', 'main', 'section'] if element.name.lower() in high_impact_tags: score += 5 score_log.append(f"High impact tag found: {element.name}, score increased by 5") # Paragraph count paragraph_count = len(element.find_all('p')) paragraph_score = min(paragraph_count, 5) if paragraph_score > 0: score += paragraph_score score_log.append(f"Paragraph count: {paragraph_count}, score increased by {paragraph_score}") # Text content length text_content_length = len(element.get_text(strip=True)) if text_content_length > 200: text_score = min(text_content_length // 200, 5) score += text_score score_log.append(f"Text content length: {text_content_length}, score increased by {text_score}") # Link density link_density = calculate_link_density(element) if link_density < 0.3: score += 5 score_log.append(f"Link density: {link_density:.2f}, score increased by 5") # Data attributes if element.has_attr('data-main') or element.has_attr('data-content'): score += 10 score_log.append("Data attribute for main content found, score increased by 10") # Role attribute if element.get('role') and 'main' in element.get('role'): score += 10 score_log.append("Role attribute indicating main content found, score increased by 10") if options.debug and score_log: logger.debug(f"Scoring for {element_to_string(element)}:") for log in score_log: logger.debug(f" {log}") logger.debug(f" Final score: {score}") return score def calculate_link_density(element: Tag) -> float: links = element.find_all('a') link_length = sum(len(link.get_text(strip=True)) for link in links) text_length = len(element.get_text(strip=True)) or 1 # Avoid division by zero return link_length / text_length