Spaces:
Running
Running
File size: 5,640 Bytes
43cd37c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# html_to_markdown/dom_utils.py
from bs4 import BeautifulSoup, Tag
from typing import Optional
import logging
from conversion_options import ConversionOptions
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def find_main_content(soup: BeautifulSoup, options: ConversionOptions) -> Tag:
logger.debug("Entering find_main_content function")
main_element = soup.find('main')
if main_element:
logger.debug("Existing <main> element found")
return main_element
logger.debug("No <main> element found. Detecting main content.")
if not soup.body:
logger.debug("No body element found, returning the entire document")
return soup
return detect_main_content(soup.body, options)
def wrap_main_content(main_content: Tag, soup: BeautifulSoup):
if main_content.name.lower() != 'main':
logger.debug("Wrapping main content in <main> element")
main_element = soup.new_tag('main')
main_content.wrap(main_element)
main_element['id'] = 'detected-main-content'
logger.debug("Main content wrapped successfully")
else:
logger.debug("Main content already wrapped")
def detect_main_content(element: Tag, options: ConversionOptions) -> Tag:
candidates = []
min_score = 20
logger.debug(f"Collecting candidates with minimum score: {min_score}")
collect_candidates(element, candidates, min_score, options)
logger.debug(f"Total candidates found: {len(candidates)}")
if not candidates:
logger.debug("No suitable candidates found, returning root element")
return element
# Sort candidates by score descending
candidates.sort(key=lambda x: calculate_score(x, options), reverse=True)
logger.debug("Candidates sorted by score")
best_candidate = candidates[0]
for candidate in candidates[1:]:
if not any(other.contains(candidate) for other in candidates):
if calculate_score(candidate, options) > calculate_score(best_candidate, options):
best_candidate = candidate
logger.debug(f"New best independent candidate found: {element_to_string(best_candidate)}")
logger.debug(f"Final main content candidate: {element_to_string(best_candidate)}")
return best_candidate
def element_to_string(element: Optional[Tag]) -> str:
if not element:
return 'No element'
classes = '.'.join(element.get('class', []))
return f"{element.name}#{element.get('id', 'no-id')}.{classes}"
def collect_candidates(element: Tag, candidates: list, min_score: int, options: ConversionOptions):
score = calculate_score(element, options)
if score >= min_score:
candidates.append(element)
logger.debug(f"Candidate found: {element_to_string(element)}, score: {score}")
for child in element.find_all(recursive=False):
collect_candidates(child, candidates, min_score, options)
def calculate_score(element: Tag, options: ConversionOptions) -> int:
score = 0
score_log = []
# High impact attributes
high_impact_attributes = ['article', 'content', 'main-container', 'main', 'main-content']
for attr in high_impact_attributes:
if 'class' in element.attrs and attr in element['class']:
score += 10
score_log.append(f"High impact attribute found: {attr}, score increased by 10")
if 'id' in element.attrs and attr in element['id']:
score += 10
score_log.append(f"High impact ID found: {attr}, score increased by 10")
# High impact tags
high_impact_tags = ['article', 'main', 'section']
if element.name.lower() in high_impact_tags:
score += 5
score_log.append(f"High impact tag found: {element.name}, score increased by 5")
# Paragraph count
paragraph_count = len(element.find_all('p'))
paragraph_score = min(paragraph_count, 5)
if paragraph_score > 0:
score += paragraph_score
score_log.append(f"Paragraph count: {paragraph_count}, score increased by {paragraph_score}")
# Text content length
text_content_length = len(element.get_text(strip=True))
if text_content_length > 200:
text_score = min(text_content_length // 200, 5)
score += text_score
score_log.append(f"Text content length: {text_content_length}, score increased by {text_score}")
# Link density
link_density = calculate_link_density(element)
if link_density < 0.3:
score += 5
score_log.append(f"Link density: {link_density:.2f}, score increased by 5")
# Data attributes
if element.has_attr('data-main') or element.has_attr('data-content'):
score += 10
score_log.append("Data attribute for main content found, score increased by 10")
# Role attribute
if element.get('role') and 'main' in element.get('role'):
score += 10
score_log.append("Role attribute indicating main content found, score increased by 10")
if options.debug and score_log:
logger.debug(f"Scoring for {element_to_string(element)}:")
for log in score_log:
logger.debug(f" {log}")
logger.debug(f" Final score: {score}")
return score
def calculate_link_density(element: Tag) -> float:
links = element.find_all('a')
link_length = sum(len(link.get_text(strip=True)) for link in links)
text_length = len(element.get_text(strip=True)) or 1 # Avoid division by zero
return link_length / text_length
|