# html_to_markdown/html_to_markdown.py from bs4 import BeautifulSoup from typing import Optional from conversion_options import ConversionOptions from dom_utils import find_main_content, wrap_main_content from html_to_markdown_ast import html_to_markdown_ast from markdown_ast_to_string import markdown_ast_to_string from url_utils import refify_urls import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def convert_html_to_markdown(html: str, options: Optional[ConversionOptions] = None) -> str: if options is None: options = ConversionOptions() if options.debug: logger.setLevel(logging.DEBUG) soup = BeautifulSoup(html, 'html.parser') if options.extract_main_content: main_content = find_main_content(soup, options) if options.include_meta_data and soup.head and not main_content.find('head'): # Reattach head for metadata extraction new_html = f"{soup.head}{main_content}" soup = BeautifulSoup(new_html, 'html.parser') main_content = soup.html else: if options.include_meta_data and soup.head: main_content = soup else: main_content = soup.body if soup.body else soup markdown_ast = html_to_markdown_ast(main_content, options) if options.refify_urls: options.url_map = refify_urls(markdown_ast, options.url_map) markdown_string = markdown_ast_to_string(markdown_ast, options) return markdown_string