|
"""HTML parser. |
|
|
|
Contains parser for html files. |
|
|
|
""" |
|
import re |
|
from pathlib import Path |
|
from typing import Dict, Union |
|
|
|
from application.parser.file.base_parser import BaseParser |
|
|
|
|
|
class HTMLParser(BaseParser): |
|
"""HTML parser.""" |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: |
|
"""Parse file. |
|
|
|
Returns: |
|
Union[str, List[str]]: a string or a List of strings. |
|
""" |
|
try: |
|
from unstructured.partition.html import partition_html |
|
from unstructured.staging.base import convert_to_isd |
|
from unstructured.cleaners.core import clean |
|
except ImportError: |
|
raise ValueError("unstructured package is required to parse HTML files.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(file, "r", encoding="utf-8") as fp: |
|
elements = partition_html(file=fp) |
|
isd = convert_to_isd(elements) |
|
|
|
|
|
for isd_el in isd: |
|
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() |
|
|
|
|
|
|
|
for isd_el in isd: |
|
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) |
|
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) |
|
|
|
|
|
for isd_el in isd: |
|
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) |
|
|
|
|
|
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
Chunks = [[]] |
|
final_chunks = list(list()) |
|
|
|
for i, isd_el in enumerate(isd): |
|
if i in title_indexes: |
|
Chunks.append([]) |
|
Chunks[-1].append(isd_el['text']) |
|
|
|
|
|
|
|
for chunk in Chunks: |
|
|
|
sum = 0 |
|
sum += len(str(chunk)) |
|
if sum < 25: |
|
Chunks.remove(chunk) |
|
else: |
|
|
|
final_chunks.append(" ".join([str(item) for item in chunk])) |
|
return final_chunks |
|
|