Spaces:

jonas
/

sdg-policy-tracing

Sleeping

File size: 2,134 Bytes

f51b958

from typing import Callable, Dict, List, Optional

from pathlib import Path
import re
import logging
import string 
import streamlit as st
logger = logging.getLogger(__name__)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

import pandas as pd

def load_document(
    file: str,
    file_name,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file_name.name.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file_name.name.endswith('.txt'):
        converter = TextConverter()
    if file_name.name.endswith('.docx'):
        converter = DocxToTextConverter()


    documents = []
    logger.info("Converting {}".format(file_name))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    for i in documents: 
        if i.content == "":
            st.write("using pdfplumber")
            text = []
            with pdfplumber.open(file) as pdf:
                for page in pdf.pages:
                    text.append(page.extract_text())
            i.content = ' '.join([page for page in text])
    
    return documents