File size: 5,163 Bytes
0b16387
 
 
a2f836f
 
0b16387
e7d71ae
 
 
 
 
 
 
 
 
a2f836f
 
220da13
 
9bcbb8b
220da13
a2f836f
220da13
 
9bcbb8b
220da13
 
a2f836f
220da13
a2f836f
220da13
9bcbb8b
220da13
 
a2f836f
220da13
 
 
a2f836f
220da13
0b16387
 
a2f836f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bcbb8b
a2f836f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220da13
a2f836f
 
 
 
 
 
 
 
 
 
 
 
 
 
2806b32
a2f836f
 
 
 
87af43d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import pdfplumber
import re
import gradio as gr
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

"""
Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
Parameters: 
- path (str): The file path to the PDF file.
- wanted_section (str): The section to start extracting text from.
- next_section (str): The section to stop extracting text at.
Returns:
- text (str): The extracted text from the specified section range.
"""


def get_section(path, wanted_section, next_section):
    print(wanted_section)

    # Open the PDF file
    doc = pdfplumber.open(BytesIO(path)) 
    start_page = []
    end_page = []

    # Find the all the pages for the specified sections
    for page in range(len(doc.pages)):
        if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
            start_page.append(page)
        if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
            end_page.append(page)

    # Extract the text between the start and end page of the wanted section
    text = []
    for page_num in range(max(start_page), max(end_page)+1):
        page = doc.pages[page_num]
        text.append(page.extract_text())
    text = " ".join(text)
    final_text = text.replace("\n", " ")
    return final_text


def extract_between(big_string, start_string, end_string):
    # Use a non-greedy match for content between start_string and end_string
    pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
    match = re.search(pattern, big_string, re.DOTALL)

    if match:
        # Return the content without the start and end strings
        return match.group(1)
    else:
        # Return None if the pattern is not found
        return None

def format_section1(section1_text):
    result_section1_dict = {}

    result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
    result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
    result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
    result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
    result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
    result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
    result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
    result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")

    return result_section1_dict

def answer_questions(text,language="de"):
    # Initialize the zero-shot classification pipeline
    model_name = "deepset/gelectra-large-germanquad"
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Initialize the QA pipeline
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    questions = [
        "Welches ist das Titel des Moduls?",
        "Welches ist das Sektor oder das Kernthema?",
        "Welches ist das Land?",
        "Zu welchem Program oder EZ-Programm gehort das Projekt?"
        #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
        # "In dem Dokument was steht bei Sektor?",
        # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
        # "In dem Dokument was steht bei EZ-Programmziel?",
        # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
        # "In dem Dokument was steht bei Zielerreichung des Moduls?",
        # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
        # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
        # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
        # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
    ]

    # Iterate over each question and get answers
    for question in questions:
        result = qa_pipeline(question=question, context=text)
        # print(f"Question: {question}")
        # print(f"Answer: {result['answer']}\n")
        answers_dict[question] = result['answer']
    return answers_dict


def process_pdf(path):
    results_dict = {}
    results_dict["1. Kurzbeschreibung"] = \
        get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
    answers = answer_questions(results_dict["1. Kurzbeschreibung"])
    return result_section1_dict['TOPIC']

def get_first_page_text(file_data):
    doc = pdfplumber.open(BytesIO(file_data)) 
    if len(doc.pages):
        return doc.pages[0].extract_text()

if __name__ == "__main__":
    
    # Define the Gradio interface
    # iface = gr.Interface(fn=process_pdf,