File size: 11,738 Bytes
08b59ae
 
 
 
 
22db5e3
 
 
 
 
08b59ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf1f436
08b59ae
 
 
 
 
 
 
 
 
 
 
 
bf1f436
 
 
08b59ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50d3706
 
22db5e3
 
 
 
 
862aea5
6d27c94
22db5e3
 
 
 
 
 
 
3d407d8
22db5e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50d3706
22db5e3
 
 
 
 
8ccce28
1413861
8ccce28
 
 
 
22db5e3
 
08b59ae
 
 
 
22db5e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d407d8
 
22db5e3
 
 
 
 
 
 
 
 
 
 
 
08b59ae
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
import gradio as gr
from transformers import pipeline
import spacy
import lib.read_pdf
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import io
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')

def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

def make_spans(text, results):
    results_list = [res['label'] for res in results]
    facts_spans = list(zip(split_in_sentences(text), results_list))
    return facts_spans

# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')

def summarize_text(text):
    resp = summarizer(text)
    return resp[0]['summary_text']

def text_to_sentiment(text):
    sentiment = fin_model(text)[0]["label"]
    return sentiment

def fin_ext(text):
    results = fin_model(split_in_sentences(text))
    return make_spans(text, results)
def fin_ext_bis(text):
    results = fin_model_bis(split_in_sentences(text))
    return make_spans(text, results)

def extract_and_summarize(pdf1, pdf2):
    if not pdf1 or not pdf2:
        return [], []

    pdf1_path = os.path.join(PDF_FOLDER, pdf1)
    pdf2_path = os.path.join(PDF_FOLDER, pdf2)

    # Extract and format paragraphs
    paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
    paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)

    start_keyword = "Main risks to"
    end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]

    start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
    start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)

    paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
    paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)

    paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
    paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)

    return paragraphs_1, paragraphs_2

# Gradio interface setup
PDF_FOLDER = "data"

def get_pdf_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.pdf')]

def show(name):
    return f"{name}"

def get_excel_files(folder):
    return [f for f in os.listdir(folder) if f.endswith('.xlsx')]

def get_sheet_names(file):
    xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file))
    return gr.update(choices=xls.sheet_names)

def process_and_compare(file1, sheet1, file2, sheet2):
    def process_file(file_path, sheet_name):
        # Extract year from file name
        year = int(re.search(r'(\d{4})', file_path).group(1))
        
        # Load the Excel file
        df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0)
        
        # Define expected columns based on extracted year
        historical_col = f'Historical {year - 1}'
        baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
        adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
        level_deviation_col = f'Level Deviation {year + 2}'

        # Drop rows and reset index
        df = df.iloc[4:].reset_index(drop=True)

        # Define the new column names
        new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]
        
        # Ensure the number of columns matches
        if len(df.columns) == len(new_columns):
            df.columns = new_columns
        else:
            raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
        
        return df

    # Process both files
    df1 = process_file(file1, sheet1)
    df2 = process_file(file2, sheet2)
    year1 = int(re.search(r'(\d{4})', file1).group(1))
    year2 = int(re.search(r'(\d{4})', file2).group(1))
    # Calculate the differences
    # historical_col1 = f'Historical {int(year1) - 1}'
    # historical_col2 = f'Historical {int(year2) - 1}'
    
    # df1['Historical vs Adverse'] = df1[historical_col1] - df1['Adverse Cumulative']
    # df2['Historical vs Adverse'] = df2[historical_col2] - df2['Adverse Cumulative']

    # Merge dataframes on 'Country'
    merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
    merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
    # Ensure data types are correct
    merged_df['Country'] = merged_df['Country'].astype(str)
    merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')

    # Create histogram plot with color coding
    fig, ax = plt.subplots(figsize=(12, 8))
    colors = plt.get_cmap('tab20').colors  # Use a colormap with multiple colors
    num_countries = len(merged_df['Country'])
    
    bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])
    
    # Add a legend
    handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
    ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
    ax.set_xlabel('Country')
    ax.set_ylabel('Difference')
    plt.xticks(rotation=90)

    # Save plot to a file
    file_path = 'output/plot.png'
    plt.savefig(file_path, format='png', bbox_inches='tight')
    plt.close()

    return file_path


stored_paragraphs_1 = []
stored_paragraphs_2 = []

with gr.Blocks() as demo:
    with gr.Tab("Financial Report Text Analysis"):
        gr.Markdown("## Financial Report Paragraph Selection and Analysis on adverse macro-economy scenario")

        with gr.Row():
            # Upload PDFs
            with gr.Column():
                pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
                pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")

            with gr.Column():
                b1 = gr.Button("Extract and Display Paragraphs")
                paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
                paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")

                def update_paragraphs(pdf1, pdf2):
                    global stored_paragraphs_1, stored_paragraphs_2
                    stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
                    updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
                    updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
                    return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)

                b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])

        with gr.Row():
            # Process the selected paragraph from PDF 1
            with gr.Column():
                gr.Markdown("### PDF 1 Analysis")
                selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
                selected_paragraph_1.change(show, paragraph_1_dropdown, selected_paragraph_1)
                summarize_btn1 = gr.Button("Summarize Text from PDF 1")
                summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
                summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
                sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
                sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=1)
                sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
                analyze_btn1 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
                fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
                analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
                analyze_btn1_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
                fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 bis")
                analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)

            # Process the selected paragraph from PDF 2
            with gr.Column():
                gr.Markdown("### PDF 2 Analysis")
                selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
                selected_paragraph_2.change(show, paragraph_2_dropdown, selected_paragraph_2)
                summarize_btn2 = gr.Button("Summarize Text from PDF 2")
                summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
                summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
                sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
                sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=1)
                sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
                analyze_btn2 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
                fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
                analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
                analyze_btn2_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
                fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 bis")
                analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)

    with gr.Tab("Financial Report Table Analysis"):
        # New tab content goes here
        gr.Markdown("## Excel Data Comparison")

        with gr.Row():
            with gr.Column():
                file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1")
                file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2")
                sheet = gr.Dropdown(choices=[], label="Select Sheet for File 1 and 2")
            
            with gr.Column():
                result = gr.Image(label="Comparison pLot")
        
        def update_sheets(file):
            return get_sheet_names(file)
        
        file1.change(fn=update_sheets, inputs=file1, outputs=sheet)
        
        b1 = gr.Button("Compare Data")
        b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)

demo.launch()