Spaces:
Running
Running
File size: 11,738 Bytes
08b59ae 22db5e3 08b59ae bf1f436 08b59ae bf1f436 08b59ae 50d3706 22db5e3 862aea5 6d27c94 22db5e3 3d407d8 22db5e3 50d3706 22db5e3 8ccce28 1413861 8ccce28 22db5e3 08b59ae 22db5e3 3d407d8 22db5e3 08b59ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
import os
import gradio as gr
from transformers import pipeline
import spacy
import lib.read_pdf
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import io
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
def split_in_sentences(text):
doc = nlp(text)
return [str(sent).strip() for sent in doc.sents]
def make_spans(text, results):
results_list = [res['label'] for res in results]
facts_spans = list(zip(split_in_sentences(text), results_list))
return facts_spans
# Initialize pipelines
summarizer = pipeline("summarization", model="human-centered-summarization/financial-summarization-pegasus")
fin_model = pipeline("sentiment-analysis", model='yiyanghkust/finbert-tone', tokenizer='yiyanghkust/finbert-tone')
fin_model_bis = pipeline("sentiment-analysis", model='ProsusAI/finbert', tokenizer='ProsusAI/finbert')
def summarize_text(text):
resp = summarizer(text)
return resp[0]['summary_text']
def text_to_sentiment(text):
sentiment = fin_model(text)[0]["label"]
return sentiment
def fin_ext(text):
results = fin_model(split_in_sentences(text))
return make_spans(text, results)
def fin_ext_bis(text):
results = fin_model_bis(split_in_sentences(text))
return make_spans(text, results)
def extract_and_summarize(pdf1, pdf2):
if not pdf1 or not pdf2:
return [], []
pdf1_path = os.path.join(PDF_FOLDER, pdf1)
pdf2_path = os.path.join(PDF_FOLDER, pdf2)
# Extract and format paragraphs
paragraphs_1 = lib.read_pdf.extract_and_format_paragraphs(pdf1_path)
paragraphs_2 = lib.read_pdf.extract_and_format_paragraphs(pdf2_path)
start_keyword = "Main risks to"
end_keywords = ["4. Appendix", "Annex:", "4. Annex", "Detailed tables", "ACKNOWLEDGEMENTS", "STATISTICAL ANNEX", "PROSPECTS BY MEMBER STATES"]
start_index1, end_index1 = lib.read_pdf.find_text_range(paragraphs_1, start_keyword, end_keywords)
start_index2, end_index2 = lib.read_pdf.find_text_range(paragraphs_2, start_keyword, end_keywords)
paragraphs_1 = lib.read_pdf.extract_relevant_text(paragraphs_1, start_index1, end_index1)
paragraphs_2 = lib.read_pdf.extract_relevant_text(paragraphs_2, start_index2, end_index2)
paragraphs_1 = lib.read_pdf.split_text_into_paragraphs(paragraphs_1, 0)
paragraphs_2 = lib.read_pdf.split_text_into_paragraphs(paragraphs_2, 0)
return paragraphs_1, paragraphs_2
# Gradio interface setup
PDF_FOLDER = "data"
def get_pdf_files(folder):
return [f for f in os.listdir(folder) if f.endswith('.pdf')]
def show(name):
return f"{name}"
def get_excel_files(folder):
return [f for f in os.listdir(folder) if f.endswith('.xlsx')]
def get_sheet_names(file):
xls = pd.ExcelFile(os.path.join(PDF_FOLDER, file))
return gr.update(choices=xls.sheet_names)
def process_and_compare(file1, sheet1, file2, sheet2):
def process_file(file_path, sheet_name):
# Extract year from file name
year = int(re.search(r'(\d{4})', file_path).group(1))
# Load the Excel file
df = pd.read_excel(os.path.join(PDF_FOLDER, file_path), sheet_name=sheet_name, index_col=0)
# Define expected columns based on extracted year
historical_col = f'Historical {year - 1}'
baseline_cols = [f'Baseline {year}', f'Baseline {year + 1}', f'Baseline {year + 2}']
adverse_cols = [f'Adverse {year}', f'Adverse {year + 1}', f'Adverse {year + 2}']
level_deviation_col = f'Level Deviation {year + 2}'
# Drop rows and reset index
df = df.iloc[4:].reset_index(drop=True)
# Define the new column names
new_columns = ['Country', 'Code', historical_col] + baseline_cols + adverse_cols + ['Adverse Cumulative', 'Adverse Minimum', level_deviation_col]
# Ensure the number of columns matches
if len(df.columns) == len(new_columns):
df.columns = new_columns
else:
raise ValueError(f"Expected {len(new_columns)} columns, but found {len(df.columns)} columns in the data.")
return df
# Process both files
df1 = process_file(file1, sheet1)
df2 = process_file(file2, sheet2)
year1 = int(re.search(r'(\d{4})', file1).group(1))
year2 = int(re.search(r'(\d{4})', file2).group(1))
# Calculate the differences
# historical_col1 = f'Historical {int(year1) - 1}'
# historical_col2 = f'Historical {int(year2) - 1}'
# df1['Historical vs Adverse'] = df1[historical_col1] - df1['Adverse Cumulative']
# df2['Historical vs Adverse'] = df2[historical_col2] - df2['Adverse Cumulative']
# Merge dataframes on 'Country'
merged_df = pd.merge(df2, df1, on='Country', suffixes=(f'_{year1}', f'_{year2}'))
merged_df['Difference adverse cumulative growth'] = merged_df[f'Adverse Cumulative_{year2}'] - merged_df[f'Adverse Cumulative_{year1}']
# Ensure data types are correct
merged_df['Country'] = merged_df['Country'].astype(str)
merged_df['Difference adverse cumulative growth'] = pd.to_numeric(merged_df['Difference adverse cumulative growth'], errors='coerce')
# Create histogram plot with color coding
fig, ax = plt.subplots(figsize=(12, 8))
colors = plt.get_cmap('tab20').colors # Use a colormap with multiple colors
num_countries = len(merged_df['Country'])
bars = ax.bar(merged_df['Country'], merged_df['Difference adverse cumulative growth'], color=colors[:num_countries])
# Add a legend
handles = [patches.Patch(color=color, label=country) for color, country in zip(colors[:num_countries], merged_df['Country'])]
ax.legend(handles=handles, title='Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_title(f'Histogram of Difference between Adverse cumulative growth of {year2} and {year1} for {sheet1}')
ax.set_xlabel('Country')
ax.set_ylabel('Difference')
plt.xticks(rotation=90)
# Save plot to a file
file_path = 'output/plot.png'
plt.savefig(file_path, format='png', bbox_inches='tight')
plt.close()
return file_path
stored_paragraphs_1 = []
stored_paragraphs_2 = []
with gr.Blocks() as demo:
with gr.Tab("Financial Report Text Analysis"):
gr.Markdown("## Financial Report Paragraph Selection and Analysis on adverse macro-economy scenario")
with gr.Row():
# Upload PDFs
with gr.Column():
pdf1 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 1")
pdf2 = gr.Dropdown(choices=get_pdf_files(PDF_FOLDER), label="Select PDF 2")
with gr.Column():
b1 = gr.Button("Extract and Display Paragraphs")
paragraph_1_dropdown = gr.Dropdown(label="Select Paragraph from PDF 1")
paragraph_2_dropdown = gr.Dropdown(label="Select Paragraph from PDF 2")
def update_paragraphs(pdf1, pdf2):
global stored_paragraphs_1, stored_paragraphs_2
stored_paragraphs_1, stored_paragraphs_2 = extract_and_summarize(pdf1, pdf2)
updated_dropdown_1 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_1)]
updated_dropdown_2 = [f"Paragraph {i+1}: {p[:100]}..." for i, p in enumerate(stored_paragraphs_2)]
return gr.update(choices=updated_dropdown_1), gr.update(choices=updated_dropdown_2)
b1.click(fn=update_paragraphs, inputs=[pdf1, pdf2], outputs=[paragraph_1_dropdown, paragraph_2_dropdown])
with gr.Row():
# Process the selected paragraph from PDF 1
with gr.Column():
gr.Markdown("### PDF 1 Analysis")
selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
selected_paragraph_1.change(show, paragraph_1_dropdown, selected_paragraph_1)
summarize_btn1 = gr.Button("Summarize Text from PDF 1")
summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
sentiment_btn1 = gr.Button("Classify Financial Tone from PDF 1")
sentiment_textbox_1 = gr.Textbox(label="Classification for PDF 1", lines=1)
sentiment_btn1.click(fn=lambda p: process_paragraph_1_sent(p), inputs=paragraph_1_dropdown, outputs=sentiment_textbox_1)
analyze_btn1 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
fin_spans_1 = gr.HighlightedText(label="Financial Tone Analysis for PDF 1")
analyze_btn1.click(fn=lambda p: process_paragraph_1_sent_tone(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1)
analyze_btn1_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
fin_spans_1_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 1 bis")
analyze_btn1_.click(fn=lambda p: process_paragraph_1_sent_tone_bis(p), inputs=paragraph_1_dropdown, outputs=fin_spans_1_)
# Process the selected paragraph from PDF 2
with gr.Column():
gr.Markdown("### PDF 2 Analysis")
selected_paragraph_2 = gr.Textbox(label="Selected Paragraph 2 Content", lines=4)
selected_paragraph_2.change(show, paragraph_2_dropdown, selected_paragraph_2)
summarize_btn2 = gr.Button("Summarize Text from PDF 2")
summary_textbox_2 = gr.Textbox(label="Summary for PDF 2", lines=2)
summarize_btn2.click(fn=lambda p: process_paragraph_2_sum(p), inputs=paragraph_2_dropdown, outputs=summary_textbox_2)
sentiment_btn2 = gr.Button("Classify Financial Tone from PDF 2")
sentiment_textbox_2 = gr.Textbox(label="Classification for PDF 2", lines=1)
sentiment_btn2.click(fn=lambda p: process_paragraph_2_sent(p), inputs=paragraph_2_dropdown, outputs=sentiment_textbox_2)
analyze_btn2 = gr.Button("Analyze Financial Tone on each sentence with yiyanghkust/finbert-tone")
fin_spans_2 = gr.HighlightedText(label="Financial Tone Analysis for PDF 2")
analyze_btn2.click(fn=lambda p: process_paragraph_2_sent_tone(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2)
analyze_btn2_ = gr.Button("Analyze Financial Tone on each sentence with ProsusAI/finbert")
fin_spans_2_ = gr.HighlightedText(label="Financial Tone Analysis for PDF 2 bis")
analyze_btn2_.click(fn=lambda p: process_paragraph_2_sent_tone_bis(p), inputs=paragraph_2_dropdown, outputs=fin_spans_2_)
with gr.Tab("Financial Report Table Analysis"):
# New tab content goes here
gr.Markdown("## Excel Data Comparison")
with gr.Row():
with gr.Column():
file1 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 1")
file2 = gr.Dropdown(choices=get_excel_files(PDF_FOLDER), label="Select Excel File 2")
sheet = gr.Dropdown(choices=[], label="Select Sheet for File 1 and 2")
with gr.Column():
result = gr.Image(label="Comparison pLot")
def update_sheets(file):
return get_sheet_names(file)
file1.change(fn=update_sheets, inputs=file1, outputs=sheet)
b1 = gr.Button("Compare Data")
b1.click(fn=process_and_compare, inputs=[file1, sheet, file2, sheet], outputs=result)
demo.launch()
|