sohomghosh commited on
Commit
9901d60
β€’
1 Parent(s): 3794ca7

Create new file

Browse files
Files changed (1) hide show
  1. app.py +252 -0
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('stopwords')
3
+ nltk.download('punkt')
4
+ import pickle
5
+ from keybert import KeyBERT
6
+ from nltk.util import everygrams
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import sent_tokenize
9
+ from fincat_utils import extract_context_words
10
+ from fincat_utils import bert_embedding_extract
11
+ from sentence_transformers import SentenceTransformer, util
12
+ import torch
13
+ from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForSeq2SeqLM, AutoModel, RobertaModel, RobertaTokenizer
14
+ import gradio as gr
15
+ import pandas as pd
16
+ from fin_readability_sustainability import BERTClass, do_predict
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+
20
+ kw_model = KeyBERT(model='all-mpnet-base-v2')
21
+
22
+ #ESG
23
+ finbert_esg = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
24
+ tokenizer_esg = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
25
+ nlp_esg = pipeline("text-classification", model=finbert_esg, tokenizer=tokenizer_esg)
26
+
27
+ #FLS
28
+ finbert_fls = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
29
+ tokenizer_fls = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
30
+ nlp_fls = pipeline("text-classification", model=finbert_fls, tokenizer=tokenizer_fls)
31
+
32
+ #FinCAT - Claim Detection
33
+ lr_clf_claim = pickle.load(open("lr_clf_FiNCAT.pickle",'rb'))
34
+
35
+ #Sustainability
36
+ tokenizer_sus = RobertaTokenizer.from_pretrained('roberta-base')
37
+ model_sustain = BERTClass(2, "sustanability")
38
+ model_sustain.to(device)
39
+ model_sustain.load_state_dict(torch.load('sustainability_model.bin', map_location=device)['model_state_dict'])
40
+
41
+ #Readability
42
+ tokenizer_read = BertTokenizer.from_pretrained('ProsusAI/finbert')
43
+ model_read = BERTClass(2, "readability")
44
+ model_read.to(device)
45
+ model_read.load_state_dict(torch.load('readability_model.bin', map_location=device)['model_state_dict'])
46
+
47
+ #Sentiment
48
+ model_senti1 = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
49
+ tokenizer_senti1 = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
50
+ senti1 = pipeline("sentiment-analysis", model=model_senti1, tokenizer=tokenizer_senti1)
51
+ model_senti2 = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
52
+ tokenizer_senti2 = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
53
+ senti2 = TextClassificationPipeline(model=model_senti2, tokenizer=tokenizer_senti2)
54
+
55
+ #Summarization
56
+ model_finsum = AutoModelForSeq2SeqLM.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
57
+ tokenizer_finsum = AutoTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
58
+
59
+
60
+ #Hypernym Detection
61
+ model_finlipi = SentenceTransformer('sohomghosh/LIPI_FinSim3_Hypernym')
62
+ hypernyms = ['Bonds','Forward','Funds', 'Future', 'MMIs','Option', 'Stocks', 'Swap', 'Equity Index', 'Credit Index', 'Securities restrictions', 'Parametric schedules', 'Debt pricing and yields', 'Credit Events','Stock Corporation', 'Central Securities Depository', 'Regulatory Agency']
63
+ hyp_di = { v:k for v, k in enumerate(hypernyms)}
64
+ hypernyms_embeddings = model_finlipi.encode(hypernyms)
65
+
66
+
67
+ #ESG
68
+ def esg(text):
69
+ sents = sent_tokenize(text)
70
+ results = nlp_esg(sents)
71
+ highlight = [(text,i['label']) for text,i in zip(sents,results)]
72
+ return highlight
73
+
74
+ #FLS
75
+ def fls(text):
76
+ sents = sent_tokenize(text)
77
+ results = nlp_fls(sents)
78
+ highlight = [(text,i['label']) for text,i in zip(sents,results)]
79
+ return highlight
80
+
81
+ #Sentiment
82
+ def getfinsenti(text):
83
+ highlight = []
84
+ for text in sent_tokenize(text):
85
+ senti1_output = senti1(text)[0]
86
+ senti2_output = senti2(text)[0]
87
+ if senti1_output['score'] >= senti2_output['score']:
88
+ label = senti1_output['label']
89
+ score = round(senti1_output['score'], 4)
90
+ else:
91
+ label = senti2_output['label']
92
+ score = round(senti2_output['score'], 4)
93
+ highlight.append((text, label.strip().lower()))
94
+ return highlight
95
+
96
+ #Summarization
97
+ def summarize_pega(text):
98
+ input_ids = tokenizer_finsum(text, return_tensors="pt").input_ids
99
+ output = model_finsum.generate(
100
+ input_ids,
101
+ max_length=32,
102
+ num_beams=5,
103
+ early_stopping=True
104
+ )
105
+ summary = str(tokenizer_finsum.decode(output[0], skip_special_tokens=True))
106
+ return summary
107
+
108
+ #Hypernym Detection
109
+ def get_hyp(words, th=0.85):
110
+ queries = [wd.strip() for wd in words.split(",")]
111
+ highlight = []
112
+ if len(queries)>0:
113
+ query_embeddings = model_finlipi.encode(queries)
114
+ cos_scores = util.pytorch_cos_sim(query_embeddings, hypernyms_embeddings)
115
+ ans = torch.max(cos_scores, dim=1)
116
+
117
+ for sim,ind,query in zip(ans.values, ans.indices, queries):
118
+ if query.strip()!="":
119
+ if sim.item()>th:
120
+ highlight.append((query, hyp_di[ind.item()]))
121
+ else:
122
+ highlight.append((query, 'no hypernym found'))
123
+
124
+ return highlight
125
+
126
+ #FinCAT - Claim Detection
127
+ def score_fincat(txt):
128
+ '''
129
+ Extracts numerals from financial texts and checks if they are in-claim or out-of claim
130
+
131
+ Parameters:
132
+ txt (str): Financial Text. This is to be given as input. Numerals present in this text will be evaluated.
133
+
134
+ Returns:
135
+ highlight (list): A list each element of which is a tuple. Each tuple has two elements i) word ii) whether the word is in-claim or out-of-claim.
136
+ '''
137
+ #li = []
138
+ highlight = []
139
+ txt = " " + txt + " "
140
+ k = ''
141
+ for word in txt.split():
142
+ if any(char.isdigit() for char in word):
143
+ if word[-1] in ['.', ',', ';', ":", "-", "!", "?", ")", '"', "'"]:
144
+ k = word[-1]
145
+ word = word[:-1]
146
+ st = txt.index(" " + word + k + " ")+1
147
+ k = ''
148
+ ed = st + len(word)
149
+ x = {'paragraph' : txt, 'offset_start':st, 'offset_end':ed}
150
+ context_text = extract_context_words(x)
151
+ features = bert_embedding_extract(context_text, word)
152
+ prediction = lr_clf_claim.predict(features.reshape(1, 768))
153
+ highlight.append((word, 'In-claim' if prediction==1 else 'Out-of-Claim'))
154
+ else:
155
+ highlight.append((word, ''))
156
+ headers = ['numeral', 'prediction', 'probability']
157
+ return highlight
158
+
159
+ #Readability
160
+ def get_readability(text):
161
+ df = pd.DataFrame({'sentence':sent_tokenize(text)})
162
+ actual_predictions_read = do_predict(model_read, tokenizer_read, df)
163
+ highlight = [(sent, 'readable') if i==1 else (sent, 'non-readable') for sent,i in zip(df['sentence'].values, actual_predictions_read[0])]
164
+ return highlight
165
+
166
+
167
+ #Sustainability
168
+ def get_sustainability(text):
169
+ df = pd.DataFrame({'sentence':sent_tokenize(text)})
170
+ actual_predictions_sustainability = do_predict(model_sustain, tokenizer_sus, df)
171
+ highlight = []
172
+ for sent, prob in zip(df['sentence'].values, actual_predictions_sustainability[1]):
173
+ if prob>=2.8:#4.384316:
174
+ highlight.append((sent, 'non-sustainable'))
175
+ elif prob<=1.423736:
176
+ highlight.append((sent, 'sustainable'))
177
+ else:
178
+ highlight.append((sent, '-'))
179
+ return highlight
180
+
181
+ #keywords
182
+ def get_keywords(text):
183
+ keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', highlight=False, top_n=3)
184
+ keywords_list= list(dict(keywords).keys())
185
+ return ",".join([i.strip() for i in keywords_list])
186
+
187
+ #examples
188
+ def set_example_text(example_text):
189
+ return gr.Textbox.update(value=example_text[0])
190
+
191
+
192
+ demo = gr.Blocks()
193
+
194
+ with demo:
195
+ gr.Markdown("# **Financial Language Understandability Enhancement Toolkit (FLUEnT)**")
196
+ with gr.Row():
197
+ with gr.Column():
198
+ text = gr.inputs.Textbox(label="Enter financial text here", lines=6, placeholder="Enter Financial Text here...")
199
+
200
+ b_hyp_th = gr.inputs.Slider(minimum=0, maximum=1, step=0.01, label="Detect hypernyms with confidence of")
201
+ with gr.Row():
202
+ b1 = gr.Button("Get Keywords For Hypernym Detection")
203
+
204
+ with gr.Row():
205
+ jargons = gr.Textbox(label="Enter words for Hypernyms Detection separated by comma")
206
+ b1.click(get_keywords, inputs = text, outputs=jargons)
207
+
208
+ example_text = gr.Dataset(components=[text], samples=[["Markets are falling."], ["Exchanges the coupon on a bond for LIBOR plus a spread."], ["We follow a carbon neutrality strategy, seek to use resources efficiently and work to deliver sustainable value for society"], ["NGOs have been instrumental in shaping the economy"], ["We expect to boost our sales by 80% this year by using eco-friendly products."], ["We will continue to evaluate the need for an employee allowance as it hinders growth."],["As an example, in the calculation as of the end of 2020, carbon emissions of an issuer relate to 2019, whereas market capitalization is shown as of the end of 2020."], ["In addition to the impacts from the merger, insurance income increased $121 million due to strong production and acquisitions."],["In the year 2021, the markets were bullish. We expect to boost our sales by 80% this year by using eco-friendly products."], ["Noninterest income increased $1.7 billion due primarily to the Merger and higher residential mortgage income as a result of the lower rate environment driving mortgage production through refinance activity, partially offset by lower residential mortgage servicing income driven by higher prepayment and an MSR fair value adjustment in 2020. This year it will increase to $3M."]])
209
+ example_text.click(fn=set_example_text,
210
+ inputs=example_text,
211
+ outputs=example_text.components)
212
+
213
+ with gr.Column():
214
+ with gr.Tabs():
215
+ with gr.TabItem("Hypernyms & Claims"):
216
+ with gr.Row():
217
+ b_hyp = gr.Button("Get Hypernyms")
218
+ b_hyp.click(get_hyp, inputs = [jargons, b_hyp_th], outputs = gr.HighlightedText())
219
+ with gr.Row():
220
+ b3 = gr.Button("Get Claims")
221
+ b3.click(score_fincat, inputs = text, outputs = gr.HighlightedText().style(color_map={"In-claim": "red", "Out-of-Claim": "green"}))
222
+
223
+ with gr.TabItem("Summary & Sentiment"):
224
+ with gr.Row():
225
+ b2 = gr.Button("Get Summary")
226
+ b2.click(summarize_pega, inputs = text, outputs = gr.Textbox(label="Summary"))
227
+
228
+ with gr.Row():
229
+ b4 = gr.Button("Get Sentiment")
230
+ b4.click(getfinsenti, inputs = text, outputs = gr.HighlightedText().style(color_map={"negative": "red", "neutral":"blue", "positive": "green"}))
231
+
232
+ with gr.TabItem("Readability & Sustainability"):
233
+
234
+ with gr.Row():
235
+ b5 = gr.Button("Get Readability")
236
+ b5.click(get_readability, inputs = text, outputs = gr.HighlightedText().style(color_map={"non-readable": "red", "readable": "green"}))
237
+
238
+ with gr.Row():
239
+ b6 = gr.Button("Get Sustainability")
240
+ b6.click(get_sustainability, inputs = text, outputs = gr.HighlightedText().style(color_map={"non-sustainable": "red", "-":"blue", "sustainable": "green"}))
241
+
242
+ with gr.TabItem("ESG & FLS"):
243
+
244
+ with gr.Row():
245
+ b6 = gr.Button("Get Environmental, Social & Gov.(ESG)")
246
+ b6.click(esg, inputs = text, outputs = gr.HighlightedText().style(color_map={"Governance": "red", "Social":"blue", "Environmental": "green", "None":"yellow"}))
247
+
248
+ with gr.Row():
249
+ b6 = gr.Button("Get Forward Looking Statements(FLS)")
250
+ b6.click(fls, inputs = text, outputs = gr.HighlightedText().style(color_map={"Non-specific FLS": "red", "Not-FLS":"blue", "Specific-FLS": "green"}))
251
+ gr.Markdown("How to use? [link](https://youtu.be/Sk3PiQdr9Og), Warning: User discretion is advised.")
252
+ demo.launch()