Jan Maciejowski commited on
Commit
f98185e
1 Parent(s): 59bef16

Committed app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gradio Application Interface
2
+
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+ import pandas as pd
8
+ import gensim
9
+ import re
10
+ import nltk
11
+ from nltk.corpus import stopwords, wordnet
12
+ from nltk.stem import WordNetLemmatizer
13
+ import os
14
+
15
+ def summarizer_func():
16
+ return pipeline(
17
+ model="Majon911/pegasus_multi_news_ep1",
18
+ tokenizer = "google/pegasus-xsum",
19
+ min_length=100, max_length=200,
20
+ truncation = True
21
+ )
22
+
23
+ def sentiment_func():
24
+ return pipeline("text-classification",
25
+ model="kbaumgartner/DeBERTa_Finetuned_Financial_News",
26
+ tokenizer = "microsoft/deberta-v3-base")
27
+
28
+ def source_outlet(choise):
29
+ if choise == 'CNBC':
30
+ url = "https://www.cnbc.com/finance/"
31
+ response = requests.get(url)
32
+ soup = BeautifulSoup(response.content, 'html.parser')
33
+
34
+ headlines = {}
35
+ headline_elements = soup.find_all('a', class_='Card-title')
36
+ for headline_element in headline_elements:
37
+ headlines[headline_element.text.strip()] = headline_element['href']
38
+ elif choise == "Reuters":
39
+ pass
40
+
41
+ df = pd.DataFrame({'headline': headlines.keys(),
42
+ 'url': headlines.values()})
43
+
44
+ first_5_articles = df.head()
45
+ first_5_articles = first_5_articles.assign(text='')
46
+ first_5_articles = first_5_articles.assign(summary='')
47
+ first_5_articles = first_5_articles.assign(sentiment='')
48
+ first_5_articles = first_5_articles.assign(topic='')
49
+ return first_5_articles
50
+
51
+ def sentiment_translation(curr_sentiment):
52
+ if curr_sentiment == "LABEL_0":
53
+ trans_lbl = "NEGATIVE"
54
+ elif curr_sentiment == "LABEL_1":
55
+ trans_lbl = "NEUTRAL"
56
+ elif curr_sentiment == "LABEL_2":
57
+ trans_lbl = "POSITIVE"
58
+ return trans_lbl
59
+
60
+ def preprocess(text):
61
+ # Remove special characters and digits
62
+ text = text.lower()
63
+ text = re.sub("(\\d|\\W)+", " ", text)
64
+ stop_words = set(stopwords.words('english'))
65
+ lemmatizer = WordNetLemmatizer()
66
+ tokens = [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words and len(word) > 3]
67
+ return tokens
68
+
69
+ def lda_topic_modeling(text):
70
+ lda_model = gensim.models.LdaModel.load("lda_gensim_5t/lda_model5.gensim")
71
+ dictionary = gensim.corpora.Dictionary.load("lda_gensim_5t/dictionary5.gensim")
72
+
73
+ processed_text = preprocess(text)
74
+ bow = dictionary.doc2bow(processed_text)
75
+ topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0.0)
76
+ topic_distribution = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
77
+
78
+ topic_names = {
79
+ '0': "Corporate Valuation & Performance",
80
+ '1': "Quarterly Financial Reports",
81
+ '2': "Stock Market & Investment Funds",
82
+ '3': "Corporate Affairs & Products",
83
+ '4': "Investment Research"
84
+ }
85
+
86
+ # Extract the most probable topic and its probability
87
+ if topic_distribution:
88
+ dominant_topic, probability = topic_distribution[0]
89
+ topic_name = topic_names.get(str(dominant_topic), "Unknown Topic")
90
+ return (topic_name, probability)
91
+ else:
92
+ # If no topic is found, return a placeholder and zero probability
93
+ return ("No Topic Found", 0.0)
94
+
95
+ def gradio_stocknews(source_ch, art_number):
96
+
97
+ # Defining the summarizer
98
+ summarizer = summarizer_func()
99
+ # Defining the semtiment analysis
100
+ pipe_sentiment = sentiment_func()
101
+
102
+ # Identyfying the Articles
103
+ first_5_articles = source_outlet(source_ch)
104
+
105
+ # Scraping text for the chosen article
106
+ response = requests.get(first_5_articles.loc[art_number-1, 'url'])
107
+ sub_soup = BeautifulSoup(response.content, 'html.parser')
108
+ article_body_element = sub_soup.find('div', class_='ArticleBody-articleBody') # ArticleBody-articleBody
109
+ article_text = article_body_element.get_text() # Extracting only the text
110
+ first_5_articles.loc[art_number-1, 'text'] = article_text
111
+ first_5_articles.loc[art_number-1, 'summary'] = summarizer(article_text)[0]['generated_text']
112
+
113
+ label_sentiment = pipe_sentiment(article_text)[0]['label']
114
+ first_5_articles.loc[art_number-1, 'sentiment'] = sentiment_translation(label_sentiment)
115
+
116
+ # Get the human-readable topic name using the topic names mapping
117
+ first_5_articles.loc[art_number-1, 'topic'] = lda_topic_modeling(article_text)[0]
118
+
119
+
120
+ return first_5_articles.loc[art_number-1, 'headline'], first_5_articles.loc[art_number-1, 'url'], first_5_articles.loc[art_number-1, 'summary'], first_5_articles.loc[art_number-1, 'sentiment'], first_5_articles.loc[art_number-1, 'topic']
121
+
122
+ def main():
123
+ os.chdir(os.path.dirname(os.path.realpath(__file__)))
124
+
125
+ #print(gradio_stocknews("CNBC", 2))
126
+
127
+ iface = gr.Interface(fn=gradio_stocknews,
128
+ inputs=[gr.Dropdown(choices=["CNBC"], label="Select Source"), gr.Dropdown(choices=[1, 2, 3, 4, 5], label="Select Article Number")],
129
+ outputs=[gr.Textbox(lines=1, label="Article Title"), gr.Textbox(lines=1, label="Article Link"), gr.Textbox(lines=1, label="Article Summary"), gr.Textbox(lines=1, label="Article Sentiment"), gr.Textbox(lines=1, label="Article Topic")], # Add this line for topic
130
+ title="Latest 5 Stock News Dashboard",
131
+ description="Click the button to refresh the news summary.")
132
+
133
+ iface.launch()
134
+
135
+ if __name__ == "__main__":
136
+ main()