stocks-machine / app.py
yaay's picture
Create app.py
ecc3a10
# 1. Install and Import Baseline Dependencies
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline
import csv
import streamlit as st
st.title('Stocks Analysis Machine')
x = st.slider('Select a value')
st.write(x, 'squared is', x * x)
# 2. Setup Model
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
# 3. Setup Pipeline
monitored_tickers = ['ETH']
# 4.1. Search for Stock News using Google and Yahoo Finance
print('Searching for stock news for', monitored_tickers)
def search_for_stock_news_links(ticker):
search_url = 'https://www.google.com/search?q=yahoo+finance+{}&tbm=nws'.format(ticker)
r = requests.get(search_url)
soup = BeautifulSoup(r.text, 'html.parser')
atags = soup.find_all('a')
hrefs = [link['href'] for link in atags]
return hrefs
raw_urls = {ticker:search_for_stock_news_links(ticker) for ticker in monitored_tickers}
# 4.2. Strip out unwanted URLs
print('Cleaning URLs.')
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']
def strip_unwanted_urls(urls, exclude_list):
val = []
for url in urls:
if 'https://' in url and not any(exc in url for exc in exclude_list):
res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
val.append(res)
return list(set(val))
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker] , exclude_list) for ticker in monitored_tickers}
# 4.3. Search and Scrape Cleaned URLs
print('Scraping news links.')
def scrape_and_process(URLs):
ARTICLES = []
for url in URLs:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('p')
text = [res.text for res in results]
words = ' '.join(text).split(' ')[:350]
ARTICLE = ' '.join(words)
ARTICLES.append(ARTICLE)
return ARTICLES
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
# 4.4. Summarise all Articles
print('Summarizing articles.')
def summarize(articles):
summaries = []
for article in articles:
input_ids = tokenizer.encode(article, return_tensors="pt")
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
summaries.append(summary)
return summaries
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
# 5. Adding Sentiment Analysis
print('Calculating sentiment.')
sentiment = pipeline("sentiment-analysis")
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
# # 6. Exporting Results
print('Exporting results')
def create_output_array(summaries, scores, urls):
output = []
for ticker in monitored_tickers:
for counter in range(len(summaries[ticker])):
output_this = [
ticker,
summaries[ticker][counter],
scores[ticker][counter]['label'],
scores[ticker][counter]['score'],
urls[ticker][counter]
]
output.append(output_this)
return output
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output.insert(0, ['Ticker','Summary', 'Sentiment', 'Sentiment Score', 'URL'])
with open('ethsummaries.csv', mode='w', newline='') as f:
csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerows(final_output)