Spaces:

KatGaw
/

ExpressMood

Sleeping

File size: 7,467 Bytes

0bf43ca


import os
from dotenv import load_dotenv
from transformers import pipeline
import pandas as pd
from GoogleNews import GoogleNews
from langchain_openai import ChatOpenAI
import praw
from datetime import datetime

load_dotenv()

def fetch_news(topic):
    
    """ Fetches news articles within a specified date range.

    Args:
    - topic (str): Topic of interest
    
    Returns:
    - list: A list of dictionaries containing news. """
    
    load_dotenv()
    days_to_fetch_news = os.environ["DAYS_TO_FETCH_NEWS"]

    googlenews = GoogleNews()
    googlenews.set_period(days_to_fetch_news)
    googlenews.get_news(topic)
    news_json=googlenews.get_texts()
    urls=googlenews.get_links()
    
    no_of_news_articles_to_fetch = os.environ["NO_OF_NEWS_ARTICLES_TO_FETCH"]
    news_article_list = []
    counter = 0
    for article in news_json:
        
        if(counter >= int(no_of_news_articles_to_fetch)):
            break

        relevant_info = {
            'News_Article': article,
            'URL': urls[counter]
        }
        news_article_list.append(relevant_info)
        counter+=1
    return news_article_list

def fetch_reddit_news(topic):
    load_dotenv()
    REDDIT_USER_AGENT= os.environ["REDDIT_USER_AGENT"]
    REDDIT_CLIENT_ID= os.environ["REDDIT_CLIENT_ID"]
    REDDIT_CLIENT_SECRET= os.environ["REDDIT_CLIENT_SECRET"]
    #https://medium.com/geekculture/a-complete-guide-to-web-scraping-reddit-with-python-16e292317a52
    user_agent = REDDIT_USER_AGENT
    reddit = praw.Reddit (
    client_id= REDDIT_CLIENT_ID,
    client_secret= REDDIT_CLIENT_SECRET,
    user_agent=user_agent
    )

    headlines = set ( )
    for submission in reddit.subreddit('nova').search(topic,time_filter='week'):
        headlines.add(submission.title + ', Date: ' +datetime.utcfromtimestamp(int(submission.created_utc)).strftime('%Y-%m-%d %H:%M:%S') + ', URL:' +submission.url)
    
    if len(headlines)<10:
        for submission in reddit.subreddit('nova').search(topic,time_filter='year'):
            headlines.add(submission.title + ', Date: ' +datetime.utcfromtimestamp(int(submission.created_utc)).strftime('%Y-%m-%d %H:%M:%S') + ', URL:' +submission.url)
    if len(headlines)<10:
        for submission in reddit.subreddit('nova').search(topic): #,time_filter='week'):
            headlines.add(submission.title + ', Date: ' +datetime.utcfromtimestamp(int(submission.created_utc)).strftime('%Y-%m-%d %H:%M:%S') + ', URL:' +submission.url)
    return headlines

def analyze_sentiment(article):
    """
    Analyzes the sentiment of a given news article.

    Args:
    - news_article (dict): Dictionary containing 'summary', 'headline', and 'created_at' keys.

    Returns:
    - dict: A dictionary containing sentiment analysis results.
    """

    #Analyze sentiment using default model
    #classifier = pipeline('sentiment-analysis')

    #Analyze sentiment using specific model
    classifier = pipeline(model='tabularisai/robust-sentiment-analysis') #mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')
    sentiment_result = classifier(str(article))

    analysis_result = {
        'News_Article': article,
        'Sentiment': sentiment_result
    }

    return analysis_result


def generate_summary_of_sentiment(sentiment_analysis_results): #, dominant_sentiment):
    
    
    news_article_sentiment = str(sentiment_analysis_results)
    print("News article sentiment : " + news_article_sentiment)
    

    os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
    model = ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        # api_key="...",  # if you prefer to pass api key in directly instaed of using env vars
        # base_url="...",
        # organization="...",
        # other params...
    )

    messages=[
            {"role": "system", "content": "You are a helpful assistant that looks at all news articles, their sentiment, along with domainant sentiment and generates a summary rationalizing dominant sentiment. At the end of the summary, add URL links with dates for all the articles in the markdown format for streamlit. Example of adding the URLs: The Check out the links: [link](%s) % url, 2024-03-01 "},
            {"role": "user", "content": f"News articles and their sentiments: {news_article_sentiment}"} #, and dominant sentiment is: {dominant_sentiment}"}
    ]
    response = model.invoke(messages)
    

    summary = response.content
    print ("+++++++++++++++++++++++++++++++++++++++++++++++")
    print(summary)
    print ("+++++++++++++++++++++++++++++++++++++++++++++++")
    return summary


def plot_sentiment_graph(sentiment_analysis_results):
    """
    Plots a sentiment analysis graph 

    Args:
    - sentiment_analysis_result): (dict): Dictionary containing 'Review Title : Summary', 'Rating', and 'Sentiment' keys.

    Returns:
    - dict: A dictionary containing sentiment analysis results.
    """
    df = pd.DataFrame(sentiment_analysis_results)
    print(df)

    #Group by Rating, sentiment value count
    grouped = df['Sentiment'].value_counts()

    sentiment_counts = df['Sentiment'].value_counts()

    # Plotting pie chart
    # fig = plt.figure(figsize=(5, 3))
    # plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140)
    # plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    
    #Open below when u running this program locally and c
    #plt.show()

    return sentiment_counts


def get_dominant_sentiment (sentiment_analysis_results):
    """
    Returns overall sentiment, negative or positive or neutral depending on the count of negative sentiment vs positive sentiment 

    Args:
    - sentiment_analysis_result): (dict): Dictionary containing 'summary', 'headline', and 'created_at' keys.

    Returns:
    - dict: A dictionary containing sentiment analysis results.
    """
    df = pd.DataFrame(sentiment_analysis_results)

    # Group by the 'sentiment' column and count the occurrences of each sentiment value
    print(df)
    print(df['Sentiment'])
    sentiment_counts = df['Sentiment'].value_counts().reset_index()
    sentiment_counts.columns = ['sentiment', 'count']
    print(sentiment_counts)

    # Find the sentiment with the highest count
    dominant_sentiment = sentiment_counts.loc[sentiment_counts['count'].idxmax()]

    return dominant_sentiment['sentiment']

#starting point of the program
if __name__ == '__main__':
    
    #fetch news
    news_articles = fetch_news('AAPL')

    analysis_results = []
    
    #Perform sentiment analysis for each product review
    for article in news_articles:
        sentiment_analysis_result = analyze_sentiment(article['News_Article'])

        # Display sentiment analysis results
        print(f'News Article: {sentiment_analysis_result["News_Article"]} : Sentiment: {sentiment_analysis_result["Sentiment"]}', '\n')

        result = {
                    'News_Article': sentiment_analysis_result["News_Article"],
                    'Sentiment': sentiment_analysis_result["Sentiment"][0]['label']
                }
        
        analysis_results.append(result)

    
    #Graph dominant sentiment based on sentiment analysis data of reviews
    dominant_sentiment = get_dominant_sentiment(analysis_results)
    print(dominant_sentiment)
    
    #Plot graph
    plot_sentiment_graph(analysis_results)