##Variables import os import streamlit as st import pathlib from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.chat_models.openai import ChatOpenAI from langchain import VectorDBQA import pandas as pd from langchain.chat_models import ChatOpenAI from langchain.prompts.chat import ( ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate, ) from langchain.schema import ( AIMessage, HumanMessage, SystemMessage ) from optimum.onnxruntime import ORTModelForSequenceClassification from transformers import pipeline, AutoTokenizer from optimum.pipelines import pipeline import tweepy import pandas as pd import numpy as np import plotly_express as px import plotly.graph_objects as go from datetime import datetime as dt from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode from datasets import Dataset from huggingface_hub import Repository @st.experimental_singleton(suppress_st_warning=True) def load_models(): '''load sentimant and topic clssification models''' sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id) topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id) return sent_pipe, topic_pipe @st.cache(allow_output_mutation=True, suppress_st_warning=True) def process_tweets(df,df_users): '''process tweets into a dataframe''' df['author'] = df['author'].astype(np.int64) df_merged = df.merge(df_users, on='author') tweet_list = df_merged['tweet'].tolist() sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list)) sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True) topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True) df_group = pd.concat([df_merged,sentiment,topic],axis=1) df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100) df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']] df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False) return df_tweets @st.experimental_singleton(suppress_st_warning=True) def embed_tweets(file,model,query,prompt): '''Process file with latest tweets''' # Split tweets int chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_text(file) if model == "hkunlp/instructor-large": emb = HuggingFaceInstructEmbeddings(model_name=model, query_instruction='Represent the Financial question for retrieving supporting documents: ', embed_instruction='Represent the Financial document for retrieval: ') elif model == "sentence-transformers/all-mpnet-base-v2": emb = HuggingFaceEmbeddings(model_name=model) docsearch = FAISS.from_texts(texts, emb) chain_type_kwargs = {"prompt": prompt} chain = VectorDBQA.from_chain_type( ChatOpenAI(temperature=0), chain_type="stuff", vectorstore=docsearch, chain_type_kwargs=chain_type_kwargs ) result = chain({"query": query}) return result CONFIG = { "bearer_token": os.environ.get("bearer_token") } sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification' topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification' task = 'text-classification' sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"} topics = { "0": "Analyst Update", "1": "Fed | Central Banks", "2": "Company | Product News", "3": "Treasuries | Corporate Debt", "4": "Dividend", "5": "Earnings", "6": "Energy | Oil", "7": "Financials", "8": "Currencies", "9": "General News | Opinion", "10": "Gold | Metals | Materials", "11": "IPO", "12": "Legal | Regulation", "13": "M&A | Investments", "14": "Macro", "15": "Markets", "16": "Politics", "17": "Personnel Change", "18": "Stock Commentary", "19": "Stock Movement", } user_name = [ "Investing.com", "(((The Daily Shot)))", "Bloomberg Markets", "FirstSquawk", "MarketWatch", "markets", "FinancialTimes", "CNBC", "ReutersBiz", "BreakingNews", "LiveSquawk", "NYSE", "WSJmarkets", "FT", "TheStreet", "ftfinancenews", "BloombergTV", "Nasdaq", "NYSE", "federalreserve", "NewYorkFed", "sffed", "WSJCentralBanks", "RichmondFed", "ecb", "stlouisfed", "WorldBank", "MarketCurrents", "OpenOutcrier", "BullTradeFinder", "WallStChatter", "Briefingcom", "SeekingAlpha", "realDonaldTrump", "AswathDamodaran", "ukarlewitz", "alphatrends", "Investor666", "ACInvestorBlog", "ZorTrades", "ScottNations", "TradersCorner", "TraderGoalieOne", "option_snipper", "jasonleavitt", "LMT978", "OptionsHawk", "andrewbtodd", "Terri1618", "SunriseTrader", "traderstewie", "TMLTrader", "IncredibleTrade", "NYFedResearch", "YahooFinance", "business", "economics", "IMFNews", "Market_Screener", "QuickTake", "NewsFromBW", "BNCommodities", ] user_id = [ "988955288", "423769635", "69620713", "59393368", "3295423333", "624413", "69620713", "4898091", "20402945", "15110357", "6017542", "21323268", "28164923", "18949452", "15281391", "11014272", "35002876", "18639734", "21323268", "26538229", "15072071", "117237387", "327484803", "16532451", "83466368", "71567590", "27860681", "15296897", "2334614718", "2222635612", "3382363841", "72928001", "23059499", "25073877", "33216611", "37284991", "15246621", "293458690", "55561590", "18560146", "244978426", "85523269", "276714687", "2806294664", "16205561", "1064700308", "61342056", "184126162", "405820375", "787439438964068352", "52166809", "2715646770", "47247213", "374672240", "19546277", "34713362", "144274618", "25098482", "102325185", "252751061", "976297820532518914", "804556370", ] sentiment_classifier, topic_classifier = load_models() def convert_user_names(user_name: list): '''convert user_names to tweepy format''' users = [] for user in user_name: users.append(f"from:{user}") return " OR ".join(users)