fintweet-GPT-Search / variables.py
nickmuchi's picture
Update variables.py
5ec18f4
raw
history blame
4.92 kB
##Variables
import os
import streamlit as st
import pathlib
from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models.openai import ChatOpenAI
from langchain import VectorDBQA
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import (
AIMessage,
HumanMessage,
SystemMessage
)
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer
from optimum.pipelines import pipeline
import tweepy
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from datetime import datetime as dt
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode
from datasets import Dataset
from huggingface_hub import Repository
@st.experimental_singleton(suppress_st_warning=True)
def load_models():
'''load sentimant and topic clssification models'''
sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id)
topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id)
return sent_pipe, topic_pipe
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def process_tweets(df,df_users):
'''process tweets into a dataframe'''
df['author'] = df['author'].astype(np.int64)
df_merged = df.merge(df_users, on='author')
tweet_list = df_merged['tweet'].tolist()
sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list))
sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True)
topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True)
df_group = pd.concat([df_merged,sentiment,topic],axis=1)
df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100)
df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']]
df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False)
return df_tweets
@st.experimental_singleton(suppress_st_warning=True)
def create_vectorstore(file,model):
'''Create FAISS vectorstore'''
# Split tweets int chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_text(file)
if model == "hkunlp/instructor-large":
emb = HuggingFaceInstructEmbeddings(model_name=model,
query_instruction='Represent the Financial question for retrieving supporting documents: ',
embed_instruction='Represent the Financial document for retrieval: ')
elif model == "sentence-transformers/all-mpnet-base-v2":
emb = HuggingFaceEmbeddings(model_name=model)
docsearch = FAISS.from_texts(texts, emb)
return docsearch
@st.experimental_singleton(suppress_st_warning=True)
def embed_tweets(query,_prompt,_docsearch):
'''Process file with latest tweets'''
chain_type_kwargs = {"prompt": _prompt}
chain = VectorDBQA.from_chain_type(
ChatOpenAI(temperature=0),
chain_type="stuff",
vectorstore=_docsearch,
chain_type_kwargs=chain_type_kwargs,
return_source_documents=True,
k=3
)
result = chain({"query": query})
return result
CONFIG = {
"bearer_token": os.environ.get("bearer_token")
}
sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification'
topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification'
task = 'text-classification'
sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"}
topics = {
"0": "Analyst Update",
"1": "Fed | Central Banks",
"2": "Company | Product News",
"3": "Treasuries | Corporate Debt",
"4": "Dividend",
"5": "Earnings",
"6": "Energy | Oil",
"7": "Financials",
"8": "Currencies",
"9": "General News | Opinion",
"10": "Gold | Metals | Materials",
"11": "IPO",
"12": "Legal | Regulation",
"13": "M&A | Investments",
"14": "Macro",
"15": "Markets",
"16": "Politics",
"17": "Personnel Change",
"18": "Stock Commentary",
"19": "Stock Movement",
}
sentiment_classifier, topic_classifier = load_models()
def convert_user_names(user_name: list):
'''convert user_names to tweepy format'''
users = []
for user in user_name:
users.append(f"from:{user}")
return " OR ".join(users)