Spaces:
Build error
Build error
##Variables | |
import os | |
import streamlit as st | |
import pathlib | |
from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.chat_models.openai import ChatOpenAI | |
from langchain import VectorDBQA | |
import pandas as pd | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts.chat import ( | |
ChatPromptTemplate, | |
SystemMessagePromptTemplate, | |
AIMessagePromptTemplate, | |
HumanMessagePromptTemplate, | |
) | |
from langchain.schema import ( | |
AIMessage, | |
HumanMessage, | |
SystemMessage | |
) | |
from optimum.onnxruntime import ORTModelForSequenceClassification | |
from transformers import pipeline, AutoTokenizer | |
from optimum.pipelines import pipeline | |
import tweepy | |
import pandas as pd | |
import numpy as np | |
import plotly_express as px | |
import plotly.graph_objects as go | |
from datetime import datetime as dt | |
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode | |
from datasets import Dataset | |
from huggingface_hub import Repository | |
def load_models(): | |
'''load sentimant and topic clssification models''' | |
sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id) | |
topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id) | |
return sent_pipe, topic_pipe | |
def process_tweets(df,df_users): | |
'''process tweets into a dataframe''' | |
df['author'] = df['author'].astype(np.int64) | |
df_merged = df.merge(df_users, on='author') | |
tweet_list = df_merged['tweet'].tolist() | |
sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list)) | |
sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True) | |
topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True) | |
df_group = pd.concat([df_merged,sentiment,topic],axis=1) | |
df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100) | |
df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']] | |
df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False) | |
return df_tweets | |
def create_vectorstore(file,model): | |
'''Create FAISS vectorstore''' | |
# Split tweets int chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
texts = text_splitter.split_text(file) | |
if model == "hkunlp/instructor-large": | |
emb = HuggingFaceInstructEmbeddings(model_name=model, | |
query_instruction='Represent the Financial question for retrieving supporting documents: ', | |
embed_instruction='Represent the Financial document for retrieval: ') | |
elif model == "sentence-transformers/all-mpnet-base-v2": | |
emb = HuggingFaceEmbeddings(model_name=model) | |
docsearch = FAISS.from_texts(texts, emb) | |
return docsearch | |
def embed_tweets(query,_prompt,_docsearch): | |
'''Process file with latest tweets''' | |
chain_type_kwargs = {"prompt": _prompt} | |
chain = VectorDBQA.from_chain_type( | |
ChatOpenAI(temperature=0), | |
chain_type="stuff", | |
vectorstore=_docsearch, | |
chain_type_kwargs=chain_type_kwargs, | |
return_source_documents=True, | |
k=3 | |
) | |
result = chain({"query": query}) | |
return result | |
CONFIG = { | |
"bearer_token": os.environ.get("bearer_token") | |
} | |
sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification' | |
topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification' | |
task = 'text-classification' | |
sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"} | |
topics = { | |
"0": "Analyst Update", | |
"1": "Fed | Central Banks", | |
"2": "Company | Product News", | |
"3": "Treasuries | Corporate Debt", | |
"4": "Dividend", | |
"5": "Earnings", | |
"6": "Energy | Oil", | |
"7": "Financials", | |
"8": "Currencies", | |
"9": "General News | Opinion", | |
"10": "Gold | Metals | Materials", | |
"11": "IPO", | |
"12": "Legal | Regulation", | |
"13": "M&A | Investments", | |
"14": "Macro", | |
"15": "Markets", | |
"16": "Politics", | |
"17": "Personnel Change", | |
"18": "Stock Commentary", | |
"19": "Stock Movement", | |
} | |
sentiment_classifier, topic_classifier = load_models() | |
def convert_user_names(user_name: list): | |
'''convert user_names to tweepy format''' | |
users = [] | |
for user in user_name: | |
users.append(f"from:{user}") | |
return " OR ".join(users) |