Spaces:

nickmuchi
/

fintweet-GPT-Search

Build error

App Files Files Community

fintweet-GPT-Search / variables.py

nickmuchi

Update variables.py

5ec18f4 over 1 year ago

raw

history blame

4.92 kB

	##Variables

	import os
	import streamlit as st
	import pathlib

	from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import FAISS
	from langchain.chat_models.openai import ChatOpenAI
	from langchain import VectorDBQA
	import pandas as pd

	from langchain.chat_models import ChatOpenAI
	from langchain.prompts.chat import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	AIMessagePromptTemplate,
	HumanMessagePromptTemplate,
	)
	from langchain.schema import (
	AIMessage,
	HumanMessage,
	SystemMessage
	)

	from optimum.onnxruntime import ORTModelForSequenceClassification
	from transformers import pipeline, AutoTokenizer
	from optimum.pipelines import pipeline
	import tweepy
	import pandas as pd
	import numpy as np
	import plotly_express as px
	import plotly.graph_objects as go
	from datetime import datetime as dt
	from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode
	from datasets import Dataset
	from huggingface_hub import Repository

	@st.experimental_singleton(suppress_st_warning=True)
	def load_models():
	'''load sentimant and topic clssification models'''
	sent_pipe = pipeline(task,model=sent_model_id, tokenizer=sent_model_id)
	topic_pipe = pipeline(task, model=topic_model_id, tokenizer=topic_model_id)

	return sent_pipe, topic_pipe

	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	def process_tweets(df,df_users):
	'''process tweets into a dataframe'''

	df['author'] = df['author'].astype(np.int64)

	df_merged = df.merge(df_users, on='author')

	tweet_list = df_merged['tweet'].tolist()

	sentiment, topic = pd.DataFrame(sentiment_classifier(tweet_list)), pd.DataFrame(topic_classifier(tweet_list))

	sentiment.rename(columns={'score':'sentiment_confidence','label':'sentiment'}, inplace=True)

	topic.rename(columns={'score':'topic_confidence','label':'topic'}, inplace=True)

	df_group = pd.concat([df_merged,sentiment,topic],axis=1)

	df_group[['sentiment_confidence','topic_confidence']] = df_group[['sentiment_confidence','topic_confidence']].round(2).mul(100)

	df_tweets = df_group[['creation_time','username','tweet','sentiment','topic','sentiment_confidence','topic_confidence']]

	df_tweets = df_tweets.sort_values(by=['creation_time'],ascending=False)

	return df_tweets

	@st.experimental_singleton(suppress_st_warning=True)
	def create_vectorstore(file,model):
	'''Create FAISS vectorstore'''

	# Split tweets int chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	texts = text_splitter.split_text(file)


	if model == "hkunlp/instructor-large":
	emb = HuggingFaceInstructEmbeddings(model_name=model,
	query_instruction='Represent the Financial question for retrieving supporting documents: ',
	embed_instruction='Represent the Financial document for retrieval: ')

	elif model == "sentence-transformers/all-mpnet-base-v2":
	emb = HuggingFaceEmbeddings(model_name=model)

	docsearch = FAISS.from_texts(texts, emb)

	return docsearch


	@st.experimental_singleton(suppress_st_warning=True)
	def embed_tweets(query,_prompt,_docsearch):
	'''Process file with latest tweets'''


	chain_type_kwargs = {"prompt": _prompt}
	chain = VectorDBQA.from_chain_type(
	ChatOpenAI(temperature=0),
	chain_type="stuff",
	vectorstore=_docsearch,
	chain_type_kwargs=chain_type_kwargs,
	return_source_documents=True,
	k=3
	)

	result = chain({"query": query})

	return result

	CONFIG = {
	"bearer_token": os.environ.get("bearer_token")
	}

	sent_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-fintwitter-classification'
	topic_model_id = 'nickmuchi/optimum-finbert-tone-finetuned-finance-topic-classification'
	task = 'text-classification'

	sentiments = {"0": "Bearish", "1": "Bullish", "2": "Neutral"}

	topics = {
	"0": "Analyst Update",
	"1": "Fed \| Central Banks",
	"2": "Company \| Product News",
	"3": "Treasuries \| Corporate Debt",
	"4": "Dividend",
	"5": "Earnings",
	"6": "Energy \| Oil",
	"7": "Financials",
	"8": "Currencies",
	"9": "General News \| Opinion",
	"10": "Gold \| Metals \| Materials",
	"11": "IPO",
	"12": "Legal \| Regulation",
	"13": "M&A \| Investments",
	"14": "Macro",
	"15": "Markets",
	"16": "Politics",
	"17": "Personnel Change",
	"18": "Stock Commentary",
	"19": "Stock Movement",
	}

	sentiment_classifier, topic_classifier = load_models()

	def convert_user_names(user_name: list):
	'''convert user_names to tweepy format'''
	users = []
	for user in user_name:
	users.append(f"from:{user}")

	return " OR ".join(users)