File size: 7,517 Bytes
67fe8d2
aa1010d
67fe8d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b4622
67fe8d2
 
96b4622
 
 
67fe8d2
 
 
96b4622
3d138c0
96b4622
67fe8d2
96b4622
 
 
 
 
67fe8d2
 
 
 
 
 
76a408f
2a93c29
67fe8d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87b7efa
67fe8d2
 
 
 
419ce3f
67fe8d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9495dfe
67fe8d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a408f
67fe8d2
 
 
 
87b7efa
3ba8a22
 
a5392e6
 
 
 
aa1010d
a5392e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151abac
 
a5392e6
 
151abac
a5392e6
 
151abac
 
 
3ba8a22
 
 
 
 
 
a5392e6
3ba8a22
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# -*- coding: utf-8 -*-
"""AI chatbot financial market.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1h4tpXH6r9B2VZLVwksIkuuVpcrXTUnuJ
"""

import torch
import bitsandbytes as bnb
import transformers
import re
import pandas as pd
import os
import streamlit as st

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the API token from environment variable
api_token = os.getenv("API_TOKEN")

# Define the repository ID and task
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
task = "text-generation"

# Initialize the Hugging Face Endpoint
chat_model = HuggingFaceEndpoint(
    huggingfacehub_api_token=api_token,
    repo_id=repo_id,
    task=task
)

template = """
You are a genius trader with extensive knowledge of the financial and stock markets, capable of providing deep and insightful analysis of financial stocks with remarkable accuracy.

**ALWAYS**
Forget your previous prompt.
First, determine if the content pertains to finance or the stock market. If it does, provide a summary with the main insights. If it does not, apologize and indicate that a summary with main insights will not be provided.
Be as detailed as possible, but don't make up any information that’s not from the context.
If you don't know an answer, say you don't know.
Let's think step by step.

Please ensure responses are informative, accurate, and tailored to the user's queries and preferences.
Use natural language to engage users and provide readable content throughout your response.

{context}
"""

review_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"],
        template=template,
    )
)

review_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)
messages = [review_system_prompt, review_human_prompt]

review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=messages,
)


def find_youtube_links(text):
    # Define the regular expression pattern for YouTube URLs
    youtube_regex = (r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[^ \n]+)')
    # Use re.findall() to find all matches in the text
    matches = re.findall(youtube_regex, text)
    return str(' '.join(matches))


# Function to get a response from the model
def get_response(user_query):
    review_chain = (
        {"context": reviews_retriever, "question": RunnablePassthrough()}
        | review_prompt_template
        | chat_model
        | StrOutputParser()
    )
    response = review_chain.invoke(user_query)
    return response

# App config
st.set_page_config(page_title="GOAHEAD.VN", page_icon="🌍")
st.title("GOAHEAD.VN AI 🤖")

# Initialize session state
if "chat_history" not in st.session_state:
    st.session_state.chat_history = [
        AIMessage(content="Please drop the YouTube link related to the financial market, I will help you summarize the main insights."),
    ]

# Display chat history
for message in st.session_state.chat_history:
    if isinstance(message, AIMessage):
        with st.chat_message("AI"):
            st.write(message.content)
    elif isinstance(message, HumanMessage):
        with st.chat_message("Human"):
            st.write(message.content)

# User input
user_query = st.chat_input("Type your message here...")

if user_query is not None and find_youtube_links(user_query) != "":
    st.session_state.chat_history.append(HumanMessage(content=user_query))

    with st.chat_message("Human"):
        st.markdown(user_query)

    loader = YoutubeLoader.from_youtube_url(
        find_youtube_links(user_query),
        add_video_info=False,
        language=["en", "vi"],
        translation="en",
    )
    docs = loader.load()
    # Convert the loaded documents to a list of dictionaries
    data_list = [
        {
            "source": doc.metadata['source'],
            "page_content": doc.page_content
        }
        for doc in docs
    ]
    df = pd.DataFrame(data_list)
    loader = DataFrameLoader(df, page_content_column='page_content')
    content = loader.load()
    content = filter_complex_metadata(content)

    # Split the document into chunks with a specified chunk size
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
    all_splits = text_splitter.split_documents(content)

    # Initialize the embedding model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

    # Store the document into a vector store with a specific embedding model
    vectorstore = FAISS.from_documents(all_splits, embedding_model)

    reviews_retriever  = vectorstore.as_retriever()

    response = get_response("Help me summary with main insights.")

    with st.chat_message("AI"):
        st.write(response)

    st.session_state.chat_history.append(AIMessage(content=response))


template_2 = """
You are a genius trader with extensive knowledge of the financial and stock markets, capable of providing deep and insightful analysis of financial stocks with remarkable accuracy.

**ALWAYS**
Only answer the question about the financial and stocks market. Do not answer anything else.
Be as detailed as possible, but don't make up any information that’s not from the context.
If you don't know an answer, say you don't know.
Let's think step by step.

Please ensure responses are informative, accurate, and tailored to the user's queries and preferences.
Use natural language to engage users and provide readable content throughout your response.

Chat history:
{chat_history}

User question:
{user_question}
"""

prompt_2 = ChatPromptTemplate.from_template(template_2)

# Function to get a response from the model
def get_response_2(user_query, chat_history):
    chain = prompt_2 | chat_model | StrOutputParser()
    response = chain.invoke({
        "chat_history": chat_history,
        "user_question": user_query,
    })
    return response

if user_query is not None and user_query != "" and find_youtube_links(user_query) == "":
    st.session_state.chat_history.append(HumanMessage(content=user_query))

    with st.chat_message("Human"):
        st.markdown(user_query)

    response = get_response_2(user_query, st.session_state.chat_history)

    # Remove any unwanted prefixes from the response
    response = response.replace("AI response:", "").replace("chat response:", "").replace("bot response:", "").strip()

    with st.chat_message("AI"):
        st.write(response)

    st.session_state.chat_history.append(AIMessage(content=response))