# -*- coding: utf-8 -*- """Untitled68.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1h4tpXH6r9B2VZLVwksIkuuVpcrXTUnuJ """ import torch import bitsandbytes as bnb import transformers import re import pandas as pd import os import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from langchain.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema.runnable import RunnablePassthrough from langchain_core.messages import AIMessage, HumanMessage from langchain_community.llms import HuggingFaceEndpoint from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Get the API token from environment variable api_token = os.getenv("API_TOKEN") # Define the repository ID and task repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1" task = "text-generation" # Initialize the Hugging Face Endpoint chat_model = HuggingFaceEndpoint( huggingfacehub_api_token=api_token, repo_id=repo_id, task=task ) template = """ You are a genius trader with extensive knowledge of the financial and stock markets, capable of providing deep and insightful analysis of financial stocks with remarkable accuracy. **ALWAYS** Summarize and provide the main insights. Be as detailed as possible, but don't make up any information that’s not from the context. If you don't know an answer, say you don't know. Let's think step by step. Please ensure responses are informative, accurate, and tailored to the user's queries and preferences. Use natural language to engage users and provide readable content throughout your response. {context} """ review_system_prompt = SystemMessagePromptTemplate( prompt=PromptTemplate( input_variables=["context"], template=template, ) ) review_human_prompt = HumanMessagePromptTemplate( prompt=PromptTemplate( input_variables=["question"], template="{question}", ) ) messages = [review_system_prompt, review_human_prompt] review_prompt_template = ChatPromptTemplate( input_variables=["context", "question"], messages=messages, ) def find_youtube_links(text): # Define the regular expression pattern for YouTube URLs youtube_regex = (r'(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[^ \n]+)') # Use re.findall() to find all matches in the text matches = re.findall(youtube_regex, text) return str(' '.join(matches)) # Function to get a response from the model def get_response(user_query): review_chain = ( {"context": reviews_retriever, "question": RunnablePassthrough()} | review_prompt_template | chat_model | StrOutputParser() ) response = review_chain.invoke(user_query) return response # App config st.set_page_config(page_title="GOAHEAD.VN", page_icon="🌍") st.title("Summary and provide insights from youtube news.") # Initialize session state if "chat_history" not in st.session_state: st.session_state.chat_history = [ AIMessage(content="Hello, how can I help you?"), ] # Display chat history for message in st.session_state.chat_history: if isinstance(message, AIMessage): with st.chat_message("AI"): st.write(message.content) elif isinstance(message, HumanMessage): with st.chat_message("Human"): st.write(message.content) # User input user_query = st.chat_input("Type your message here...") if user_query is not None and find_youtube_links(user_query) != "": st.session_state.chat_history.append(HumanMessage(content=user_query)) with st.chat_message("Human"): st.markdown(user_query) loader = YoutubeLoader.from_youtube_url( find_youtube_links(user_query), add_video_info=False, language=["en", "vi"], translation="en", ) docs = loader.load() # Convert the loaded documents to a list of dictionaries data_list = [ { "source": doc.metadata['source'], "page_content": doc.page_content } for doc in docs ] df = pd.DataFrame(data_list) loader = DataFrameLoader(df, page_content_column='page_content') content = loader.load() content = filter_complex_metadata(content) # Split the document into chunks with a specified chunk size text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150) all_splits = text_splitter.split_documents(content) # Initialize the embedding model embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2") # Store the document into a vector store with a specific embedding model vectorstore = FAISS.from_documents(all_splits, embedding_model) reviews_retriever = vectorstore.as_retriever() response = get_response("Help me summary and provide main insights.") with st.chat_message("AI"): st.write(response) st.session_state.chat_history.append(AIMessage(content=response))