File size: 4,973 Bytes
bf1c7e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c43678
bf1c7e8
 
 
 
 
 
 
 
 
1d04b99
bf1c7e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d04b99
bf1c7e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import streamlit as st
import pickle
import time
import langchain
#from langchain import OpenAI
#from langchain.chains import RetrievalQAWithSourcesChain
#from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import requests
import pandas as pd
from langchain_community.llms import HuggingFaceEndpoint
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer, util
from langchain.schema import SystemMessage, HumanMessage, AIMessage
import faiss
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env (especially openai api key)

def querypreprocess(query: str ):
  vec = model.encode(query)  #again embeddings of query by sentencetransformer and able to search the index vector.
  #svec = np.array(vec).reshape(1,-1)  # as 2D needed
  distances, I = index.search(vec, k=2)
  row_indices = I.tolist()[0]
  list1 = [docs[i].page_content for i in row_indices]
  str1 = " "
  str1 = str1.join(list1)
  #str1 = '\n'.join([str(message) for message in list1])
  #results = ' '.join(map(str, list1)) #list to string convert
  return str1
    
def augmented_prompt(query: str):
  messages = querypreprocess(query)
  source_knowledge =''.join([str(message) for message in messages])
  #source_knowledge =results
  augmented_prompt = f"""
  using the contexts below, answer the query.

  Contexts:
  {source_knowledge}
  Question: {query}
  Answer:"""
  return augmented_prompt

   
st.title("RockyBot: News Research Tool πŸ“ˆ")
st.sidebar.title("News Article URLs")

urls = []
for i in range(3):
    url = st.sidebar.text_input(f"URL {i+1}")
    urls.append(url)

process_url_clicked = st.sidebar.button("Process URLs")
file_path = "sentence_embeddings.pkl"

main_placeholder = st.empty()
#llm = OpenAI(temperature=0.9, max_tokens=500)
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2")

if process_url_clicked:
    # load data
    loader = UnstructuredURLLoader(urls=urls)
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
    data = loader.load()
    # split data
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000,
        chunk_overlap=0
    )
    main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
    docs = text_splitter.split_documents(data)
    # Create an array of text to embed
    sentences = []
    for i, row in enumerate(docs):
        sentences.append(row.page_content)
    # create embeddings and save it to FAISS index
    #embeddings = OpenAIEmbeddings()
    #vectorstore_openai = FAISS.from_documents(docs, embeddings)
    # initialize sentence transformer model
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    # create sentence embeddings
    sentence_embeddings = model.encode(sentences)
    main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
    time.sleep(2)

    # Save the FAISS index to a pickle file
    with open(file_path, "wb") as f:
        pickle.dump(sentence_embeddings, f)

query = main_placeholder.text_input("Question: ")
if query:
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            query = pickle.load(f)
            import faiss
            d = sentence_embeddings.shape[1]
            index = faiss.IndexFlatL2(d)   # build the index, d=size of vectors
            # here we assume xb contains a n-by-d numpy matrix of type float32
            index.add(sentence_embeddings)                  # add vectors to the index
            #chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
            #result = chain({"question": query}, return_only_outputs=True)
            # result will be a dictionary of this format --> {"answer": "", "sources": [] }
            #xq = model.encode([query])
            #k=2
            #D, I = index.search(xq, k=k)
            #result1 = [f'{i}: {sentences[i]}' for i in I[0]]
            messages = [
                SystemMessage(content="You are a helpful assistant."),
                HumanMessage(content=query),
                AIMessage(content="I am Great, Thank You, How Can I Help You.")
                ]
            prompt = augmented_prompt(query)
            messages.append(prompt)
            result = llm.invoke(messages)
            st.header("Answer")
            # Display sources, if available
            sources = result.get("sources", "")
            if sources:
                st.subheader("Sources:")
                sources_list = sources.split("\n")  # Split the sources by newline
                for source in sources_list:
                    st.write(source)