File size: 2,687 Bytes
60e4f28
cdbb5c6
5ed0e0d
cdbb5c6
 
 
 
 
 
 
 
a87cb29
cdbb5c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!pip install -qU langchain-community faiss-cpu faiss-gpu langchain-openai sentence_transformers gradio

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os
import pandas as pd
from uuid import uuid4
from langchain_core.documents import Document
import numpy as np
#from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain import PromptTemplate
import gradio as gr

df = pd.read_csv('news_paper-Cleaned.csv', encoding='utf-8', on_bad_lines='skip')

os.environ["OPENAI_API_KEY"] = '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'
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = [{
    'title': row['title'],
    'author': row['author'],
    'description': row['description'],
    'full_text' : row['full_text']

}
    for _, row in df.iterrows()]

full_text = [Document(
    page_content=str(doc),
    metadata={"source": "news"},
) for doc in documents]

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

text_split = text_splitter.split_documents(full_text)

uuids = [str(uuid4()) for _ in range(len(text_split))]

vector_store.add_documents(documents=text_split, ids=uuids)

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})

def questions(query):

    template = """
    You are a helpful assistant that that can answer questions about specific data.
    You have answer only from this Context.
    You will receive 10 Answer return all and spilt between them by new line.

    Question: {question}
    Context: {context}
    Answer: 
    """


    PROMPT = PromptTemplate(template=template, input_variables=['question', 'context'])

    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(), 
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT},
    )

    return qa_chain({"query": query})['result']


demo = gr.Interface(fn=questions, inputs="text", outputs="text")
demo.launch()