Spaces:
Running
Running
File size: 3,625 Bytes
e9f8bde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.schema import Document
import pinecone
from langchain.vectorstores import FAISS
from pypdf import PdfReader
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import DirectoryLoader
#Extract Information from PDF file
def get_pdf_text(pdf_doc):
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
# iterate over files in
# that user uploaded PDF files, one by one
def create_docs(user_pdf_list, unique_id):
docs=[]
for filename in user_pdf_list:
chunks=get_pdf_text(filename)
#Adding items to our list - Adding data & its metadata
docs.append(Document(
page_content=chunks,
metadata={"name": filename.name,"id":filename.id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
))
# Load Files from Directory (Local Version)
#loader = DirectoryLoader('./Repository', glob='**/*')
#docs1 = loader.load()
#final_docs = docs + docs1
return docs
#Create embeddings instance
def create_embeddings_load_data():
embeddings = OpenAIEmbeddings()
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
return embeddings
#Function to push data to Vector Store - Pinecone here
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
print("done......2")
Pinecone.from_documents(docs, embeddings, index_name=pinecone_index_name)
#Function to pull infrmation from Vector Store - Pinecone here
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
index_name = pinecone_index_name
index = Pinecone.from_existing_index(index_name, embeddings)
return index
#Function to help us get relavant documents from vector store - based on user input
def similar_docs(query,k,pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,unique_id):
pinecone.init(
api_key=pinecone_apikey,
environment=pinecone_environment
)
index_name = pinecone_index_name
index = pull_from_pinecone(pinecone_apikey,pinecone_environment,index_name,embeddings)
#similar_docs = index.similarity_search_with_score(query, int(k),{"unique_id":unique_id})
similar_docs = index.similarity_search_with_score(query, int(k))
#print(similar_docs)
return similar_docs
def close_matches(query,k,docs,embeddings):
#https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.faiss.FAISS.html#langchain.vectorstores.faiss.FAISS.similarity_search_with_score
db = FAISS.from_documents(docs, embeddings)
similar_docs = db.similarity_search_with_score(query, int(k))
return similar_docs
# Helps us get the summary of a document
def get_summary(current_doc):
llm = OpenAI(temperature=0)
#llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run([current_doc])
return summary |