ishaan-mital's picture
added embedding model and vector DB
4dd0f5b
raw
history blame
3.33 kB
import numpy as np
import gradio as gr
import os
import pinecone
import time
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import PyPDF2
import re
from langchain.vectorstores import Pinecone
import os
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
model_kwargs={'device': device},
encode_kwargs={'device': device, 'batch_size': 32}
)
# get API key from app.pinecone.io and environment from console
pinecone.init(
api_key=os.environ.get('PINECONE_API_KEY'),
environment=os.environ.get('PINECONE_ENVIRONMENT')
)
docs = [
"this is one document",
"and another document"
]
embeddings = embed_model.embed_documents(docs)
index_name = 'llama-rag'
if index_name not in pinecone.list_indexes():
pinecone.create_index(
index_name,
dimension=len(embeddings[0]),
metric='cosine'
)
# wait for index to finish initialization
while not pinecone.describe_index(index_name).status['ready']:
time.sleep(1)
index = pinecone.Index(index_name)
index.describe_index_stats()
# def extract_text_from_pdf(pdf_path):
# pdf_file = open(pdf_path, 'rb')
# pdf_reader = PyPDF2.PdfReader(pdf_file)
# text = ""
# for page_number in range(len(pdf_reader.pages)):
# page = pdf_reader.pages[page_number]
# text += page.extract_text()
# pdf_file.close()
# return text
# def identify_sections(text):
# # Assuming sections start with "Chapter" headings
# sections = re.split(r'\n1+', text)
# sections = [section.strip() for section in sections if section.strip()]
# return sections
# pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed
# book_sections=[]
# for pdf_file in pdf_files:
# pdf_path = f'/content/{pdf_file}'
# book_text = extract_text_from_pdf(pdf_path)
# book_sections.append(identify_sections(book_text))
# print(len(book_sections))
# # Now you can organize and store the data as needed
# import pandas as pd
# data = pd.DataFrame({
# 'ID': range(len(book_sections)), # Sequential IDs
# 'Text': book_sections
# })
# print(data)
# batch_size = 4
# for i in range(0, len(data), batch_size):
# i_end = min(len(data), i+batch_size)
# batch = data.iloc[i:i_end]
# ids = [f"{x['ID']}" for i, x in batch.iterrows()]
# texts = [x['Text'] for i, x in batch.iterrows()]
# embeds = embed_model.embed_documents(texts)
# # get metadata to store in Pinecone
# metadata = [
# {'text': x['Text'],
# 'ID': x['ID']} for i, x in batch.iterrows()
# ]
# # add to Pinecone
# index.upsert(vectors=zip(ids, embeds,metadata))
text_field = 'text' # field in metadata that contains text content
vectorstore = Pinecone(
index, embed_model.embed_query, text_field
)
def question(query):
return vectorstore.similarity_search(
query, # the search query
k=3 # returns top 3 most relevant chunks of text
)
demo = gr.Interface(fn=question, inputs="text", outputs="text")
if __name__ == "__main__":
demo.launch()