Spaces:
Sleeping
Sleeping
import numpy as np | |
import gradio as gr | |
import os | |
import pinecone | |
import time | |
from torch import cuda | |
from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
import PyPDF2 | |
import re | |
from langchain.vectorstores import Pinecone | |
import os | |
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2' | |
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' | |
embed_model = HuggingFaceEmbeddings( | |
model_name=embed_model_id, | |
model_kwargs={'device': device}, | |
encode_kwargs={'device': device, 'batch_size': 32} | |
) | |
# get API key from app.pinecone.io and environment from console | |
pinecone.init( | |
api_key=os.environ.get('PINECONE_API_KEY'), | |
environment=os.environ.get('PINECONE_ENVIRONMENT') | |
) | |
docs = [ | |
"this is one document", | |
"and another document" | |
] | |
embeddings = embed_model.embed_documents(docs) | |
index_name = 'llama-rag' | |
if index_name not in pinecone.list_indexes(): | |
pinecone.create_index( | |
index_name, | |
dimension=len(embeddings[0]), | |
metric='cosine' | |
) | |
# wait for index to finish initialization | |
while not pinecone.describe_index(index_name).status['ready']: | |
time.sleep(1) | |
index = pinecone.Index(index_name) | |
index.describe_index_stats() | |
# def extract_text_from_pdf(pdf_path): | |
# pdf_file = open(pdf_path, 'rb') | |
# pdf_reader = PyPDF2.PdfReader(pdf_file) | |
# text = "" | |
# for page_number in range(len(pdf_reader.pages)): | |
# page = pdf_reader.pages[page_number] | |
# text += page.extract_text() | |
# pdf_file.close() | |
# return text | |
# def identify_sections(text): | |
# # Assuming sections start with "Chapter" headings | |
# sections = re.split(r'\n1+', text) | |
# sections = [section.strip() for section in sections if section.strip()] | |
# return sections | |
# pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed | |
# book_sections=[] | |
# for pdf_file in pdf_files: | |
# pdf_path = f'/content/{pdf_file}' | |
# book_text = extract_text_from_pdf(pdf_path) | |
# book_sections.append(identify_sections(book_text)) | |
# print(len(book_sections)) | |
# # Now you can organize and store the data as needed | |
# import pandas as pd | |
# data = pd.DataFrame({ | |
# 'ID': range(len(book_sections)), # Sequential IDs | |
# 'Text': book_sections | |
# }) | |
# print(data) | |
# batch_size = 4 | |
# for i in range(0, len(data), batch_size): | |
# i_end = min(len(data), i+batch_size) | |
# batch = data.iloc[i:i_end] | |
# ids = [f"{x['ID']}" for i, x in batch.iterrows()] | |
# texts = [x['Text'] for i, x in batch.iterrows()] | |
# embeds = embed_model.embed_documents(texts) | |
# # get metadata to store in Pinecone | |
# metadata = [ | |
# {'text': x['Text'], | |
# 'ID': x['ID']} for i, x in batch.iterrows() | |
# ] | |
# # add to Pinecone | |
# index.upsert(vectors=zip(ids, embeds,metadata)) | |
text_field = 'text' # field in metadata that contains text content | |
vectorstore = Pinecone( | |
index, embed_model.embed_query, text_field | |
) | |
def question(query): | |
return vectorstore.similarity_search( | |
query, # the search query | |
k=3 # returns top 3 most relevant chunks of text | |
) | |
demo = gr.Interface(fn=question, inputs="text", outputs="text") | |
if __name__ == "__main__": | |
demo.launch() |