import numpy as np import gradio as gr import os import pinecone import time from torch import cuda from langchain.embeddings.huggingface import HuggingFaceEmbeddings import PyPDF2 import re from langchain.vectorstores import Pinecone import os embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2' device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' embed_model = HuggingFaceEmbeddings( model_name=embed_model_id, model_kwargs={'device': device}, encode_kwargs={'device': device, 'batch_size': 32} ) # get API key from app.pinecone.io and environment from console pinecone.init( api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENVIRONMENT') ) docs = [ "this is one document", "and another document" ] embeddings = embed_model.embed_documents(docs) index_name = 'llama-rag' if index_name not in pinecone.list_indexes(): pinecone.create_index( index_name, dimension=len(embeddings[0]), metric='cosine' ) # wait for index to finish initialization while not pinecone.describe_index(index_name).status['ready']: time.sleep(1) index = pinecone.Index(index_name) index.describe_index_stats() # def extract_text_from_pdf(pdf_path): # pdf_file = open(pdf_path, 'rb') # pdf_reader = PyPDF2.PdfReader(pdf_file) # text = "" # for page_number in range(len(pdf_reader.pages)): # page = pdf_reader.pages[page_number] # text += page.extract_text() # pdf_file.close() # return text # def identify_sections(text): # # Assuming sections start with "Chapter" headings # sections = re.split(r'\n1+', text) # sections = [section.strip() for section in sections if section.strip()] # return sections # pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed # book_sections=[] # for pdf_file in pdf_files: # pdf_path = f'/content/{pdf_file}' # book_text = extract_text_from_pdf(pdf_path) # book_sections.append(identify_sections(book_text)) # print(len(book_sections)) # # Now you can organize and store the data as needed # import pandas as pd # data = pd.DataFrame({ # 'ID': range(len(book_sections)), # Sequential IDs # 'Text': book_sections # }) # print(data) # batch_size = 4 # for i in range(0, len(data), batch_size): # i_end = min(len(data), i+batch_size) # batch = data.iloc[i:i_end] # ids = [f"{x['ID']}" for i, x in batch.iterrows()] # texts = [x['Text'] for i, x in batch.iterrows()] # embeds = embed_model.embed_documents(texts) # # get metadata to store in Pinecone # metadata = [ # {'text': x['Text'], # 'ID': x['ID']} for i, x in batch.iterrows() # ] # # add to Pinecone # index.upsert(vectors=zip(ids, embeds,metadata)) text_field = 'text' # field in metadata that contains text content vectorstore = Pinecone( index, embed_model.embed_query, text_field ) def question(query): return vectorstore.similarity_search( query, # the search query k=3 # returns top 3 most relevant chunks of text ) demo = gr.Interface(fn=question, inputs="text", outputs="text") if __name__ == "__main__": demo.launch()