import gradio as gr import os import pinecone import time # from torch import cuda from langchain.embeddings.huggingface import HuggingFaceEmbeddings # import PyPDF2 # import re from langchain.vectorstores import Pinecone from sentence_transformers import SentenceTransformer embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2' # device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' embed_model = HuggingFaceEmbeddings( model_name=embed_model_id, # model_kwargs={'device': device}, # encode_kwargs={'device': device, 'batch_size': 32} ) # get API key from app.pinecone.io and environment from console pinecone.init( api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENVIRONMENT') ) docs = [ "this is one document", "and another document" ] embeddings = embed_model.embed_documents(docs) index_name = 'llama-rag' if index_name not in pinecone.list_indexes(): pinecone.create_index( index_name, dimension=len(embeddings[0]), metric='cosine' ) # wait for index to finish initialization while not pinecone.describe_index(index_name).status['ready']: time.sleep(1) index = pinecone.Index(index_name) index.describe_index_stats() # def extract_text_from_pdf(pdf_path): # pdf_file = open(pdf_path, 'rb') # pdf_reader = PyPDF2.PdfReader(pdf_file) # text = "" # for page_number in range(len(pdf_reader.pages)): # page = pdf_reader.pages[page_number] # text += page.extract_text() # pdf_file.close() # return text # def identify_sections(text): # # Assuming sections start with "Chapter" headings # sections = re.split(r'\n1+', text) # sections = [section.strip() for section in sections if section.strip()] # return sections # pdf_files = ['leph101.pdf', 'leph102.pdf','leph103.pdf','leph104.pdf','leph105.pdf','leph106.pdf','leph107.pdf','leph108.pdf'] # Add more file names as needed # book_sections=[] # for pdf_file in pdf_files: # pdf_path = f'/content/{pdf_file}' # book_text = extract_text_from_pdf(pdf_path) # book_sections.append(identify_sections(book_text)) # print(len(book_sections)) # # Now you can organize and store the data as needed # import pandas as pd # data = pd.DataFrame({ # 'ID': range(len(book_sections)), # Sequential IDs # 'Text': book_sections # }) # print(data) # batch_size = 4 # for i in range(0, len(data), batch_size): # i_end = min(len(data), i+batch_size) # batch = data.iloc[i:i_end] # ids = [f"{x['ID']}" for i, x in batch.iterrows()] # texts = [x['Text'] for i, x in batch.iterrows()] # embeds = embed_model.embed_documents(texts) # # get metadata to store in Pinecone # metadata = [ # {'text': x['Text'], # 'ID': x['ID']} for i, x in batch.iterrows() # ] # # add to Pinecone # index.upsert(vectors=zip(ids, embeds,metadata)) text_field = 'text' # field in metadata that contains text content vectorstore = Pinecone( index, embed_model.embed_query, text_field ) def question(query): return vectorstore.similarity_search( query, # the search query k=3 # returns top 3 most relevant chunks of text ) demo = gr.Interface(fn=question, inputs="text", outputs="text") if __name__ == "__main__": demo.launch()