from langchain_huggingface import HuggingFaceEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_core.documents import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from pinecone import Pinecone, ServerlessSpec from uuid import uuid4 import json import os from dotenv import load_dotenv import sys import time load_dotenv() BEGIN = int(sys.argv[1]) END = int(sys.argv[2]) PATH = sys.argv[3] # Pinecone setup PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') pc = Pinecone(api_key=PINECONE_API_KEY) INDEX_NAME = sys.argv[4] index = pc.Index(INDEX_NAME) print("Loading JSON...") meta = json.load(open(PATH)) model_name = "sentence-transformers/all-MiniLM-L6-v2" model_kwargs = {'device': 'cuda'} encode_kwargs = {'normalize_embeddings': False} print("Initializing Pinecone index...") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_store = PineconeVectorStore(index=index, embedding=embeddings) text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, length_function=len, separators=["\n\n", "\n", " ", ""] ) fields = ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim'] print("Beginning Embeddings...") start = time.time() full_data = [] for page in meta: content = page['data'] full_data += content if BEGIN > END: slice = content[BEGIN:] else: slice = content[BEGIN:END] num = 0 for item in slice: id = item["id"] item_data = item["attributes"] print(id, time.time()) documents = [] for field in item_data: if (field in fields) or ("note" in field): entry = str(item_data[field]) if len(entry) > 1000: chunks = text_splitter.split_text(entry) for chunk in chunks: documents.append(Document(page_content=chunk, metadata={"source": id, "field": field})) else: documents.append(Document(page_content=entry, metadata={"source": id, "field": field})) if num % 1000 == 0: print(num, f"Added vectors to vectorstore at {time.time()} on id {id}") print(documents) uuids = [str(uuid4()) for _ in range(len(documents))] vector_store.add_documents(documents=documents, ids=uuids) num += 1 end = time.time() print(f"Embedded all documents in {end-start} seconds...")