import os import shutil import git from urllib.parse import urlparse local_dir = os.getcwd() branch = None # Function to extract repository name from URL def get_repo_name(url): parsed_url = urlparse(url) # Extract the base name from the path (which is usually the repository name) repo_name = os.path.basename(parsed_url.path) # Remove the ".git" extension if it exists repo_name = repo_name[:-4] return repo_name # Function to clone a Git repository def clone_repo(url): try: path = os.path.join(local_dir,"staging",get_repo_name(url)) # Check if the repository already exists in the specified path if os.path.exists(path): print(f"{get_repo_name(url)} already added in db") return False repo = git.Repo.clone_from(url,path) global branch branch = repo.head.reference print(f"{get_repo_name(url)} cloned succesfully") return True except Exception as e : print(f"Error cloning the git repository: {e}") return False def delete_cloned_repo(url): local_path = os.path.join(local_dir,"staging",get_repo_name(url)) try: # Check if the local path exists if os.path.exists(local_path): # Use shutil.rmtree to remove the entire directory shutil.rmtree(local_path,ignore_errors=True) print(f"Repository at {local_path} successfully deleted.") else: print(f"Repository at {local_path} does not exist.") except Exception as e: print(f"Error deleting repository: {e}") from langchain_community.document_loaders import GitLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Qdrant import qdrant_client text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, chunk_overlap = 20, ) # from langchain_together.embeddings import TogetherEmbeddings # embeddings2 = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval",together_api_key="d8ec7106bd0c268bf4672dba83272b86054fbe849eba82f3f75ceb17e6d57eb0") client = qdrant_client.QdrantClient( os.getenv("QDRANT_HOST"), api_key=os.getenv("QDRANT_API_KEY") ) from langchain_community.embeddings.fastembed import FastEmbedEmbeddings embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5") vectorstore = None def load_repo(url): collection_config = qdrant_client.http.models.VectorParams( size=384, # 768 for instructor-xl, 1536 for OpenAI distance=qdrant_client.http.models.Distance.COSINE ) client.recreate_collection( collection_name=get_repo_name(url), vectors_config=collection_config ) vectorstore = Qdrant( client=client, collection_name=get_repo_name(url), embeddings=embeddings ) print("collection created") try: loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),) data = loader.load() chunks = text_splitter.split_documents(data) print("chunks created") vectorstore.add_documents(chunks) return True except Exception as e: print(f"Error loading and indexing repository: {e}") return False def repository_loader(url): result = False if(clone_repo(url)): result = load_repo(url) if result : delete_cloned_repo(url) print('HELLO FROM CONTAINER') #answer_query("How is the routing done in this project and what are the routes used",'https://github.com/s0ham075/Google-Docs-Frontend.git') # delete_cloned_repo()