from langchain.llms import OpenAI from langchain.chains.qa_with_sources import load_qa_with_sources_chain from langchain.docstore.document import Document import requests import pathlib import subprocess import tempfile import os import gradio as gr import pickle from huggingface_hub import HfApi, upload_folder from huggingface_hub import whoami, list_models # using a vector space for our search from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores.faiss import FAISS from langchain.text_splitter import CharacterTextSplitter #Code for extracting the markdown fies from a Repo #To get markdowns from github for any/your repo def get_github_docs(repo_link): repo_owner, repo_name = repo_link.split('/')[-2], repo_link.split('/')[-1] with tempfile.TemporaryDirectory() as d: subprocess.check_call( f"git clone https://github.com/{repo_owner}/{repo_name}.git .", cwd=d, shell=True, ) git_sha = ( subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d) .decode("utf-8") .strip() ) repo_path = pathlib.Path(d) markdown_files = list(repo_path.rglob("*.md")) + list( repo_path.rglob("*.mdx") ) for markdown_file in markdown_files: try: with open(markdown_file, "r") as f: relative_path = markdown_file.relative_to(repo_path) github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}" yield Document(page_content=f.read(), metadata={"source": github_url}) except FileNotFoundError: print(f"Could not open file: {markdown_file}") #Code for creating a new space for the user def create_space(repo_link, hf_token): print("***********INSIDE CREATE SPACE***************") repo_name = repo_link.split('/')[-1] api = HfApi(token=hf_token) repo_url = api.create_repo( repo_id=f'LangChain_{repo_name}Bot', #example - ysharma/LangChain_GradioBot repo_type="space", space_sdk="gradio", private=False) #Code for creating the search index #Saving search index to disk def create_search_index(repo_link, openai_api_key): print("***********INSIDE CREATE SEARCH INDEX***************") #openai = OpenAI(temperature=0, openai_api_key=openai_api_key ) sources = get_github_docs(repo_link) #"gradio-app", "gradio" source_chunks = [] splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0) for source in sources: for chunk in splitter.split_text(source.page_content): source_chunks.append(Document(page_content=chunk, metadata=source.metadata)) search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings(openai_api_key=openai_api_key)) #saving FAISS search index to disk with open("search_index.pickle", "wb") as f: pickle.dump(search_index, f) return "search_index.pickle" def upload_files_to_space(repo_link, hf_token): print("***********INSIDE UPLOAD FILES TO SPACE***************") repo_name = repo_link.split('/')[-1] #Replacing the repo namein app.py with open("template/app_og.py", "r") as f: app = f.read() app = app.replace("$RepoName", reponame) #app = app.replace("$space_id", whoami(token=token)["name"] + "/" + model_id.split("/")[-1]) #Saving the new app.py file to disk with open("template/app.py", "w") as f: f.write(app) #Uploading the new app.py to the new space api.upload_file( path_or_fileobj = "template/app.py", path_in_repo = "app.py", repo_id = f'LangChain_{repo_name}Bot', #model_id, token = hf_token, repo_type="space",) #Uploading the new search_index file to the new space api.upload_file( path_or_fileobj = "search_index.pickle", path_in_repo = "search_index.pickle", repo_id = f'LangChain_{repo_name}Bot', #model_id, token = hf_token, repo_type="space",) #Upload requirements.txt to the space api.upload_file( path_or_fileobj="template/requirements.txt", path_in_repo="requirements.txt", repo_id=model_id, token=token, repo_type="space",) #Deleting the files - search_index and app.py file os.remove("template/app.py") os.remove("search_index.pickle") user_name = whoami(token=hf_token)['name'] repo_url = f"https://huggingface.co/spaces/{user_name}/LangChain_{repo_name}Bot" space_name = f"{user_name}/LangChain_{repo_name}Bot" return f"Successfully created the Chatbot at: " + space_name + "" def driver(repo_link, hf_token): #create search index openai_api_key=openai_api_key #search_index_pickle = create_search_index(repo_link, openai_api_key) #create a new space print("***********INSIDE DRIVER***************") create_space(repo_link, hf_token) #upload files to the new space html_tag = upload_files_to_space(repo_link, hf_token) print(f"html tag is : {html_tag}") return html_tag #Gradio code for Repo as input and search index as output file with gr.Blocks() as demo: with gr.Row(): repo_link = gr.Textbox(label="Enter Github repo name") hf_token_in = gr.Textbox(type='password', label="Enter hf-token name") openai_api_key = gr.Textbox(type='password', label="Enter your OpenAI API key here") with gr.Row(): btn_faiss = gr.Button("Create Search index") btn_create_space = gr.Button("Create YOur Chatbot") html_out = gr.HTML() search_index_file = gr.File() btn_faiss.click(create_search_index, [repo_link, openai_api_key],search_index_file ) btn_create_space.click(driver, [repo_link, hf_token_in], html_out) demo.queue() demo.launch(debug=True)