from huggingface_hub import login, InferenceClient import os, gc, time, random, datetime, json, re HF_TOKEN=os.getenv('HF_TOKEN') SERP_API_KEY=os.getenv('SERP_KEY') login(token=HF_TOKEN) import gradio as gr from transformers import CodeAgent, Tool, ToolCollection, load_tool, ReactCodeAgent, ReactJsonAgent from transformers.agents import PythonInterpreterTool from langchain.memory import ConversationBufferMemory import bs4 import requests from llm_engine import HfEngine import datasets import spaces import tqdm from langchain_huggingface.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.vectorstores import VectorStore from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_JSON_SYSTEM_PROMPT from transformers.agents.default_tools import Tool, PythonInterpreterTool from duckduckgo_search import DDGS from web_surfer import (SearchInformationTool, NavigationalSearchTool, VisitTool, DownloadTool, PageUpTool, PageDownTool, FinderTool, FindNextTool, ArchiveSearchTool,) from mdconvert import MarkdownConverter from visual_qa import VisualQATool, VisualQAGPT4Tool HF_HUB_DISABLE_TELEMETRY=1 DO_NOT_TRACK=1 HF_HUB_ENABLE_HF_TRANSFER=0 def search_ducky(query): with DDGS() as ddgs: results = list(ddgs.text(query, max_results=10)) content = '' if results: for result in results: content += result['body'] return content knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") source_docs = [Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base] docs_processed = RecursiveCharacterTextSplitter(chunk_size=500).split_documents(source_docs)[:1000] embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") vectordb = FAISS.from_documents(documents=docs_processed, embedding=embedding_model) all_sources = list(set([doc.metadata["source"] for doc in docs_processed])) print(all_sources) class RetrieverTool(Tool): name = "retriever" description = "Retrieves some documents from the knowledge base that have the closest embeddings to the input query." inputs = { "query": { "type": "text", "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", }, "source": { "type": "text", "description": "" }, } output_type = "text" def __init__(self, vectordb: VectorStore, all_sources: str, **kwargs): super().__init__(**kwargs) self.vectordb = vectordb self.inputs["source"]["description"] = (f"The source of the documents to search, as a str representation of a list. Possible values in the list are: {all_sources}. If this argument is not provided, all sources will be searched.") def forward(self, query: str, source: str = None) -> str: assert isinstance(query, str), "Your search query must be a string" if source: if isinstance(source, str) and "[" not in str(source): # if the source is not representing a list source = [source] source = json.loads(str(source).replace("'", '"')) docs = self.vectordb.similarity_search(query, filter=({"source": source} if source else None), k=3) if len(docs) == 0: return "No documents found with this filtering. Try removing the source filter." return "Retrieved documents:\n\n" + "\n===Document===\n".join([doc.page_content for doc in docs]) memory = ConversationBufferMemory(memory_key="chat_history") llm_engine = HfEngine(model="Jopmt/JoPmt") ##gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() ##prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) ##tools = [StableDiffusionTool().langchain, ImageCaptioningTool().langchain, StableDiffusionPromptGeneratorTool().langchain, TextToVideoTool().langchain] ##tools=[prompt_generator_tool(), image_generation_tool(), PythonInterpreterTool()] class SearchTool(Tool): name = "ask_search_agent" description = "A search agent that will browse the internet to answer a question. Use it to gather informations, not for problem-solving." inputs = { "question": { "description": "Your question, as a natural language sentence. You are talking to an agent, so provide them with as much context as possible.", "type": "text", } } output_type = "text" def forward(self, question: str) -> str: return websurfer_agent.run(question) tools=[PythonInterpreterTool(),SearchTool(),RetrieverTool(vectordb, all_sources)] additional_authorized_imports=['requests', 'bs4', 'os', 'time', 'datetime', 'json', 're'] WEB_TOOLS = [SearchInformationTool(), NavigationalSearchTool(), VisitTool(), DownloadTool(), PageUpTool(), PageDownTool(), FinderTool(), FindNextTool(), ArchiveSearchTool(),] websurfer_agent = ReactJsonAgent(tools=WEB_TOOLS,llm_engine=llm_engine, add_base_tools=True,max_iterations=1) reagent = ReactCodeAgent(tools=tools, llm_engine=llm_engine, add_base_tools=True,max_iterations=1,additional_authorized_imports=additional_authorized_imports) def plix(inut, progress=gr.Progress(track_tqdm=True)): goose=reagent.run(inut) return goose with gr.Blocks(theme=random.choice([gr.themes.Monochrome(),gr.themes.Base.from_hub("gradio/seafoam"),gr.themes.Base.from_hub("freddyaboulton/dracula_revamped"),gr.themes.Glass(),gr.themes.Base(),]),analytics_enabled=False) as iface: out=gr.MultimodalTextbox(label="🤗Output",lines=5,interactive=False) inut=gr.Textbox(label="Prompt") btn=gr.Button("GENERATE") btn.click(fn=plix,inputs=inut,outputs=out) iface.queue(max_size=1,api_open=False) iface.launch(max_threads=20,inline=False,show_api=False)