import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings import gradio as gr import subprocess # Ensure Playwright installs required browsers and dependencies subprocess.run(["playwright", "install"]) #subprocess.run(["playwright", "install-deps"]) # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Initialize the model instances #repo_id = "mistralai/Mistral-7B-Instruct-v0.2" repo_id = "Qwen/Qwen2.5-72B-Instruct" llm_model_instance = HuggingFaceEndpoint( repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN ) embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" ) graph_config = { "llm": { "model_instance": llm_model_instance, "model_tokens": 100000, }, "embeddings": {"model_instance": embedder_model_instance} } def scrape_and_summarize(prompt, source): smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() exec_info = smart_scraper_graph.get_execution_info() return result, prettify_exec_info(exec_info) # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Scrape websites, no-code version") gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started. This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai). It's a basic demo and a work in progress. Please contribute to it to make it more useful!""") with gr.Row(): with gr.Column(): model_dropdown = gr.Textbox(label="Model", value="Qwen/Qwen2.5-72B-Instruct") prompt_input = gr.Textbox(label="Prompt", value="List all the press releases with their headlines and urls.") source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/") scrape_button = gr.Button("Scrape and Summarize") with gr.Column(): result_output = gr.Textbox(label="Result") exec_info_output = gr.Textbox(label="Execution Info") scrape_button.click( scrape_and_summarize, inputs=[prompt_input, source_input], outputs=[result_output, exec_info_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()