Spaces:

mayank1101
/

gradio-web-search-application

Running

App Files Files Community

mayank1101 commited on Dec 6, 2024

Commit

28d8100

verified ·

1 Parent(s): 3180e31

Upload 7 files

Browse files

Files changed (7) hide show

gradio_app.py +180 -0
main.py +82 -0
model_registry.py +26 -0
requirements.txt +18 -0
utils.py +49 -0
websearch.py +99 -0
websites.yaml +135 -0

gradio_app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# import gradio as gr
+# import httpx
+# import json
+# from typing import Tuple, Any
+# # Define the FastAPI endpoint URL
+# FASTAPI_ENDPOINT = "http://localhost:8000/websearch"
+# def query_api(query: str) -> Tuple[Any, Any]:
+#     try:
+#         # Send POST request to FastAPI endpoint with streaming enabled
+#         with httpx.Client() as client:
+#             with client.stream("POST", FASTAPI_ENDPOINT, json={"query": query}, timeout=60.0) as response:
+#                 response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
+#                 # Process the streaming response
+#                 response_data = ""
+#                 for chunk in response.iter_text():
+#                     response_data += chunk
+#                 # Parse the accumulated response data as JSON
+#                 response_json = json.loads(response_data)
+#                 # Extract content and citations from the response JSON
+#                 content = response_json.get("content", "")
+#                 citations = response_json.get("citations", [])
+#                 # Beautify content using Markdown formatting
+#                 beautified_content = f"# Search Results\n\n{content}"
+#                 # Beautify citations by adding Markdown links
+#                 beautified_citations = "# Citations\n\n"
+#                 for i, citation in enumerate(citations, start=1):
+#                     beautified_citations += f"{i}. [{citation}]({citation})\n"
+#                 # Yield the beautified content and citations
+#                 yield beautified_content, beautified_citations
+#     except httpx.TimeoutException:
+#         yield "Request timed out. Please try again later.", ""
+#     except httpx.HTTPStatusError as e:
+#         yield f"HTTP error occurred: {e}", ""
+#     except Exception as e:
+#         yield f"An error occurred: {e}", ""
+# # Create Gradio interface
+# with gr.Blocks(css=".gradio-container { background-color: #f5f5f5; padding: 20px; border-radius: 10px; }") as demo:
+#     gr.Markdown("# Web Search Application")
+#     with gr.Row():
+#         with gr.Column():
+#             query = gr.Textbox(
+#                 label="Enter your query",
+#                 placeholder="Type your search query here...",
+#                 lines=2,
+#                 max_lines=4,
+#                 value="",
+#                 elem_id="query-input"
+#             )
+#             submit_button = gr.Button("Search")
+#         with gr.Column():
+#             output_content = gr.Textbox(
+#                 label="Response Content",
+#                 placeholder="Search results will appear here...",
+#                 lines=10,
+#                 max_lines=20,
+#                 value="",
+#                 elem_id="response-content"
+#             )
+#             output_citations = gr.Textbox(
+#                 label="Citations",
+#                 placeholder="Citations will appear here...",
+#                 lines=5,
+#                 max_lines=10,
+#                 value="",
+#                 elem_id="response-citations"
+#             )
+#     # Set up event listener
+#     submit_button.click(query_api, inputs=query, outputs=[output_content, output_citations])
+#     gr.Markdown("Powered by FastAPI and Gradio")
+# # Launch the Gradio application
+# demo.launch()
+import gradio as gr
+import httpx
+import json
+# Define the FastAPI endpoint URL
+FASTAPI_ENDPOINT = "http://localhost:8000/websearch"
+def query_api(query: str) -> tuple:
+    try:
+        # Send POST request to FastAPI endpoint with streaming enabled
+        with httpx.Client() as client:
+            with client.stream("POST", FASTAPI_ENDPOINT, json={"query": query}, timeout=60.0) as response:
+                response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
+                # Process the streaming response
+                response_data = ""
+                for chunk in response.iter_text():
+                    response_data += chunk
+                # Parse the accumulated response data as JSON
+                response_json = json.loads(response_data)
+                # Extract content and citations from the response JSON
+                content = response_json.get("content", "")
+                citations = response_json.get("citations", [])
+                # Beautify content using Markdown formatting
+                beautified_content = f"# Search Results\n\n{content}"
+                # Beautify citations by adding Markdown links
+                beautified_citations = "# Citations/Sources\n\n"
+                for i, citation in enumerate(citations, start=1):
+                    beautified_citations += f"{i}. [{citation}]({citation})\n"
+                # Yield the beautified content and citations
+                yield beautified_content, beautified_citations
+    except httpx.TimeoutException:
+        yield "# Request Timeout\n\nRequest timed out. Please try again later.", ""
+    except httpx.HTTPStatusError as e:
+        yield f"# HTTP Error\n\nHTTP error occurred: {e}", ""
+    except Exception as e:
+        yield f"# Error\n\nAn error occurred: {e}", ""
+# Create Gradio interface
+with gr.Blocks(css=".gradio-container { background-color: #f5f5f5; padding: 20px; border-radius: 10px; }", theme=gr.themes.Citrus()) as demo:
+    gr.Markdown("# Web Search Application")
+    with gr.Row():
+        with gr.Column(
+            render=True,
+            show_progress=True
+        ):
+            query = gr.Textbox(
+                label="Enter your query",
+                placeholder="Type your search query here...",
+                lines=2,
+                max_lines=4,
+                value="",
+                elem_id="query-input"
+            )
+            submit_button = gr.Button("Search")
+        with gr.Column(
+            render=True,
+            show_progress=True
+        ):
+            output_content = gr.Markdown(
+                label="Response Content",
+                value="",
+                elem_id="response-content",
+                height="600px",
+                visible=True,
+                show_label=True
+            )
+            output_citations = gr.Markdown(
+                label="Citations",
+                value="",
+                elem_id="response-citations",
+                height="200px",
+                visible=True,
+                show_label=True
+            )
+    # Set up event listener
+    submit_button.click(query_api, inputs=query, outputs=[output_content, output_citations])
+    gr.Markdown("Powered by FastAPI and Gradio")
+# Launch the Gradio application
+demo.launch()

main.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+import json
+import asyncio
+from typing import AsyncGenerator
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from fastapi import FastAPI, HTTPException
+from websearch import QueryRequest, PerplexityClient, parse_perplexity_response
+# load .env file
+from dotenv import load_dotenv
+load_dotenv()
+# Initialize FastAPI app
+app = FastAPI()
+# Add CORS middleware to allow frontend connections
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Adjust this in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize Perplexity client
+perplexity_client = PerplexityClient(
+    api_key=os.environ["PERPLEXITY_AUTH_TOKEN"]
+    )
+async def generate_stream(query: str) -> AsyncGenerator[str, None]:
+    """
+    Async generator to stream JSON response
+    Args:
+        query (str): User query
+    Yields:
+        str: JSON-encoded chunks of response
+    """
+    try:
+        # Fetch response from Perplexity
+        response = await perplexity_client.generate_response(query)
+        # Parse the response
+        parsed_response = parse_perplexity_response(response)
+        # Stream the parsed response as JSON chunks
+        yield json.dumps(parsed_response)
+    except Exception as e:
+        yield json.dumps({"error": str(e)})
+@app.post("/websearch")
+async def handle_query(request: QueryRequest):
+    """
+    Endpoint to handle user queries and stream responses
+    Args:
+        request (QueryRequest): Query request model
+    Returns:
+        StreamingResponse: Streaming JSON response
+    """
+    return StreamingResponse(
+        generate_stream(request.query),
+        media_type="application/json"
+    )
+# Optional: Health check endpoint
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+# Run the app with:
+# uvicorn main:app --reload
+# Make sure to set PERPLEXITY_API_KEY environment variable

model_registry.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from utils import get_current_date, get_websites
+current_date = get_current_date()  # returns current date
+websites = get_websites()  # returns list of websites
+perplexity_prompt = f"""
+You are an advanced information retrieval assistant. Your primary task is to provide accurate and concise answers to user queries. You must refer to the following LIST OF WEBSITES to retrieve the context required to answer the user query.
+LIST OF WEBSITES TO REFER - {', '.join(websites)}
+IMPORTANT NOTE - You must always include citations source URLs in the response for user transparency. Provide information that is relevant and up-to-date as of this {current_date} date.
+"""
+model_card = {
+  "perplexity": {
+    "model": "llama-3.1-sonar-large-128k-online",
+    "url": "https://api.perplexity.ai/chat/completions",
+    "inference_config": {
+      "max_tokens": 4094,
+      "temperature": 0
+    },
+    "prompt": perplexity_prompt
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+python-dotenv==1.0.1
+langchain==0.3.7
+langchain-community==0.3.7
+langchain-aws==0.2.7
+tavily-python==0.5.0
+langchain-qdrant==0.2.0
+langchain-openai==0.2.9
+langchain-ollama==0.2.0
+# langchain-mistralai==0.2.2
+# langchain-huggingface==0.1.2
+faiss-cpu==1.9.0.post1
+rapidocr-onnxruntime==1.4.0
+fastapi==0.115.5
+httpx==0.28.0
+uvicorn==0.32.1
+pydantic==2.10.2
+aiofiles==23.2.1 # for gradio 24.1.0 for unstructured
+gradio==5.8.0

utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import yaml
+import datetime
+from typing import Dict, List
+def get_current_date() -> str:
+    """returns present date.
+    Returns:
+        str: return present date as a string.
+    """
+    current_date = datetime.date.today().strftime("%Y-%m-%d")  # provide current date to LLM context
+    return current_date
+def read_yaml(file_path: str) -> Dict:
+    """_summary_
+    Args:
+        file_path (str): wesites.yaml file path
+    Raises:
+        ValueError: raise error is file_path is empty or if websites.yaml file is missing.
+    Returns:
+        Dict: return List websites to be used for websearch.
+    """
+    websites_yaml = None
+    if not read_yaml:
+        raise ValueError("Website yaml config file missing")
+        return websites_yaml
+    else:
+        with open(file_path, 'r') as file:
+            websites_yaml = yaml.load(file, Loader=yaml.SafeLoader)  # reads .yaml file
+    return websites_yaml
+def get_websites() -> List[str]:
+    """reads websites.yaml file and return list of webistes
+    Returns:
+        List[str]: List of websites
+    """
+    file_path = os.path.join(os.getcwd(), 'websites.yaml')  # get websites.yaml file path
+    websites = read_yaml(file_path) # read wesbites.yaml file
+    if not websites:
+        return []
+    return websites['public_websites'] # return list of public files

websearch.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import asyncio
+from typing import Dict, Any
+import httpx
+from pydantic import BaseModel
+from model_registry import model_card
+perpelxity_card = model_card["perplexity"]
+class QueryRequest(BaseModel):
+    query: str
+class PerplexityClient:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.base_url = perpelxity_card["url"]
+        self.headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+        }
+    async def generate_response(self, query: str) -> Dict[str, Any]:
+        payload = {
+            "model": perpelxity_card["model"],
+            "messages": [
+                {
+                    "role": "system",
+                    "content": perpelxity_card["prompt"]
+                },
+                {
+                    "role": "user",
+                    "content": query
+                }
+            ],
+            "max_tokens": perpelxity_card["inference_config"]["max_tokens"],
+            "temperature": perpelxity_card["inference_config"]["temperature"],
+            "stream": False,  # We'll handle streaming separately
+        }
+        async with httpx.AsyncClient() as client:
+            try:
+                async with httpx.AsyncClient(
+                    timeout=httpx.Timeout(
+                        connect=10.0,    # Connection timeout
+                        read=45.0,       # Read timeout
+                        write=10.0,      # Write timeout
+                        pool=10.0        # Connection pool timeout
+                    )
+                ) as client:
+                    response = await client.post(
+                        self.base_url,
+                        headers=self.headers,
+                        json=payload
+                    )
+                    response.raise_for_status()
+                    return response.json()
+            except httpx.HTTPStatusError as e:
+                print(f"HTTP error occurred: {e}")
+                print(f"Response text: {e.response.text}")
+                raise
+            except httpx.RequestError as e:
+                print(f"Request error occurred: {e}")
+                raise
+            except Exception as e:
+                print(f"An unexpected error occurred: {e}")
+                raise
+def parse_perplexity_response(response: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Parse the Perplexity API response and extract key information.
+    Args:
+        response (Dict[str, Any]): Raw response from Perplexity API
+    Returns:
+        Dict[str, Any]: Parsed response with content and citations
+    """
+    print("parse_perplexity_response called...")
+    # Basic citation extraction (this is a simple implementation)
+    # In a real-world scenario, you might want a more sophisticated citation extraction
+    citations = []
+    # default content to stream if no response from the Perplexity AI
+    default_content = (
+        "I'm sorry, I couldn't find any relevant information for your query from the available sources. "
+        "If you'd like, you can try rephrasing your question or provide more context to help refine the search. "
+        "Alternatively, let me know if you'd like assistance in a different area."
+        )
+    citations = response.get("citations", [])
+    content = response.get("choices", default_content)[0]["message"]["content"]
+    return {
+        "content": content,
+        "citations": citations
+    }

websites.yaml ADDED Viewed

	@@ -0,0 +1,135 @@

+public_websites:
+  - https://www.alimarket.es/base_de_datos,
+  - https://www.bain.cn/,
+  - http://zhgry.aiijournal.com/CN/1671-4393/home.shtml,
+  - https://moloprom.ru/,
+  - https://2023.dairyunion.ru/,
+  - https://www2.deloitte.com/cn/zh.html,
+  - https://www.emeoutlookmag.com/food-beverage",
+  - https://fabnews.live/,
+  - https://fmcgmagazine.co.uk/,
+  - https://www.foodbusinessnews.net/,
+  - https://www.fooddive.com/,
+  - https://www.foodengineeringmag.com/articles/96940-new-advances-make-aseptic-packaging-more-popular,
+  - https://foodindustryexecutive.com/
+  - http://spgykj.com/
+  - https://www.foodmanufacturing.com/
+  - https://www.foodprocessing.com/
+  - https://www.foodprocessing.com/
+  - https://www.foodsafetyafrica.net/
+  - https://www.foodtechbiz.com/packaging/tetra-pak-introduces-tetra-stelo-aseptic-package-with-minute-maid-juice-range-of-coca-cola-in-india
+  - https://www.fruit-processing.com/
+  - https://www.globalbrandsmagazine.com/beverage-brands-in-the-middle-east/
+  - https://www.greenqueen.com.hk/
+  - https://www.healthcaremea.com/
+  - https://www.ingredientsnetwork.com/
+  - https://www.iresearch.cn/
+  - https://issuu.com/ruralnewsgroup/docs/dn_508_november_22
+  - https://www.just-food.com/
+  - https://www.kantarworldpanel.com/cn
+  - https://www.labelsandlabeling.com/
+  - https://lenta.ru
+  - https://www.livemint.com/
+  - https://www.magzter.com/ar/HK/Ringier-Trade-Media-Ltd/Food-Manufacturing-Journal---Middle-East-&-Africa/Food-&-Beverage/194798
+  - https://www.mckinsey.com.cn/
+  - https://milknews.ru/
+  - https://www.nutritioninsight.com/
+  - https://www.packagingnetwork.com/doc/fruit-juice-maker-chooses-drink-box-for-grown-0001
+  - https://www.packagingnews.co.uk/
+  - https://packagingsouthasia.com/
+  - https://www.packaginglaw.com/
+  - https://www.pwccn.com/zh/research-and-insights.html
+  - https://www.retail.ru/
+  - https://retailer.ru/
+  - https://www.rbc.ru/
+  - https://rg.ru/
+  - https://sustainabilitymea.com/
+  - https://www.foodsafetyafrica.net/fda-announces-elimination-of-pfas-in-food-packaging-to-protect-public-health/
+  - https://tass.ru
+  - https://thebeet.com/category/plant-based-news/
+  - https://www.thegrocer.co.uk/
+  - https://www.theveganindians.com/
+  - https://www.thevegankind.com/
+  - https://www.totallyveganbuzz.com/
+  - https://www.unipack.ru/
+  - https://vegnews.com/
+  - https://www.veganfirst.com/
+  - https://www.theveganindians.com/indian-start-up-launches-first-ever-vegan-milk-made-from-sprouted-millets/
+  - https://vegoutmag.com/food-and-drink
+  - https://www.ecfr.gov/
+  - https://freepub.edqm.eu/publications/
+  - https://www.bfr.bund.de/de/bfr_empfehlungen_fuer_materialien_im_lebensmittelkontakt-308425.html
+  - https://cfsa.net.cn/
+  - https://www.icourse163.org/
+  - https://www.doc88.com/
+  - https://www.foodbev.com/news/category/industries/beverage/
+  - https://www.packagingdigest.com/food-beverage/beverage-packaging
+  - https://www.packagingnews.co.uk/news/materials/cartonboard
+  - https://www.dairyreporter.com/
+  - https://www.packaginginsights.com
+  - https://packagingeurope.com/
+  - https://www.foodnavigator-asia.com/
+  - https://www.newsnow.co.uk/h/Industry+Sectors/Food+&+Drink/Food+Manufacturers
+  - https://www.beveragedaily.com/
+  - https://www.fb101.com/
+  - https://www.dairyfoods.com/
+  - https://www.foodengineeringmag.com/
+  - https://www.preparedfoods.com/
+  - https://www.bevindustry.com/
+  - https://www.foodnavigator-usa.com/
+  - https://www.foodnavigator-latam.com/
+  - https://www.foodnavigator.com/
+  - https://www.apfoodonline.com/
+  - https://www.food-safety.com/
+  - https://www.foodbusinessgulf.com/
+  - https://www.bevnet.com/
+  - https://imbibemagazine.com/
+  - https://beveragedynamics.com/
+  - https://www.beverage-digest.com/
+  - https://www.thebeveragejournal.com/
+  - https://foodchainmagazine.com/news/category/insights/
+  - https://www.just-drinks.com/
+  - https://www.newfoodmagazine.com/
+  - https://beverage-master.com/
+  - https://drinksint.com/
+  - https://dairynews.today/global/
+  - https://globaldairyplatform.com/news_category/dairy-media-news/
+  - https://www.usdairy.com/media
+  - https://dairybusiness.com/
+  - https://www.thedairysite.com/
+  - https://californiadairymagazine.com/
+  - https://www.dairynewsaustralia.com.au/
+  - https://www.dairynz.co.nz/news/
+  - https://international-dairy.com/media-information/
+  - https://www.packaging-gateway.com/news/
+  - https://www.packworld.com/
+  - https://foodbeverageasia.com/
+  - https://www.nspackaging.com/
+  - https://www.packagingstrategies.com/
+  - https://spnews.com/
+  - https://www.foodbusinessafrica.com/
+  - https://www.foodmanufacture.co.uk/#
+  - https://www.weibo.com/u/2096538335?lpage=profileRecom
+  - http://www.csztv.cn/
+  - https://www.cbndata.com/
+  - https://news.qq.com/
+  - http://www.foodmate.net/
+  - https://www.foodingredientsfirst.com/
+  - https://www.beveragemarketing.com/strategist.asp
+  - https://www.ift.org/news-and-publications/food-technology-magazine
+  - https://www.packagingstrategies.com/flexible-packaging
+  - https://www.compacknews.news/en/
+  - https://www.italiaimballaggio.it/
+  - https://www.italiagrafica.com/
+  - https://www.portalspozywczy.pl/
+  - https://www.clal.it/
+  - https://www.pack.com.br/
+  - https://www.plastico.com.br/category/embalagens/
+  - https://www.canaldoleite.com/
+  - https://www.foodtalks.cn/
+  - https://www.foodaily.com/
+  - https://www.iimedia.cn/
+  - https://www.cdia.org.cn/
+  - https://www.chinacoatingnet.com/
+  - http://www.cppia.org.cn/
+  - https://www.dac.org.cn/