Spaces:

librarian-bots
/

dataset-column-search-api

Running

App Files Files Community

davanstrien HF staff commited on Jul 16, 2024

Commit

5721477

1 Parent(s): b558f4f

schedule collections refresh

Browse files

Files changed (2) hide show

create_collections.py +128 -0
main.py +20 -0

create_collections.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from typing import Any, Dict, Iterator, List
+import requests
+from huggingface_hub import add_collection_item, create_collection
+from tqdm.auto import tqdm
+class DatasetSearchClient:
+    def __init__(
+        self,
+        base_url: str = "https://librarian-bots-dataset-column-search-api.hf.space",
+    ):
+        self.base_url = base_url
+    def search(
+        self, columns: List[str], match_all: bool = False, page_size: int = 100
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Search datasets using the provided API, automatically handling pagination.
+        Args:
+            columns (List[str]): List of column names to search for.
+            match_all (bool, optional): If True, match all columns. If False, match any column. Defaults to False.
+            page_size (int, optional): Number of results per page. Defaults to 100.
+        Yields:
+            Dict[str, Any]: Each dataset result from all pages.
+        Raises:
+            requests.RequestException: If there's an error with the HTTP request.
+            ValueError: If the API returns an unexpected response format.
+        """
+        page = 1
+        total_results = None
+        while total_results is None or (page - 1) * page_size < total_results:
+            params = {
+                "columns": columns,
+                "match_all": str(match_all).lower(),
+                "page": page,
+                "page_size": page_size,
+            }
+            try:
+                response = requests.get(f"{self.base_url}/search", params=params)
+                response.raise_for_status()
+                data = response.json()
+                if not {"total", "page", "page_size", "results"}.issubset(data.keys()):
+                    raise ValueError("Unexpected response format from the API")
+                if total_results is None:
+                    total_results = data["total"]
+                yield from data["results"]
+                page += 1
+            except requests.RequestException as e:
+                raise requests.RequestException(
+                    f"Error connecting to the API: {str(e)}"
+                ) from e
+            except ValueError as e:
+                raise ValueError(f"Error processing API response: {str(e)}") from e
+# Create an instance of the client
+client = DatasetSearchClient()
+def update_collection_for_dataset(
+    collection_name: str = None,
+    dataset_columns: List[str] = None,
+    collection_description: str = None,
+    collection_namespace: str = None,
+):
+    if not collection_name:
+        collection = create_collection(
+            collection_name, exists_ok=True, description=collection_description
+        )
+    else:
+        collection = create_collection(
+            collection_name,
+            exists_ok=True,
+            description=collection_description,
+            namespace=collection_namespace,
+        )
+    results = list(
+        tqdm(
+            client.search(dataset_columns, match_all=True),
+            desc="Searching datasets...",
+            leave=False,
+        )
+    )
+    for result in tqdm(results, desc="Adding datasets to collection...", leave=False):
+        try:
+            add_collection_item(
+                collection.slug, result["hub_id"], item_type="dataset", exists_ok=True
+            )
+        except Exception as e:
+            print(
+                f"Error adding dataset {result['hub_id']} to collection {collection_name}: {str(e)}"
+            )
+    return f"https://huggingface.co/collections/{collection.slug}"
+collections = [
+    {
+        "dataset_columns": ["chosen", "rejected", "prompt"],
+        "collection_description": "Datasets suitable for Direct Preference Optimization based on having 'chosen', 'rejected', and 'prompt' columns",
+        "collection_name": "Direct Preference Optimization Datasets",
+    },
+    {
+        "dataset_columns": ["image", "chosen", "rejected"],
+        "collection_description": "Datasets suitable for Image Preference Optimization based on having  'image','chosen', and 'rejected' columns",
+        "collection_name": "Image Preference Optimization Datasets",
+    },
+    {
+        "collection_name": "Alpaca Style Datasets",
+        "dataset_columns": ["instruction", "input", "output"],
+        "collection_description": "Datasets which follow the Alpaca Style format based on having 'instruction', 'input', and 'output' columns",
+    },
+]
+results = [
+    update_collection_for_dataset(**collection, collection_namespace="librarian-bots")
+    for collection in collections
+]
+print(results)

main.py CHANGED Viewed

@@ -17,6 +17,7 @@ from pandas import Timestamp
 from pydantic import BaseModel
 from starlette.responses import RedirectResponse
 from data_loader import refresh_data
 login(token=os.getenv("HF_TOKEN"))
@@ -163,6 +164,23 @@ async def update_database():
         logger.error(f"Error uploading database file to Hugging Face Hub: {str(e)}")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     setup_database()
@@ -173,6 +191,8 @@ async def lifespan(app: FastAPI):
     scheduler = AsyncIOScheduler()
     # Schedule the update_database function using the UPDATE_SCHEDULE configuration
     scheduler.add_job(update_database, CronTrigger(**UPDATE_SCHEDULE))
     scheduler.start()
     yield

 from pydantic import BaseModel
 from starlette.responses import RedirectResponse
+from create_collections import collections, update_collection_for_dataset
 from data_loader import refresh_data
 login(token=os.getenv("HF_TOKEN"))
         logger.error(f"Error uploading database file to Hugging Face Hub: {str(e)}")
+async def update_collections():
+    logger.info("Starting scheduled collection update")
+    try:
+        for collection in collections:
+            result = await asyncio.get_event_loop().run_in_executor(
+                None,
+                update_collection_for_dataset,
+                collection["collection_name"],
+                collection["dataset_columns"],
+                collection["collection_description"],
+                "librarian-bots",
+            )
+            logger.info(f"Updated collection: {result}")
+    except Exception as e:
+        logger.error(f"Error during collection update: {str(e)}")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     setup_database()
     scheduler = AsyncIOScheduler()
     # Schedule the update_database function using the UPDATE_SCHEDULE configuration
     scheduler.add_job(update_database, CronTrigger(**UPDATE_SCHEDULE))
+    # Schedule the update_collections function to run daily at midnight
+    scheduler.add_job(update_collections, CronTrigger(hour=0, minute=0))
     scheduler.start()
     yield