Spaces:
Runtime error
Runtime error
from fastapi import FastAPI, Query | |
from datasets import load_dataset | |
from typing import List | |
app = FastAPI() | |
# Load the dataset in streaming mode for memory efficiency | |
dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True) | |
def greet_json(): | |
return {"message": "Welcome to the RedPajama Dataset API"} | |
def get_data(chunk_size: int = 10): | |
""" | |
Returns a small chunk of the dataset. | |
Parameters: | |
- chunk_size: The number of examples to return (default: 10). | |
Returns: | |
- A list of examples from the dataset. | |
""" | |
data_chunk = [] | |
for i, example in enumerate(dataset["train"]): # Adjust split if needed | |
data_chunk.append(example) | |
if i + 1 == chunk_size: | |
break | |
return {"data": data_chunk} | |
def search_data(keyword: str, max_results: int = 10): | |
""" | |
Searches the dataset for a specific keyword in the text fields. | |
Parameters: | |
- keyword: The keyword to search for. | |
- max_results: The maximum number of results to return (default: 10). | |
Returns: | |
- A list of examples containing the keyword. | |
""" | |
results = [] | |
for example in dataset["train"]: # Adjust split if needed | |
if keyword.lower() in str(example).lower(): | |
results.append(example) | |
if len(results) == max_results: | |
break | |
return {"results": results} | |
def data_summary(): | |
""" | |
Provides a basic summary of the dataset. | |
Returns: | |
- A dictionary with dataset details (e.g., number of splits). | |
""" | |
return {"dataset_splits": dataset.keys()} |