Spaces:
Running
Running
import os | |
import asyncio | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel, Field | |
from typing import List, Optional | |
from crawl4ai import AsyncWebCrawler | |
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy | |
import json | |
app = FastAPI() | |
class CrawlerInput(BaseModel): | |
url: str = Field(..., description="URL to crawl") | |
columns: List[str] = Field(..., description="List of required columns") | |
descriptions: List[str] = Field(..., description="Descriptions for each column") | |
class CrawlerOutput(BaseModel): | |
data: List[dict] | |
async def simple_crawl(): | |
async with AsyncWebCrawler(verbose=True) as crawler: | |
result = await crawler.arun(url="https://www.nbcnews.com/business") | |
print(len(result.markdown)) | |
return result | |
async def crawl(input: CrawlerInput): | |
if len(input.columns) != len(input.descriptions): | |
raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions") | |
# Create a dictionary with columns as keys and descriptions as values | |
extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)} | |
# Convert the dictionary to a JSON string | |
instruction = f"Extract the following information: {json.dumps(extraction_info)}" | |
async with AsyncWebCrawler(verbose=True) as crawler: | |
result = await crawler.arun( | |
url=input.url, | |
extraction_strategy=LLMExtractionStrategy( | |
provider="openai/gpt-4o-mini", | |
api_token=os.getenv('OPENAI_API_KEY'), | |
extraction_type="schema", | |
verbose=True, | |
instruction=instruction | |
) | |
) | |
extracted_data = json.loads(result.extracted_content) | |
return CrawlerOutput(data=extracted_data) | |
async def test(): | |
result = await simple_crawl() | |
return {"markdown": result.markdown} | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) |