import os import asyncio from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field from typing import List, Optional from crawl4ai import AsyncWebCrawler from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy import json app = FastAPI() class CrawlerInput(BaseModel): url: str = Field(..., description="URL to crawl") columns: List[str] = Field(..., description="List of required columns") descriptions: List[str] = Field(..., description="Descriptions for each column") class CrawlerOutput(BaseModel): data: List[dict] async def simple_crawl(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun(url="https://www.nbcnews.com/business") print(len(result.markdown)) return result @app.post("/crawl", response_model=CrawlerOutput) async def crawl(input: CrawlerInput): if len(input.columns) != len(input.descriptions): raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions") # Create a dictionary with columns as keys and descriptions as values extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)} # Convert the dictionary to a JSON string instruction = f"Extract the following information: {json.dumps(extraction_info)}" async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=input.url, extraction_strategy=LLMExtractionStrategy( provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY'), extraction_type="schema", verbose=True, instruction=instruction ) ) extracted_data = json.loads(result.extracted_content) return CrawlerOutput(data=extracted_data) @app.get("/test") async def test(): result = await simple_crawl() return {"markdown": result.markdown} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)