web-crawling / main.py
pvanand's picture
Update main.py
fa0386d verified
raw
history blame
2.11 kB
import os
import asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from typing import List, Optional
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
app = FastAPI()
class CrawlerInput(BaseModel):
url: str = Field(..., description="URL to crawl")
columns: List[str] = Field(..., description="List of required columns")
descriptions: List[str] = Field(..., description="Descriptions for each column")
class CrawlerOutput(BaseModel):
data: List[dict]
async def simple_crawl():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business")
print(len(result.markdown))
return result
@app.post("/crawl", response_model=CrawlerOutput)
async def crawl(input: CrawlerInput):
if len(input.columns) != len(input.descriptions):
raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions")
# Create a dictionary with columns as keys and descriptions as values
extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)}
# Convert the dictionary to a JSON string
instruction = f"Extract the following information: {json.dumps(extraction_info)}"
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=input.url,
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o-mini",
api_token=os.getenv('OPENAI_API_KEY'),
extraction_type="schema",
verbose=True,
instruction=instruction
)
)
extracted_data = json.loads(result.extracted_content)
return CrawlerOutput(data=extracted_data)
@app.get("/test")
async def test():
result = await simple_crawl()
return {"markdown": result.markdown}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)