from fastapi import FastAPI, HTTPException, Query from pydantic import BaseModel import hrequests import trafilatura from fastapi.middleware.cors import CORSMiddleware app = FastAPI() class URLRequest(BaseModel): url: str @app.post("/scrape") async def scrape(url_request: URLRequest): try: response = hrequests.get(url_request.url, browser='chrome') return {"content": response.text} except Exception as e: raise e @app.get("/extract-article") def extract_article( url: str, record_id: Optional[str] = Query(None, description="Add an ID to the metadata."), no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."), favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."), favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."), include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."), output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]), target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."), include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML element."), include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."), include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."), deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."), max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.") ): response = hrequests.get(url) filecontent = response.text extracted = trafilatura.extract( filecontent, url=url, record_id=record_id, no_fallback=no_fallback, favor_precision=favor_precision, favor_recall=favor_recall, include_comments=include_comments, output_format=output_format, target_language=target_language, include_tables=include_tables, include_images=include_images, include_links=include_links, deduplicate=deduplicate, max_tree_size=max_tree_size ) if extracted: return {"article": extracted} else: return {"error": "Could not extract the article"} app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/", tags=["Home"]) def api_home(): return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}