web-crawling

Running

File size: 2,965 Bytes

ffa393c
b5b2e6a
a3bb0bd
ffa393c
1d78f18
b5b2e6a
a3bb0bd
ea680b0
a3bb0bd
 
ea680b0
a3bb0bd
 
b5b2e6a
9300ae8
 
 
a3bb0bd
dbf8d54
 
949bf2b
ffa393c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8510e0
 
 
 
 
 
 
 
676b3da
c8510e0
1a44639

from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
import hrequests
import trafilatura
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

class URLRequest(BaseModel):
    url: str

@app.post("/scrape")
async def scrape(url_request: URLRequest):
    try:
        response = hrequests.get(url_request.url, browser='chrome')


        return {"content": response.text}
    except Exception as e: 
        raise e

@app.get("/extract-article")
def extract_article(
    url: str,
    record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
    no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
    favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
    favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
    include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
    output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
    target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
    include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
    include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
    include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
    deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
    max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
):
    response = hrequests.get(url)
    filecontent = response.text
    extracted = trafilatura.extract(
        filecontent,
        url=url,
        record_id=record_id,
        no_fallback=no_fallback,
        favor_precision=favor_precision,
        favor_recall=favor_recall,
        include_comments=include_comments,
        output_format=output_format,
        target_language=target_language,
        include_tables=include_tables,
        include_images=include_images,
        include_links=include_links,
        deduplicate=deduplicate,
        max_tree_size=max_tree_size
    )
    
    if extracted:
        return {"article": extracted}
    else:
        return {"error": "Could not extract the article"}

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/", tags=["Home"])
def api_home():
    return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}