Spaces:
Running
Running
File size: 4,359 Bytes
1bdface 6fda465 b5b2e6a a3bb0bd ffa393c 1d78f18 abc1ce4 1bdface ab0b795 bb6b8b7 4acb7cc b5b2e6a a3bb0bd ea680b0 a3bb0bd ea680b0 a3bb0bd b5b2e6a 9300ae8 a3bb0bd dbf8d54 949bf2b ffa393c 3562885 ffa393c c8510e0 bb6b8b7 ab0b795 bb6b8b7 ab0b795 1bdface 676b3da c8510e0 b32973f eebc41d b32973f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from fastapi import FastAPI, HTTPException, Query, Depends
from fastapi.responses import Response
from pydantic import BaseModel
import hrequests
import trafilatura
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
from pytrends.request import TrendReq
from datetime import datetime, timedelta
from fastapi_cache import FastAPICache
from fastapi_cache.backends.inmemory import InMemoryBackend
from fastapi_cache.decorator import cache
import pdfkit
app = FastAPI()
class URLRequest(BaseModel):
url: str
@app.post("/scrape")
async def scrape(url_request: URLRequest):
try:
response = hrequests.get(url_request.url, browser='chrome')
return {"content": response.text}
except Exception as e:
raise e
@app.get("/extract-article")
def extract_article(
url: str,
record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
):
response = hrequests.get(url)
filecontent = response.text
extracted = trafilatura.extract(
filecontent,
url=url,
record_id=record_id,
no_fallback=no_fallback,
favor_precision=favor_precision,
favor_recall=favor_recall,
include_comments=include_comments,
output_format=output_format,
target_language=target_language,
include_tables=include_tables,
include_images=include_images,
include_links=include_links,
deduplicate=deduplicate,
max_tree_size=max_tree_size
)
if extracted:
return {"article": trafilatura.utils.sanitize(extracted)}
else:
return {"error": "Could not extract the article"}
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
pytrends = TrendReq()
@app.on_event("startup")
async def startup():
FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
@app.get("/realtime_trending_searches")
@cache(expire=3600)
async def get_realtime_trending_searches(pn: str = Query('US', description="Country code for trending searches")):
trending_searches = pytrends.realtime_trending_searches(pn=pn)
return trending_searches.to_dict(orient='records')
@app.get("/", tags=["Home"])
def api_home():
return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}
class HTMLRequest(BaseModel):
html_content: str
@app.post("/html_to_pdf")
async def convert_to_pdf(request: HTMLRequest):
try:
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
}
pdf = pdfkit.from_string(request.html_content, False, options=options)
return Response(content=pdf, media_type="application/pdf")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) |