web-crawling

Running

App Files Files Community

web-crawling / main.py

pvanand

Update main.py

ffa393c verified 11 months ago

raw

history blame

2.97 kB

	from fastapi import FastAPI, HTTPException, Query
	from pydantic import BaseModel
	import hrequests
	import trafilatura
	from fastapi.middleware.cors import CORSMiddleware

	app = FastAPI()

	class URLRequest(BaseModel):
	url: str

	@app.post("/scrape")
	async def scrape(url_request: URLRequest):
	try:
	response = hrequests.get(url_request.url, browser='chrome')


	return {"content": response.text}
	except Exception as e:
	raise e

	@app.get("/extract-article")
	def extract_article(
	url: str,
	record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
	no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
	favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
	favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
	include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
	output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
	target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
	include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
	include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
	include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
	deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
	max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
	):
	response = hrequests.get(url)
	filecontent = response.text
	extracted = trafilatura.extract(
	filecontent,
	url=url,
	record_id=record_id,
	no_fallback=no_fallback,
	favor_precision=favor_precision,
	favor_recall=favor_recall,
	include_comments=include_comments,
	output_format=output_format,
	target_language=target_language,
	include_tables=include_tables,
	include_images=include_images,
	include_links=include_links,
	deduplicate=deduplicate,
	max_tree_size=max_tree_size
	)

	if extracted:
	return {"article": extracted}
	else:
	return {"error": "Could not extract the article"}

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.get("/", tags=["Home"])
	def api_home():
	return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}