Spaces:
Running
Running
File size: 7,157 Bytes
1bdface 6fda465 b5b2e6a a3bb0bd ffa393c 1d78f18 abc1ce4 1bdface ab0b795 bb6b8b7 4acb7cc b5b2e6a a3bb0bd ea680b0 a3bb0bd ea680b0 a3bb0bd b5b2e6a 9300ae8 a3bb0bd dbf8d54 949bf2b ffa393c 3562885 ffa393c c8510e0 bb6b8b7 ab0b795 bb6b8b7 ab0b795 1bdface 676b3da c8510e0 b32973f eebc41d b32973f d4afb46 7a08838 d4afb46 7a08838 d4afb46 6154116 7a08838 d4afb46 6154116 d4afb46 11417cc 6154116 d4afb46 7a08838 d4afb46 7a08838 d4afb46 6154116 7a08838 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
from fastapi import FastAPI, HTTPException, Query, Depends
from fastapi.responses import Response
from pydantic import BaseModel
import hrequests
import trafilatura
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
from pytrends.request import TrendReq
from datetime import datetime, timedelta
from fastapi_cache import FastAPICache
from fastapi_cache.backends.inmemory import InMemoryBackend
from fastapi_cache.decorator import cache
import pdfkit
app = FastAPI()
class URLRequest(BaseModel):
url: str
@app.post("/scrape")
async def scrape(url_request: URLRequest):
try:
response = hrequests.get(url_request.url, browser='chrome')
return {"content": response.text}
except Exception as e:
raise e
@app.get("/extract-article")
def extract_article(
url: str,
record_id: Optional[str] = Query(None, description="Add an ID to the metadata."),
no_fallback: Optional[bool] = Query(False, description="Skip the backup extraction with readability-lxml and justext."),
favor_precision: Optional[bool] = Query(False, description="Prefer less text but correct extraction."),
favor_recall: Optional[bool] = Query(False, description="When unsure, prefer more text."),
include_comments: Optional[bool] = Query(True, description="Extract comments along with the main text."),
output_format: Optional[str] = Query('txt', description="Define an output format: 'csv', 'json', 'markdown', 'txt', 'xml', 'xmltei'.", enum=["csv", "json", "markdown", "txt", "xml", "xmltei"]),
target_language: Optional[str] = Query(None, description="Define a language to discard invalid documents (ISO 639-1 format)."),
include_tables: Optional[bool] = Query(True, description="Take into account information within the HTML <table> element."),
include_images: Optional[bool] = Query(False, description="Take images into account (experimental)."),
include_links: Optional[bool] = Query(False, description="Keep links along with their targets (experimental)."),
deduplicate: Optional[bool] = Query(False, description="Remove duplicate segments and documents."),
max_tree_size: Optional[int] = Query(None, description="Discard documents with too many elements.")
):
response = hrequests.get(url)
filecontent = response.text
extracted = trafilatura.extract(
filecontent,
url=url,
record_id=record_id,
no_fallback=no_fallback,
favor_precision=favor_precision,
favor_recall=favor_recall,
include_comments=include_comments,
output_format=output_format,
target_language=target_language,
include_tables=include_tables,
include_images=include_images,
include_links=include_links,
deduplicate=deduplicate,
max_tree_size=max_tree_size
)
if extracted:
return {"article": trafilatura.utils.sanitize(extracted)}
else:
return {"error": "Could not extract the article"}
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
pytrends = TrendReq()
@app.on_event("startup")
async def startup():
FastAPICache.init(InMemoryBackend(), prefix="fastapi-cache")
@app.get("/realtime_trending_searches")
@cache(expire=3600)
async def get_realtime_trending_searches(pn: str = Query('US', description="Country code for trending searches")):
trending_searches = pytrends.realtime_trending_searches(pn=pn)
return trending_searches.to_dict(orient='records')
@app.get("/", tags=["Home"])
def api_home():
return {'detail': 'Welcome to Web-Scraping API! Visit https://pvanand-web-scraping.hf.space/docs to test'}
class HTMLRequest(BaseModel):
html_content: str
@app.post("/html_to_pdf")
async def convert_to_pdf(request: HTMLRequest):
try:
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
}
pdf = pdfkit.from_string(request.html_content, False, options=options)
return Response(content=pdf, media_type="application/pdf")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
from fastapi import FastAPI, HTTPException, Response
from pydantic import BaseModel
from html4docx import HtmlToDocx
import os
class HTMLInput(BaseModel):
html: str
# Define the path to the temporary folder
TEMP_FOLDER = "/app/temp"
@app.post("/convert")
async def convert_html_to_docx(input_data: HTMLInput):
temp_filename = None
try:
# Create a new HtmlToDocx parser
parser = HtmlToDocx()
# Parse the HTML string to DOCX
docx = parser.parse_html_string(input_data.html)
# Create a unique filename in the temporary folder
temp_filename = os.path.join(TEMP_FOLDER, f"temp_{os.urandom(8).hex()}.docx")
# Save the DOCX to the temporary file
docx.save(temp_filename)
# Open the file and read its contents
with open(temp_filename, 'rb') as file:
file_contents = file.read()
# Return the DOCX file as a response
return Response(
content=file_contents,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": "attachment; filename=converted.docx"}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
# Clean up: remove the temporary file
if temp_filename and os.path.exists(temp_filename):
os.remove(temp_filename)
@app.post("/html_to_docx")
async def convert_html_to_docx(input_data: HTMLRequest):
temp_filename = None
try:
# Create a new HtmlToDocx parser
parser = HtmlToDocx()
# Parse the HTML string to DOCX
docx = parser.parse_html_string(input_data.html_content)
# Create a unique filename in the temporary folder
temp_filename = os.path.join(TEMP_FOLDER, f"temp_{os.urandom(8).hex()}.docx")
# Save the DOCX to the temporary file
docx.save(temp_filename)
# Open the file and read its contents
with open(temp_filename, 'rb') as file:
file_contents = file.read()
# Return the DOCX file as a response
return Response(
content=file_contents,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": "attachment; filename=converted.docx"}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
# Clean up: remove the temporary file
if temp_filename and os.path.exists(temp_filename):
os.remove(temp_filename)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860) |