web-crawling / file_conversion.py
pvanand's picture
revert md request
495fb46 verified
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
from fastapi.responses import FileResponse
from pydantic import BaseModel
from pdf2docx import Converter
import os
import shutil
import pdfkit
import uuid
router = APIRouter()
TEMP_DIR = "/.tempfiles"
class HTMLRequest(BaseModel):
html_content: str
def ensure_temp_dir():
os.makedirs(TEMP_DIR, exist_ok=True)
def remove_file(path: str):
if os.path.exists(path):
os.unlink(path)
def generate_temp_filepath(extension: str) -> str:
return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")
def html_to_pdf(html_content: str, output_path: str) -> None:
options = {
'page-size': 'A4',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
}
pdfkit.from_string(html_content, output_path, options=options)
def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
cv = Converter(pdf_path)
cv.convert(docx_path)
cv.close()
def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
try:
convert_func(input_path, output_path)
if not os.path.exists(output_path):
raise FileNotFoundError(f"Converted file not found: {output_path}")
background_tasks.add_task(remove_file, input_path)
background_tasks.add_task(remove_file, output_path)
return FileResponse(
output_path,
media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
filename=f"converted_document_{uuid.uuid4()}.docx"
)
except Exception as e:
remove_file(input_path)
remove_file(output_path)
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
@router.post("/convert/pdf_to_docx")
async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="File must be a PDF")
ensure_temp_dir()
pdf_temp_path = generate_temp_filepath("pdf")
docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
with open(pdf_temp_path, "wb") as pdf_file:
shutil.copyfileobj(file.file, pdf_file)
return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
@router.post("/convert/html_to_pdf")
async def convert_html_to_pdf(request: HTMLRequest):
ensure_temp_dir()
pdf_temp_path = generate_temp_filepath("pdf")
try:
html_to_pdf(request.html_content, pdf_temp_path)
with open(pdf_temp_path, "rb") as pdf_file:
pdf_content = pdf_file.read()
remove_file(pdf_temp_path)
return Response(content=pdf_content, media_type="application/pdf")
except Exception as e:
remove_file(pdf_temp_path)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/convert/html_to_docx")
async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
ensure_temp_dir()
pdf_temp_path = generate_temp_filepath("pdf")
docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
try:
html_to_pdf(request.html_content, pdf_temp_path)
return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
except Exception as e:
remove_file(pdf_temp_path)
remove_file(docx_temp_path)
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")