web-crawling

Running

App Files Files Community

web-crawling / file_conversion.py

pvanand

revert md request

495fb46 verified 2 months ago

raw

history blame contribute delete

3.64 kB

	from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
	from fastapi.responses import FileResponse
	from pydantic import BaseModel
	from pdf2docx import Converter
	import os
	import shutil
	import pdfkit
	import uuid

	router = APIRouter()

	TEMP_DIR = "/.tempfiles"

	class HTMLRequest(BaseModel):
	html_content: str

	def ensure_temp_dir():
	os.makedirs(TEMP_DIR, exist_ok=True)

	def remove_file(path: str):
	if os.path.exists(path):
	os.unlink(path)

	def generate_temp_filepath(extension: str) -> str:
	return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")

	def html_to_pdf(html_content: str, output_path: str) -> None:
	options = {
	'page-size': 'A4',
	'margin-top': '0.75in',
	'margin-right': '0.75in',
	'margin-bottom': '0.75in',
	'margin-left': '0.75in',
	'encoding': "UTF-8",
	}
	pdfkit.from_string(html_content, output_path, options=options)

	def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
	cv = Converter(pdf_path)
	cv.convert(docx_path)
	cv.close()

	def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
	try:
	convert_func(input_path, output_path)
	if not os.path.exists(output_path):
	raise FileNotFoundError(f"Converted file not found: {output_path}")
	background_tasks.add_task(remove_file, input_path)
	background_tasks.add_task(remove_file, output_path)
	return FileResponse(
	output_path,
	media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	filename=f"converted_document_{uuid.uuid4()}.docx"
	)
	except Exception as e:
	remove_file(input_path)
	remove_file(output_path)
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	@router.post("/convert/pdf_to_docx")
	async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="File must be a PDF")

	ensure_temp_dir()
	pdf_temp_path = generate_temp_filepath("pdf")
	docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')

	with open(pdf_temp_path, "wb") as pdf_file:
	shutil.copyfileobj(file.file, pdf_file)

	return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)

	@router.post("/convert/html_to_pdf")
	async def convert_html_to_pdf(request: HTMLRequest):
	ensure_temp_dir()
	pdf_temp_path = generate_temp_filepath("pdf")

	try:
	html_to_pdf(request.html_content, pdf_temp_path)
	with open(pdf_temp_path, "rb") as pdf_file:
	pdf_content = pdf_file.read()
	remove_file(pdf_temp_path)
	return Response(content=pdf_content, media_type="application/pdf")
	except Exception as e:
	remove_file(pdf_temp_path)
	raise HTTPException(status_code=500, detail=str(e))

	@router.post("/convert/html_to_docx")
	async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
	ensure_temp_dir()
	pdf_temp_path = generate_temp_filepath("pdf")
	docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')

	try:
	html_to_pdf(request.html_content, pdf_temp_path)
	return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
	except Exception as e:
	remove_file(pdf_temp_path)
	remove_file(docx_temp_path)
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")