import fitz from fastapi import FastAPI, File, UploadFile, Form from fastapi.responses import JSONResponse from transformers import pipeline from PIL import Image from io import BytesIO from starlette.middleware import Middleware from starlette.middleware.cors import CORSMiddleware app = FastAPI() # Set up CORS middleware origins = ["*"] # or specify your list of allowed origins app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Use a pipeline as a high-level helper nlp_qa = pipeline("document-question-answering", model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa") description = """ ## Image-based Document QA This API performs document question answering using a LayoutLMv2-based model. ### Endpoints: - **POST /uploadfile/:** Upload an image file to extract text and answer provided questions. - **POST /pdfQA/:** Provide a PDF file to extract text and answer provided questions. """ app = FastAPI(docs_url="/", description=description) @app.post("/uploadfile/", description="Upload an image file to extract text and answer provided questions.") async def perform_document_qa( file: UploadFile = File(...), questions: str = Form(...), ): try: # Read the uploaded file as bytes contents = await file.read() # Open the image using PIL image = Image.open(BytesIO(contents)) # Perform document question answering for each question using LayoutLMv2-based model answers_dict = {} for question in questions.split(','): result = nlp_qa( image, question.strip() ) # Access the 'answer' key from the first item in the result list answer = result[0]['answer'] # Format the question as a string without extra characters formatted_question = question.strip("[]") answers_dict[formatted_question] = answer return answers_dict except Exception as e: return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500) @app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.") async def pdf_question_answering( file: UploadFile = File(...), questions: str = Form(...), ): try: # Read the uploaded file as bytes contents = await file.read() # Initialize an empty string to store the text content of the PDF all_text = "" # Use PyMuPDF to process the PDF and extract text pdf_document = fitz.open_from_bytes(contents) # Loop through each page and perform OCR for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) print(f"Processing page {page_num + 1}...") text = page.get_text() all_text += text + '\n' # Print or do something with the collected text print(all_text) # List of questions question_list = questions.split(',') # Initialize an empty dictionary to store questions and answers qa_dict = {} # Get answers for each question with the same context for question in question_list: result = nlp_qa({ 'question': question, 'context': all_text }) # Access the 'answer' key from the result answer = result['answer'] # Store the question and answer in the dictionary qa_dict[question] = answer return qa_dict except Exception as e: return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)