document-vqa-v2 / main.py
MJobe's picture
Update main.py
574f9e3
raw
history blame
3.76 kB
import fitz
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from transformers import pipeline
from PIL import Image
from io import BytesIO
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
app = FastAPI()
# Set up CORS middleware
origins = ["*"] # or specify your list of allowed origins
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Use a pipeline as a high-level helper
nlp_qa = pipeline("document-question-answering", model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa")
description = """
## Image-based Document QA
This API performs document question answering using a LayoutLMv2-based model.
### Endpoints:
- **POST /uploadfile/:** Upload an image file to extract text and answer provided questions.
- **POST /pdfQA/:** Provide a PDF file to extract text and answer provided questions.
"""
app = FastAPI(docs_url="/", description=description)
@app.post("/uploadfile/", description="Upload an image file to extract text and answer provided questions.")
async def perform_document_qa(
file: UploadFile = File(...),
questions: str = Form(...),
):
try:
# Read the uploaded file as bytes
contents = await file.read()
# Open the image using PIL
image = Image.open(BytesIO(contents))
# Perform document question answering for each question using LayoutLMv2-based model
answers_dict = {}
for question in questions.split(','):
result = nlp_qa(
image,
question.strip()
)
# Access the 'answer' key from the first item in the result list
answer = result[0]['answer']
# Format the question as a string without extra characters
formatted_question = question.strip("[]")
answers_dict[formatted_question] = answer
return answers_dict
except Exception as e:
return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)
@app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.")
async def pdf_question_answering(
file: UploadFile = File(...),
questions: str = Form(...),
):
try:
# Read the uploaded file as bytes
contents = await file.read()
# Initialize an empty string to store the text content of the PDF
all_text = ""
# Use PyMuPDF to process the PDF and extract text
pdf_document = fitz.open_from_bytes(contents)
# Loop through each page and perform OCR
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
print(f"Processing page {page_num + 1}...")
text = page.get_text()
all_text += text + '\n'
# Print or do something with the collected text
print(all_text)
# List of questions
question_list = questions.split(',')
# Initialize an empty dictionary to store questions and answers
qa_dict = {}
# Get answers for each question with the same context
for question in question_list:
result = nlp_qa({
'question': question,
'context': all_text
})
# Access the 'answer' key from the result
answer = result['answer']
# Store the question and answer in the dictionary
qa_dict[question] = answer
return qa_dict
except Exception as e:
return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)