Spaces:

MJobe
/

document-vqa-v2

Running

File size: 4,759 Bytes

8700a34
836458e
6bbd3ca
 
c39e604
 
af17670
 
86a0b7a
 
6bbd3ca
86a0b7a
836458e
86a0b7a
6bbd3ca
86a0b7a
 
6bbd3ca
86a0b7a
 
6bbd3ca
86a0b7a
 
6bbd3ca
 
 
 
a82199b
6bbd3ca
 
c39e604
 
 
86a0b7a
 
41d335c
86a0b7a
 
41d335c
86a0b7a
 
 
f198fb3
f8ec4b3
86a0b7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ec4b3
ed563a5
 
f8ec4b3
 
 
 
2181fee
f8ec4b3
 
ed563a5
 
 
 
 
 
 
8700a34
ed563a5
 
 
 
c39e604
ed563a5
 
41d335c
ed563a5
 
41d335c
ed563a5
 
f198fb3
ed563a5
 
 
 
 
 
 
 
 
 
 
 
 
 
6bbd3ca
 
8700a34
f66f82d
0c14995

import fitz
from fastapi import FastAPI, File, UploadFile, Form, Request, Response
from fastapi.responses import JSONResponse
from transformers import pipeline
from PIL import Image
from io import BytesIO
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
import torch
import re

from transformers import DonutProcessor, VisionEncoderDecoderModel

app = FastAPI()

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@app.post("/donutQA/")
async def donut_question_answering(
    file: UploadFile = File(...),
    questions: str = Form(...),
):
    try:
        # Read the uploaded file as bytes
        contents = await file.read()

        # Open the image using PIL
        image = Image.open(BytesIO(contents))

        # Split the questions into a list
        question_list = questions.split(',')

        # Process document with Donut model for each question
        answers = process_document(image, question_list)

        # Return a dictionary with questions and corresponding answers
        result_dict = dict(zip(question_list, answers))
        return result_dict

    except Exception as e:
        return {"error": f"Error processing file: {str(e)}"}

def process_document(image, questions):
    # prepare encoder inputs
    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # prepare decoder inputs
    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
    
    # Initialize a list to store answers for each question
    answers = []

    # Process each question
    for question in questions:
        prompt = task_prompt.replace("{user_input}", question)
        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
              
        # generate answer
        outputs = model.generate(
            pixel_values.to(device),
            decoder_input_ids=decoder_input_ids.to(device),
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )
        
        # postprocess
        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token

        # Append the answer to the list
        answers.append(processor.token2json(sequence))

    return answers

@app.post("/pdfQA/", description=description)
async def pdf_question_answering(
    file: UploadFile = File(...),
    questions: str = Form(...),
):
    try:
        # Read the uploaded file as bytes
        contents = await file.read()

        # Initialize an empty string to store the text content of the PDF
        all_text = ""

        # Use PyMuPDF to process the PDF and extract text
        pdf_document = fitz.open_from_bytes(contents)
        
        # Loop through each page and perform OCR
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            print(f"Processing page {page_num + 1}...")
            text = page.get_text()
            all_text += text + '\n'

        # Print or do something with the collected text
        print(all_text)

        # List of questions
        question_list = questions.split(',')

        # Initialize an empty dictionary to store questions and answers
        qa_dict = {}

        # Get answers for each question with the same context
        for question in question_list:
            result = nlp_qa({
                'question': question,
                'context': all_text
            })

            # Access the 'answer' key from the result
            answer = result['answer']

            # Store the question and answer in the dictionary
            qa_dict[question] = answer

        return qa_dict

    except Exception as e:
        return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)

# Set up CORS middleware
origins = ["*"]  # or specify your list of allowed origins
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)