|
from transformers import NougatProcessor, VisionEncoderDecoderModel |
|
import gradio as gr |
|
from pdf2image import convert_from_path |
|
|
|
|
|
processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat") |
|
model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat") |
|
device = "cpu" |
|
|
|
context_length = 2048 |
|
|
|
def extract_text_from_image(image): |
|
""" |
|
Extract text from PIL image |
|
|
|
Args: |
|
image (PIL.Image): Input image |
|
|
|
Returns: |
|
str: Extracted text from the image |
|
""" |
|
|
|
|
|
pixel_values = processor(image, return_tensors="pt").pixel_values |
|
|
|
|
|
outputs = model.generate( |
|
pixel_values.to(device), |
|
min_length=1, |
|
max_new_tokens=context_length, |
|
bad_words_ids=[[processor.tokenizer.unk_token_id]], |
|
) |
|
page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] |
|
page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False) |
|
return page_sequence |
|
|
|
def extract_text_from_pdf(pdf_path, progress=gr.Progress()): |
|
""" |
|
Extract text from PDF |
|
|
|
Args: |
|
pdf_path (str): Path to the PDF file |
|
progress (gr.Progress): Progress bar |
|
|
|
Returns: |
|
str: Extracted text from the PDF |
|
""" |
|
|
|
progress(0, desc="Starting...") |
|
images = convert_from_path(pdf_path) |
|
texts = [] |
|
for image in progress.tqdm(images): |
|
extracted_text = extract_text_from_image(image) |
|
texts.append(extracted_text) |
|
|
|
return "\n".join(texts) |
|
|
|
with gr.Blocks(title="Arabic Small Nougat") as demo: |
|
gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>") |
|
|
|
with gr.Tab("Extract Text from Image"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
image = gr.Image(label="Input Image", type="pil") |
|
image_submit_button = gr.Button(value="Submit", variant="primary") |
|
output = gr.Markdown(label="Output Markdown", rtl=True) |
|
image_submit_button.click(extract_text_from_image, inputs=[image], outputs=output) |
|
|
|
with gr.Tab("Extract Text from PDF"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
pdf = gr.File(label="Input PDF", type="filepath") |
|
pdf_submit_button = gr.Button(value="Submit", variant="primary") |
|
output = gr.Markdown(label="Output Markdown", rtl=True) |
|
pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output) |
|
|
|
demo.queue().launch(share=False) |
|
|