Spaces:
Running
on
Zero
Running
on
Zero
from transformers import NougatProcessor, VisionEncoderDecoderModel | |
import gradio as gr | |
import torch | |
from pdf2image import convert_from_path | |
# Load the model and processor | |
processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat") | |
model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
context_length = 2048 | |
def extract_text_from_image(image): | |
""" | |
Extract text from PIL image | |
Args: | |
image (PIL.Image): Input image | |
Returns: | |
str: Extracted text from the image | |
""" | |
# prepare PDF image for the model | |
pixel_values = processor(image, return_tensors="pt").pixel_values | |
# generate transcription | |
outputs = model.generate( | |
pixel_values.to(device), | |
min_length=1, | |
max_new_tokens=context_length, | |
bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
) | |
page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False) | |
return page_sequence | |
def extract_text_from_pdf(pdf_path, progress=gr.Progress()): | |
""" | |
Extract text from PDF | |
Args: | |
pdf_path (str): Path to the PDF file | |
progress (gr.Progress): Progress bar | |
Returns: | |
str: Extracted text from the PDF | |
""" | |
progress(0, desc="Starting...") | |
images = convert_from_path(pdf_path) | |
texts = [] | |
for image in progress.tqdm(images): | |
extracted_text = extract_text_from_image(image) | |
texts.append(extracted_text) | |
return "\n".join(texts) | |
model_description = """ | |
This is a demo for the Arabic Small Nougat model. It is an end-to-end OCR model that can extract text from images and PDFs. | |
- The model is trained on the [Khatt dataset](https://huggingface.co/datasets/Fakhraddin/khatt) and custom made dataset. | |
- The model is a finetune of [facebook/nougat-small](https://huggingface.co/facebook/nougat-small) model. | |
**Note**: The model is a prototype in my book and may not work well on all types of images and PDFs. **Check the output carefully before using it for any serious work.** | |
""" | |
with gr.Blocks(title="Arabic Small Nougat") as demo: | |
gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>") | |
gr.Markdown(model_description) | |
with gr.Tab("Extract Text from Image"): | |
with gr.Row(): | |
with gr.Column(): | |
image = gr.Image(label="Input Image", type="pil") | |
image_submit_button = gr.Button(value="Submit", variant="primary") | |
output = gr.Markdown(label="Output Markdown", rtl=True) | |
image_submit_button.click(extract_text_from_image, inputs=[image], outputs=output) | |
with gr.Tab("Extract Text from PDF"): | |
with gr.Row(): | |
with gr.Column(): | |
pdf = gr.File(label="Input PDF", type="filepath") | |
pdf_submit_button = gr.Button(value="Submit", variant="primary") | |
output = gr.Markdown(label="Output Markdown", rtl=True) | |
pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output) | |
demo.queue().launch(share=False) | |