Spaces:

akhil-vaidya
/

GOT-OCR

Running

App Files Files Community

akhil-vaidya commited on Sep 26, 2024

Commit

c542962

0 Parent(s):

commit message

Browse files

Files changed (6) hide show

.devcontainer/devcontainer.json +33 -0
.github/workflows/sync_to_huggingface.yml +20 -0
README.md +11 -0
app.py +134 -0
archive/qwen_test.ipynb +324 -0
requirements.txt +8 -0

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "name": "Python 3",
+  // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+  "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
+  "customizations": {
+    "codespaces": {
+      "openFiles": [
+        "README.md",
+        "app.py"
+      ]
+    },
+    "vscode": {
+      "settings": {},
+      "extensions": [
+        "ms-python.python",
+        "ms-python.vscode-pylance"
+      ]
+    }
+  },
+  "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
+  "postAttachCommand": {
+    "server": "streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false"
+  },
+  "portsAttributes": {
+    "8501": {
+      "label": "Application",
+      "onAutoForward": "openPreview"
+    }
+  },
+  "forwardPorts": [
+    8501
+  ]
+}

.github/workflows/sync_to_huggingface.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://akhil-vaidya:$HF_TOKEN@huggingface.co/spaces/akhil-vaidya/GOT-OCR main

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: GOT OCR
+emoji: 👀
+colorFrom: green
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.38.0
+app_file: app.py
+pinned: false
+license: mit
+---

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, MllamaForConditionalGeneration
+import streamlit as st
+import os
+from PIL import Image
+import requests
+import torch
+from torchvision import io
+from typing import Dict
+import base64
+import random
+def init_model():
+    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
+    model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+    model = model.eval()
+    return model, tokenizer
+def init_gpu_model():
+    tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+    model = model.eval().cuda()
+    return model, tokenizer
+def init_qwen_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+    return model, processor
+def get_quen_op(image_file, model, processor):
+    try:
+        image = Image.open(image_file).convert('RGB')
+        conversation = [
+            {
+                "role":"user",
+                "content":[
+                    {
+                        "type":"image",
+                    },
+                    {
+                        "type":"text",
+                        "text":"Extract text from this image."
+                    }
+                ]
+            }
+        ]
+        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
+        inputs = {k: v.to(torch.float32) if torch.is_floating_point(v) else v for k, v in inputs.items()}
+        generation_config = {
+            "max_new_tokens": 32,
+            "do_sample": False,
+            "top_k": 20,
+            "top_p": 0.90,
+            "temperature": 0.4,
+            "num_return_sequences": 1,
+            "pad_token_id": processor.tokenizer.pad_token_id,
+            "eos_token_id": processor.tokenizer.eos_token_id,
+        }
+        output_ids = model.generate(**inputs, **generation_config)
+        if 'input_ids' in inputs:
+                generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
+        else:
+            generated_ids = output_ids
+        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        return output_text[:] if output_text else "No text extracted from the image."
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+def init_llama():
+    model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        token=os.getenv("access_token")
+    )
+    processor = AutoProcessor.from_pretrained(model_id, token=os.getenv("access_token"))
+    return model, processor
+def get_llama_op(image_file, model, processor):
+    with open(image_file, "rb") as f:
+        image = base64.b64encode(f.read()).decode('utf-8')
+    messages = [
+        {"role": "user", "content": [
+            {"type": "image"},
+            {"type": "text", "text": "You are an accurate OCR engine. From the given image, extract the text."}
+        ]}
+    ]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(image, input_text, return_tensors="pt").to(model.device)
+    output = model.generate(**inputs, max_new_tokens=30)
+    return processor.decode(output[0])
+def get_text(image_file, model, tokenizer):
+    res = model.chat(tokenizer, image_file, ocr_type='ocr')
+    return res
+st.title("Image - Text OCR")
+st.write("Upload an image for OCR")
+MODEL, PROCESSOR = init_llama()
+random_value = random.randint(0, 100)
+st.write(f"Model loaded: build number - {random_value}")
+image_file = st.file_uploader("Upload Image", type=['jpg', 'png', 'jpeg'])
+if image_file:
+    if not os.path.exists("images"):
+        os.makedirs("images")
+    with open(f"images/{image_file.name}", "wb") as f:
+        f.write(image_file.getbuffer())
+    image_file = f"images/{image_file.name}"
+    # model, tokenizer = init_gpu_model()
+    # model, tokenizer = init_model()
+    # text = get_text(image_file, model, tokenizer)
+    # model, processor = init_llama()
+    text = get_llama_op(image_file, MODEL, PROCESSOR)
+    # model, processor = init_qwen_model()
+    # text = get_quen_op(image_file, model, processor)
+    print(text)
+    st.write(text)

archive/qwen_test.ipynb ADDED Viewed

	@@ -0,0 +1,324 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "import torch\n",
+    "from torchvision import io\n",
+    "from typing import Dict\n",
+    "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "29ac356cdb05492d8a2da9bceea03b37",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Akhil PC\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Akhil PC\\.cache\\huggingface\\hub\\models--Qwen--Qwen2-VL-2B-Instruct. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3ca08388cd3a4bc58b5b3c84b57fcd7f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7f667bff4c014fce85cb222f40508c78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c4289d2bd8f0466586d20564fb8fef84",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "47d67996509f431abb0f99bab97a03d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d3e49e52f64147e2b5043c76d9a507e6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5060e7d44d5b40fd8ca2d7e90542be21",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac0500d6289442d88db22065e94c6df2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "99ff45911ba848f2bd3ccd3f57029641",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d484f67779348d7b242a12de0505324",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0e6345cf4cd4b61b7d6b10ab7ae6f23",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c108ffe24eab4d82a8aa8d5bda088bf7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c9385ab1782f49fcb59fbe2aa73a81c5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Load the model in half-precision on the available device(s)\n",
+    "model = Qwen2VLForConditionalGeneration.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\", device_map=\"cpu\", torch_dtype=torch.float16)\n",
+    "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Image\n",
+    "url = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)\n",
+    "\n",
+    "conversation = [\n",
+    "    {\n",
+    "        \"role\":\"user\",\n",
+    "        \"content\":[\n",
+    "            {\n",
+    "                \"type\":\"image\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"type\":\"text\",\n",
+    "                \"text\":\"Describe this image.\"\n",
+    "            }\n",
+    "        ]\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocess the inputs\n",
+    "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n",
+    "# inputs = inputs.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inference: Generation of the output\n",
+    "output_ids = model.generate(**inputs, max_new_tokens=128)\n",
+    "generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
+    "print(output_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers==4.45.0
+streamlit==1.30.0
+torch --index-url https://download.pytorch.org/whl/cpu
+torchvision --index-url https://download.pytorch.org/whl/cpu
+tiktoken
+verovio
+accelerate==0.28.0
+Pillow==10.3.0