Spaces:

AskUI
/

pta-text-v0.1

Sleeping

App Files Files Community

gitlost-murali commited on Feb 15, 2024

Commit

95431d3

1 Parent(s): 0aa610a

use askui-ml-helper library

Browse files

Files changed (3) hide show

app.py +2 -91
requirements.txt +2 -4
utils.py +0 -144

app.py CHANGED Viewed

@@ -1,98 +1,9 @@
 import gradio as gr
-from PIL import Image, ImageDraw
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch
-from transformers import Pix2StructProcessor, Pix2StructVisionModel
-from utils import download_default_font, render_header
-class Pix2StructForRegression(nn.Module):
-    def __init__(self, sourcemodel_path, device):
-        super(Pix2StructForRegression, self).__init__()
-        self.model = Pix2StructVisionModel.from_pretrained(sourcemodel_path)
-        self.regression_layer1 = nn.Linear(768, 1536)
-        self.dropout1 = nn.Dropout(0.1)
-        self.regression_layer2 = nn.Linear(1536, 768)
-        self.dropout2 = nn.Dropout(0.1)
-        self.regression_layer3 = nn.Linear(768, 2)
-        self.device = device
-    def forward(self, *args, **kwargs):
-        outputs = self.model(*args, **kwargs)
-        sequence_output = outputs.last_hidden_state
-        first_token_output = sequence_output[:, 0, :]
-        x = F.relu(self.regression_layer1(first_token_output))
-        x = F.relu(self.regression_layer2(x))
-        regression_output = torch.sigmoid(self.regression_layer3(x))
-        return regression_output
-    def load_state_dict_file(self, checkpoint_path, strict=True):
-        state_dict = torch.load(checkpoint_path, map_location=self.device)
-        self.load_state_dict(state_dict, strict=strict)
-class Inference:
-    def __init__(self) -> None:
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model, self.processor = self.load_model_and_processor("google/matcha-base", "model/pta-text-v0.1.pt")
-    def load_model_and_processor(self, model_name, checkpoint_path):
-        model = Pix2StructForRegression(sourcemodel_path=model_name, device=self.device)
-        model.load_state_dict_file(checkpoint_path=checkpoint_path)
-        model.eval()
-        model = model.to(self.device)
-        processor = Pix2StructProcessor.from_pretrained(model_name, is_vqa=False)
-        return model, processor
-    def prepare_image(self, image, prompt, processor):
-        image = image.resize((1920, 1080))
-        download_default_font_path = download_default_font()
-        rendered_image, _, render_variables = render_header(
-            image=image,
-            header=prompt,
-            bbox={"xmin": 0, "ymin": 0, "xmax": 0, "ymax": 0},
-            font_path=download_default_font_path,
-        )
-        encoding = processor(
-            images=rendered_image,
-            max_patches=2048,
-            add_special_tokens=True,
-            return_tensors="pt",
-        )
-        return encoding, render_variables
-    def predict_coordinates(self, encoding, model, render_variables):
-        with torch.no_grad():
-            pred_regression_outs = model(flattened_patches=encoding["flattened_patches"], attention_mask=encoding["attention_mask"])
-            new_height = render_variables["height"]
-            new_header_height = render_variables["header_height"]
-            new_total_height = render_variables["total_height"]
-            pred_regression_outs[:, 1] = (
-                (pred_regression_outs[:, 1] * new_total_height) - new_header_height
-            ) / new_height
-            pred_coordinates = pred_regression_outs.squeeze().tolist()
-        return pred_coordinates
-    def draw_circle_on_image(self, image, coordinates):
-        x, y = coordinates[0] * image.width, coordinates[1] * image.height
-        draw = ImageDraw.Draw(image)
-        radius = 5
-        draw.ellipse((x-radius, y-radius, x+radius, y+radius), fill="red")
-        return image
-    def process_image_and_draw_circle(self, image, prompt):
-        encoding, render_variables = self.prepare_image(image, prompt, self.processor)
-        pred_coordinates = self.predict_coordinates(encoding.to(self.device) , self.model, render_variables)
-        result_image = self.draw_circle_on_image(image, pred_coordinates)
-        return result_image
 def main():
-    inference = Inference()
     # Gradio Interface
     iface = gr.Interface(
         fn=inference.process_image_and_draw_circle,

 import gradio as gr
+from askui_ml_helper.utils.pta_text import PtaTextInference
 def main():
+    inference = PtaTextInference("model/pta-text-v0.1.pt")
     # Gradio Interface
     iface = gr.Interface(
         fn=inference.process_image_and_draw_circle,

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-torch
-transformers
-gradio
-Pillow


1	+ askui-ml-helper
2	+ gradio

utils.py DELETED Viewed

@@ -1,144 +0,0 @@
-import io
-import os
-import textwrap
-from typing import Dict, Optional, Tuple
-from huggingface_hub import hf_hub_download
-from PIL import Image, ImageDraw, ImageFont
-DEFAULT_FONT_PATH = "ybelkada/fonts"
-def download_default_font():
-    font_path = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
-    return font_path
-def render_text(
-    text: str,
-    text_size: int = 36,
-    text_color: str = "black",
-    background_color: str = "white",
-    left_padding: int = 5,
-    right_padding: int = 5,
-    top_padding: int = 5,
-    bottom_padding: int = 5,
-    font_bytes: Optional[bytes] = None,
-    font_path: Optional[str] = None,
-) -> Image.Image:
-    """
-    Render text. This script is entirely adapted from the original script that can be found here:
-    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py
-    Args:
-        text (`str`, *optional*, defaults to ):
-            Text to render.
-        text_size (`int`, *optional*, defaults to 36):
-            Size of the text.
-        text_color (`str`, *optional*, defaults to `"black"`):
-            Color of the text.
-        background_color (`str`, *optional*, defaults to `"white"`):
-            Color of the background.
-        left_padding (`int`, *optional*, defaults to 5):
-            Padding on the left.
-        right_padding (`int`, *optional*, defaults to 5):
-            Padding on the right.
-        top_padding (`int`, *optional*, defaults to 5):
-            Padding on the top.
-        bottom_padding (`int`, *optional*, defaults to 5):
-            Padding on the bottom.
-        font_bytes (`bytes`, *optional*):
-            Bytes of the font to use. If `None`, the default font will be used.
-        font_path (`str`, *optional*):
-            Path to the font to use. If `None`, the default font will be used.
-    """
-    wrapper = textwrap.TextWrapper(
-        width=80
-    )  # Add new lines so that each line is no more than 80 characters.
-    lines = wrapper.wrap(text=text)
-    wrapped_text = "\n".join(lines)
-    if font_bytes is not None and font_path is None:
-        font = io.BytesIO(font_bytes)
-    elif font_path is not None:
-        font = font_path
-    else:
-        font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
-        raise ValueError(
-            "Either font_bytes or font_path must be provided. "
-            f"Using default font {font}."
-        )
-    font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)
-    # Use a temporary canvas to determine the width and height in pixels when
-    # rendering the text.
-    temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
-    _, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)
-    # Create the actual image with a bit of padding around the text.
-    image_width = text_width + left_padding + right_padding
-    image_height = text_height + top_padding + bottom_padding
-    image = Image.new("RGB", (image_width, image_height), background_color)
-    draw = ImageDraw.Draw(image)
-    draw.text(
-        xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font
-    )
-    return image
-# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
-def render_header(
-    image: Image.Image, header: str, bbox: Dict[str, float], font_path: str, **kwargs
-) -> Tuple[Image.Image, Tuple[float, float, float, float]]:
-    """
-    Renders the input text as a header on the input image and updates the bounding box.
-    Args:
-        image (Image.Image):
-            The image to render the header on.
-        header (str):
-            The header text.
-        bbox (Dict[str,float]):
-            The bounding box in relative position (0-1), format ("x_min": 0,
-                                                                 "y_min": 0,
-                                                                 "x_max": 0,
-                                                                 "y_max": 0).
-        input_data_format (Union[str, ChildProcessError], optional):
-            The data format of the image.
-    Returns:
-        Tuple[Image.Image, Dict[str, float] ]:
-        The image with the header rendered and the updated bounding box.
-    """
-    assert os.path.exists(font_path), f"Font path {font_path} does not exist."
-    header_image = render_text(text=header, font_path=font_path, **kwargs)
-    new_width = max(header_image.width, image.width)
-    new_height = int(image.height * (new_width / image.width))
-    new_header_height = int(header_image.height * (new_width / header_image.width))
-    new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")
-    new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))
-    new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))
-    new_total_height = new_image.height
-    new_bbox = {
-        "xmin": bbox["xmin"],
-        "ymin": ((bbox["ymin"] * new_height) + new_header_height)
-        / new_total_height,  # shift y_min down by the header's relative height
-        "xmax": bbox["xmax"],
-        "ymax": ((bbox["ymax"] * new_height) + new_header_height)
-        / new_total_height,  # shift y_min down by the header's relative height
-    }
-    return (
-        new_image,
-        new_bbox,
-        {
-            "width": new_width,
-            "height": new_height,
-            "header_height": new_header_height,
-            "total_height": new_total_height,
-        },
-    )