File size: 4,965 Bytes
ade70cf
1d51385
d69fd19
d7f29ce
9ec0fd2
2ff3a1c
39ae23a
5df4b4d
 
ca4e95c
9ec0fd2
 
c279f7e
 
 
ade70cf
3b56f9e
ca16909
beec895
c279f7e
d59f119
9c53151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae9be1
9ec0fd2
c279f7e
9ec0fd2
 
 
 
 
 
 
 
 
 
 
 
144ba4b
2617a01
144ba4b
 
53581ac
9ec0fd2
c279f7e
d59f119
9ec0fd2
5ae9be1
 
1d51385
5ae9be1
 
9ec0fd2
5ae9be1
 
2ff3a1c
84d0e49
6172e67
 
 
 
d59f119
dd36999
 
6172e67
 
 
92e51e9
 
 
 
 
 
 
 
 
 
 
 
 
53581ac
5df4b4d
53581ac
6172e67
 
63ed30f
1d51385
6172e67
5df4b4d
144ba4b
6172e67
 
8dc80bf
63ed30f
5df4b4d
63ed30f
 
6172e67
63ed30f
9ec0fd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5df4b4d
63ed30f
 
c279f7e
6172e67
e5ad09f
 
5df4b4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import io
import base64  # Adicionando a biblioteca base64 para decodificação
from PIL import Image
import subprocess

# Instalando a dependência flash-attn se necessário
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Carregando o modelo e o processador
model_id = 'J-LAB/Florence-vl3'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

DESCRIPTION = "# Product Describe by Fluxi IA\n### Base Model [Florence-2] (https://huggingface.co/microsoft/Florence-2-large)"

@spaces.GPU
def run_example(task_prompt, image):
    inputs = processor(text=task_prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

# Função para processar imagens, agora suportando Base64
def process_image(image, task_prompt):
    # Verifica se a imagem é uma string base64
    if isinstance(image, str) and image.startswith("data:image"):
        # Extraindo a parte base64 da string
        base64_image = image.split(",")[1]
        # Decodificando a imagem base64
        image = Image.open(io.BytesIO(base64.b64decode(base64_image)))
    elif isinstance(image, bytes):
        image = Image.open(io.BytesIO(image))
    else:
        image = Image.fromarray(image)  # Convertendo um array NumPy para imagem PIL, se aplicável

    # Mapeando os prompts de tarefas
    if task_prompt == 'Product Caption':
        task_prompt = '<MORE_DETAILED_CAPTION>'
    elif task_prompt == 'OCR':
        task_prompt = '<OCR>'
    
    # Chamando o exemplo com a imagem processada e o prompt da tarefa
    results = run_example(task_prompt, image)
    
    # Extraindo o texto gerado a partir dos resultados
    if results and task_prompt in results:
        output_text = results[task_prompt]
    else:
        output_text = ""

    # Convertendo quebras de linha para quebras de linha HTML
    output_text = output_text.replace("\n\n", "<br><br>").replace("\n", "<br>")

    return output_text

css = """
  #output {
    overflow: auto; 
    border: 1px solid #ccc; 
    padding: 10px;
    background-color: rgb(31 41 55);
    color: #fff; 
  }
"""

js = """
function adjustHeight() {
    var outputElement = document.getElementById('output');
    outputElement.style.height = 'auto';  // Reset height to auto to get the actual content height
    var height = outputElement.scrollHeight + 'px';  // Get the scrollHeight
    outputElement.style.height = height;  // Set the height
}

// Attach the adjustHeight function to the click event of the submit button
document.querySelector('button').addEventListener('click', function() {
    setTimeout(adjustHeight, 500);  // Adjust the height after a small delay to ensure content is loaded
});
"""

single_task_list = ['Product Caption', 'OCR']

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Product Image Select"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture", source="upload", type="pil")  # Suporte a PIL images
                task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt", value="Product Caption")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.HTML(label="Output Text", elem_id="output")
        
        gr.Markdown(""" 
        ## How to use via API
        To use this model via API, you can follow the example code below:

        ```python
        import base64
        from PIL import Image
        import io
        import requests

        # Converting image to base64
        image_path = 'path_to_image.png'
        with open(image_path, 'rb') as image_file:
            image_base64 = base64.b64encode(image_file.read()).decode('utf-8')
        
        # Preparing the payload
        payload = {
            "image": f"data:image/png;base64,{image_base64}",
            "task_prompt": "Product Caption"
        }

        response = requests.post("http://your-space-url-here", json=payload)
        print(response.json())
        ``` 
        """)
        
        submit_btn.click(process_image, [input_img, task_prompt], [output_text])

    demo.load(lambda: None, inputs=None, outputs=None, js=js)

demo.launch(debug=True)