Spaces:

maxiw
/

HTML-to-Markdown

Running on Zero

File size: 3,021 Bytes

92a3de8
2d817c6
 
7df7460
8a04aff
92a3de8
 
2d817c6
8a04aff
 
2d817c6
 
 
 
8a04aff
2d817c6
 
 
 
84b0091
8a04aff
2d817c6
 
 
 
7df7460
2d817c6
7df7460
 
8a04aff
 
 
2d817c6
 
 
 
 
 
 
 
 
 
545e733
84b0091
 
 
 
 
 
 
 
 
 
 
 
545e733
84b0091
2d817c6
 
 
8a04aff
2d817c6
8a04aff
 
84b0091
8a04aff
 
 
 
 
 
84b0091
 
 
 
25d3022
84b0091
 
 
 
 
 
8a04aff
2d817c6

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify


models = {
    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
}

tokenizers = {
    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
}


@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
    print("Start Model Processing")
    model = models[model_id]
    tokenizer = tokenizers[model_id]
    messages = [{"role": "user", "content": html_content}]
    input_text=tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
    print("Start Markdownify Processing")
    markdownify_output = markdownify(html_content)
    return assistant_response[0], markdownify_output


css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

example_html = """<div id="myDIV" class="header">
  <h2>My To Do List</h2>
  <input type="text" id="myInput" placeholder="Title...">
  <span onclick="newElement()" class="addBtn">Add</span>
</div>

<ul id="myUL">
  <li>Hit the gym</li>
  <li class="checked">Pay bills</li>
  <li>Meet George</li>
  <li>Buy eggs</li>
  <li>Read a book</li>
  <li>Organize office</li>
</ul>"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("""
    # HTML-to-Markdown
    Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
    """)
    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b")
            html_content = gr.Textbox(label="HTML")
            submit_btn = gr.Button(value="Submit")
        with gr.Column():
            model_output_text = gr.Textbox(label="Reader LM Output")
            markdownify_output = gr.Textbox(label="Markdownify Output")

    gr.Examples(
        examples=[
            [example_html],
        ],
        inputs=[html_content],
        outputs=[model_output_text, markdownify_output],
        fn=run_example,
        cache_examples=True,
        label="Try examples"
    )

    submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])

demo.launch(debug=True)