Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,021 Bytes
92a3de8 2d817c6 7df7460 8a04aff 92a3de8 2d817c6 8a04aff 2d817c6 8a04aff 2d817c6 84b0091 8a04aff 2d817c6 7df7460 2d817c6 7df7460 8a04aff 2d817c6 545e733 84b0091 545e733 84b0091 2d817c6 8a04aff 2d817c6 8a04aff 84b0091 8a04aff 84b0091 25d3022 84b0091 8a04aff 2d817c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
}
@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
print("Start Model Processing")
model = models[model_id]
tokenizer = tokenizers[model_id]
messages = [{"role": "user", "content": html_content}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
print("Start Markdownify Processing")
markdownify_output = markdownify(html_content)
return assistant_response[0], markdownify_output
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
example_html = """<div id="myDIV" class="header">
<h2>My To Do List</h2>
<input type="text" id="myInput" placeholder="Title...">
<span onclick="newElement()" class="addBtn">Add</span>
</div>
<ul id="myUL">
<li>Hit the gym</li>
<li class="checked">Pay bills</li>
<li>Meet George</li>
<li>Buy eggs</li>
<li>Read a book</li>
<li>Organize office</li>
</ul>"""
with gr.Blocks(css=css) as demo:
gr.Markdown("""
# HTML-to-Markdown
Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b")
html_content = gr.Textbox(label="HTML")
submit_btn = gr.Button(value="Submit")
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output")
markdownify_output = gr.Textbox(label="Markdownify Output")
gr.Examples(
examples=[
[example_html],
],
inputs=[html_content],
outputs=[model_output_text, markdownify_output],
fn=run_example,
cache_examples=True,
label="Try examples"
)
submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])
demo.launch(debug=True) |