maxiw's picture
clean up and add markdownify
8a04aff
raw
history blame
2.37 kB
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
}
@spaces.GPU
def run_example(html_content, model_id):
print("Start Model Processing")
model = models[model_id]
tokenizer = tokenizers[model_id]
messages = [{"role": "user", "content": html_content}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
print("Start Markdownify Processing")
markdownify_output = markdownify(html_content)
return assistant_response[0], markdownify_output
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("""
# HTML-to-Markdown
Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-0.5b")
html_content = gr.Textbox(label="HTML")
submit_btn = gr.Button(value="Submit")
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output")
markdownify_output = gr.Textbox(label="Markdownify Output")
submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])
demo.launch(debug=True)