Spaces:
Sleeping
Sleeping
File size: 3,293 Bytes
fc973c2 93013c6 6c94b18 93013c6 fc973c2 e5a75a4 6c94b18 fc973c2 6c94b18 93013c6 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 e5a75a4 fc973c2 6c94b18 fc973c2 6c94b18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import gradio as gr
import pandas as pd
from gradio.themes import colors
from transformers import AutoTokenizer
os.environ['TOKENIZERS_PARALLELISM'] = "false"
# Function to map tokenized text to IDs
def inference(
text="",
model_id="openai/clip-vit-large-patch14",
) -> (list[str, str], pd.DataFrame):
if text == "":
return [], pd.DataFrame()
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Use tokenizer to tokenize the text
text_inputs = tokenizer(text, return_tensors='pt')
input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list
# Create pairs of tokens and IDs
tokens = [tokenizer.decode([id_]) for id_ in input_ids]
token_pairs = []
for token, id_ in zip(tokens, input_ids):
token_pairs.append((token, str(id_)))
# Count the number of characters and tokens
pos_count = pd.DataFrame({
"Char Count": [len(text)],
"Token Count": [len(token_pairs)]
})
# Create list of special tokens
special_tokens = []
for k, v in tokenizer.special_tokens_map.items():
if k == 'additional_special_tokens':
continue
sp_token_map = [str(k), str(v)]
special_tokens.append(sp_token_map)
return token_pairs, special_tokens, pos_count
if __name__ == '__main__':
iface = gr.Interface(
fn=inference,
inputs=[
gr.Textbox(label="Text"),
gr.Dropdown(
label="Model",
choices=[
"openai/clip-vit-large-patch14",
"google/gemma-7b",
"google-bert/bert-base-uncased",
"google/flan-t5-base",
"openai-community/gpt2",
"rinna/japanese-gpt-1b",
"cyberagent/open-calm-7b",
],
value="openai/clip-vit-large-patch14"
),
],
outputs=[
gr.Highlightedtext(label="Highlighted Text"),
gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
gr.Dataframe(label="Position Count"),
],
examples=[
["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
"but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
"google/flan-t5-base"],
["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
"google/gemma-7b"],
["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"],
],
cache_examples=True,
title="TokenVisor 👀",
description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
allow_flagging="never",
)
iface.launch()
|