Dataset-Tokens / app.py
GPT007's picture
Update app.py
7f60bc5 verified
raw history blame
No virus
1.19 kB
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from tokenizers import Tokenizer
def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
global tokens_
tokenizer=Tokenizer.from_pretrained(tokenizer)
dataset=load_dataset(dataset)
tokens_=0
def CountTokens(Example):
global tokens_
for i in Example.values():
tokens_+=len(Tokenizer.encode(i))
dataset.map(CountTokens)
return tokens_
with gr.Blocks(title="Dataset token counter") as app:
gr.Markdown("# Token Counter")
with gr.Row():
prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
tokens = gr.Label(label="Tokens", elem_id="tokens", info="")
prompt.submit().success(
ReturnTokens,
inputs=[prompt,tokenizer,split],
outputs=[tokens]
)
app.launch()