import token import tokenize import gradio as gr from datasets import load_dataset from tokenizers import Tokenizer def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"): global tokens_ tokenizer=Tokenizer.from_pretrained(tokenizer) dataset=load_dataset(dataset) tokens_=0 def CountTokens(Example): global tokens_ for i in Example.values(): tokens_+=len(Tokenizer.encode(i)) dataset.map(CountTokens) return tokens_ with gr.Blocks(title="Dataset token counter") as app: gr.Markdown("# Token Counter") with gr.Row(): prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") tokens = gr.Label(label="Tokens", elem_id="tokens", info="") prompt.submit().success( ReturnTokens, inputs=[prompt,tokenizer,split], outputs=[tokens] ) app.launch()