import gradio as gr import tiktoken import json # Function to load JSON/JSONL file and count tokens for each entry def count_tokens(json_file, encoding_name): encoding = tiktoken.get_encoding(encoding_name) # Load the JSON or JSONL data with open(json_file.name, 'r') as f: data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f) # Token counting for each item in the dataset token_counts = [] for item in data: if isinstance(item, dict): text = ' '.join([str(v) for v in item.values() if isinstance(v, str)]) else: text = str(item) num_tokens = len(encoding.encode(text)) token_counts.append({ 'text': text, 'token_count': num_tokens }) return token_counts # Gradio interface def token_counter(json_file, encoding_name): token_data = count_tokens(json_file, encoding_name) return token_data # Gradio UI gr.Interface( fn=token_counter, inputs=[ gr.File(label="Upload JSON/JSONL File"), gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base") ], outputs=gr.JSON(label="Token Counts") ).launch()