Spaces:
Sleeping
Sleeping
import gradio as gr | |
import tiktoken | |
import json | |
# Function to load JSON/JSONL file and count tokens for each entry | |
def count_tokens(json_file, encoding_name): | |
encoding = tiktoken.get_encoding(encoding_name) | |
# Load the JSON or JSONL data | |
with open(json_file.name, 'r') as f: | |
data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f) | |
# Token counting for each item in the dataset | |
token_counts = [] | |
for item in data: | |
if isinstance(item, dict): | |
text = ' '.join([str(v) for v in item.values() if isinstance(v, str)]) | |
else: | |
text = str(item) | |
num_tokens = len(encoding.encode(text)) | |
token_counts.append({ | |
'text': text, | |
'token_count': num_tokens | |
}) | |
return token_counts | |
# Gradio interface | |
def token_counter(json_file, encoding_name): | |
token_data = count_tokens(json_file, encoding_name) | |
return token_data | |
# Gradio UI | |
gr.Interface( | |
fn=token_counter, | |
inputs=[ | |
gr.File(label="Upload JSON/JSONL File"), | |
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base") | |
], | |
outputs=gr.JSON(label="Token Counts") | |
).launch() | |