token_counter / app.py
luminoussg's picture
Create app.py
e7d39a8 verified
raw
history blame
1.3 kB
import gradio as gr
import tiktoken
import json
# Function to load JSON/JSONL file and count tokens for each entry
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
# Token counting for each item in the dataset
token_counts = []
for item in data:
if isinstance(item, dict):
text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
else:
text = str(item)
num_tokens = len(encoding.encode(text))
token_counts.append({
'text': text,
'token_count': num_tokens
})
return token_counts
# Gradio interface
def token_counter(json_file, encoding_name):
token_data = count_tokens(json_file, encoding_name)
return token_data
# Gradio UI
gr.Interface(
fn=token_counter,
inputs=[
gr.File(label="Upload JSON/JSONL File"),
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
],
outputs=gr.JSON(label="Token Counts")
).launch()