Spaces:

luminoussg
/

token_counter

Sleeping

token_counter / app.py

Create app.py

e7d39a8 verified 2 months ago

1.3 kB

	import gradio as gr
	import tiktoken
	import json

	# Function to load JSON/JSONL file and count tokens for each entry
	def count_tokens(json_file, encoding_name):
	encoding = tiktoken.get_encoding(encoding_name)

	# Load the JSON or JSONL data
	with open(json_file.name, 'r') as f:
	data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)

	# Token counting for each item in the dataset
	token_counts = []
	for item in data:
	if isinstance(item, dict):
	text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
	else:
	text = str(item)

	num_tokens = len(encoding.encode(text))
	token_counts.append({
	'text': text,
	'token_count': num_tokens
	})

	return token_counts

	# Gradio interface
	def token_counter(json_file, encoding_name):
	token_data = count_tokens(json_file, encoding_name)
	return token_data

	# Gradio UI
	gr.Interface(
	fn=token_counter,
	inputs=[
	gr.File(label="Upload JSON/JSONL File"),
	gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
	],
	outputs=gr.JSON(label="Token Counts")
	).launch()