Spaces:
Sleeping
Sleeping
File size: 3,478 Bytes
e7d39a8 169e43b e7d39a8 0878173 ee3031b e7d39a8 0cc0797 ee3031b 0cc0797 0878173 1476a99 e7d39a8 0878173 1476a99 0878173 1476a99 e7d39a8 0878173 e7d39a8 f825e7c e7d39a8 0878173 ee3031b f825e7c ee3031b f825e7c e7d39a8 0878173 f825e7c 0cc0797 f825e7c 0cc0797 f825e7c 169e43b 51a3da8 f825e7c 47925da ee3031b f825e7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import tiktoken
import json
import os
# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
total_token_count = 0
token_counts = []
for entry in data:
conversation_token_count = 0
conversation_texts = []
if "messages" in entry:
for message in entry["messages"]:
content = message.get("content", "")
conversation_texts.append(content)
tokens = len(encoding.encode(content))
conversation_token_count += tokens
# Add conversation token count to the total
total_token_count += conversation_token_count
token_counts.append({
'conversation': ' '.join(conversation_texts),
'token_count': conversation_token_count
})
return token_counts, total_token_count
# Gradio interface function
def token_counter(json_file, encoding_with_model):
# Split encoding name and model type from the dropdown input
encoding_name = encoding_with_model.split()[0]
# Get token counts
token_data, total_token_count = count_tokens(json_file, encoding_name)
return token_data, total_token_count
# Define the encoding choices with model information
encoding_options = [
"o200k_base (gpt-4o, gpt-4o-mini)",
"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
"p50k_base (Codex models, text-davinci-002, text-davinci-003)",
"r50k_base (GPT-3 models like davinci)"
]
# Gradio UI setup
with gr.Blocks() as app:
gr.Markdown("# Token Counter for JSON/JSONL Datasets")
with gr.Row():
json_input = gr.File(label="Upload JSON/JSONL File")
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
# Example file (this will automatically upload when clicked)
example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl"
if os.path.exists(example_file_path):
example = gr.Examples(
examples=[example_file_path],
inputs=json_input,
label="Click here to load the example file"
)
# Display credits for the dataset author
gr.Markdown("### Dataset Credits")
gr.Markdown(
"""
This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face.
All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya).
"""
)
# Output for individual conversation token counts
conversation_output = gr.JSON(label="Token Counts per Conversation")
# Output for total token count
total_output = gr.Number(label="Total Token Count", interactive=False)
# Add a submit button to trigger token counting
submit_button = gr.Button("Submit")
# Link the button click event to the token counting function
submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
# Launch the app
app.launch()
|