Spaces:

luminoussg
/

token_counter

Sleeping

File size: 3,478 Bytes

import gradio as gr
import tiktoken
import json
import os

# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Load the JSON or JSONL data
    with open(json_file.name, 'r') as f:
        data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]

    total_token_count = 0
    token_counts = []
    for entry in data:
        conversation_token_count = 0
        conversation_texts = []
        if "messages" in entry:
            for message in entry["messages"]:
                content = message.get("content", "")
                conversation_texts.append(content)
                tokens = len(encoding.encode(content))
                conversation_token_count += tokens

        # Add conversation token count to the total
        total_token_count += conversation_token_count
        
        token_counts.append({
            'conversation': ' '.join(conversation_texts),
            'token_count': conversation_token_count
        })
    
    return token_counts, total_token_count

# Gradio interface function
def token_counter(json_file, encoding_with_model):
    # Split encoding name and model type from the dropdown input
    encoding_name = encoding_with_model.split()[0]
    
    # Get token counts
    token_data, total_token_count = count_tokens(json_file, encoding_name)
    
    return token_data, total_token_count

# Define the encoding choices with model information
encoding_options = [
    "o200k_base (gpt-4o, gpt-4o-mini)",
    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
    "r50k_base (GPT-3 models like davinci)"
]

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# Token Counter for JSON/JSONL Datasets")
    
    with gr.Row():
        json_input = gr.File(label="Upload JSON/JSONL File")
        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
    
    # Example file (this will automatically upload when clicked)
    example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl"
    if os.path.exists(example_file_path):
        example = gr.Examples(
            examples=[example_file_path],
            inputs=json_input,
            label="Click here to load the example file"
        )
    
    # Display credits for the dataset author
    gr.Markdown("### Dataset Credits")
    gr.Markdown(
        """
        This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face.
        All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya).
        """
    )
    
    # Output for individual conversation token counts
    conversation_output = gr.JSON(label="Token Counts per Conversation")
    
    # Output for total token count
    total_output = gr.Number(label="Total Token Count", interactive=False)
    
    # Add a submit button to trigger token counting
    submit_button = gr.Button("Submit")
    
    # Link the button click event to the token counting function
    submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])

# Launch the app
app.launch()