File size: 2,731 Bytes
e7d39a8
 
 
 
0878173
e7d39a8
 
 
 
 
0878173
 
1476a99
e7d39a8
0878173
 
 
 
 
 
 
1476a99
 
0878173
1476a99
 
 
e7d39a8
0878173
 
e7d39a8
 
f825e7c
e7d39a8
0878173
f825e7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7d39a8
0878173
f825e7c
 
 
 
 
 
 
 
 
 
 
 
 
47925da
 
 
 
 
f825e7c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import tiktoken
import json

# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Load the JSON or JSONL data
    with open(json_file.name, 'r') as f:
        data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]

    total_token_count = 0
    token_counts = []
    for entry in data:
        conversation_token_count = 0
        conversation_texts = []
        if "messages" in entry:
            for message in entry["messages"]:
                content = message.get("content", "")
                conversation_texts.append(content)
                tokens = len(encoding.encode(content))
                conversation_token_count += tokens

        # Add conversation token count to the total
        total_token_count += conversation_token_count
        
        token_counts.append({
            'conversation': ' '.join(conversation_texts),
            'token_count': conversation_token_count
        })
    
    return token_counts, total_token_count

# Gradio interface function
def token_counter(json_file, encoding_with_model):
    # Split encoding name and model type from the dropdown input
    encoding_name = encoding_with_model.split()[0]
    
    # Get token counts
    token_data, total_token_count = count_tokens(json_file, encoding_name)
    
    return token_data, total_token_count

# Define the encoding choices with model information
encoding_options = [
    "o200k_base (gpt-4o, gpt-4o-mini)",
    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
    "r50k_base (GPT-3 models like davinci)"
]

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# Token Counter for JSON/JSONL Datasets")
    
    with gr.Row():
        json_input = gr.File(label="Upload JSON/JSONL File")
        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
    
    # Output for individual conversation token counts
    conversation_output = gr.JSON(label="Token Counts per Conversation")
    
    # Output for total token count
    total_output = gr.Number(label="Total Token Count", interactive=False)
    
    # Add a submit button to trigger token counting
    submit_button = gr.Button("Submit")
    
    # Link the button click event to the token counting function
    submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])

# Launch the app
app.launch()