File size: 3,478 Bytes
e7d39a8
 
 
169e43b
e7d39a8
0878173
ee3031b
e7d39a8
 
0cc0797
ee3031b
0cc0797
0878173
1476a99
e7d39a8
0878173
 
 
 
 
 
 
1476a99
 
0878173
1476a99
 
 
e7d39a8
0878173
 
e7d39a8
 
f825e7c
e7d39a8
0878173
ee3031b
f825e7c
 
 
 
ee3031b
f825e7c
 
 
 
 
 
 
 
 
 
e7d39a8
0878173
f825e7c
0cc0797
f825e7c
 
0cc0797
f825e7c
 
169e43b
 
 
 
 
 
 
 
 
51a3da8
 
 
 
 
 
 
 
 
f825e7c
 
 
 
 
 
47925da
 
 
 
ee3031b
f825e7c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import tiktoken
import json
import os

# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Load the JSON or JSONL data
    with open(json_file.name, 'r') as f:
        data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]

    total_token_count = 0
    token_counts = []
    for entry in data:
        conversation_token_count = 0
        conversation_texts = []
        if "messages" in entry:
            for message in entry["messages"]:
                content = message.get("content", "")
                conversation_texts.append(content)
                tokens = len(encoding.encode(content))
                conversation_token_count += tokens

        # Add conversation token count to the total
        total_token_count += conversation_token_count
        
        token_counts.append({
            'conversation': ' '.join(conversation_texts),
            'token_count': conversation_token_count
        })
    
    return token_counts, total_token_count

# Gradio interface function
def token_counter(json_file, encoding_with_model):
    # Split encoding name and model type from the dropdown input
    encoding_name = encoding_with_model.split()[0]
    
    # Get token counts
    token_data, total_token_count = count_tokens(json_file, encoding_name)
    
    return token_data, total_token_count

# Define the encoding choices with model information
encoding_options = [
    "o200k_base (gpt-4o, gpt-4o-mini)",
    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
    "r50k_base (GPT-3 models like davinci)"
]

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# Token Counter for JSON/JSONL Datasets")
    
    with gr.Row():
        json_input = gr.File(label="Upload JSON/JSONL File")
        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
    
    # Example file (this will automatically upload when clicked)
    example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl"
    if os.path.exists(example_file_path):
        example = gr.Examples(
            examples=[example_file_path],
            inputs=json_input,
            label="Click here to load the example file"
        )
    
    # Display credits for the dataset author
    gr.Markdown("### Dataset Credits")
    gr.Markdown(
        """
        This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face.
        All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya).
        """
    )
    
    # Output for individual conversation token counts
    conversation_output = gr.JSON(label="Token Counts per Conversation")
    
    # Output for total token count
    total_output = gr.Number(label="Total Token Count", interactive=False)
    
    # Add a submit button to trigger token counting
    submit_button = gr.Button("Submit")
    
    # Link the button click event to the token counting function
    submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])

# Launch the app
app.launch()