Spaces:

luminoussg
/

token_counter

Sleeping

luminoussg commited on Oct 16, 2024

Commit

0878173

verified ·

1 Parent(s): 1232554

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,36 +2,37 @@ import gradio as gr
 import tiktoken
 import json
-# Function to load JSON/JSONL file and count tokens for each entry
 def count_tokens(json_file, encoding_name):
     encoding = tiktoken.get_encoding(encoding_name)
     # Load the JSON or JSONL data
     with open(json_file.name, 'r') as f:
-        data = [json.loads(line) for line in f.readlines()] if json_file.name.endswith('.jsonl') else json.load(f)
-    # Token counting for each item in the dataset
     token_counts = []
-    for item in data:
-        if isinstance(item, dict):
-            text = ' '.join([str(v) for v in item.values() if isinstance(v, str)])
-        else:
-            text = str(item)
-        num_tokens = len(encoding.encode(text))
         token_counts.append({
-            'text': text,
-            'token_count': num_tokens
         })
     return token_counts
-# Gradio interface
 def token_counter(json_file, encoding_name):
     token_data = count_tokens(json_file, encoding_name)
     return token_data
-# Gradio UI
 gr.Interface(
     fn=token_counter,
     inputs=[

 import tiktoken
 import json
+# Function to count tokens in the dataset based on the "messages" field
 def count_tokens(json_file, encoding_name):
     encoding = tiktoken.get_encoding(encoding_name)
     # Load the JSON or JSONL data
     with open(json_file.name, 'r') as f:
+        data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
     token_counts = []
+    for entry in data:
+        conversation_token_count = 0
+        conversation_texts = []
+        if "messages" in entry:
+            for message in entry["messages"]:
+                content = message.get("content", "")
+                conversation_texts.append(content)
+                conversation_token_count += len(encoding.encode(content))
         token_counts.append({
+            'conversation': ' '.join(conversation_texts),
+            'token_count': conversation_token_count
         })
     return token_counts
+# Gradio interface function
 def token_counter(json_file, encoding_name):
     token_data = count_tokens(json_file, encoding_name)
     return token_data
+# Gradio UI setup
 gr.Interface(
     fn=token_counter,
     inputs=[