AlGe commited on
Commit
0be981f
·
verified ·
1 Parent(s): 0b052a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -5
app.py CHANGED
@@ -55,20 +55,40 @@ def generate_colored_html(tokens, decoded_tokens):
55
  return html_output
56
 
57
  def tokenize_text(text, tokenizer_name):
58
- if tokenizer_name.split("/")[0]=="Xenova":
59
  tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_name)
60
- else :
61
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
62
  tokens = tokenizer.encode(text, add_special_tokens=True)
63
  decoded_tokens = [tokenizer.decode(token) for token in tokens]
64
  html_output = generate_colored_html(tokens, decoded_tokens)
65
- return html_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def compare_tokenizers(text, selected_tokenizers):
68
  results = ""
69
  for tokenizer_name in selected_tokenizers:
70
  results += f"<h3>{tokenizer_name}</h3>"
71
- results += tokenize_text(text, tokenizer_name)
 
 
72
  results += "<hr>"
73
  return results
74
 
@@ -81,7 +101,7 @@ iface = gr.Interface(
81
  ],
82
  outputs=gr.HTML(label="Tokenization Results"),
83
  title="Tokenizer Comparison",
84
- description="Compare tokenization results from different tokenizers.",
85
  examples=[
86
  ["Allergies: \nNo Known Allergies / Adverse Drug Reactions\n\nChief Complaint: \nRight hand erythema/edema\n\nHistory of Present Illness: \nThe patient is a ___ y/o M with PMHx significant", ["allenai/longformer-base-4096", "gpt2"]],
87
  ["In the ED, initial vitals: 98.3 80 144/90 18 96% ra \nLabs notable for: Leukocytosis to 12.5. The patient endorsed \nsevere R hand tenderness but otherwise denied any fevers, \nchills, nausea, vomiting, numbness, tingling or weakness. Hand \nsurgery was consulted and recommended admission to medicine for \nIV antibiotics, possible ultrasound to evaluate for \nthrombophlebitis and pain control. In the ED, patient received \nZosyn and IV dilaudid 0.5mg x1.\n\nUpon arrival to the floor, patient endorses R hand pain- stable \nfrom prior. \n\nROS: Per HPI \n\n\nPast Medical History: \nCrohn's disease\nAnal fissure\nh/o DVT on coumadin\nRotator cuff repair in ___\n\n\nFamily History: \nNo family h/o IBD\n\n\nMedications on Admission: EXAMINATION: UNILAT UP EXT VEINS US RIGHT\n\nINDICATION: ___ year old man with concern for R hand cellulitis vs.\nthrombophlebitis ", ["roberta-base", "allenai/longformer-base-4096"]],
 
55
  return html_output
56
 
57
  def tokenize_text(text, tokenizer_name):
58
+ if tokenizer_name.split("/")[0] == "Xenova":
59
  tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_name)
60
+ else:
61
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
62
+
63
  tokens = tokenizer.encode(text, add_special_tokens=True)
64
  decoded_tokens = [tokenizer.decode(token) for token in tokens]
65
  html_output = generate_colored_html(tokens, decoded_tokens)
66
+
67
+ # Calculate tokenizer metrics
68
+ total_tokens = len(tokens)
69
+ unique_tokens = len(set(tokens))
70
+ vocab_size = len(tokenizer.get_vocab())
71
+ avg_token_length = sum(len(t) for t in decoded_tokens) / total_tokens if total_tokens > 0 else 0
72
+
73
+ metrics = f"""
74
+ <h4>Tokenizer Metrics:</h4>
75
+ <ul>
76
+ <li>Total Tokens: {total_tokens}</li>
77
+ <li>Unique Tokens: {unique_tokens}</li>
78
+ <li>Vocabulary Size: {vocab_size}</li>
79
+ <li>Average Token Length: {avg_token_length:.2f} characters</li>
80
+ </ul>
81
+ """
82
+
83
+ return html_output, metrics
84
 
85
  def compare_tokenizers(text, selected_tokenizers):
86
  results = ""
87
  for tokenizer_name in selected_tokenizers:
88
  results += f"<h3>{tokenizer_name}</h3>"
89
+ html_output, metrics = tokenize_text(text, tokenizer_name)
90
+ results += html_output
91
+ results += metrics
92
  results += "<hr>"
93
  return results
94
 
 
101
  ],
102
  outputs=gr.HTML(label="Tokenization Results"),
103
  title="Tokenizer Comparison",
104
+ description="Compare tokenization results and metrics from different tokenizers.",
105
  examples=[
106
  ["Allergies: \nNo Known Allergies / Adverse Drug Reactions\n\nChief Complaint: \nRight hand erythema/edema\n\nHistory of Present Illness: \nThe patient is a ___ y/o M with PMHx significant", ["allenai/longformer-base-4096", "gpt2"]],
107
  ["In the ED, initial vitals: 98.3 80 144/90 18 96% ra \nLabs notable for: Leukocytosis to 12.5. The patient endorsed \nsevere R hand tenderness but otherwise denied any fevers, \nchills, nausea, vomiting, numbness, tingling or weakness. Hand \nsurgery was consulted and recommended admission to medicine for \nIV antibiotics, possible ultrasound to evaluate for \nthrombophlebitis and pain control. In the ED, patient received \nZosyn and IV dilaudid 0.5mg x1.\n\nUpon arrival to the floor, patient endorses R hand pain- stable \nfrom prior. \n\nROS: Per HPI \n\n\nPast Medical History: \nCrohn's disease\nAnal fissure\nh/o DVT on coumadin\nRotator cuff repair in ___\n\n\nFamily History: \nNo family h/o IBD\n\n\nMedications on Admission: EXAMINATION: UNILAT UP EXT VEINS US RIGHT\n\nINDICATION: ___ year old man with concern for R hand cellulitis vs.\nthrombophlebitis ", ["roberta-base", "allenai/longformer-base-4096"]],