Prgckwb commited on
Commit
0a485e6
·
1 Parent(s): d9d3f4b

:tada: init

Browse files
Files changed (1) hide show
  1. app.py +32 -8
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import torch
3
  from diffusers import DiffusionPipeline
4
  from transformers import AutoTokenizer, CLIPTokenizerFast, T5TokenizerFast
5
-
6
 
7
  def load_tokenizers(model_id: str) -> list[CLIPTokenizerFast | T5TokenizerFast | None]:
8
  config = DiffusionPipeline.load_config(model_id)
@@ -25,19 +25,20 @@ def load_tokenizers(model_id: str) -> list[CLIPTokenizerFast | T5TokenizerFast |
25
 
26
 
27
  @torch.no_grad()
28
- def inference(model_id: str, input_text: str):
29
  tokenizers = load_tokenizers(model_id)
30
 
31
  text_pairs_components = []
32
  special_tokens_components = []
 
33
  for i, tokenizer in enumerate(tokenizers):
34
  if tokenizer:
35
  label_text = f"Tokenizer {i + 1}: {tokenizer.__class__.__name__}"
36
 
37
  # テキストとトークンIDのペアを作成
38
  input_ids = tokenizer(
39
- text=input_text,
40
- truncation=True,
41
  return_length=False,
42
  return_overflowing_tokens=False,
43
  ).input_ids
@@ -49,7 +50,6 @@ def inference(model_id: str, input_text: str):
49
  label=label_text,
50
  value=token_pairs,
51
  visible=True,
52
- show_legend=True,
53
  )
54
 
55
  # スペシャルトークンを追加
@@ -63,16 +63,32 @@ def inference(model_id: str, input_text: str):
63
  label=label_text,
64
  value=special_tokens,
65
  visible=True,
66
- show_legend=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
  else:
69
  output_text_pair_component = gr.HighlightedText(visible=False)
70
  output_special_tokens_component = gr.HighlightedText(visible=False)
 
71
 
72
  text_pairs_components.append(output_text_pair_component)
73
  special_tokens_components.append(output_special_tokens_component)
 
74
 
75
- return text_pairs_components + special_tokens_components
76
 
77
 
78
  if __name__ == "__main__":
@@ -110,6 +126,11 @@ if __name__ == "__main__":
110
  output_special_tokens_1 = gr.HighlightedText()
111
  output_special_tokens_2 = gr.HighlightedText()
112
  output_special_tokens_3 = gr.HighlightedText()
 
 
 
 
 
113
 
114
  with gr.Row():
115
  clear_button = gr.ClearButton(components=[input_text])
@@ -123,6 +144,9 @@ if __name__ == "__main__":
123
  output_special_tokens_1,
124
  output_special_tokens_2,
125
  output_special_tokens_3,
 
 
 
126
  ]
127
  submit_button.click(fn=inference, inputs=all_inputs, outputs=all_output)
128
 
@@ -141,4 +165,4 @@ if __name__ == "__main__":
141
  cache_examples=True,
142
  )
143
 
144
- demo.queue().launch()
 
2
  import torch
3
  from diffusers import DiffusionPipeline
4
  from transformers import AutoTokenizer, CLIPTokenizerFast, T5TokenizerFast
5
+ import pandas as pd
6
 
7
  def load_tokenizers(model_id: str) -> list[CLIPTokenizerFast | T5TokenizerFast | None]:
8
  config = DiffusionPipeline.load_config(model_id)
 
25
 
26
 
27
  @torch.no_grad()
28
+ def inference(model_id: str, text: str):
29
  tokenizers = load_tokenizers(model_id)
30
 
31
  text_pairs_components = []
32
  special_tokens_components = []
33
+ tokenizer_details_components = []
34
  for i, tokenizer in enumerate(tokenizers):
35
  if tokenizer:
36
  label_text = f"Tokenizer {i + 1}: {tokenizer.__class__.__name__}"
37
 
38
  # テキストとトークンIDのペアを作成
39
  input_ids = tokenizer(
40
+ text=text,
41
+ truncation=False,
42
  return_length=False,
43
  return_overflowing_tokens=False,
44
  ).input_ids
 
50
  label=label_text,
51
  value=token_pairs,
52
  visible=True,
 
53
  )
54
 
55
  # スペシャルトークンを追加
 
63
  label=label_text,
64
  value=special_tokens,
65
  visible=True,
66
+ )
67
+
68
+ # トークナイザーの詳細情報を追加
69
+ tokenizer_details = pd.DataFrame([
70
+ ("Type", tokenizer.__class__.__name__),
71
+ ("Vocab Size", tokenizer.vocab_size),
72
+ ("Model Max Length", tokenizer.model_max_length),
73
+ ("Padding Side", tokenizer.padding_side),
74
+ ("Truncation Side", tokenizer.truncation_side),
75
+ ], columns=["Attribute", "Value"])
76
+ output_tokenizer_details = gr.Dataframe(
77
+ headers=["Attribute", "Value"],
78
+ value=tokenizer_details,
79
+ label=label_text,
80
+ visible=True,
81
  )
82
  else:
83
  output_text_pair_component = gr.HighlightedText(visible=False)
84
  output_special_tokens_component = gr.HighlightedText(visible=False)
85
+ output_tokenizer_details = gr.Dataframe(visible=False)
86
 
87
  text_pairs_components.append(output_text_pair_component)
88
  special_tokens_components.append(output_special_tokens_component)
89
+ tokenizer_details_components.append(output_tokenizer_details)
90
 
91
+ return text_pairs_components + special_tokens_components + tokenizer_details_components
92
 
93
 
94
  if __name__ == "__main__":
 
126
  output_special_tokens_1 = gr.HighlightedText()
127
  output_special_tokens_2 = gr.HighlightedText()
128
  output_special_tokens_3 = gr.HighlightedText()
129
+ with gr.Tab(label="Tokenizer Details"):
130
+ with gr.Column():
131
+ output_tokenizer_details_1 = gr.Dataframe(headers=["Attribute", "Value"])
132
+ output_tokenizer_details_2 = gr.Dataframe(headers=["Attribute", "Value"])
133
+ output_tokenizer_details_3 = gr.Dataframe(headers=["Attribute", "Value"])
134
 
135
  with gr.Row():
136
  clear_button = gr.ClearButton(components=[input_text])
 
144
  output_special_tokens_1,
145
  output_special_tokens_2,
146
  output_special_tokens_3,
147
+ output_tokenizer_details_1,
148
+ output_tokenizer_details_2,
149
+ output_tokenizer_details_3,
150
  ]
151
  submit_button.click(fn=inference, inputs=all_inputs, outputs=all_output)
152
 
 
165
  cache_examples=True,
166
  )
167
 
168
+ demo.queue().launch()