CISCai commited on
Commit
ad78086
·
verified ·
1 Parent(s): afe8ab5

Implemented edit support for tokenizer.ggml.scores and token_types

Browse files
Files changed (3) hide show
  1. _hf_explorer.py +2 -2
  2. _hf_gguf.py +50 -0
  3. app.py +207 -26
_hf_explorer.py CHANGED
@@ -33,7 +33,7 @@ class FileExplorer(Component):
33
  value: str | list[str] | Callable | None = None,
34
  file_count: Literal["single", "multiple"] = "multiple",
35
  root_dir: str = None,
36
- branch: str = "main",
37
  token: str | None = None,
38
  ignore_glob: str | None = None,
39
  label: str | None = None,
@@ -76,7 +76,7 @@ class FileExplorer(Component):
76
  key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
77
  """
78
  self.root_dir = root_dir
79
- self.branch = branch
80
  self.fs = HfFileSystem(token = token)
81
  self.glob = glob
82
  self.ignore_glob = ignore_glob
 
33
  value: str | list[str] | Callable | None = None,
34
  file_count: Literal["single", "multiple"] = "multiple",
35
  root_dir: str = None,
36
+ branch: str | None = None,
37
  token: str | None = None,
38
  ignore_glob: str | None = None,
39
  label: str | None = None,
 
76
  key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
77
  """
78
  self.root_dir = root_dir
79
+ self.branch = branch or "main"
80
  self.fs = HfFileSystem(token = token)
81
  self.glob = glob
82
  self.ignore_glob = ignore_glob
_hf_gguf.py CHANGED
@@ -4,6 +4,56 @@ from fsspec.spec import AbstractBufferedFile
4
  from typing import Any, Iterator, NamedTuple
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  class GGUFValueType(IntEnum):
8
  UINT8 = 0
9
  INT8 = 1
 
4
  from typing import Any, Iterator, NamedTuple
5
 
6
 
7
+ class TokenType(IntEnum):
8
+ NORMAL = 1
9
+ UNKNOWN = 2
10
+ CONTROL = 3
11
+ USER_DEFINED = 4
12
+ UNUSED = 5
13
+ BYTE = 6
14
+
15
+
16
+ class LlamaFileType(IntEnum):
17
+ ALL_F32 = 0
18
+ MOSTLY_F16 = 1
19
+ MOSTLY_Q4_0 = 2
20
+ MOSTLY_Q4_1 = 3
21
+ MOSTLY_Q4_1_SOME_F16 = 4
22
+ MOSTLY_Q4_2 = 5
23
+ MOSTLY_Q4_3 = 6
24
+ MOSTLY_Q8_0 = 7
25
+ MOSTLY_Q5_0 = 8
26
+ MOSTLY_Q5_1 = 9
27
+ MOSTLY_Q2_K = 10
28
+ MOSTLY_Q3_K_S = 11
29
+ MOSTLY_Q3_K_M = 12
30
+ MOSTLY_Q3_K_L = 13
31
+ MOSTLY_Q4_K_S = 14
32
+ MOSTLY_Q4_K_M = 15
33
+ MOSTLY_Q5_K_S = 16
34
+ MOSTLY_Q5_K_M = 17
35
+ MOSTLY_Q6_K = 18
36
+ MOSTLY_IQ2_XXS = 19
37
+ MOSTLY_IQ2_XS = 20
38
+ MOSTLY_Q2_K_S = 21
39
+ MOSTLY_IQ3_XS = 22
40
+ MOSTLY_IQ3_XXS = 23
41
+ MOSTLY_IQ1_S = 24
42
+ MOSTLY_IQ4_NL = 25
43
+ MOSTLY_IQ3_S = 26
44
+ MOSTLY_IQ3_M = 27
45
+ MOSTLY_IQ2_S = 28
46
+ MOSTLY_IQ2_M = 29
47
+ MOSTLY_IQ4_XS = 30
48
+ MOSTLY_IQ1_M = 31
49
+ MOSTLY_BF16 = 32
50
+ MOSTLY_Q4_0_4_4 = 33
51
+ MOSTLY_Q4_0_4_8 = 34
52
+ MOSTLY_Q4_0_8_8 = 35
53
+ MOSTLY_TQ1_0 = 36
54
+ MOSTLY_TQ2_0 = 37
55
+
56
+
57
  class GGUFValueType(IntEnum):
58
  UINT8 = 0
59
  INT8 = 1
app.py CHANGED
@@ -9,7 +9,7 @@ from typing import Annotated, Any, NamedTuple
9
  from urllib.parse import urlencode
10
 
11
  from _hf_explorer import FileExplorer
12
- from _hf_gguf import standard_metadata, GGUFValueType, HuggingGGUFstream
13
 
14
 
15
  hfapi = HfApi()
@@ -49,6 +49,14 @@ def human_readable_metadata(
49
  val = str(val[:8])[:-1] + ', ...]'
50
  else:
51
  val = str(val)
 
 
 
 
 
 
 
 
52
  elif key.endswith('_token_id'):
53
  tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
54
 
@@ -113,8 +121,23 @@ with gr.Blocks(
113
  )
114
 
115
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  meta_lookup = gr.Dropdown(
117
  label = 'Lookup token',
 
118
  allow_custom_value = True,
119
  visible = False,
120
  )
@@ -169,6 +192,8 @@ with gr.Blocks(
169
  # BUG: For some reason using gr.State initial value turns tuple to list?
170
  meta_state.value = init_state()
171
 
 
 
172
  file_change_components = [
173
  meta_changes,
174
  file_meta,
@@ -400,6 +425,8 @@ with gr.Blocks(
400
  ],
401
  outputs = [
402
  meta_boolean,
 
 
403
  meta_lookup,
404
  meta_number,
405
  meta_string,
@@ -420,7 +447,21 @@ with gr.Blocks(
420
  elif not key:
421
  typ = None
422
 
423
- if isinstance(val, list):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  # TODO: Support arrays?
425
  typ = GGUFValueType.ARRAY
426
 
@@ -435,15 +476,25 @@ with gr.Blocks(
435
  value = val if typ == GGUFValueType.BOOL and data is not None else False,
436
  visible = True if typ == GGUFValueType.BOOL else False,
437
  ),
 
 
 
 
 
 
 
 
 
438
  meta_lookup: gr.Dropdown(
439
  None,
440
- value = tokens[val] if is_number and data is not None and key.endswith('_token_id') and val < len(tokens) else '',
441
- visible = True if is_number and key.endswith('_token_id') else False,
442
  ),
443
  meta_number: gr.Number(
444
- value = val if is_number and data is not None else 0,
445
  precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
446
- visible = True if is_number else False,
 
447
  ),
448
  meta_string: gr.Textbox(
449
  value = val if typ == GGUFValueType.STRING else '',
@@ -483,8 +534,9 @@ with gr.Blocks(
483
  changes = [(k, 'rem') for k in meta.rem]
484
 
485
  for k, v in meta.add.items():
 
486
  changes.append((k, 'add'))
487
- changes.append((str(v[1]), None))
488
 
489
  m = []
490
  for k, v in meta.key.items():
@@ -498,7 +550,7 @@ with gr.Blocks(
498
  link += '&' + urlencode(
499
  {
500
  'rem': meta.rem,
501
- 'add': [json.dumps([k, *v], ensure_ascii = False) for k, v in meta.add.items()],
502
  },
503
  doseq = True,
504
  safe = '[]{}:"\',',
@@ -554,6 +606,97 @@ with gr.Blocks(
554
  )
555
 
556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  @gr.on(
558
  triggers = [
559
  meta_lookup.key_up,
@@ -563,6 +706,7 @@ with gr.Blocks(
563
  ],
564
  outputs = [
565
  meta_lookup,
 
566
  ],
567
  show_progress = 'hidden',
568
  trigger_mode = 'always_last',
@@ -571,16 +715,13 @@ with gr.Blocks(
571
  meta: MetadataState,
572
  keyup: gr.KeyUpData,
573
  ):
574
- found = []
575
- value = keyup.input_value.lower()
576
- tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
577
-
578
- any(((found.append(t), len(found) > 5)[1] for i, t in enumerate(tokens) if value in t.lower()))
579
 
580
  return {
581
  meta_lookup: gr.Dropdown(
582
- found,
583
  ),
 
584
  }
585
 
586
 
@@ -590,6 +731,8 @@ with gr.Blocks(
590
  typ: int | None,
591
  val: Any,
592
  request: gr.Request,
 
 
593
  ):
594
  if not key or typ is None:
595
  if key:
@@ -603,7 +746,18 @@ with gr.Blocks(
603
  if key in meta.rem:
604
  meta.rem.remove(key)
605
 
606
- meta.key[key] = meta.add[key] = (typ, val)
 
 
 
 
 
 
 
 
 
 
 
607
 
608
  if key.startswith('tokenizer.chat_template.'):
609
  template = key[24:]
@@ -617,29 +771,25 @@ with gr.Blocks(
617
  )
618
 
619
 
620
- def token_to_id(
621
- meta: MetadataState,
622
- token: str,
623
  ):
624
- tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
625
-
626
- try:
627
- found = tokens.index(token)
628
- except Exception as e:
629
  raise gr.Error('Token not found')
630
 
631
  return {
632
  meta_number: gr.Number(
633
- found,
634
  ),
635
  }
636
 
637
 
638
  meta_lookup.input(
639
- token_to_id,
640
  inputs = [
641
- meta_state,
642
  meta_lookup,
 
643
  ],
644
  outputs = [
645
  meta_number,
@@ -668,6 +818,20 @@ with gr.Blocks(
668
  ] + state_change_components,
669
  )
670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  meta_number.submit(
672
  add_metadata,
673
  inputs = [
@@ -675,6 +839,8 @@ with gr.Blocks(
675
  meta_keys,
676
  meta_types,
677
  meta_number,
 
 
678
  ],
679
  outputs = [
680
  ] + state_change_components,
@@ -736,9 +902,24 @@ def stream_repo_file(
736
  for k in rem_meta:
737
  gguf.remove_metadata(k)
738
 
 
739
  for k in add_meta:
740
  k = json.loads(k)
741
  if isinstance(k, list) and len(k) == 3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  gguf.add_metadata(*k)
743
 
744
  yield gguf.filesize
 
9
  from urllib.parse import urlencode
10
 
11
  from _hf_explorer import FileExplorer
12
+ from _hf_gguf import standard_metadata, TokenType, LlamaFileType, GGUFValueType, HuggingGGUFstream
13
 
14
 
15
  hfapi = HfApi()
 
49
  val = str(val[:8])[:-1] + ', ...]'
50
  else:
51
  val = str(val)
52
+ elif isinstance(val, dict):
53
+ val = '[' + ', '.join((f'{k}: {v}' for k, v in val.items())) + ']'
54
+ elif key == 'general.file_type':
55
+ try:
56
+ ftype = LlamaFileType(val).name
57
+ except:
58
+ ftype = 'UNKNOWN'
59
+ val = f'{ftype} ({val})'
60
  elif key.endswith('_token_id'):
61
  tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
62
 
 
121
  )
122
 
123
  with gr.Row():
124
+ meta_token_select = gr.Dropdown(
125
+ label = 'Select token',
126
+ type = 'index',
127
+ allow_custom_value = True,
128
+ visible = False,
129
+ )
130
+
131
+ meta_token_type = gr.Dropdown(
132
+ [e.name for e in TokenType],
133
+ label = 'Token type',
134
+ type = 'index',
135
+ visible = False,
136
+ )
137
+
138
  meta_lookup = gr.Dropdown(
139
  label = 'Lookup token',
140
+ type = 'index',
141
  allow_custom_value = True,
142
  visible = False,
143
  )
 
192
  # BUG: For some reason using gr.State initial value turns tuple to list?
193
  meta_state.value = init_state()
194
 
195
+ token_select_indices = gr.State([])
196
+
197
  file_change_components = [
198
  meta_changes,
199
  file_meta,
 
425
  ],
426
  outputs = [
427
  meta_boolean,
428
+ meta_token_select,
429
+ meta_token_type,
430
  meta_lookup,
431
  meta_number,
432
  meta_string,
 
447
  elif not key:
448
  typ = None
449
 
450
+ do_select_token = False
451
+ do_lookup_token = False
452
+ do_token_type = False
453
+ match key:
454
+ case 'tokenizer.ggml.scores':
455
+ do_select_token = True
456
+ case 'tokenizer.ggml.token_type':
457
+ do_select_token = True
458
+ do_token_type = True
459
+ case s if s.endswith('_token_id'):
460
+ do_lookup_token = True
461
+ case _:
462
+ pass
463
+
464
+ if isinstance(val, list) and not do_select_token:
465
  # TODO: Support arrays?
466
  typ = GGUFValueType.ARRAY
467
 
 
476
  value = val if typ == GGUFValueType.BOOL and data is not None else False,
477
  visible = True if typ == GGUFValueType.BOOL else False,
478
  ),
479
+ meta_token_select: gr.Dropdown(
480
+ None,
481
+ value = '',
482
+ visible = True if do_select_token else False,
483
+ ),
484
+ meta_token_type: gr.Dropdown(
485
+ interactive = False,
486
+ visible = True if do_token_type else False,
487
+ ),
488
  meta_lookup: gr.Dropdown(
489
  None,
490
+ value = tokens[val] if is_number and data is not None and do_lookup_token and val < len(tokens) else '',
491
+ visible = True if is_number and do_lookup_token else False,
492
  ),
493
  meta_number: gr.Number(
494
+ value = val if is_number and data is not None and not do_select_token else 0,
495
  precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
496
+ interactive = False if do_select_token else True,
497
+ visible = True if is_number and not do_token_type else False,
498
  ),
499
  meta_string: gr.Textbox(
500
  value = val if typ == GGUFValueType.STRING else '',
 
534
  changes = [(k, 'rem') for k in meta.rem]
535
 
536
  for k, v in meta.add.items():
537
+ key, typ, val = human_readable_metadata(meta, k, *v)
538
  changes.append((k, 'add'))
539
+ changes.append((str(val), None))
540
 
541
  m = []
542
  for k, v in meta.key.items():
 
550
  link += '&' + urlencode(
551
  {
552
  'rem': meta.rem,
553
+ 'add': [json.dumps([k, *v], ensure_ascii = False, separators = (',', ':')) for k, v in meta.add.items()],
554
  },
555
  doseq = True,
556
  safe = '[]{}:"\',',
 
606
  )
607
 
608
 
609
+ def token_search(
610
+ meta: MetadataState,
611
+ name: str,
612
+ ):
613
+ found = {}
614
+ name = name.lower()
615
+ tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
616
+
617
+ any(((len(found) > 5, found.setdefault(i, t))[0] for i, t in enumerate(tokens) if name in t.lower()))
618
+
619
+ return found
620
+
621
+
622
+ @gr.on(
623
+ triggers = [
624
+ meta_token_select.key_up,
625
+ ],
626
+ inputs = [
627
+ meta_state,
628
+ ],
629
+ outputs = [
630
+ meta_token_select,
631
+ token_select_indices,
632
+ ],
633
+ show_progress = 'hidden',
634
+ trigger_mode = 'always_last',
635
+ )
636
+ def token_select(
637
+ meta: MetadataState,
638
+ keyup: gr.KeyUpData,
639
+ ):
640
+ found = token_search(meta, keyup.input_value)
641
+
642
+ return {
643
+ meta_token_select: gr.Dropdown(
644
+ list(found.values()),
645
+ ),
646
+ token_select_indices: list(found.keys()),
647
+ }
648
+
649
+
650
+ @gr.on(
651
+ triggers = [
652
+ meta_token_select.input,
653
+ ],
654
+ inputs = [
655
+ meta_state,
656
+ meta_keys,
657
+ meta_token_select,
658
+ token_select_indices,
659
+ ],
660
+ outputs = [
661
+ meta_token_type,
662
+ meta_number,
663
+ ],
664
+ )
665
+ def token_selected(
666
+ meta: MetadataState,
667
+ key: str,
668
+ choice: int,
669
+ indices: list[int],
670
+ ):
671
+ if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
672
+ raise gr.Error('Token not found')
673
+
674
+ tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
675
+
676
+ if token >= len(tokens):
677
+ raise gr.Error('Invalid token')
678
+
679
+ data = meta.key.get(key, (-1, []))[1]
680
+
681
+ match key:
682
+ case 'tokenizer.ggml.scores':
683
+ return {
684
+ meta_number: gr.Number(
685
+ value = data[token] if data and len(data) > token else 0.0,
686
+ interactive = True,
687
+ ),
688
+ }
689
+ case 'tokenizer.ggml.token_type':
690
+ return {
691
+ meta_token_type: gr.Dropdown(
692
+ value = TokenType(data[token]).name if data and len(data) > token else TokenType.NORMAL.name,
693
+ interactive = True,
694
+ ),
695
+ }
696
+ case _:
697
+ raise gr.Error('Invalid metadata key')
698
+
699
+
700
  @gr.on(
701
  triggers = [
702
  meta_lookup.key_up,
 
706
  ],
707
  outputs = [
708
  meta_lookup,
709
+ token_select_indices,
710
  ],
711
  show_progress = 'hidden',
712
  trigger_mode = 'always_last',
 
715
  meta: MetadataState,
716
  keyup: gr.KeyUpData,
717
  ):
718
+ found = token_search(meta, keyup.input_value)
 
 
 
 
719
 
720
  return {
721
  meta_lookup: gr.Dropdown(
722
+ list(found.values()),
723
  ),
724
+ token_select_indices: list(found.keys()),
725
  }
726
 
727
 
 
731
  typ: int | None,
732
  val: Any,
733
  request: gr.Request,
734
+ choice: int | None = None,
735
+ indices: list[int] | None = None,
736
  ):
737
  if not key or typ is None:
738
  if key:
 
746
  if key in meta.rem:
747
  meta.rem.remove(key)
748
 
749
+ match key:
750
+ case 'tokenizer.ggml.scores' | 'tokenizer.ggml.token_type':
751
+ if choice >= 0 and choice < len(indices) and (token := indices[choice]) >= 0:
752
+ tok = meta.add.setdefault(key, (typ, {}))[1]
753
+ tok[str(token)] = val + 1 if key == 'tokenizer.ggml.token_type' else val
754
+
755
+ data = meta.key.setdefault(key, (typ, [0.0 if key == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(meta.key.get('tokenizer.ggml.tokens', (-1, []))[1])))[1]
756
+ if data:
757
+ for k, v in tok.items():
758
+ data[int(k)] = v
759
+ case _:
760
+ meta.key[key] = meta.add[key] = (typ, val)
761
 
762
  if key.startswith('tokenizer.chat_template.'):
763
  template = key[24:]
 
771
  )
772
 
773
 
774
+ def token_select_to_id(
775
+ choice: int,
776
+ indices: list[int],
777
  ):
778
+ if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
 
 
 
 
779
  raise gr.Error('Token not found')
780
 
781
  return {
782
  meta_number: gr.Number(
783
+ token,
784
  ),
785
  }
786
 
787
 
788
  meta_lookup.input(
789
+ token_select_to_id,
790
  inputs = [
 
791
  meta_lookup,
792
+ token_select_indices,
793
  ],
794
  outputs = [
795
  meta_number,
 
818
  ] + state_change_components,
819
  )
820
 
821
+ meta_token_type.input(
822
+ add_metadata,
823
+ inputs = [
824
+ meta_state,
825
+ meta_keys,
826
+ meta_types,
827
+ meta_token_type,
828
+ meta_token_select,
829
+ token_select_indices,
830
+ ],
831
+ outputs = [
832
+ ] + state_change_components,
833
+ )
834
+
835
  meta_number.submit(
836
  add_metadata,
837
  inputs = [
 
839
  meta_keys,
840
  meta_types,
841
  meta_number,
842
+ meta_token_select,
843
+ token_select_indices,
844
  ],
845
  outputs = [
846
  ] + state_change_components,
 
902
  for k in rem_meta:
903
  gguf.remove_metadata(k)
904
 
905
+ tokens = gguf.metadata.get('tokenizer.ggml.tokens')
906
  for k in add_meta:
907
  k = json.loads(k)
908
  if isinstance(k, list) and len(k) == 3:
909
+ if isinstance(k[2], dict):
910
+ if tokens:
911
+ if (data := gguf.metadata.get(k[0])):
912
+ data = data.value
913
+ else:
914
+ data = [0.0 if k[0] == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(tokens.value)
915
+
916
+ for i, v in k[2].items():
917
+ data[int(i)] = v
918
+
919
+ k[2] = data
920
+ else:
921
+ k[2] = []
922
+
923
  gguf.add_metadata(*k)
924
 
925
  yield gguf.filesize