Spaces:

CISCai
/

gguf-editor

Running

App Files Files Community

CISCai commited on Sep 6, 2024

Commit

ad78086

verified ·

1 Parent(s): afe8ab5

Implemented edit support for tokenizer.ggml.scores and token_types

Browse files

Files changed (3) hide show

_hf_explorer.py +2 -2
_hf_gguf.py +50 -0
app.py +207 -26

_hf_explorer.py CHANGED Viewed

@@ -33,7 +33,7 @@ class FileExplorer(Component):
         value: str | list[str] | Callable | None = None,
         file_count: Literal["single", "multiple"] = "multiple",
         root_dir: str = None,
-        branch: str = "main",
         token: str | None = None,
         ignore_glob: str | None = None,
         label: str | None = None,
@@ -76,7 +76,7 @@ class FileExplorer(Component):
             key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
         """
         self.root_dir = root_dir
-        self.branch = branch
         self.fs = HfFileSystem(token = token)
         self.glob = glob
         self.ignore_glob = ignore_glob

         value: str | list[str] | Callable | None = None,
         file_count: Literal["single", "multiple"] = "multiple",
         root_dir: str = None,
+        branch: str | None = None,
         token: str | None = None,
         ignore_glob: str | None = None,
         label: str | None = None,
             key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
         """
         self.root_dir = root_dir
+        self.branch = branch or "main"
         self.fs = HfFileSystem(token = token)
         self.glob = glob
         self.ignore_glob = ignore_glob

_hf_gguf.py CHANGED Viewed

@@ -4,6 +4,56 @@ from fsspec.spec import AbstractBufferedFile
 from typing import Any, Iterator, NamedTuple
 class GGUFValueType(IntEnum):
     UINT8   = 0
     INT8    = 1

 from typing import Any, Iterator, NamedTuple
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+class LlamaFileType(IntEnum):
+    ALL_F32              = 0
+    MOSTLY_F16           = 1
+    MOSTLY_Q4_0          = 2
+    MOSTLY_Q4_1          = 3
+    MOSTLY_Q4_1_SOME_F16 = 4
+    MOSTLY_Q4_2          = 5
+    MOSTLY_Q4_3          = 6
+    MOSTLY_Q8_0          = 7
+    MOSTLY_Q5_0          = 8
+    MOSTLY_Q5_1          = 9
+    MOSTLY_Q2_K          = 10
+    MOSTLY_Q3_K_S        = 11
+    MOSTLY_Q3_K_M        = 12
+    MOSTLY_Q3_K_L        = 13
+    MOSTLY_Q4_K_S        = 14
+    MOSTLY_Q4_K_M        = 15
+    MOSTLY_Q5_K_S        = 16
+    MOSTLY_Q5_K_M        = 17
+    MOSTLY_Q6_K          = 18
+    MOSTLY_IQ2_XXS       = 19
+    MOSTLY_IQ2_XS        = 20
+    MOSTLY_Q2_K_S        = 21
+    MOSTLY_IQ3_XS        = 22
+    MOSTLY_IQ3_XXS       = 23
+    MOSTLY_IQ1_S         = 24
+    MOSTLY_IQ4_NL        = 25
+    MOSTLY_IQ3_S         = 26
+    MOSTLY_IQ3_M         = 27
+    MOSTLY_IQ2_S         = 28
+    MOSTLY_IQ2_M         = 29
+    MOSTLY_IQ4_XS        = 30
+    MOSTLY_IQ1_M         = 31
+    MOSTLY_BF16          = 32
+    MOSTLY_Q4_0_4_4      = 33
+    MOSTLY_Q4_0_4_8      = 34
+    MOSTLY_Q4_0_8_8      = 35
+    MOSTLY_TQ1_0         = 36
+    MOSTLY_TQ2_0         = 37
 class GGUFValueType(IntEnum):
     UINT8   = 0
     INT8    = 1

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Annotated, Any, NamedTuple
 from urllib.parse import urlencode
 from _hf_explorer import FileExplorer
-from _hf_gguf import standard_metadata, GGUFValueType, HuggingGGUFstream
 hfapi = HfApi()
@@ -49,6 +49,14 @@ def human_readable_metadata(
             val = str(val[:8])[:-1] + ', ...]'
         else:
             val = str(val)
     elif key.endswith('_token_id'):
         tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
@@ -113,8 +121,23 @@ with gr.Blocks(
     )
     with gr.Row():
         meta_lookup = gr.Dropdown(
             label = 'Lookup token',
             allow_custom_value = True,
             visible = False,
         )
@@ -169,6 +192,8 @@ with gr.Blocks(
     # BUG: For some reason using gr.State initial value turns tuple to list?
     meta_state.value = init_state()
     file_change_components = [
         meta_changes,
         file_meta,
@@ -400,6 +425,8 @@ with gr.Blocks(
         ],
         outputs = [
             meta_boolean,
             meta_lookup,
             meta_number,
             meta_string,
@@ -420,7 +447,21 @@ with gr.Blocks(
         elif not key:
             typ = None
-        if isinstance(val, list):
             # TODO: Support arrays?
             typ = GGUFValueType.ARRAY
@@ -435,15 +476,25 @@ with gr.Blocks(
                 value = val if typ == GGUFValueType.BOOL and data is not None else False,
                 visible = True if typ == GGUFValueType.BOOL else False,
             ),
             meta_lookup: gr.Dropdown(
                 None,
-                value = tokens[val] if is_number and data is not None and key.endswith('_token_id') and val < len(tokens) else '',
-                visible = True if is_number and key.endswith('_token_id') else False,
             ),
             meta_number: gr.Number(
-                value = val if is_number and data is not None else 0,
                 precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
-                visible = True if is_number else False,
             ),
             meta_string: gr.Textbox(
                 value = val if typ == GGUFValueType.STRING else '',
@@ -483,8 +534,9 @@ with gr.Blocks(
         changes = [(k, 'rem') for k in meta.rem]
         for k, v in meta.add.items():
             changes.append((k, 'add'))
-            changes.append((str(v[1]), None))
         m = []
         for k, v in meta.key.items():
@@ -498,7 +550,7 @@ with gr.Blocks(
             link += '&' + urlencode(
                 {
                     'rem': meta.rem,
-                    'add': [json.dumps([k, *v], ensure_ascii = False) for k, v in meta.add.items()],
                 },
                 doseq = True,
                 safe = '[]{}:"\',',
@@ -554,6 +606,97 @@ with gr.Blocks(
         )
     @gr.on(
         triggers = [
             meta_lookup.key_up,
@@ -563,6 +706,7 @@ with gr.Blocks(
         ],
         outputs = [
             meta_lookup,
         ],
         show_progress = 'hidden',
         trigger_mode = 'always_last',
@@ -571,16 +715,13 @@ with gr.Blocks(
         meta: MetadataState,
         keyup: gr.KeyUpData,
     ):
-        found = []
-        value = keyup.input_value.lower()
-        tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
-        any(((found.append(t), len(found) > 5)[1] for i, t in enumerate(tokens) if value in t.lower()))
         return {
             meta_lookup: gr.Dropdown(
-                found,
             ),
         }
@@ -590,6 +731,8 @@ with gr.Blocks(
         typ: int | None,
         val: Any,
         request: gr.Request,
     ):
         if not key or typ is None:
             if key:
@@ -603,7 +746,18 @@ with gr.Blocks(
         if key in meta.rem:
             meta.rem.remove(key)
-        meta.key[key] = meta.add[key] = (typ, val)
         if key.startswith('tokenizer.chat_template.'):
             template = key[24:]
@@ -617,29 +771,25 @@ with gr.Blocks(
         )
-    def token_to_id(
-        meta: MetadataState,
-        token: str,
     ):
-        tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
-        try:
-            found = tokens.index(token)
-        except Exception as e:
             raise gr.Error('Token not found')
         return {
             meta_number: gr.Number(
-                found,
             ),
         }
     meta_lookup.input(
-        token_to_id,
         inputs = [
-            meta_state,
             meta_lookup,
         ],
         outputs = [
             meta_number,
@@ -668,6 +818,20 @@ with gr.Blocks(
         ] + state_change_components,
     )
     meta_number.submit(
         add_metadata,
         inputs = [
@@ -675,6 +839,8 @@ with gr.Blocks(
             meta_keys,
             meta_types,
             meta_number,
         ],
         outputs = [
         ] + state_change_components,
@@ -736,9 +902,24 @@ def stream_repo_file(
         for k in rem_meta:
             gguf.remove_metadata(k)
         for k in add_meta:
             k = json.loads(k)
             if isinstance(k, list) and len(k) == 3:
                 gguf.add_metadata(*k)
         yield gguf.filesize

 from urllib.parse import urlencode
 from _hf_explorer import FileExplorer
+from _hf_gguf import standard_metadata, TokenType, LlamaFileType, GGUFValueType, HuggingGGUFstream
 hfapi = HfApi()
             val = str(val[:8])[:-1] + ', ...]'
         else:
             val = str(val)
+    elif isinstance(val, dict):
+        val = '[' + ', '.join((f'{k}: {v}' for k, v in val.items())) + ']'
+    elif key == 'general.file_type':
+        try:
+            ftype = LlamaFileType(val).name
+        except:
+            ftype = 'UNKNOWN'
+        val = f'{ftype} ({val})'
     elif key.endswith('_token_id'):
         tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
     )
     with gr.Row():
+        meta_token_select = gr.Dropdown(
+            label = 'Select token',
+            type = 'index',
+            allow_custom_value = True,
+            visible = False,
+        )
+        meta_token_type = gr.Dropdown(
+            [e.name for e in TokenType],
+            label = 'Token type',
+            type = 'index',
+            visible = False,
+        )
         meta_lookup = gr.Dropdown(
             label = 'Lookup token',
+            type = 'index',
             allow_custom_value = True,
             visible = False,
         )
     # BUG: For some reason using gr.State initial value turns tuple to list?
     meta_state.value = init_state()
+    token_select_indices = gr.State([])
     file_change_components = [
         meta_changes,
         file_meta,
         ],
         outputs = [
             meta_boolean,
+            meta_token_select,
+            meta_token_type,
             meta_lookup,
             meta_number,
             meta_string,
         elif not key:
             typ = None
+        do_select_token = False
+        do_lookup_token = False
+        do_token_type = False
+        match key:
+            case 'tokenizer.ggml.scores':
+                do_select_token = True
+            case 'tokenizer.ggml.token_type':
+                do_select_token = True
+                do_token_type = True
+            case s if s.endswith('_token_id'):
+                do_lookup_token = True
+            case _:
+                pass
+        if isinstance(val, list) and not do_select_token:
             # TODO: Support arrays?
             typ = GGUFValueType.ARRAY
                 value = val if typ == GGUFValueType.BOOL and data is not None else False,
                 visible = True if typ == GGUFValueType.BOOL else False,
             ),
+            meta_token_select: gr.Dropdown(
+                None,
+                value = '',
+                visible = True if do_select_token else False,
+            ),
+            meta_token_type: gr.Dropdown(
+                interactive = False,
+                visible = True if do_token_type else False,
+            ),
             meta_lookup: gr.Dropdown(
                 None,
+                value = tokens[val] if is_number and data is not None and do_lookup_token and val < len(tokens) else '',
+                visible = True if is_number and do_lookup_token else False,
             ),
             meta_number: gr.Number(
+                value = val if is_number and data is not None and not do_select_token else 0,
                 precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
+                interactive = False if do_select_token else True,
+                visible = True if is_number and not do_token_type else False,
             ),
             meta_string: gr.Textbox(
                 value = val if typ == GGUFValueType.STRING else '',
         changes = [(k, 'rem') for k in meta.rem]
         for k, v in meta.add.items():
+            key, typ, val = human_readable_metadata(meta, k, *v)
             changes.append((k, 'add'))
+            changes.append((str(val), None))
         m = []
         for k, v in meta.key.items():
             link += '&' + urlencode(
                 {
                     'rem': meta.rem,
+                    'add': [json.dumps([k, *v], ensure_ascii = False, separators = (',', ':')) for k, v in meta.add.items()],
                 },
                 doseq = True,
                 safe = '[]{}:"\',',
         )
+    def token_search(
+        meta: MetadataState,
+        name: str,
+    ):
+        found = {}
+        name = name.lower()
+        tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
+        any(((len(found) > 5, found.setdefault(i, t))[0] for i, t in enumerate(tokens) if name in t.lower()))
+        return found
+    @gr.on(
+        triggers = [
+            meta_token_select.key_up,
+        ],
+        inputs = [
+            meta_state,
+        ],
+        outputs = [
+            meta_token_select,
+            token_select_indices,
+        ],
+        show_progress = 'hidden',
+        trigger_mode = 'always_last',
+    )
+    def token_select(
+        meta: MetadataState,
+        keyup: gr.KeyUpData,
+    ):
+        found = token_search(meta, keyup.input_value)
+        return {
+            meta_token_select: gr.Dropdown(
+                list(found.values()),
+            ),
+            token_select_indices: list(found.keys()),
+        }
+    @gr.on(
+        triggers = [
+            meta_token_select.input,
+        ],
+        inputs = [
+            meta_state,
+            meta_keys,
+            meta_token_select,
+            token_select_indices,
+        ],
+        outputs = [
+            meta_token_type,
+            meta_number,
+        ],
+    )
+    def token_selected(
+        meta: MetadataState,
+        key: str,
+        choice: int,
+        indices: list[int],
+    ):
+        if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
+            raise gr.Error('Token not found')
+        tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
+        if token >= len(tokens):
+            raise gr.Error('Invalid token')
+        data = meta.key.get(key, (-1, []))[1]
+        match key:
+            case 'tokenizer.ggml.scores':
+                return {
+                    meta_number: gr.Number(
+                        value = data[token] if data and len(data) > token else 0.0,
+                        interactive = True,
+                    ),
+                }
+            case 'tokenizer.ggml.token_type':
+                return {
+                    meta_token_type: gr.Dropdown(
+                        value = TokenType(data[token]).name if data and len(data) > token else TokenType.NORMAL.name,
+                        interactive = True,
+                    ),
+                }
+            case _:
+                raise gr.Error('Invalid metadata key')
     @gr.on(
         triggers = [
             meta_lookup.key_up,
         ],
         outputs = [
             meta_lookup,
+            token_select_indices,
         ],
         show_progress = 'hidden',
         trigger_mode = 'always_last',
         meta: MetadataState,
         keyup: gr.KeyUpData,
     ):
+        found = token_search(meta, keyup.input_value)
         return {
             meta_lookup: gr.Dropdown(
+                list(found.values()),
             ),
+            token_select_indices: list(found.keys()),
         }
         typ: int | None,
         val: Any,
         request: gr.Request,
+        choice: int | None = None,
+        indices: list[int] | None = None,
     ):
         if not key or typ is None:
             if key:
         if key in meta.rem:
             meta.rem.remove(key)
+        match key:
+            case 'tokenizer.ggml.scores' | 'tokenizer.ggml.token_type':
+                if choice >= 0 and choice < len(indices) and (token := indices[choice]) >= 0:
+                    tok = meta.add.setdefault(key, (typ, {}))[1]
+                    tok[str(token)] = val + 1 if key == 'tokenizer.ggml.token_type' else val
+                    data = meta.key.setdefault(key, (typ, [0.0 if key == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(meta.key.get('tokenizer.ggml.tokens', (-1, []))[1])))[1]
+                    if data:
+                        for k, v in tok.items():
+                            data[int(k)] = v
+            case _:
+                meta.key[key] = meta.add[key] = (typ, val)
         if key.startswith('tokenizer.chat_template.'):
             template = key[24:]
         )
+    def token_select_to_id(
+        choice: int,
+        indices: list[int],
     ):
+        if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
             raise gr.Error('Token not found')
         return {
             meta_number: gr.Number(
+                token,
             ),
         }
     meta_lookup.input(
+        token_select_to_id,
         inputs = [
             meta_lookup,
+            token_select_indices,
         ],
         outputs = [
             meta_number,
         ] + state_change_components,
     )
+    meta_token_type.input(
+        add_metadata,
+        inputs = [
+            meta_state,
+            meta_keys,
+            meta_types,
+            meta_token_type,
+            meta_token_select,
+            token_select_indices,
+        ],
+        outputs = [
+        ] + state_change_components,
+    )
     meta_number.submit(
         add_metadata,
         inputs = [
             meta_keys,
             meta_types,
             meta_number,
+            meta_token_select,
+            token_select_indices,
         ],
         outputs = [
         ] + state_change_components,
         for k in rem_meta:
             gguf.remove_metadata(k)
+        tokens = gguf.metadata.get('tokenizer.ggml.tokens')
         for k in add_meta:
             k = json.loads(k)
             if isinstance(k, list) and len(k) == 3:
+                if isinstance(k[2], dict):
+                    if tokens:
+                        if (data := gguf.metadata.get(k[0])):
+                            data = data.value
+                        else:
+                            data = [0.0 if k[0] == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(tokens.value)
+                        for i, v in k[2].items():
+                            data[int(i)] = v
+                        k[2] = data
+                    else:
+                        k[2] = []
                 gguf.add_metadata(*k)
         yield gguf.filesize