Spaces:
Running
Running
import functools | |
from collections import OrderedDict | |
import gradio as gr | |
from modules import shared | |
loaders_and_params = OrderedDict({ | |
'Transformers': [ | |
'cpu_memory', | |
'gpu_memory', | |
'load_in_8bit', | |
'bf16', | |
'cpu', | |
'disk', | |
'auto_devices', | |
'load_in_4bit', | |
'use_double_quant', | |
'quant_type', | |
'compute_dtype', | |
'trust_remote_code', | |
'no_use_fast', | |
'use_flash_attention_2', | |
'alpha_value', | |
'rope_freq_base', | |
'compress_pos_emb', | |
'disable_exllama', | |
'disable_exllamav2', | |
'transformers_info', | |
], | |
'llama.cpp': [ | |
'n_ctx', | |
'n_gpu_layers', | |
'tensor_split', | |
'n_batch', | |
'threads', | |
'threads_batch', | |
'no_mmap', | |
'mlock', | |
'no_mul_mat_q', | |
'alpha_value', | |
'rope_freq_base', | |
'compress_pos_emb', | |
'cpu', | |
'numa', | |
'no_offload_kqv', | |
'row_split', | |
'tensorcores', | |
'streaming_llm', | |
'attention_sink_size', | |
], | |
'llamacpp_HF': [ | |
'n_ctx', | |
'n_gpu_layers', | |
'tensor_split', | |
'n_batch', | |
'threads', | |
'threads_batch', | |
'no_mmap', | |
'mlock', | |
'no_mul_mat_q', | |
'alpha_value', | |
'rope_freq_base', | |
'compress_pos_emb', | |
'cpu', | |
'numa', | |
'cfg_cache', | |
'trust_remote_code', | |
'no_use_fast', | |
'logits_all', | |
'no_offload_kqv', | |
'row_split', | |
'tensorcores', | |
'streaming_llm', | |
'attention_sink_size', | |
'llamacpp_HF_info', | |
], | |
'ExLlamav2_HF': [ | |
'gpu_split', | |
'max_seq_len', | |
'cfg_cache', | |
'no_flash_attn', | |
'num_experts_per_token', | |
'cache_8bit', | |
'cache_4bit', | |
'autosplit', | |
'alpha_value', | |
'compress_pos_emb', | |
'trust_remote_code', | |
'no_use_fast', | |
], | |
'ExLlamav2': [ | |
'gpu_split', | |
'max_seq_len', | |
'no_flash_attn', | |
'num_experts_per_token', | |
'cache_8bit', | |
'cache_4bit', | |
'autosplit', | |
'alpha_value', | |
'compress_pos_emb', | |
'exllamav2_info', | |
], | |
'AutoGPTQ': [ | |
'triton', | |
'no_inject_fused_attention', | |
'no_inject_fused_mlp', | |
'no_use_cuda_fp16', | |
'wbits', | |
'groupsize', | |
'desc_act', | |
'disable_exllama', | |
'disable_exllamav2', | |
'gpu_memory', | |
'cpu_memory', | |
'cpu', | |
'disk', | |
'auto_devices', | |
'trust_remote_code', | |
'no_use_fast', | |
'autogptq_info', | |
], | |
'AutoAWQ': [ | |
'cpu_memory', | |
'gpu_memory', | |
'auto_devices', | |
'max_seq_len', | |
'no_inject_fused_attention', | |
'trust_remote_code', | |
'no_use_fast', | |
], | |
'GPTQ-for-LLaMa': [ | |
'wbits', | |
'groupsize', | |
'model_type', | |
'pre_layer', | |
'trust_remote_code', | |
'no_use_fast', | |
'gptq_for_llama_info', | |
], | |
'QuIP#': [ | |
'trust_remote_code', | |
'no_use_fast', | |
'no_flash_attn', | |
'quipsharp_info', | |
], | |
'HQQ': [ | |
'hqq_backend', | |
'trust_remote_code', | |
'no_use_fast', | |
] | |
}) | |
def transformers_samplers(): | |
return { | |
'temperature', | |
'temperature_last', | |
'dynamic_temperature', | |
'dynatemp_low', | |
'dynatemp_high', | |
'dynatemp_exponent', | |
'smoothing_factor', | |
'smoothing_curve', | |
'top_p', | |
'min_p', | |
'top_k', | |
'typical_p', | |
'epsilon_cutoff', | |
'eta_cutoff', | |
'tfs', | |
'top_a', | |
'repetition_penalty', | |
'presence_penalty', | |
'frequency_penalty', | |
'repetition_penalty_range', | |
'encoder_repetition_penalty', | |
'no_repeat_ngram_size', | |
'seed', | |
'do_sample', | |
'penalty_alpha', | |
'mirostat_mode', | |
'mirostat_tau', | |
'mirostat_eta', | |
'grammar_file_row', | |
'grammar_string', | |
'guidance_scale', | |
'negative_prompt', | |
'ban_eos_token', | |
'custom_token_bans', | |
'sampler_priority', | |
'add_bos_token', | |
'skip_special_tokens', | |
'auto_max_new_tokens', | |
'prompt_lookup_num_tokens' | |
} | |
loaders_samplers = { | |
'Transformers': transformers_samplers(), | |
'AutoGPTQ': transformers_samplers(), | |
'GPTQ-for-LLaMa': transformers_samplers(), | |
'AutoAWQ': transformers_samplers(), | |
'QuIP#': transformers_samplers(), | |
'HQQ': transformers_samplers(), | |
'ExLlamav2': { | |
'temperature', | |
'temperature_last', | |
'top_p', | |
'min_p', | |
'top_k', | |
'typical_p', | |
'tfs', | |
'top_a', | |
'repetition_penalty', | |
'presence_penalty', | |
'frequency_penalty', | |
'repetition_penalty_range', | |
'seed', | |
'mirostat_mode', | |
'mirostat_tau', | |
'mirostat_eta', | |
'ban_eos_token', | |
'add_bos_token', | |
'custom_token_bans', | |
'skip_special_tokens', | |
'auto_max_new_tokens', | |
}, | |
'ExLlamav2_HF': { | |
'temperature', | |
'temperature_last', | |
'dynamic_temperature', | |
'dynatemp_low', | |
'dynatemp_high', | |
'dynatemp_exponent', | |
'smoothing_factor', | |
'smoothing_curve', | |
'top_p', | |
'min_p', | |
'top_k', | |
'typical_p', | |
'epsilon_cutoff', | |
'eta_cutoff', | |
'tfs', | |
'top_a', | |
'repetition_penalty', | |
'presence_penalty', | |
'frequency_penalty', | |
'repetition_penalty_range', | |
'encoder_repetition_penalty', | |
'no_repeat_ngram_size', | |
'seed', | |
'do_sample', | |
'mirostat_mode', | |
'mirostat_tau', | |
'mirostat_eta', | |
'grammar_file_row', | |
'grammar_string', | |
'guidance_scale', | |
'negative_prompt', | |
'ban_eos_token', | |
'custom_token_bans', | |
'sampler_priority', | |
'add_bos_token', | |
'skip_special_tokens', | |
'auto_max_new_tokens', | |
}, | |
'llama.cpp': { | |
'temperature', | |
'top_p', | |
'min_p', | |
'top_k', | |
'typical_p', | |
'tfs', | |
'repetition_penalty', | |
'presence_penalty', | |
'frequency_penalty', | |
'seed', | |
'mirostat_mode', | |
'mirostat_tau', | |
'mirostat_eta', | |
'grammar_file_row', | |
'grammar_string', | |
'ban_eos_token', | |
'custom_token_bans', | |
}, | |
'llamacpp_HF': { | |
'temperature', | |
'temperature_last', | |
'dynamic_temperature', | |
'dynatemp_low', | |
'dynatemp_high', | |
'dynatemp_exponent', | |
'smoothing_factor', | |
'smoothing_curve', | |
'top_p', | |
'min_p', | |
'top_k', | |
'typical_p', | |
'epsilon_cutoff', | |
'eta_cutoff', | |
'tfs', | |
'top_a', | |
'repetition_penalty', | |
'presence_penalty', | |
'frequency_penalty', | |
'repetition_penalty_range', | |
'encoder_repetition_penalty', | |
'no_repeat_ngram_size', | |
'seed', | |
'do_sample', | |
'mirostat_mode', | |
'mirostat_tau', | |
'mirostat_eta', | |
'grammar_file_row', | |
'grammar_string', | |
'guidance_scale', | |
'negative_prompt', | |
'ban_eos_token', | |
'custom_token_bans', | |
'sampler_priority', | |
'add_bos_token', | |
'skip_special_tokens', | |
'auto_max_new_tokens', | |
}, | |
} | |
loaders_model_types = { | |
'GPTQ-for-LLaMa': [ | |
"None", | |
"llama", | |
"opt", | |
"gptj" | |
], | |
} | |
def list_all_samplers(): | |
all_samplers = set() | |
for k in loaders_samplers: | |
for sampler in loaders_samplers[k]: | |
all_samplers.add(sampler) | |
return sorted(all_samplers) | |
def blacklist_samplers(loader, dynamic_temperature): | |
all_samplers = list_all_samplers() | |
output = [] | |
for sampler in all_samplers: | |
if loader == 'All' or sampler in loaders_samplers[loader]: | |
if sampler.startswith('dynatemp'): | |
output.append(gr.update(visible=dynamic_temperature)) | |
else: | |
output.append(gr.update(visible=True)) | |
else: | |
output.append(gr.update(visible=False)) | |
return output | |
def get_model_types(loader): | |
if loader in loaders_model_types: | |
return loaders_model_types[loader] | |
return ["None"] | |
def get_gpu_memory_keys(): | |
return [k for k in shared.gradio if k.startswith('gpu_memory')] | |
def get_all_params(): | |
all_params = set() | |
for k in loaders_and_params: | |
for el in loaders_and_params[k]: | |
all_params.add(el) | |
if 'gpu_memory' in all_params: | |
all_params.remove('gpu_memory') | |
for k in get_gpu_memory_keys(): | |
all_params.add(k) | |
return sorted(all_params) | |
def make_loader_params_visible(loader): | |
params = [] | |
all_params = get_all_params() | |
if loader in loaders_and_params: | |
params = loaders_and_params[loader] | |
if 'gpu_memory' in params: | |
params.remove('gpu_memory') | |
params += get_gpu_memory_keys() | |
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params] | |