Text Generation
GGUF
PyTorch
English
instruct
finance
stock market
candlesticks
FinGPT
option trading
future stock prediction
trends prediction
Enterprise LLM
Enterprise
Enterprise ready
Banks
Wealth Management
quantized
GGUF
quantization
imat
imatrix
static
32bit
16bit
8bit
6bit
5bit
4bit
3bit
2bit
1bit
conversational
legraphista
commited on
Commit
•
d90095e
1
Parent(s):
b4bb68e
Upload imatrix.log with huggingface_hub
Browse files- imatrix.log +150 -0
imatrix.log
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
llama_model_loader: loaded meta data with 32 key-value pairs and 723 tensors from Palmyra-Fin-70B-32K-IMat-GGUF/Palmyra-Fin-70B-32K.Q8_0.gguf.hardlink.gguf (version GGUF V3 (latest))
|
2 |
+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
|
3 |
+
llama_model_loader: - kv 0: general.architecture str = llama
|
4 |
+
llama_model_loader: - kv 1: general.type str = model
|
5 |
+
llama_model_loader: - kv 2: general.name str = Palmyra Fin 70B 32K
|
6 |
+
llama_model_loader: - kv 3: general.organization str = Writer
|
7 |
+
llama_model_loader: - kv 4: general.finetune str = 32k
|
8 |
+
llama_model_loader: - kv 5: general.basename str = Palmyra-Fin
|
9 |
+
llama_model_loader: - kv 6: general.size_label str = 70B
|
10 |
+
llama_model_loader: - kv 7: general.license str = other
|
11 |
+
llama_model_loader: - kv 8: general.license.name str = writer-open-model-license
|
12 |
+
llama_model_loader: - kv 9: general.license.link str = https://writer.com/legal/open-model-l...
|
13 |
+
llama_model_loader: - kv 10: general.tags arr[str,14] = ["instruct", "pytorch", "finance", "s...
|
14 |
+
llama_model_loader: - kv 11: general.languages arr[str,1] = ["en"]
|
15 |
+
llama_model_loader: - kv 12: llama.block_count u32 = 80
|
16 |
+
llama_model_loader: - kv 13: llama.context_length u32 = 32768
|
17 |
+
llama_model_loader: - kv 14: llama.embedding_length u32 = 8192
|
18 |
+
llama_model_loader: - kv 15: llama.feed_forward_length u32 = 28672
|
19 |
+
llama_model_loader: - kv 16: llama.attention.head_count u32 = 64
|
20 |
+
llama_model_loader: - kv 17: llama.attention.head_count_kv u32 = 8
|
21 |
+
llama_model_loader: - kv 18: llama.rope.freq_base f32 = 6315088.000000
|
22 |
+
llama_model_loader: - kv 19: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
|
23 |
+
llama_model_loader: - kv 20: general.file_type u32 = 7
|
24 |
+
llama_model_loader: - kv 21: llama.vocab_size u32 = 128256
|
25 |
+
llama_model_loader: - kv 22: llama.rope.dimension_count u32 = 128
|
26 |
+
llama_model_loader: - kv 23: tokenizer.ggml.model str = gpt2
|
27 |
+
llama_model_loader: - kv 24: tokenizer.ggml.pre str = smaug-bpe
|
28 |
+
llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ...
|
29 |
+
llama_model_loader: - kv 26: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
|
30 |
+
llama_model_loader: - kv 27: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
|
31 |
+
llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 128000
|
32 |
+
llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 128009
|
33 |
+
llama_model_loader: - kv 30: tokenizer.chat_template str = {% set loop_messages = messages %}{% ...
|
34 |
+
llama_model_loader: - kv 31: general.quantization_version u32 = 2
|
35 |
+
llama_model_loader: - type f32: 161 tensors
|
36 |
+
llama_model_loader: - type q8_0: 562 tensors
|
37 |
+
llm_load_vocab: special tokens cache size = 256
|
38 |
+
llm_load_vocab: token to piece cache size = 0.8000 MB
|
39 |
+
llm_load_print_meta: format = GGUF V3 (latest)
|
40 |
+
llm_load_print_meta: arch = llama
|
41 |
+
llm_load_print_meta: vocab type = BPE
|
42 |
+
llm_load_print_meta: n_vocab = 128256
|
43 |
+
llm_load_print_meta: n_merges = 280147
|
44 |
+
llm_load_print_meta: vocab_only = 0
|
45 |
+
llm_load_print_meta: n_ctx_train = 32768
|
46 |
+
llm_load_print_meta: n_embd = 8192
|
47 |
+
llm_load_print_meta: n_layer = 80
|
48 |
+
llm_load_print_meta: n_head = 64
|
49 |
+
llm_load_print_meta: n_head_kv = 8
|
50 |
+
llm_load_print_meta: n_rot = 128
|
51 |
+
llm_load_print_meta: n_swa = 0
|
52 |
+
llm_load_print_meta: n_embd_head_k = 128
|
53 |
+
llm_load_print_meta: n_embd_head_v = 128
|
54 |
+
llm_load_print_meta: n_gqa = 8
|
55 |
+
llm_load_print_meta: n_embd_k_gqa = 1024
|
56 |
+
llm_load_print_meta: n_embd_v_gqa = 1024
|
57 |
+
llm_load_print_meta: f_norm_eps = 0.0e+00
|
58 |
+
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
59 |
+
llm_load_print_meta: f_clamp_kqv = 0.0e+00
|
60 |
+
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
|
61 |
+
llm_load_print_meta: f_logit_scale = 0.0e+00
|
62 |
+
llm_load_print_meta: n_ff = 28672
|
63 |
+
llm_load_print_meta: n_expert = 0
|
64 |
+
llm_load_print_meta: n_expert_used = 0
|
65 |
+
llm_load_print_meta: causal attn = 1
|
66 |
+
llm_load_print_meta: pooling type = 0
|
67 |
+
llm_load_print_meta: rope type = 0
|
68 |
+
llm_load_print_meta: rope scaling = linear
|
69 |
+
llm_load_print_meta: freq_base_train = 6315088.0
|
70 |
+
llm_load_print_meta: freq_scale_train = 1
|
71 |
+
llm_load_print_meta: n_ctx_orig_yarn = 32768
|
72 |
+
llm_load_print_meta: rope_finetuned = unknown
|
73 |
+
llm_load_print_meta: ssm_d_conv = 0
|
74 |
+
llm_load_print_meta: ssm_d_inner = 0
|
75 |
+
llm_load_print_meta: ssm_d_state = 0
|
76 |
+
llm_load_print_meta: ssm_dt_rank = 0
|
77 |
+
llm_load_print_meta: model type = 70B
|
78 |
+
llm_load_print_meta: model ftype = Q8_0
|
79 |
+
llm_load_print_meta: model params = 70.55 B
|
80 |
+
llm_load_print_meta: model size = 69.82 GiB (8.50 BPW)
|
81 |
+
llm_load_print_meta: general.name = Palmyra Fin 70B 32K
|
82 |
+
llm_load_print_meta: BOS token = 128000 '<|begin_of_text|>'
|
83 |
+
llm_load_print_meta: EOS token = 128009 '<|eot_id|>'
|
84 |
+
llm_load_print_meta: LF token = 128 'Ä'
|
85 |
+
llm_load_print_meta: EOT token = 128009 '<|eot_id|>'
|
86 |
+
llm_load_print_meta: max token length = 256
|
87 |
+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
88 |
+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
|
89 |
+
ggml_cuda_init: found 1 CUDA devices:
|
90 |
+
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
|
91 |
+
llm_load_tensors: ggml ctx size = 0.68 MiB
|
92 |
+
llm_load_tensors: offloading 25 repeating layers to GPU
|
93 |
+
llm_load_tensors: offloaded 25/81 layers to GPU
|
94 |
+
llm_load_tensors: CPU buffer size = 71494.28 MiB
|
95 |
+
llm_load_tensors: CUDA0 buffer size = 21676.56 MiB
|
96 |
+
....................................................................................................
|
97 |
+
llama_new_context_with_model: n_ctx = 512
|
98 |
+
llama_new_context_with_model: n_batch = 512
|
99 |
+
llama_new_context_with_model: n_ubatch = 512
|
100 |
+
llama_new_context_with_model: flash_attn = 0
|
101 |
+
llama_new_context_with_model: freq_base = 6315088.0
|
102 |
+
llama_new_context_with_model: freq_scale = 1
|
103 |
+
llama_kv_cache_init: CUDA_Host KV buffer size = 110.00 MiB
|
104 |
+
llama_kv_cache_init: CUDA0 KV buffer size = 50.00 MiB
|
105 |
+
llama_new_context_with_model: KV self size = 160.00 MiB, K (f16): 80.00 MiB, V (f16): 80.00 MiB
|
106 |
+
llama_new_context_with_model: CUDA_Host output buffer size = 0.49 MiB
|
107 |
+
llama_new_context_with_model: CUDA0 compute buffer size = 1331.12 MiB
|
108 |
+
llama_new_context_with_model: CUDA_Host compute buffer size = 17.01 MiB
|
109 |
+
llama_new_context_with_model: graph nodes = 2566
|
110 |
+
llama_new_context_with_model: graph splits = 609
|
111 |
+
|
112 |
+
system_info: n_threads = 25 / 32 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
|
113 |
+
compute_imatrix: tokenizing the input ..
|
114 |
+
compute_imatrix: tokenization took 126.907 ms
|
115 |
+
compute_imatrix: computing over 125 chunks with batch_size 512
|
116 |
+
compute_imatrix: 6.04 seconds per pass - ETA 12.58 minutes
|
117 |
+
[1]6.1440,[2]4.7452,[3]4.1216,[4]4.9314,[5]5.0081,[6]4.2135,[7]4.2735,[8]4.6681,[9]4.8739,
|
118 |
+
save_imatrix: stored collected data after 10 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
119 |
+
[10]4.5858,[11]5.0261,[12]5.4656,[13]5.9165,[14]6.2804,[15]6.4814,[16]6.7611,[17]6.9371,[18]6.6649,[19]6.3427,
|
120 |
+
save_imatrix: stored collected data after 20 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
121 |
+
[20]6.3391,[21]6.4281,[22]6.4250,[23]6.6474,[24]6.6518,[25]6.9319,[26]6.9256,[27]6.5617,[28]6.2843,[29]6.2872,
|
122 |
+
save_imatrix: stored collected data after 30 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
123 |
+
[30]6.2545,[31]5.9800,[32]5.7160,[33]5.6005,[34]5.5069,[35]5.5871,[36]5.6482,[37]5.6162,[38]5.6794,[39]5.8489,
|
124 |
+
save_imatrix: stored collected data after 40 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
125 |
+
[40]5.9275,[41]5.7433,[42]5.5645,[43]5.4150,[44]5.2665,[45]5.2281,[46]5.2048,[47]5.3152,[48]5.4025,[49]5.5079,
|
126 |
+
save_imatrix: stored collected data after 50 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
127 |
+
[50]5.4590,[51]5.5551,[52]5.6457,[53]5.7308,[54]5.7917,[55]5.8764,[56]5.9336,[57]6.0021,[58]6.0446,[59]6.0770,
|
128 |
+
save_imatrix: stored collected data after 60 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
129 |
+
[60]6.0581,[61]6.0574,[62]6.1048,[63]6.1625,[64]6.1075,[65]6.0976,[66]6.1092,[67]6.0971,[68]6.1110,[69]6.1082,
|
130 |
+
save_imatrix: stored collected data after 70 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
131 |
+
[70]6.1211,[71]6.1250,[72]6.1348,[73]6.1226,[74]6.0961,[75]6.0962,[76]6.1081,[77]6.0929,[78]6.0968,[79]6.1313,
|
132 |
+
save_imatrix: stored collected data after 80 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
133 |
+
[80]6.1531,[81]6.1465,[82]6.1606,[83]6.1917,[84]6.1254,[85]6.1244,[86]6.1312,[87]6.1491,[88]6.1875,[89]6.2506,
|
134 |
+
save_imatrix: stored collected data after 90 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
135 |
+
[90]6.2899,[91]6.3202,[92]6.3413,[93]6.3615,[94]6.3904,[95]6.4258,[96]6.3927,[97]6.4068,[98]6.4536,[99]6.5224,
|
136 |
+
save_imatrix: stored collected data after 100 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
137 |
+
[100]6.5836,[101]6.6276,[102]6.7285,[103]6.7596,[104]6.7928,[105]6.7456,[106]6.7562,[107]6.7182,[108]6.6500,[109]6.5811,
|
138 |
+
save_imatrix: stored collected data after 110 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
139 |
+
[110]6.6171,[111]6.6546,[112]6.6663,[113]6.6677,[114]6.6948,[115]6.7273,[116]6.7414,[117]6.7599,[118]6.7975,[119]6.7594,
|
140 |
+
save_imatrix: stored collected data after 120 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
141 |
+
[120]6.6840,[121]6.6151,[122]6.5444,[123]6.4784,[124]6.4269,[125]6.3698,
|
142 |
+
save_imatrix: stored collected data after 125 chunks in Palmyra-Fin-70B-32K-IMat-GGUF/imatrix.dat
|
143 |
+
|
144 |
+
llama_print_timings: load time = 32479.28 ms
|
145 |
+
llama_print_timings: sample time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
146 |
+
llama_print_timings: prompt eval time = 707989.47 ms / 64000 tokens ( 11.06 ms per token, 90.40 tokens per second)
|
147 |
+
llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
148 |
+
llama_print_timings: total time = 735504.21 ms / 64001 tokens
|
149 |
+
|
150 |
+
Final estimate: PPL = 6.3698 +/- 0.08949
|