Edit model card
YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

culturax-ar-spbpe32k-focus-embs-anneal-bf16-mixed-xassyy15

W&B run url: https://wandb.ai/konstantinjdobler/tv/runs/xassyy15 W&B run ID: xassyy15

Metadata

{
    "data_dir": "/raid/konstantin.dobler/culturax/ar/ar/tokenized/tokenizers_ar_sp-bpe-ar-32kauto",
    "model_path": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-smart-heuristics-attn-fix-infini-just-embs/ruxr78xn/step-0000100-ckpt",
    "from_scratch": false,
    "saved_checkpoint_path": null,
    "resume": false,
    "train_file": "train.txt",
    "val_file": "val.txt",
    "tokenizer_path": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-smart-heuristics-attn-fix-infini-just-embs/ruxr78xn/step-0000100-ckpt",
    "base_unit": "optimizer-steps",
    "training_goal": 7680,
    "eval_interval": 384,
    "eval_samples": 10000,
    "save_interval": 768,
    "log_interval": 1,
    "model_profiling_interval": 10,
    "warmup_period": 76,
    "lr_decay_period": 4608,
    "lr_final_annealing_period": 1075,
    "block_size": 4096,
    "decontaminated_packing": true,
    "max_lr": 3e-05,
    "batch_size": 256,
    "weight_decay": 0.05,
    "beta1": 0.9,
    "beta2": 0.95,
    "grad_clip": 1.0,
    "min_lr": 2e-06,
    "infinite_lr": 1.5e-05,
    "accelerator": "cuda",
    "num_devices": 4,
    "activation_checkpointing": false,
    "distributed_strategy": "auto",
    "use_fsdp": true,
    "fsdp_sharding_strategy": "SHARD_GRAD_OP",
    "fsdp_limit_all_gathers": false,
    "fsdp_cpu_offload": false,
    "fsdp_ram_friendly_checkpointing": false,
    "fsdp_backward_prefetch_post": false,
    "smart_cuda_alloc": false,
    "fast_model_loading": true,
    "micro_batch_size": 1,
    "eval_micro_batch_size": 10,
    "gradient_accumulation_steps": 64,
    "precision": "bf16-true",
    "use_anyprecision_adamw": false,
    "adamw_foreach": true,
    "compile": false,
    "use_additional_flash_attn_kernels": true,
    "workers": 8,
    "preprocessing_workers": 224,
    "run_name": "culturax-ar-spbpe32k-focus+justembs-attn-fix-infini",
    "seed": 42,
    "only_val": false,
    "val_before_training": true,
    "out_dir": "/raid/konstantin.dobler/checkpoints/culturax-ar-spbpe32k-focus+justembs-attn-fix-infini",
    "wandb_tags": [],
    "offline": false,
    "debug": false,
    "model_profiling": true,
    "force_deterministic": false,
    "fast_dev_run": false,
    "cross_tokenizer_val": false,
    "optimized_activation_checkpointing_policy": false,
    "train_embeddings": false,
    "train_only_embeddings": false,
    "focus_init": false,
    "refocus_init": false,
    "mean_init": false,
    "random_init": false,
    "zipf_init": false,
    "smart_heuristic_init": false,
    "wechsel_init": false,
    "deepfocus_init": false,
    "zett_init": false,
    "focus_fasttext_dim": 300,
    "focus_fasttext_epochs": 3,
    "focus_fasttext_min_count": 10,
    "focus_auxiliary_mode": "fasttext-tokenlevel",
    "focus_fasttext_model_path": null,
    "focus_exact_match_all": false,
    "focus_match_symbols": false,
    "focus_bilingual_dict": null,
    "focus_bilingual_dict_mode": "mean",
    "focus_fuzzy_match_all": false,
    "focus_random_init_source": "source",
    "lora_r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "lora_query": false,
    "lora_key": false,
    "lora_value": false,
    "lora_projection": false,
    "lora_mlp": false,
    "lora_head": false,
    "perf_benchmark": false
}
Downloads last month
8
Safetensors
Model size
7.25B params
Tensor type
F32
·
Model is too large to load in Inference API (serverless). To try the model, launch it on Inference Endpoints (dedicated) instead.