tangledgroup
/

tangled-llama-n-128k-v0.1

Model card Files Files and versions Community

mtasic85 commited on 27 days ago

Commit

5a1e5e5

1 Parent(s): 511f511

pretrain datasets

Browse files

Files changed (3) hide show

scripts/prepare_pretrain_datasets.py +3 -0
scripts/pretrain-model-0.yaml +1 -1
scripts/requirements.txt +147 -0

scripts/prepare_pretrain_datasets.py CHANGED Viewed

@@ -44,3 +44,6 @@ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
     )
     print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')

     )
     print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
+    total_tokens = sum(data['token_count'] for data in dataset)
+    print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')

scripts/pretrain-model-0.yaml CHANGED Viewed

@@ -78,7 +78,7 @@ train:
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
-  max_seq_length: 513
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings: true

   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 4097
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings: true

scripts/requirements.txt ADDED Viewed

	@@ -0,0 +1,147 @@

+absl-py==2.1.0
+accelerate==1.2.1
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.11.0
+anyio==4.8.0
+attrs==24.3.0
+bitsandbytes==0.44.1
+boto3==1.35.97
+botocore==1.35.97
+certifi==2024.12.14
+chardet==5.2.0
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+DataProperty==1.1.0
+datasets==3.2.0
+dill==0.3.8
+docker-pycreds==0.4.0
+docstring_parser==0.16
+evaluate==0.4.3
+fastapi==0.115.6
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+gitdb==4.0.12
+GitPython==3.1.44
+grokadamw==0.1.2
+grpcio==1.69.0
+h11==0.14.0
+hf_transfer==0.1.9
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.27.1
+idna==3.10
+immutabledict==4.2.1
+importlib_resources==6.5.2
+Jinja2==3.1.5
+jmespath==1.0.1
+joblib==1.4.2
+jsonargparse==4.32.1
+jsonlines==4.0.0
+langdetect==1.0.9
+lightning==2.5.0.post0
+lightning-thunder @ git+https://github.com/Lightning-AI/lightning-thunder/@57e95630b9bf6490e6a98488f4893138f9a67308
+lightning-utilities==0.11.9
+litdata==0.2.17
+litgpt @ git+https://github.com/Lightning-AI/litgpt.git@a5021be4bb48e27779586b56b062a1749ecb232f
+litserve==0.2.4
+lm_eval==0.4.7
+looseversion==1.3.0
+lxml==5.3.0
+Markdown==3.7
+MarkupSafe==3.0.2
+mbstrdecoder==1.1.3
+more-itertools==10.5.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+nltk==3.9.1
+numexpr==2.10.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+opt_einsum==3.4.0
+optree==0.13.1
+packaging==24.2
+pandas==2.2.3
+pathvalidate==3.2.3
+peft==0.14.0
+pillow==11.1.0
+platformdirs==4.3.6
+portalocker==3.1.1
+propcache==0.2.1
+protobuf==5.29.3
+psutil==6.1.1
+pyarrow==18.1.0
+pybind11==2.13.6
+pydantic==2.10.5
+pydantic_core==2.27.2
+pytablewriter==1.2.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytorch-lightning==2.5.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rouge_score==0.1.2
+s3transfer==0.10.4
+sacrebleu==2.5.1
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.15.1
+sentencepiece==0.2.0
+sentry-sdk==2.19.2
+setproctitle==1.3.4
+setuptools==75.8.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+sophia-opt==0.2.2
+sqlitedict==2.1.0
+starlette==0.41.3
+sympy==1.13.1
+tabledata==1.3.4
+tabulate==0.9.0
+tcolorpy==0.1.7
+tensorboard==2.18.0
+tensorboard-data-server==0.7.2
+threadpoolctl==3.5.0
+tokenizers==0.21.0
+torch==2.5.1
+torchmetrics==1.6.1
+tqdm==4.67.1
+tqdm-multiprocess==0.0.11
+transformers==4.48.0
+triton==3.1.0
+typepy==1.3.4
+typeshed_client==2.7.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+uvloop==0.21.0
+wandb==0.19.2
+watchfiles==1.0.4
+websockets==14.1
+Werkzeug==3.1.3
+word2number==1.1
+xxhash==3.5.0
+yarl==1.18.3
+zstandard==0.23.0