mtasic85 commited on
Commit
5a1e5e5
·
1 Parent(s): 511f511

pretrain datasets

Browse files
scripts/prepare_pretrain_datasets.py CHANGED
@@ -44,3 +44,6 @@ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
44
  )
45
 
46
  print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
 
 
 
 
44
  )
45
 
46
  print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')
47
+
48
+ total_tokens = sum(data['token_count'] for data in dataset)
49
+ print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}')
scripts/pretrain-model-0.yaml CHANGED
@@ -78,7 +78,7 @@ train:
78
  max_steps:
79
 
80
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
81
- max_seq_length: 513
82
 
83
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
84
  tie_embeddings: true
 
78
  max_steps:
79
 
80
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
81
+ max_seq_length: 4097
82
 
83
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
84
  tie_embeddings: true
scripts/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==1.2.1
3
+ aiohappyeyeballs==2.4.4
4
+ aiohttp==3.11.11
5
+ aiosignal==1.3.2
6
+ annotated-types==0.7.0
7
+ antlr4-python3-runtime==4.11.0
8
+ anyio==4.8.0
9
+ attrs==24.3.0
10
+ bitsandbytes==0.44.1
11
+ boto3==1.35.97
12
+ botocore==1.35.97
13
+ certifi==2024.12.14
14
+ chardet==5.2.0
15
+ charset-normalizer==3.4.1
16
+ click==8.1.8
17
+ colorama==0.4.6
18
+ DataProperty==1.1.0
19
+ datasets==3.2.0
20
+ dill==0.3.8
21
+ docker-pycreds==0.4.0
22
+ docstring_parser==0.16
23
+ evaluate==0.4.3
24
+ fastapi==0.115.6
25
+ filelock==3.16.1
26
+ frozenlist==1.5.0
27
+ fsspec==2024.9.0
28
+ gitdb==4.0.12
29
+ GitPython==3.1.44
30
+ grokadamw==0.1.2
31
+ grpcio==1.69.0
32
+ h11==0.14.0
33
+ hf_transfer==0.1.9
34
+ httpcore==1.0.7
35
+ httptools==0.6.4
36
+ httpx==0.28.1
37
+ huggingface-hub==0.27.1
38
+ idna==3.10
39
+ immutabledict==4.2.1
40
+ importlib_resources==6.5.2
41
+ Jinja2==3.1.5
42
+ jmespath==1.0.1
43
+ joblib==1.4.2
44
+ jsonargparse==4.32.1
45
+ jsonlines==4.0.0
46
+ langdetect==1.0.9
47
+ lightning==2.5.0.post0
48
+ lightning-thunder @ git+https://github.com/Lightning-AI/lightning-thunder/@57e95630b9bf6490e6a98488f4893138f9a67308
49
+ lightning-utilities==0.11.9
50
+ litdata==0.2.17
51
+ litgpt @ git+https://github.com/Lightning-AI/litgpt.git@a5021be4bb48e27779586b56b062a1749ecb232f
52
+ litserve==0.2.4
53
+ lm_eval==0.4.7
54
+ looseversion==1.3.0
55
+ lxml==5.3.0
56
+ Markdown==3.7
57
+ MarkupSafe==3.0.2
58
+ mbstrdecoder==1.1.3
59
+ more-itertools==10.5.0
60
+ mpmath==1.3.0
61
+ multidict==6.1.0
62
+ multiprocess==0.70.16
63
+ networkx==3.4.2
64
+ nltk==3.9.1
65
+ numexpr==2.10.2
66
+ numpy==1.26.4
67
+ nvidia-cublas-cu12==12.4.5.8
68
+ nvidia-cuda-cupti-cu12==12.4.127
69
+ nvidia-cuda-nvrtc-cu12==12.4.127
70
+ nvidia-cuda-runtime-cu12==12.4.127
71
+ nvidia-cudnn-cu12==9.1.0.70
72
+ nvidia-cufft-cu12==11.2.1.3
73
+ nvidia-curand-cu12==10.3.5.147
74
+ nvidia-cusolver-cu12==11.6.1.9
75
+ nvidia-cusparse-cu12==12.3.1.170
76
+ nvidia-nccl-cu12==2.21.5
77
+ nvidia-nvjitlink-cu12==12.4.127
78
+ nvidia-nvtx-cu12==12.4.127
79
+ opt_einsum==3.4.0
80
+ optree==0.13.1
81
+ packaging==24.2
82
+ pandas==2.2.3
83
+ pathvalidate==3.2.3
84
+ peft==0.14.0
85
+ pillow==11.1.0
86
+ platformdirs==4.3.6
87
+ portalocker==3.1.1
88
+ propcache==0.2.1
89
+ protobuf==5.29.3
90
+ psutil==6.1.1
91
+ pyarrow==18.1.0
92
+ pybind11==2.13.6
93
+ pydantic==2.10.5
94
+ pydantic_core==2.27.2
95
+ pytablewriter==1.2.1
96
+ python-dateutil==2.9.0.post0
97
+ python-dotenv==1.0.1
98
+ pytorch-lightning==2.5.0.post0
99
+ pytz==2024.2
100
+ PyYAML==6.0.2
101
+ regex==2024.11.6
102
+ requests==2.32.3
103
+ rouge_score==0.1.2
104
+ s3transfer==0.10.4
105
+ sacrebleu==2.5.1
106
+ safetensors==0.5.2
107
+ scikit-learn==1.6.1
108
+ scipy==1.15.1
109
+ sentencepiece==0.2.0
110
+ sentry-sdk==2.19.2
111
+ setproctitle==1.3.4
112
+ setuptools==75.8.0
113
+ six==1.17.0
114
+ smmap==5.0.2
115
+ sniffio==1.3.1
116
+ sophia-opt==0.2.2
117
+ sqlitedict==2.1.0
118
+ starlette==0.41.3
119
+ sympy==1.13.1
120
+ tabledata==1.3.4
121
+ tabulate==0.9.0
122
+ tcolorpy==0.1.7
123
+ tensorboard==2.18.0
124
+ tensorboard-data-server==0.7.2
125
+ threadpoolctl==3.5.0
126
+ tokenizers==0.21.0
127
+ torch==2.5.1
128
+ torchmetrics==1.6.1
129
+ tqdm==4.67.1
130
+ tqdm-multiprocess==0.0.11
131
+ transformers==4.48.0
132
+ triton==3.1.0
133
+ typepy==1.3.4
134
+ typeshed_client==2.7.0
135
+ typing_extensions==4.12.2
136
+ tzdata==2024.2
137
+ urllib3==2.3.0
138
+ uvicorn==0.34.0
139
+ uvloop==0.21.0
140
+ wandb==0.19.2
141
+ watchfiles==1.0.4
142
+ websockets==14.1
143
+ Werkzeug==3.1.3
144
+ word2number==1.1
145
+ xxhash==3.5.0
146
+ yarl==1.18.3
147
+ zstandard==0.23.0