Spaces:
Runtime error
Runtime error
victormiller
commited on
Update main.py
Browse files
main.py
CHANGED
@@ -183,43 +183,6 @@ def main():
|
|
183 |
)
|
184 |
|
185 |
|
186 |
-
intro_text = P(
|
187 |
-
"Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
|
188 |
-
A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
|
189 |
-
", ",
|
190 |
-
A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
|
191 |
-
", ",
|
192 |
-
A("K2-65B", href="https://huggingface.co/LLM360/K2"),
|
193 |
-
" have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
|
194 |
-
)
|
195 |
-
|
196 |
-
intro_list = P(
|
197 |
-
"We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
|
198 |
-
)
|
199 |
-
|
200 |
-
intro_list1 = Ol(
|
201 |
-
Li(
|
202 |
-
"Curates commonly used pretraining datasets, including all CommonCrawl",
|
203 |
-
style="margin-bottom: 5px",
|
204 |
-
),
|
205 |
-
Li(
|
206 |
-
"Employs carefully selected filters designed for each data source",
|
207 |
-
style="margin-bottom: 5px",
|
208 |
-
),
|
209 |
-
Li(
|
210 |
-
"Provides only unique data elements via globally deduplicated across all datasets",
|
211 |
-
style="margin-bottom: 5px",
|
212 |
-
),
|
213 |
-
Li(
|
214 |
-
"Retains all deduplication metadata for custom upweighting",
|
215 |
-
style="margin-bottom: 5px",
|
216 |
-
),
|
217 |
-
Li(
|
218 |
-
"Is Production ready! Download here [link to HF repo]",
|
219 |
-
style="margin-bottom: 5px",
|
220 |
-
),
|
221 |
-
)
|
222 |
-
|
223 |
|
224 |
dataset_comparison1 = pd.DataFrame(
|
225 |
{
|
|
|
183 |
)
|
184 |
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
dataset_comparison1 = pd.DataFrame(
|
188 |
{
|