Ezi commited on
Commit
46df0b6
1 Parent(s): f352acd

Upload 312 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. LICENSE +201 -0
  3. OBELICS _100.csv +0 -0
  4. README.md +47 -6
  5. __pycache__/app.cpython-311.pyc +0 -0
  6. __pycache__/test_text_len.cpython-311.pyc +0 -0
  7. app.py +265 -0
  8. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/associations/identity_terms.json +1 -0
  9. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/data-00000-of-00002.arrow +3 -0
  10. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/data-00001-of-00002.arrow +3 -0
  11. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/dataset_info.json +55 -0
  12. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/state.json +16 -0
  13. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/general_stats_dict.json +1 -0
  14. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/length_measurements.json +1 -0
  15. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/lengths_fig.png +0 -0
  16. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/lengths_table.json +3 -0
  17. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/sorted_top_vocab.json +1 -0
  18. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/data-00000-of-00001.arrow +3 -0
  19. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/dataset_info.json +37 -0
  20. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/state.json +13 -0
  21. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_duplicates/text_duplicates.html +110 -0
  22. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_duplicates/text_duplicates.json +1 -0
  23. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/tokenized_df.json +3 -0
  24. cache_dir/HuggingFaceM4/OBELICS_default_train_texts/vocab_counts.json +3 -0
  25. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/data-00000-of-00001.arrow +3 -0
  26. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/dataset_info.json +51 -0
  27. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/state.json +13 -0
  28. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/dset_peek.json +0 -0
  29. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/general_stats_dict.json +1 -0
  30. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/length_measurements.json +1 -0
  31. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/lengths_fig.png +0 -0
  32. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/lengths_table.json +1 -0
  33. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/sorted_top_vocab.json +1 -0
  34. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/cache-f6aa4a70e38b4a04.arrow +3 -0
  35. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/data-00000-of-00001.arrow +3 -0
  36. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/dataset_info.json +33 -0
  37. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/state.json +13 -0
  38. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_duplicates/text_duplicates.html +1 -0
  39. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_duplicates/text_duplicates.json +1 -0
  40. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/tokenized_df.json +1 -0
  41. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/vocab_counts.json +0 -0
  42. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_basic_stats.json +1 -0
  43. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_fig.html +0 -0
  44. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_fig.json +1 -0
  45. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/data-00000-of-00001.arrow +3 -0
  46. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/dataset_info.json +51 -0
  47. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/state.json +13 -0
  48. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/dset_peek.json +0 -0
  49. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/general_stats_dict.json +1 -0
  50. cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/lengths/length_measurements.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/lengths_table.json filter=lfs diff=lfs merge=lfs -text
37
+ cache_dir/HuggingFaceM4/OBELICS_default_train_texts/tokenized_df.json filter=lfs diff=lfs merge=lfs -text
38
+ cache_dir/HuggingFaceM4/OBELICS_default_train_texts/vocab_counts.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
OBELICS _100.csv ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,12 +1,53 @@
1
  ---
2
- title: IDEFICS Data Measurement Tool
3
- emoji: 📊
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: streamlit
7
- sdk_version: 1.25.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DataMeasurementsTool
3
+ emoji: 🤗
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.0.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Data Measurements Tool
13
+
14
+ 🚧 Doing Construction 🚧
15
+
16
+ [![Generic badge](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/huggingface/data-measurements-tool)
17
+
18
+ For more information, check out out [blog post](https://huggingface.co/blog/data-measurements-tool)!
19
+
20
+ # How to run:
21
+
22
+ After cloning (and potentially setting up your virtual environment), run:
23
+
24
+ `pip install -r requirements.txt`
25
+
26
+ This installs all the requirements for the tool.
27
+
28
+ ## Command Line Interface
29
+
30
+ From there, you can measure different aspects of different datasets by running `run_data_measurements.py` with different options.
31
+ The options specify the HF Dataset, the Dataset config, the Dataset columns being measured, the measurements to use, and further details about caching and saving.
32
+
33
+ To see the full list of options, do:
34
+
35
+ `python3 run_data_measurements.py -h` or `python3 run_data_measurements.py --help`
36
+
37
+ Example for hate_speech18 dataset:
38
+
39
+ `python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"`
40
+
41
+ Example for getting *just* the nPMI measurement from hate_speech18:
42
+
43
+ `python3 run_data_measurements.py --dataset=hate_speech18 --config default --split train --feature text --calculation npmi`
44
+
45
+
46
+ Example for IMDB dataset:
47
+
48
+ `python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"`
49
+
50
+
51
+ ## User Interface
52
+
53
+ `gradio app.py`
__pycache__/app.cpython-311.pyc ADDED
Binary file (14.9 kB). View file
 
__pycache__/test_text_len.cpython-311.pyc ADDED
Binary file (11.5 kB). View file
 
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import ast
17
+ import gradio as gr
18
+ from os.path import isdir
19
+ from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls
20
+ import utils
21
+ from utils import dataset_utils
22
+ from utils import gradio_utils as gr_utils
23
+ import widgets
24
+
25
+ logs = utils.prepare_logging(__file__)
26
+
27
+ # Utility for sidebar description and selection of the dataset
28
+ DATASET_NAME_TO_DICT = dataset_utils.get_dataset_info_dicts()
29
+
30
+
31
+ def get_load_prepare_list(dstats):
32
+ """
33
+ # Get load_or_prepare functions for the measurements we will display
34
+ """
35
+ # Measurement calculation:
36
+ # Add any additional modules and their load-prepare function here.
37
+ load_prepare_list = [("general stats", dstats.load_or_prepare_general_stats),
38
+ ("label distribution", dstats.load_or_prepare_labels),
39
+ ("text_lengths", dstats.load_or_prepare_text_lengths),
40
+ ("duplicates", dstats.load_or_prepare_text_duplicates),
41
+ ("npmi", dstats.load_or_prepare_npmi),
42
+ ("zipf", dstats.load_or_prepare_zipf)]
43
+
44
+ return load_prepare_list
45
+
46
+
47
+ def get_ui_widgets():
48
+ """Get the widgets that will be displayed in the UI."""
49
+ return [widgets.DatasetDescription(DATASET_NAME_TO_DICT),
50
+ widgets.GeneralStats(),
51
+ widgets.LabelDistribution(),
52
+ widgets.TextLengths(),
53
+ widgets.Duplicates(),
54
+ widgets.Npmi(),
55
+ widgets.Zipf()]
56
+
57
+
58
+ def get_widgets():
59
+ """
60
+ # A measurement widget requires 2 things:
61
+ # - A load or prepare function
62
+ # - A display function
63
+ # We define these in two separate functions get_load_prepare_list and get_ui_widgets;
64
+ # any widget can be added by modifying both functions and the rest of the app logic will work.
65
+ # get_load_prepare_list is a function since it requires a DatasetStatisticsCacheClass which will
66
+ # not be created until dataset and config values are selected in the ui
67
+ """
68
+ return get_load_prepare_list, get_ui_widgets()
69
+
70
+
71
+ def get_title(dstats):
72
+ title_str = f"### Showing: {dstats.dset_name} - {dstats.dset_config} - {dstats.split_name} - {'-'.join(dstats.text_field)}"
73
+ logs.info("showing header")
74
+ return title_str
75
+
76
+
77
+ def display_initial_UI():
78
+ """Displays the header in the UI"""
79
+ # Extract the selected arguments
80
+ dataset_args = gr_utils.sidebar_selection(DATASET_NAME_TO_DICT)
81
+ return dataset_args
82
+
83
+
84
+ def load_or_prepare_widgets(dstats, load_prepare_list, show_perplexities, live=True, pull_cache_from_hub=False):
85
+ """
86
+ Takes the dataset arguments from the GUI and uses them to load a dataset from the Hub or, if
87
+ a cache for those arguments is available, to load it from the cache.
88
+ Widget data is loaded only when the system is live (deployed for users).
89
+ Otherwise, the data is prepared if it doesn't yet exist.
90
+ Args:
91
+ ds_args (dict): the dataset arguments defined via the streamlit app GUI
92
+ load_prepare_list (list): List of (widget_name, widget_load_or_prepare_function)
93
+ show_perplexities (Bool): whether perplexities should be loaded and displayed for this dataset
94
+ live (Bool): Whether the system is deployed for live use by users.
95
+ pull_cache_from_hub (Bool): Whether the cache should be pulled from the hub (vs locally)
96
+ Returns:
97
+ dstats: the computed dataset statistics (from the dataset_statistics class)
98
+ """
99
+
100
+ # When we're "live" (tool is being used by users on our servers),
101
+ # cache is used and the f'ns are instructed to only try to load cache,
102
+ # not to prepare/compute anything anew.
103
+ if live:
104
+ # Only use what's cached; don't prepare anything
105
+ load_only = True
106
+ logs.info("Only using cache.")
107
+ else:
108
+ # Prepare things anew and cache them if we're not live.
109
+ load_only = False
110
+ logs.info("Making new calculations if cache is not there.")
111
+ if pull_cache_from_hub:
112
+ dataset_utils.pull_cache_from_hub(dstats.cache_path, dstats.dataset_cache_dir)
113
+
114
+ # Data common across DMT:
115
+ # Includes the dataset text/requested feature column,
116
+ # the dataset tokenized, and the vocabulary
117
+ dstats.load_or_prepare_text_dataset(load_only=load_only)
118
+ # Just a snippet of the dataset
119
+ dstats.load_or_prepare_dset_peek(load_only=load_only)
120
+ # Tokenized dataset
121
+ dstats.load_or_prepare_tokenized_df(load_only=load_only)
122
+ # Vocabulary (uses tokenized dataset)
123
+ dstats.load_or_prepare_vocab(load_only=load_only)
124
+ # Custom widgets
125
+ for widget_tuple in load_prepare_list:
126
+ widget_name = widget_tuple[0]
127
+ widget_fn = widget_tuple[1]
128
+ try:
129
+ widget_fn(load_only=load_only)
130
+ except Exception as e:
131
+ logs.warning("Issue with %s." % widget_name)
132
+ logs.exception(e)
133
+ # TODO: If these are cached, can't we just show them by default?
134
+ # It won't take up computation time.
135
+ if show_perplexities:
136
+ try:
137
+ dstats.load_or_prepare_text_perplexities(load_only=load_only)
138
+ except Exception as e:
139
+ logs.warning("Issue with %s." % "perplexities")
140
+ logs.exception(e)
141
+ return dstats
142
+
143
+
144
+ def show_column(dstats, display_list, show_perplexities, column_id=""):
145
+ """
146
+ Function for displaying the elements in the streamlit app.
147
+ Args:
148
+ dstats (class): The dataset_statistics.py DatasetStatisticsCacheClass
149
+ display_list (list): List of tuples for (widget_name, widget_display_function)
150
+ show_perplexities (Bool): Whether perplexities should be loaded and displayed for this dataset
151
+ column_id (str): Which column of the dataset the analysis is done on [DEPRECATED for v1]
152
+ """
153
+
154
+ # start showing stuff
155
+ gr_utils.expander_header(dstats, DATASET_NAME_TO_DICT)
156
+ for widget_tuple in display_list:
157
+ widget_type = widget_tuple[0]
158
+ widget_fn = widget_tuple[1]
159
+ logs.info("showing %s." % widget_type)
160
+ try:
161
+ widget_fn(dstats, column_id)
162
+ except Exception as e:
163
+ logs.warning("Jk jk jk. There was an issue with %s:" % widget_type)
164
+ logs.exception(e)
165
+ # TODO: Fix how this is a weird outlier.
166
+ if show_perplexities:
167
+ gr_utils.expander_text_perplexities(dstats, column_id)
168
+ logs.info("Have finished displaying the widgets.")
169
+
170
+
171
+ def create_demo(live: bool, pull_cache_from_hub: bool):
172
+ with gr.Blocks() as demo:
173
+ state = gr.State()
174
+ with gr.Row():
175
+ with gr.Column(scale=1):
176
+ dataset_args = display_initial_UI()
177
+ get_load_prepare_list_fn, widget_list = get_widgets()
178
+ # # TODO: Make this less of a weird outlier.
179
+ # Doesn't do anything right now
180
+ show_perplexities = gr.Checkbox(label="Show text perplexities")
181
+ with gr.Column(scale=4):
182
+ gr.Markdown("# Data Measurements Tool")
183
+ title = gr.Markdown()
184
+ for widget in widget_list:
185
+ widget.render()
186
+
187
+ def update_ui(dataset: str, config: str, split: str, feature: str):
188
+ feature = ast.literal_eval(feature)
189
+ label_field, label_names = gr_utils.get_label_names(dataset, config, DATASET_NAME_TO_DICT)
190
+ dstats = dmt_cls(dset_name=dataset, dset_config=config, split_name=split, text_field=feature,
191
+ label_field=label_field, label_names=label_names, use_cache=True)
192
+ load_prepare_list = get_load_prepare_list_fn(dstats)
193
+ dstats = load_or_prepare_widgets(dstats, load_prepare_list, show_perplexities=False,
194
+ live=live, pull_cache_from_hub=pull_cache_from_hub)
195
+ output = {title: get_title(dstats), state: dstats}
196
+ for widget in widget_list:
197
+ output.update(widget.update(dstats))
198
+ return output
199
+
200
+ def update_dataset(dataset: str):
201
+ new_values = gr_utils.update_dataset(dataset, DATASET_NAME_TO_DICT)
202
+ config = new_values[0][1]
203
+ feature = new_values[1][1]
204
+ split = new_values[2][1]
205
+ new_dropdown = {
206
+ dataset_args["dset_config"]: gr.Dropdown.update(choices=new_values[0][0], value=config),
207
+ dataset_args["text_field"]: gr.Dropdown.update(choices=new_values[1][0], value=feature),
208
+ dataset_args["split_name"]: gr.Dropdown.update(choices=new_values[2][0], value=split),
209
+ }
210
+ return new_dropdown
211
+
212
+ def update_config(dataset: str, config: str):
213
+ new_values = gr_utils.update_config(dataset, config, DATASET_NAME_TO_DICT)
214
+
215
+ feature = new_values[0][1]
216
+ split = new_values[1][1]
217
+ new_dropdown = {
218
+ dataset_args["text_field"]: gr.Dropdown.update(choices=new_values[0][0], value=feature),
219
+ dataset_args["split_name"]: gr.Dropdown.update(choices=new_values[1][0], value=split)
220
+ }
221
+ return new_dropdown
222
+
223
+ measurements = [comp for output in widget_list for comp in output.output_components]
224
+ demo.load(update_ui,
225
+ inputs=[dataset_args["dset_name"], dataset_args["dset_config"], dataset_args["split_name"], dataset_args["text_field"]],
226
+ outputs=[title, state] + measurements)
227
+
228
+ for widget in widget_list:
229
+ widget.add_events(state)
230
+ #dataset_args["text_field"] --> the text that could be returned
231
+ dataset_args["dset_name"].change(update_dataset,
232
+ inputs=[dataset_args["dset_name"]],
233
+ outputs=[dataset_args["dset_config"],
234
+ dataset_args["split_name"], dataset_args["text_field"],
235
+ title, state] + measurements)
236
+
237
+ dataset_args["dset_config"].change(update_config,
238
+ inputs=[dataset_args["dset_name"], dataset_args["dset_config"]],
239
+ outputs=[dataset_args["split_name"], dataset_args["text_field"],
240
+ title, state] + measurements)
241
+
242
+ dataset_args["calculate_btn"].click(update_ui,
243
+ inputs=[dataset_args["dset_name"], dataset_args["dset_config"],
244
+ dataset_args["split_name"], dataset_args["text_field"]],
245
+ outputs=[title, state] + measurements)
246
+ return demo
247
+
248
+
249
+ def main():
250
+ parser = argparse.ArgumentParser()
251
+ parser.add_argument(
252
+ "--live", default=False, required=False, action="store_true", help="Flag to specify that this is not running live.")
253
+ parser.add_argument(
254
+ "--pull_cache_from_hub", default=False, required=False, action="store_true", help="Flag to specify whether to look in the hub for measurements caches. If you are using this option, you must have HUB_CACHE_ORGANIZATION=<the organization you've set up on the hub to store your cache> and HF_TOKEN=<your hf token> on separate lines in a file named .env at the root of this repo.")
255
+ arguments = parser.parse_args()
256
+ live = arguments.live
257
+ pull_cache_from_hub = arguments.pull_cache_from_hub
258
+
259
+ # Create and initialize the demo
260
+ demo = create_demo(live, pull_cache_from_hub)
261
+
262
+ demo.launch()
263
+
264
+ if __name__ == "__main__":
265
+ main()
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/associations/identity_terms.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["man", "woman", "gay", "lesbian", "queer", "trans", "straight", "cis", "she", "her", "hers", "he", "him", "his", "they", "them", "their", "theirs", "himself", "herself"]
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/data-00000-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33f1296731fb15954bceb2aae92ea57bd8351aa21017eb5032eea1be391b32b3
3
+ size 259553152
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/data-00001-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89173d78ae611fdfdd580dc00877e2022df059141619d44b43f8bc1f9856c2ca
3
+ size 259689664
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/dataset_info.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 519190160,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "images": {
12
+ "feature": {
13
+ "dtype": "string",
14
+ "_type": "Value"
15
+ },
16
+ "_type": "Sequence"
17
+ },
18
+ "metadata": {
19
+ "dtype": "string",
20
+ "_type": "Value"
21
+ },
22
+ "general_metadata": {
23
+ "dtype": "string",
24
+ "_type": "Value"
25
+ },
26
+ "texts": {
27
+ "feature": {
28
+ "dtype": "string",
29
+ "_type": "Value"
30
+ },
31
+ "_type": "Sequence"
32
+ }
33
+ },
34
+ "homepage": "",
35
+ "license": "",
36
+ "size_in_bytes": 519190160,
37
+ "splits": {
38
+ "train": {
39
+ "name": "train",
40
+ "num_bytes": 519190160,
41
+ "num_examples": 100000,
42
+ "shard_lengths": [
43
+ 97000,
44
+ 3000
45
+ ],
46
+ "dataset_name": "generator"
47
+ }
48
+ },
49
+ "version": {
50
+ "version_str": "0.0.0",
51
+ "major": 0,
52
+ "minor": 0,
53
+ "patch": 0
54
+ }
55
+ }
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/base_dset/state.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00002.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00002.arrow"
8
+ }
9
+ ],
10
+ "_fingerprint": "e5b19719224d2bb9",
11
+ "_format_columns": null,
12
+ "_format_kwargs": {},
13
+ "_format_type": null,
14
+ "_output_all_columns": false,
15
+ "_split": "train"
16
+ }
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/general_stats_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total words": 458951, "total open words": 458695, "text_nan_count": 0, "duplicate_fraction": 0.0011676271846117192}
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/length_measurements.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"average_instance_length": 269.25647939302144, "standard_dev_instance_length": 304.0144488706206, "num_instance_lengths": 2024}
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/lengths_fig.png ADDED
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/lengths/lengths_table.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8687bd444d4f8d7a0aaa35bd51b51f25136e2b2078e8646df187f6a0caa4bf6b
3
+ size 348823286
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/sorted_top_vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"one": {"count": 162630, "proportion": 0.005280176118857242, "vocab": "one"}, "also": {"count": 118340, "proportion": 0.0038421941948322325, "vocab": "also"}, "said": {"count": 113455, "proportion": 0.003683590860019359, "vocab": "said"}, "new": {"count": 107660, "proportion": 0.0034954421752208735, "vocab": "new"}, "time": {"count": 107410, "proportion": 0.0034873253208292216, "vocab": "time"}, "like": {"count": 99338, "proportion": 0.0032252483262315726, "vocab": "like"}, "first": {"count": 90988, "proportion": 0.002954145389550407, "vocab": "first"}, "people": {"count": 89996, "proportion": 0.002921937711324333, "vocab": "people"}, "year": {"count": 77227, "proportion": 0.0025073612564163327, "vocab": "year"}, "two": {"count": 76345, "proportion": 0.0024787249941225858, "vocab": "two"}, "get": {"count": 66084, "proportion": 0.002145576822471635, "vocab": "get"}, "years": {"count": 65457, "proportion": 0.0021252197516573724, "vocab": "years"}, "many": {"count": 64031, "proportion": 0.0020789212142073915, "vocab": "many"}, "us": {"count": 61950, "proportion": 0.0020113565182512828, "vocab": "us"}, "well": {"count": 61646, "proportion": 0.0020014864233110345, "vocab": "well"}, "world": {"count": 61327, "proportion": 0.001991129317107287, "vocab": "world"}, "even": {"count": 60536, "proportion": 0.001965447589812101, "vocab": "even"}, "back": {"count": 56695, "proportion": 0.0018407402389387648, "vocab": "back"}, "way": {"count": 53417, "proportion": 0.0017343120441554282, "vocab": "way"}, "make": {"count": 52238, "proportion": 0.001696032958844399, "vocab": "make"}, "may": {"count": 50367, "proportion": 0.0016352864205772778, "vocab": "may"}, "see": {"count": 50134, "proportion": 0.0016277215122842586, "vocab": "see"}, "work": {"count": 49793, "proportion": 0.0016166501228940457, "vocab": "work"}, "much": {"count": 49767, "proportion": 0.0016158059700373138, "vocab": "much"}, "day": {"count": 49696, "proportion": 0.0016135007833900847, "vocab": "day"}, "last": {"count": 49458, "proportion": 0.0016057735380092324, "vocab": "last"}, "made": {"count": 48189, "proportion": 0.0015645723851172085, "vocab": "made"}, "good": {"count": 45788, "proportion": 0.001486618115539786, "vocab": "good"}, "life": {"count": 44834, "proportion": 0.0014556441991812432, "vocab": "life"}, "know": {"count": 44344, "proportion": 0.001439735164573606, "vocab": "know"}, "still": {"count": 43429, "proportion": 0.0014100274775001608, "vocab": "still"}, "game": {"count": 42254, "proportion": 0.0013718782618593979, "vocab": "game"}, "around": {"count": 41693, "proportion": 0.0013536640406045315, "vocab": "around"}, "take": {"count": 40628, "proportion": 0.0013190862408960955, "vocab": "take"}, "since": {"count": 39639, "proportion": 0.0012869759649227215, "vocab": "since"}, "three": {"count": 39214, "proportion": 0.0012731773124569138, "vocab": "three"}, "go": {"count": 38323, "proportion": 0.0012442488434050672, "vocab": "go"}, "going": {"count": 37231, "proportion": 0.0012087944234223328, "vocab": "going"}, "really": {"count": 37228, "proportion": 0.0012086970211696328, "vocab": "really"}, "long": {"count": 36636, "proportion": 0.0011894763099702017, "vocab": "long"}, "use": {"count": 36387, "proportion": 0.0011813919229961168, "vocab": "use"}, "state": {"count": 36205, "proportion": 0.0011754828529989942, "vocab": "state"}, "think": {"count": 36113, "proportion": 0.0011724958505828665, "vocab": "think"}, "best": {"count": 35958, "proportion": 0.0011674634008600423, "vocab": "best"}, "part": {"count": 35927, "proportion": 0.0011664569109154776, "vocab": "part"}, "right": {"count": 34951, "proportion": 0.0011347687113704694, "vocab": "right"}, "another": {"count": 34857, "proportion": 0.0011317167741192085, "vocab": "another"}, "however": {"count": 34535, "proportion": 0.0011212622656627611, "vocab": "however"}, "great": {"count": 34388, "proportion": 0.0011164895552804698, "vocab": "great"}, "home": {"count": 33717, "proportion": 0.0010947039180932768, "vocab": "home"}, "city": {"count": 33439, "proportion": 0.0010856779760097602, "vocab": "city"}, "used": {"count": 33168, "proportion": 0.0010768793058492099, "vocab": "used"}, "need": {"count": 33129, "proportion": 0.0010756130765641123, "vocab": "need"}, "next": {"count": 32568, "proportion": 0.001057398855309246, "vocab": "next"}, "want": {"count": 32261, "proportion": 0.0010474313581162976, "vocab": "want"}, "team": {"count": 32133, "proportion": 0.0010432755286677719, "vocab": "team"}, "high": {"count": 31938, "proportion": 0.0010369443822422835, "vocab": "high"}, "old": {"count": 31307, "proportion": 0.0010164574417577548, "vocab": "old"}, "every": {"count": 30892, "proportion": 0.001002983463467613, "vocab": "every"}, "place": {"count": 30693, "proportion": 0.0009965224473718584, "vocab": "place"}, "end": {"count": 30667, "proportion": 0.0009956782945151265, "vocab": "end"}, "different": {"count": 29807, "proportion": 0.0009677563154078449, "vocab": "different"}, "family": {"count": 29380, "proportion": 0.0009538927281069038, "vocab": "family"}, "things": {"count": 29302, "proportion": 0.0009513602695367084, "vocab": "things"}, "little": {"count": 29181, "proportion": 0.000947431712011149, "vocab": "little"}, "show": {"count": 29068, "proportion": 0.0009437628938261225, "vocab": "show"}, "book": {"count": 28603, "proportion": 0.0009286655446576504, "vocab": "book"}, "come": {"count": 28403, "proportion": 0.000922172061144329, "vocab": "come"}, "something": {"count": 28362, "proportion": 0.0009208408970240981, "vocab": "something"}, "government": {"count": 27858, "proportion": 0.0009044773185705285, "vocab": "government"}, "second": {"count": 27836, "proportion": 0.0009037630353840631, "vocab": "second"}, "found": {"count": 27709, "proportion": 0.000899639673353104, "vocab": "found"}, "never": {"count": 27600, "proportion": 0.0008961007248383439, "vocab": "never"}, "including": {"count": 27573, "proportion": 0.0008952241045640455, "vocab": "including"}, "help": {"count": 27472, "proportion": 0.0008919448953898183, "vocab": "help"}, "love": {"count": 27435, "proportion": 0.0008907436009398538, "vocab": "love"}, "look": {"count": 27099, "proportion": 0.000879834548637474, "vocab": "look"}, "got": {"count": 27064, "proportion": 0.0008786981890226427, "vocab": "got"}, "set": {"count": 27038, "proportion": 0.0008778540361659109, "vocab": "set"}, "find": {"count": 27005, "proportion": 0.0008767826113862129, "vocab": "find"}, "story": {"count": 26842, "proportion": 0.0008714904223228561, "vocab": "story"}, "lot": {"count": 26791, "proportion": 0.0008698345840269592, "vocab": "lot"}, "say": {"count": 26642, "proportion": 0.0008649969388095347, "vocab": "say"}, "country": {"count": 26452, "proportion": 0.0008588281294718795, "vocab": "country"}, "man": {"count": 26091, "proportion": 0.0008471073917303344, "vocab": "man"}, "company": {"count": 26087, "proportion": 0.0008469775220600681, "vocab": "company"}, "says": {"count": 25932, "proportion": 0.000841945072337244, "vocab": "says"}, "season": {"count": 25360, "proportion": 0.000823373709489145, "vocab": "season"}, "school": {"count": 25352, "proportion": 0.0008231139701486122, "vocab": "school"}, "week": {"count": 25289, "proportion": 0.0008210685228419159, "vocab": "week"}, "always": {"count": 25064, "proportion": 0.0008137633538894294, "vocab": "always"}, "film": {"count": 24920, "proportion": 0.0008090880457598381, "vocab": "film"}, "without": {"count": 24878, "proportion": 0.0008077244142220406, "vocab": "without"}, "music": {"count": 24720, "proportion": 0.0008025945622465167, "vocab": "music"}, "big": {"count": 24691, "proportion": 0.0008016530071370851, "vocab": "big"}, "according": {"count": 24428, "proportion": 0.0007931140763170676, "vocab": "according"}, "better": {"count": 24423, "proportion": 0.0007929517392292345, "vocab": "better"}, "play": {"count": 24392, "proportion": 0.0007919452492846698, "vocab": "play"}, "group": {"count": 24235, "proportion": 0.0007868478647267125, "vocab": "group"}, "top": {"count": 23990, "proportion": 0.0007788933474228939, "vocab": "top"}}
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b9c7e98ea4cc4b01f89eee3a3a7ef47fa6d9a30fcebb22d7273ba7aef52392f
3
+ size 334497112
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/dataset_info.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 519190160,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "text": {
12
+ "dtype": "string",
13
+ "_type": "Value"
14
+ }
15
+ },
16
+ "homepage": "",
17
+ "license": "",
18
+ "size_in_bytes": 519190160,
19
+ "splits": {
20
+ "train": {
21
+ "name": "train",
22
+ "num_bytes": 519190160,
23
+ "num_examples": 100000,
24
+ "shard_lengths": [
25
+ 97000,
26
+ 3000
27
+ ],
28
+ "dataset_name": "generator"
29
+ }
30
+ },
31
+ "version": {
32
+ "version_str": "0.0.0",
33
+ "major": 0,
34
+ "minor": 0,
35
+ "patch": 0
36
+ }
37
+ }
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_dset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "003d7ffa6618774c",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_duplicates/text_duplicates.html ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <table border="1"><tr><th>duplicate_fraction</th><td>0.0011676271846117192</td></tr><tr><th>duplicates_dict</th><td><table border="1"><tr><th>Church of the Holy Sepulchre</th><td>2</td></tr><tr><th>Get fresh music recommendations delivered to your inbox every Friday.
2
+ We&#x27;ve updated our Terms of Use. You can review the changes here.</th><td>4</td></tr><tr><th>The Batman – watch the Bat and the Cat trailer</th><td>2</td></tr><tr><th>END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED</th><td>140</td></tr><tr><th>My name is Geoff Le Pard. Once I was a lawyer; now I am a writer. I&#x27;ve published four books - Dead Flies and Sherry Trifle, My Father and Other Liars, Salisbury Square and Buster &amp; Moo. In addition I have published three anthologies of short stories and a memoir of my mother. More will appear soon. I will try and continue to blog regularly at geofflepard.com about whatever takes my fancy. I hope it does yours too. These are my thoughts and no one else is to blame. If you want to nab anything I post, please acknowledge where it came from.
3
+ View all posts by TanGental →
4
+ This entry was posted in #writephoto, flash fiction, miscellany and tagged #writephoto, flash fiction. Bookmark the permalink.</th><td>2</td></tr><tr><th>Community content is available under CC-BY-SA unless otherwise noted.
5
+ Advertisement</th><td>2</td></tr><tr><th>Save products on your wishlist to buy them later or share with your friends.</th><td>2</td></tr><tr><th>A €500m aid package for EU farmers, a derogation from greening obligations and supports for feed and fertiliser are being considered by the European Commission.</th><td>2</td></tr><tr><th>An 11-Year-Old Girl Advises Her Teacher On Punishment Methods – And...</th><td>2</td></tr><tr><th>Molly grew up in California but now lives in the oh-so-amazing state of Texas with her husband, daughter, and fur babies. When she’s not diving into the world of her characters, some of her hobbies include hiking, snowboarding, traveling, and long walks on the beach … which roughly translates to being a homebody with her hubby and dishing out movie quotes. She has a weakness for crude-humored movies and fried pickles, and loves curling up in a fluffy comforter during a thunderstorm … or under one in a bathtub if there are tornados. That way she can pretend they aren’t really happening.</th><td>2</td></tr><tr><th>The 9-year-old got into character, pairing her leather jacket and pants with Jackson’s own “Smooth Criminal” hat.</th><td>2</td></tr><tr><th>Highland&#x27;s Maddie Dortch runs at the start of the race during the Triad Invitational on Wednesday, September 30, 2020 at Triad High School in Troy, Ill. Paul Halfacre, STLhighschoolsports.com</th><td>2</td></tr><tr><th>After excellent first-cut silage crops, it is a case of keeping the shoulder to the wheel to ensure fodder reserves are met for the coming winter. Declan Marren reports.</th><td>2</td></tr><tr><th>Scroll back to top</th><td>3</td></tr><tr><th>Already got the injury now what ☺️
6
+
7
+ Suffer till it&#x27;s better jk lol</th><td>2</td></tr><tr><th>We will write the formula as below:</th><td>2</td></tr><tr><th>There was an error retrieving images from Instagram. An attempt will be remade in a few minutes.</th><td>3</td></tr><tr><th>You can find out more about which cookies we are using or switch them off in settings.
8
+
9
+ This website uses cookies so that we can provide you with the best user experience possible. Cookie information is stored in your browser and performs functions such as recognising you when you return to our website and helping our team to understand which sections of the website you find most interesting and useful.
10
+
11
+ Strictly Necessary Cookie should be enabled at all times so that we can save your preferences for cookie settings.
12
+
13
+ If you disable this cookie, we will not be able to save your preferences. This means that every time you visit this website you will need to enable or disable cookies again.</th><td>2</td></tr><tr><th>In the meantime, learn about Mobile Workers Compensation below through our articles and write-up!</th><td>2</td></tr><tr><th>Lowe&#x27;s in south Fort Myers is one of several area stores that have restocked on essentials to include water, gas containers and generators in preparation for Hurricane Dorian. A manager at the Lowe&#x27;s said, if needed, they will ship supplies to stores in areas hardest hit by Hurricane Dorian. Kinfay Moroti/The News-Press USA Today Network-Florida
14
+ Fullscreen</th><td>2</td></tr><tr><th>There are no reviews yet.</th><td>2</td></tr><tr><th>80 Hindu couples tie the knot at mass wedding in Karachi</th><td>2</td></tr><tr><th>This website uses cookies to improve your experience while you navigate through the website. Out of these, the cookies that are categorized as necessary are stored on your browser as they are essential for the working of basic functionalities of the website. We also use third-party cookies that help us analyze and understand how you use this website. These cookies will be stored in your browser only with your consent. You also have the option to opt-out of these cookies. But opting out of some of these cookies may affect your browsing experience.
15
+ Necessary Always Enabled
16
+
17
+ Any cookies that may not be particularly necessary for the website to function and is used specifically to collect user personal data via analytics, ads, other embedded contents are termed as non-necessary cookies. It is mandatory to procure user consent prior to running these cookies on your website.</th><td>2</td></tr><tr><th>This site uses Akismet to reduce spam. Learn how your comment data is processed.</th><td>8</td></tr><tr><th>SEE ALL OF VELOCITY’S SUPERCARS AT PUKEKOHE HERE</th><td>2</td></tr><tr><th>skip to main | skip to sidebar</th><td>3</td></tr><tr><th>Posted 3 years ago by Yahoo</th><td>2</td></tr><tr><th>Not since van Gogh lopped off his ear has an artist’s knife been put to such good use.—Tessa Laird
18
+
19
+ New Zealand collage artist Peter Madden draws much of his imagery from old issues of National Geographic. He plunders and reworks the magazine’s discredited ’empire of signs’ to forge his own. His surrealistic pictures, objects, and installations—with their watchmaker detail and intensity—have been described as ‘microcosms’ and ‘intricate kingdoms of flying forms’ Madden has one foot in the vanitas still-life tradition and the other in new-age thinking. On the one hand, he is death obsessed: a master of morbid decoupage. (Moths and butterflies—symbols of transient life—abound. His assemblages in bell jars suggest some Victorian taxidermist killing time in his parlour.) On the other hand, with his flocks, schools, and swarms of quivering animal energy, he revels in biodiversity and magic. Madden’s works manage to be at once morbid and abundant, rotting and blooming, creepy and fey. This book serveys Madden’s work of the last ten years</th><td>2</td></tr><tr><th>Fallout 4: How to Get Vertibird Support</th><td>2</td></tr><tr><th>For Fallout 4 on the PlayStation 4, a GameFAQs message board topic titled &quot;Vertibirds going down constantly?&quot;.</th><td>2</td></tr><tr><th>I am a committed Piano tutor and composer with over 15 years experience teaching a wide range of pupils from children to...</th><td>2</td></tr><tr><th>We use cookies on our website to give you the most relevant experience by remembering your preferences and repeat visits. By clicking “Accept All”, you consent to the use of ALL the cookies. However, you may visit &quot;Cookie Settings&quot; to provide a controlled consent.
20
+ Cookie SettingsAccept All
21
+ Manage consent
22
+
23
+ This website uses cookies to improve your experience while you navigate through the website. Out of these, the cookies that are categorized as necessary are stored on your browser as they are essential for the working of basic functionalities of the website. We also use third-party cookies that help us analyze and understand how you use this website. These cookies will be stored in your browser only with your consent. You also have the option to opt-out of these cookies. But opting out of some of these cookies may affect your browsing experience.
24
+ Necessary Always Enabled
25
+ Necessary cookies are absolutely essential for the website to function properly. These cookies ensure basic functionalities and security features of the website, anonymously.
26
+ Functional
27
+ Functional cookies help to perform certain functionalities like sharing the content of the website on social media platforms, collect feedbacks, and other third-party features.
28
+ Performance
29
+ Performance cookies are used to understand and analyze the key performance indexes of the website which helps in delivering a better user experience for the visitors.
30
+ Analytics
31
+ Analytical cookies are used to understand how visitors interact with the website. These cookies help provide information on metrics the number of visitors, bounce rate, traffic source, etc.
32
+ Advertisement
33
+ Advertisement cookies are used to provide visitors with relevant ads and marketing campaigns. These cookies track visitors across websites and collect information to provide customized ads.
34
+ Others
35
+ Other uncategorized cookies are those that are being analyzed and have not been classified into a category as yet.
36
+ SAVE &amp; ACCEPT</th><td>3</td></tr><tr><th>Serbia signs Memorandum of Understanding with USAID on energy efficiency
37
+
38
+ Keep up with the latest trends and news of the CEE energy market! Sign up for our newsletters to receive curated news across the energy agenda in 20+ countries in Central and South-eastern Europe.</th><td>2</td></tr><tr><th>Concerns over effect of Rotorua plan</th><td>2</td></tr><tr><th>Jet skier in our wake</th><td>2</td></tr><tr><th>You may have missed</th><td>2</td></tr><tr><th>Showing posts from July, 2018
39
+ Show all</th><td>2</td></tr><tr><th>EXCERPT
40
+ As the band played, the dance floor filled. Nate looked over the top of his beer bottle as Rachel asked Grant to dance. It was shaping up to be a line dance and Grant, not looking like the cowboy boogie-type, begged off a second time.
41
+ She flashed Caroline a hopeful grin. “Do you want to dance?”
42
+ Caroline’s eyes darted to the dance floor. “I don’t know how to do that.”
43
+ Rachel set her hands on her hips. She cocked her head toward the line forming behind them. “Come on. I’ll teach you.”
44
+ Caroline shot Nate a pleading look as if asking him to save her. He bumped her shoulder instead. “Go ahead. Knock ’em dead.”
45
+ And damn, if she didn’t. She picked up the steps quickly, laughing every time she turned the wrong way or kicked out the opposite foot. It wasn’t long before she was rocking the arms and rolling her hips, but with an ethereal quality Nate had never witnessed in a country line dance before. Beside her, Rachel moved to the music a little differently, more seductive, less inhibited. Side by side with Caroline, he began to suspect Rachel wasn’t as innocent and naive as her older brother wanted to believe. Nate continued to watch her dance, enthralled. He’d just as soon imagine his sisters naked as he would Caroline, but Rachel? She conjured up fantasies even he’d never imagined before.
46
+ Grant paid no mind to Nate. His eyes were locked on Rachel’s long lithe body on the dance floor. She had a type, and this guy was it—tall, fair-haired, destined for a corner office. Nate brushed a hand over his scruffy face. Rachel could look him square in the eye when she wore heels. The only office he hoped to get was a concrete box with a pushout window.
47
+ Jealousy spiked in his chest before he finally pushed back from the table and headed back to the bar.
48
+ Faces flushed and smiling, Rachel and Caroline wove their way back to the table after he returned. He set a glass of water in front of Caroline, relieved to see Rachel drinking water, too.
49
+ Good. He preferred her date tonight ended with her sober.
50
+ Grant looked down at his phone as the band took a break and then leaned sideways to say something to Rachel. Nate sent her a curious look after Grant passed the bouncer and went outside.
51
+ Rachel shrugged and set down her glass as recorded music started to play over the loudspeakers. “He said he had to take a call for work.”
52
+ Caroline touched Nate’s shoulder. “Do you know which way is the toilet?”
53
+ Rachel smiled when he pointed to the far end of the bar.
54
+ Caroline stood. “I’ll be right back.”
55
+ “It’s just called the toilet in Ireland,” Nate explained after Caroline disappeared into the crowd. “Tell me more about Kieran. How does he like his new home?”
56
+ Rachel leaned her elbows on the table, her expression turning all sweet and sappy. “I think he’s happy. He meets me at the door every day when I get home and he likes to sleep in bed with me at night.”
57
+ “Hmmm,” was the best Nate could do.
58
+ She dropped her chin into her hands. “Can I ask you something?”
59
+ “Sure.”
60
+ “How much Irish do you speak?”
61
+ He grinned, assuming cussing didn’t count. “I only know a few words that my father taught me.”
62
+ Rachel’s lips twitched.
63
+ “What?”
64
+ “Your accent. You’re starting to sound a little bit like your girlfriend.”
65
+ He could tell she was teasing him, but he still felt the color rising in his cheeks. “I told you, Caroline and I are friends.”
66
+ She sat back and laughed as Lonestar’s “Amazed” began to play. “Matt’s right. Your Irish does come out when you’ve been drinking.”
67
+ Nate just shrugged. His accent was a byproduct of parents born and raised in Ireland. His father was proud of his thick Irish accent. His mother tried not to speak with any accent at all, but sometimes it would sneak out when one of her four kids got her riled up. It snuck out on him, too, sometimes, and not just while he was drinking. Times Matt didn’t know about. Moments Nate wished Rachel did.
68
+ Leaning closer, enough so that he could feel her warm breath on his cheek, she looked at him. “I have to ask you…did that kiss mean anything at all to you?”
69
+ He didn’t know how to answer. He thought about lying or twisting the truth. Or just brushing her off altogether. But he couldn’t do it. “Of course it meant something to me. But it can’t happen again.”
70
+ She let out a short laugh. “Then it didn’t mean much at all, did it?”
71
+ He stared at her, his throat so tight he could barely breathe. He told himself to keep his mouth shut. Put her first. Forget her.
72
+ But no, he looked over his shoulder for Caroline instead and then damn near lost his head. “Rachel, I’m crazy about you.” I love you! He clenched his jaw, determined to salvage the big fat mess he’d made. “But be realistic. I’m not the right guy for you.”
73
+ She eased back with defiance. “Who says?”
74
+ “How about we start with your brother?”
75
+ Her lips pinched together. He’d hit a nerve. “Who says I’m looking for Mr. Right?”
76
+ “What is that supposed to mean?”
77
+ “It means I’m not looking for a ring, Nate. I want to go out, have fun, blow off a little steam. That doesn’t work for you, so I won’t bother you again.”</th><td>2</td></tr><tr><th>AUTHOR BIO
78
+ Suzanne Winslow writes the kind of stories she loves to read—contemporary romance with relatable characters, unsung heroes and heroines, and true-to-life stories. Nurses, teachers, firefighters, and Marines top her list of champions. Give her a book about strong, brave characters with hidden vulnerabilities and a secret passion, and she’ll binge read to the end!
79
+ Suzanne and her husband, along with their rescue dog, Murphy, call Upstate New York home. When she’s not reading or writing, she’s often planning a road trip, or if it’s summertime, hanging out at the lake. Connecting with readers through Instagram, Facebook, and newsletters is a favorite pastime.
80
+ AUTHOR LINKS
81
+ WEBSITE
82
+ INSTAGRAM
83
+ FACEBOOK
84
+ GOODREADS
85
+ AMAZON</th><td>2</td></tr><tr><th>After breaking the partition, a sturdy metal frame in placed to ensure the upper part of the wall is safely supported and to facilitate access to the roof.</th><td>2</td></tr><tr><th>From the window situated over the release module and behind glass we can watch the chicks without them seeing us.</th><td>2</td></tr><tr><th>During the release process a young one-year old male from the wild population, visited the release module, attracted by the Colony Environment effect. It is probable that it is an individual from the urban centre of San Vicente where at least two pairs of lesser kestrel breed.</th><td>2</td></tr><tr><th>I’ve had a long love of books, and some of my most prized books are art books. This is a review of books from my collection that can be found on shelves in my studio. I will provide links when possible.</th><td>2</td></tr><tr><th>The Fairy Tales of Oscar Wilde</th><td>2</td></tr><tr><th>Just added to your cart</th><td>2</td></tr><tr><th>The West Side Lofts, a mixed-use development in the heart of Red Bank&#x27;s antique district, brought a fresh infusion of downtown residents when it opened about four years ago. Tanya Breen
86
+ Fullscreen</th><td>2</td></tr><tr><th>Interior of one of the apartments during the opening of Element, a new high-end 35 unit apartment complex along the Navesink River in Red Bank, NJ Wednesday May 29, 2019. Tanya Breen
87
+ Fullscreen</th><td>2</td></tr><tr><th>How To Responsibly Donate To Ukrainian Causes</th><td>2</td></tr><tr><th>The Subtle Violence Of So...</th><td>2</td></tr><tr><th>Corona-virus: Fun things to do while social distancing</th><td>2</td></tr><tr><th>Barcelona try to make up for Messi’s lost time</th><td>2</td></tr><tr><th>The Milton and Tamar Maltz Performing Arts Center, located on East 105th Street and Ansel Road in Cleveland. Prior to being used by Case Western Reserve University, the building was The Temple-Tifereth Israel’s home until the 1970s.</th><td>2</td></tr><tr><th>Error: Twitter did not respond. Please wait a few minutes and refresh this page.</th><td>2</td></tr><tr><th>Back to Top
88
+ Close</th><td>3</td></tr><tr><th>It was all over before I knew it and I just could not believe I could see almost perfectly straight after the surgery. Read more...</th><td>2</td></tr><tr><th>Watch music on TV: AXS TV programming highlights for the week of April 15-21</th><td>2</td></tr><tr><th>The BL King’s Topographical Collection: &quot;THE NORTH-EAST VIEW OF SCALEBY-CASTLE, IN THE COUNTY OF CUMBERLAND. &quot;</th><td>2</td></tr><tr><th>Welcome to our store</th><td>2</td></tr><tr><th>We seek to promote lively discussion and debate. We believe that our users have the right to express themselves freely in a manner that is courteous and respectful of others&#x27; point of view and sensibility.
89
+
90
+ Your message may be removed if we consider it to be:
91
+
92
+ Repeated violations may lead to suspension and/or termination of your message posting privileges.
93
+
94
+ www.tributes.in takes abuse very seriously: We will co-operate fully with law enforcement, including disclosure of your user ID, IP address and messaging history.
95
+
96
+ Please review your message, you cannot delete/edit once it has been posted.
97
+
98
+ READY TO GIVE THE MOST MEANINGFUL GIFT TO YOUR FAMILY?
99
+
100
+ Give a Tribute to someone special and see how your family and friends react - it&#x27;ll be priceless (trust us)!</th><td>2</td></tr><tr><th>How to start? Making a plan …</th><td>2</td></tr><tr><th>Victorian Fashion This era in fashion ranged primarily from the mid-1800s to the early 1900s. It&#x27; PowerPoint Presentation</th><td>2</td></tr><tr><th>How did the crisis grow between 1900-1914? PowerPoint Presentation</th><td>2</td></tr><tr><th>Meet Your Match on Dating Site with</th><td>2</td></tr><tr><th>Data Beams Down to Planet Comicon 2020</th><td>2</td></tr><tr><th>NBA Scoring Title Should Go To Durant Over Carmelo</th><td>2</td></tr><tr><th>Any cookies that may not be particularly necessary for the website to function and is used specifically to collect user personal data via analytics, ads, other embedded contents are termed as non-necessary cookies. It is mandatory to procure user consent prior to running these cookies on your website.</th><td>3</td></tr><tr><th>Commendation: Made in Australia: The Future of Australian Cities by Dr Julian Bolleter and Professor Richard Weller (Perth).</th><td>2</td></tr><tr><th>WINNER: Dune
101
+ Nightmare Alley
102
+ The Power of the Dog
103
+ The Tragedy of Macbeth
104
+ West Side Story</th><td>2</td></tr><tr><th>Everything Women Need to Know About Triathlon</th><td>2</td></tr><tr><th>Police keep people away from the Century 16 theater in Aurora, CO, just outside Denver after a shooting at the Midnight Premier of the Dark Knight Rises where 12 people are confirmed dead and many more injured</th><td>2</td></tr><tr><th>You don&#x27;t have permission to register</th><td>2</td></tr><tr><th>Geoff Neal believes he “shut people up” by knocking out Vicente Luque, expects “everybody is going to try to wrestle me now”</th><td>2</td></tr><tr><th>The Great Famine and the Irish Diaspora in America ebook</th><td>2</td></tr><tr><th>Demystifying the Role of AI in Cybersecurity
105
+
106
+ There&#x27;s a lot of anticipation and expectation in business around the role of artificial...</th><td>2</td></tr><tr><th>“Pale Blue Dot” by The NaveBlues</th><td>2</td></tr><tr><th>D-Day for R. Kelly as sex-crimes trial gets underway
107
+ 1 month ago
108
+ 1 month ago</th><td>2</td></tr><tr><th>Culture Current: Teenagers Are Hosed, Here’s What We Can Do</th><td>2</td></tr><tr><th>Winter camouflage in the BC Cariboo!</th><td>2</td></tr><tr><th>How Science Denial Happens and What You Can Do About it</th><td>2</td></tr><tr><th>Processed with VSCO with c1 preset</th><td>2</td></tr><tr><th>The Late Late Show with James Corden on Carpool karaoke</th><td>2</td></tr><tr><th>PAUL HINCE AND NEIL YOUNG GRAB ALL THE POINTS FOR CITY</th><td>2</td></tr><tr><th>Details Taking place between the 1st May and the 31st October 2010, the Shanghai World Expo was the largest Expo the world had ever seen. Represent.....</th><td>2</td></tr><tr><th>The office buildings contrast with the old design from Tokyo Station.</th><td>2</td></tr><tr><th>The most northerly point of our road trip.</th><td>2</td></tr><tr><th>Pin On Anniversary Quotes And Wishes</th><td>2</td></tr><tr><th>Longeveron up 100% after FDA approves its Lomecel-B medical product</th><td>2</td></tr><tr><th>↓ Download Image
109
+ Caption: Paul Medlock-Walton demonstrates Gameblox, which was developed by researchers at the Education Arcade, and allows users to create their own games.
110
+ Credits: Photo: Casey Atkins</th><td>2</td></tr><tr><th>Is Buying Gold a Good Investment?</th><td>2</td></tr><tr><th>Team 2 – work together on this collaborative puzzle game</th><td>2</td></tr><tr><th>Meredith Rosenthal (center) spoke about pharmaceutical marketing&#x27;s role in the opioid crisis. She is Gray professor of health economics at the Harvard T. H. Chan School of Public Health.</th><td>2</td></tr><tr><th>Rehabilitated borehole in use</th><td>2</td></tr><tr><th>This image from video provided by the FBI, shows Aaron Alexis moves through the hallways of Building #197 at the Washington Navy Yard on Sept. 16 in Washington, carrying a Remington 870 shotgun. Alexis, a 34-year-old former Navy reservist and IT contractor, shot and killed 12 people inside a Navy Yard building last week before being killed in a shootout with police. (AP Photo/FBI)</th><td>2</td></tr><tr><th>Pin by Ryann McBride on Humanoids in 2021 Character art</th><td>2</td></tr><tr><th>The Lebanese tourist was spared serious harm due to the rescue by local surfer Alik Reyes Narag and a Frenchman lifeguard ’hero’. Photo: Pavida Anantarasmi</th><td>2</td></tr><tr><th>PEMUDA HARUS “I DO CARE”</th><td>2</td></tr><tr><th>Is GameStop the Next RadioShack?</th><td>2</td></tr></table></td></tr></table>
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/text_duplicates/text_duplicates.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"duplicate_fraction": 0.0011676271846117192, "duplicates_dict": {"Church of the Holy Sepulchre": 2, "Get fresh music recommendations delivered to your inbox every Friday.\nWe've updated our Terms of Use. You can review the changes here.": 4, "The Batman \u2013 watch the Bat and the Cat trailer": 2, "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED": 140, "My name is Geoff Le Pard. Once I was a lawyer; now I am a writer. I've published four books - Dead Flies and Sherry Trifle, My Father and Other Liars, Salisbury Square and Buster & Moo. In addition I have published three anthologies of short stories and a memoir of my mother. More will appear soon. I will try and continue to blog regularly at geofflepard.com about whatever takes my fancy. I hope it does yours too. These are my thoughts and no one else is to blame. If you want to nab anything I post, please acknowledge where it came from.\nView all posts by TanGental \u2192\nThis entry was posted in #writephoto, flash fiction, miscellany and tagged #writephoto, flash fiction. Bookmark the permalink.": 2, "Community content is available under CC-BY-SA unless otherwise noted.\nAdvertisement": 2, "Save products on your wishlist to buy them later or share with your friends.": 2, "A \u20ac500m aid package for EU farmers, a derogation from greening obligations and supports for feed and fertiliser are being considered by the European Commission.": 2, "An 11-Year-Old Girl Advises Her Teacher On Punishment Methods \u2013 And...": 2, "Molly grew up in California but now lives in the oh-so-amazing state of Texas with her husband, daughter, and fur babies. When she\u2019s not diving into the world of her characters, some of her hobbies include hiking, snowboarding, traveling, and long walks on the beach \u2026 which roughly translates to being a homebody with her hubby and dishing out movie quotes. She has a weakness for crude-humored movies and fried pickles, and loves curling up in a fluffy comforter during a thunderstorm \u2026 or under one in a bathtub if there are tornados. That way she can pretend they aren\u2019t really happening.": 2, "The 9-year-old got into character, pairing her leather jacket and pants with Jackson\u2019s own \u201cSmooth Criminal\u201d hat.": 2, "Highland's Maddie Dortch runs at the start of the race during the Triad Invitational on Wednesday, September 30, 2020 at Triad High School in Troy, Ill. Paul Halfacre, STLhighschoolsports.com": 2, "After excellent first-cut silage crops, it is a case of keeping the shoulder to the wheel to ensure fodder reserves are met for the coming winter. Declan Marren reports.": 2, "Scroll back to top": 3, "Already got the injury now what \u263a\ufe0f\n\nSuffer till it's better jk lol": 2, "We will write the formula as below:": 2, "There was an error retrieving images from Instagram. An attempt will be remade in a few minutes.": 3, "You can find out more about which cookies we are using or switch them off in settings.\n\nThis website uses cookies so that we can provide you with the best user experience possible. Cookie information is stored in your browser and performs functions such as recognising you when you return to our website and helping our team to understand which sections of the website you find most interesting and useful.\n\nStrictly Necessary Cookie should be enabled at all times so that we can save your preferences for cookie settings.\n\nIf you disable this cookie, we will not be able to save your preferences. This means that every time you visit this website you will need to enable or disable cookies again.": 2, "In the meantime, learn about Mobile Workers Compensation below through our articles and write-up!": 2, "Lowe's in south Fort Myers is one of several area stores that have restocked on essentials to include water, gas containers and generators in preparation for Hurricane Dorian. A manager at the Lowe's said, if needed, they will ship supplies to stores in areas hardest hit by Hurricane Dorian. Kinfay Moroti/The News-Press USA Today Network-Florida\nFullscreen": 2, "There are no reviews yet.": 2, "80 Hindu couples tie the knot at mass wedding in Karachi": 2, "This website uses cookies to improve your experience while you navigate through the website. Out of these, the cookies that are categorized as necessary are stored on your browser as they are essential for the working of basic functionalities of the website. We also use third-party cookies that help us analyze and understand how you use this website. These cookies will be stored in your browser only with your consent. You also have the option to opt-out of these cookies. But opting out of some of these cookies may affect your browsing experience.\nNecessary Always Enabled\n\nAny cookies that may not be particularly necessary for the website to function and is used specifically to collect user personal data via analytics, ads, other embedded contents are termed as non-necessary cookies. It is mandatory to procure user consent prior to running these cookies on your website.": 2, "This site uses Akismet to reduce spam. Learn how your comment data is processed.": 8, "SEE ALL OF VELOCITY\u2019S SUPERCARS AT PUKEKOHE HERE": 2, "skip to main | skip to sidebar": 3, "Posted 3 years ago by Yahoo": 2, "Not since van Gogh lopped off his ear has an artist\u2019s knife been put to such good use.\u2014Tessa Laird\n\nNew Zealand collage artist Peter Madden draws much of his imagery from old issues of National Geographic. He plunders and reworks the magazine\u2019s discredited \u2019empire of signs\u2019 to forge his own. His surrealistic pictures, objects, and installations\u2014with their watchmaker detail and intensity\u2014have been described as \u2018microcosms\u2019 and \u2018intricate kingdoms of flying forms\u2019 Madden has one foot in the vanitas still-life tradition and the other in new-age thinking. On the one hand, he is death obsessed: a master of morbid decoupage. (Moths and butterflies\u2014symbols of transient life\u2014abound. His assemblages in bell jars suggest some Victorian taxidermist killing time in his parlour.) On the other hand, with his flocks, schools, and swarms of quivering animal energy, he revels in biodiversity and magic. Madden\u2019s works manage to be at once morbid and abundant, rotting and blooming, creepy and fey. This book serveys Madden\u2019s work of the last ten years": 2, "Fallout 4: How to Get Vertibird Support": 2, "For Fallout 4 on the PlayStation 4, a GameFAQs message board topic titled \"Vertibirds going down constantly?\".": 2, "I am a committed Piano tutor and composer with over 15 years experience teaching a wide range of pupils from children to...": 2, "We use cookies on our website to give you the most relevant experience by remembering your preferences and repeat visits. By clicking \u201cAccept All\u201d, you consent to the use of ALL the cookies. However, you may visit \"Cookie Settings\" to provide a controlled consent.\nCookie SettingsAccept All\nManage consent\n\nThis website uses cookies to improve your experience while you navigate through the website. Out of these, the cookies that are categorized as necessary are stored on your browser as they are essential for the working of basic functionalities of the website. We also use third-party cookies that help us analyze and understand how you use this website. These cookies will be stored in your browser only with your consent. You also have the option to opt-out of these cookies. But opting out of some of these cookies may affect your browsing experience.\nNecessary Always Enabled\nNecessary cookies are absolutely essential for the website to function properly. These cookies ensure basic functionalities and security features of the website, anonymously.\nFunctional\nFunctional cookies help to perform certain functionalities like sharing the content of the website on social media platforms, collect feedbacks, and other third-party features.\nPerformance\nPerformance cookies are used to understand and analyze the key performance indexes of the website which helps in delivering a better user experience for the visitors.\nAnalytics\nAnalytical cookies are used to understand how visitors interact with the website. These cookies help provide information on metrics the number of visitors, bounce rate, traffic source, etc.\nAdvertisement\nAdvertisement cookies are used to provide visitors with relevant ads and marketing campaigns. These cookies track visitors across websites and collect information to provide customized ads.\nOthers\nOther uncategorized cookies are those that are being analyzed and have not been classified into a category as yet.\nSAVE & ACCEPT": 3, "Serbia signs Memorandum of Understanding with USAID on energy efficiency\n\nKeep up with the latest trends and news of the CEE energy market! Sign up for our newsletters to receive curated news across the energy agenda in 20+ countries in Central and South-eastern Europe.": 2, "Concerns over effect of Rotorua plan": 2, "Jet skier in our wake": 2, "You may have missed": 2, "Showing posts from July, 2018\nShow all": 2, "EXCERPT\nAs the band played, the dance floor filled. Nate looked over the top of his beer bottle as Rachel asked Grant to dance. It was shaping up to be a line dance and Grant, not looking like the cowboy boogie-type, begged off a second time.\nShe flashed Caroline a hopeful grin. \u201cDo you want to dance?\u201d\nCaroline\u2019s eyes darted to the dance floor. \u201cI don\u2019t know how to do that.\u201d\nRachel set her hands on her hips. She cocked her head toward the line forming behind them. \u201cCome on. I\u2019ll teach you.\u201d\nCaroline shot Nate a pleading look as if asking him to save her. He bumped her shoulder instead. \u201cGo ahead. Knock \u2019em dead.\u201d\nAnd damn, if she didn\u2019t. She picked up the steps quickly, laughing every time she turned the wrong way or kicked out the opposite foot. It wasn\u2019t long before she was rocking the arms and rolling her hips, but with an ethereal quality Nate had never witnessed in a country line dance before. Beside her, Rachel moved to the music a little differently, more seductive, less inhibited. Side by side with Caroline, he began to suspect Rachel wasn\u2019t as innocent and naive as her older brother wanted to believe. Nate continued to watch her dance, enthralled. He\u2019d just as soon imagine his sisters naked as he would Caroline, but Rachel? She conjured up fantasies even he\u2019d never imagined before.\nGrant paid no mind to Nate. His eyes were locked on Rachel\u2019s long lithe body on the dance floor. She had a type, and this guy was it\u2014tall, fair-haired, destined for a corner office. Nate brushed a hand over his scruffy face. Rachel could look him square in the eye when she wore heels. The only office he hoped to get was a concrete box with a pushout window.\nJealousy spiked in his chest before he finally pushed back from the table and headed back to the bar.\nFaces flushed and smiling, Rachel and Caroline wove their way back to the table after he returned. He set a glass of water in front of Caroline, relieved to see Rachel drinking water, too.\nGood. He preferred her date tonight ended with her sober.\nGrant looked down at his phone as the band took a break and then leaned sideways to say something to Rachel. Nate sent her a curious look after Grant passed the bouncer and went outside.\nRachel shrugged and set down her glass as recorded music started to play over the loudspeakers. \u201cHe said he had to take a call for work.\u201d\nCaroline touched Nate\u2019s shoulder. \u201cDo you know which way is the toilet?\u201d\nRachel smiled when he pointed to the far end of the bar.\nCaroline stood. \u201cI\u2019ll be right back.\u201d\n\u201cIt\u2019s just called the toilet in Ireland,\u201d Nate explained after Caroline disappeared into the crowd. \u201cTell me more about Kieran. How does he like his new home?\u201d\nRachel leaned her elbows on the table, her expression turning all sweet and sappy. \u201cI think he\u2019s happy. He meets me at the door every day when I get home and he likes to sleep in bed with me at night.\u201d\n\u201cHmmm,\u201d was the best Nate could do.\nShe dropped her chin into her hands. \u201cCan I ask you something?\u201d\n\u201cSure.\u201d\n\u201cHow much Irish do you speak?\u201d\nHe grinned, assuming cussing didn\u2019t count. \u201cI only know a few words that my father taught me.\u201d\nRachel\u2019s lips twitched.\n\u201cWhat?\u201d\n\u201cYour accent. You\u2019re starting to sound a little bit like your girlfriend.\u201d\nHe could tell she was teasing him, but he still felt the color rising in his cheeks. \u201cI told you, Caroline and I are friends.\u201d\nShe sat back and laughed as Lonestar\u2019s \u201cAmazed\u201d began to play. \u201cMatt\u2019s right. Your Irish does come out when you\u2019ve been drinking.\u201d\nNate just shrugged. His accent was a byproduct of parents born and raised in Ireland. His father was proud of his thick Irish accent. His mother tried not to speak with any accent at all, but sometimes it would sneak out when one of her four kids got her riled up. It snuck out on him, too, sometimes, and not just while he was drinking. Times Matt didn\u2019t know about. Moments Nate wished Rachel did.\nLeaning closer, enough so that he could feel her warm breath on his cheek, she looked at him. \u201cI have to ask you\u2026did that kiss mean anything at all to you?\u201d\nHe didn\u2019t know how to answer. He thought about lying or twisting the truth. Or just brushing her off altogether. But he couldn\u2019t do it. \u201cOf course it meant something to me. But it can\u2019t happen again.\u201d\nShe let out a short laugh. \u201cThen it didn\u2019t mean much at all, did it?\u201d\nHe stared at her, his throat so tight he could barely breathe. He told himself to keep his mouth shut. Put her first. Forget her.\nBut no, he looked over his shoulder for Caroline instead and then damn near lost his head. \u201cRachel, I\u2019m crazy about you.\u201d I love you! He clenched his jaw, determined to salvage the big fat mess he\u2019d made. \u201cBut be realistic. I\u2019m not the right guy for you.\u201d\nShe eased back with defiance. \u201cWho says?\u201d\n\u201cHow about we start with your brother?\u201d\nHer lips pinched together. He\u2019d hit a nerve. \u201cWho says I\u2019m looking for Mr. Right?\u201d\n\u201cWhat is that supposed to mean?\u201d\n\u201cIt means I\u2019m not looking for a ring, Nate. I want to go out, have fun, blow off a little steam. That doesn\u2019t work for you, so I won\u2019t bother you again.\u201d": 2, "AUTHOR BIO\nSuzanne Winslow writes the kind of stories she loves to read\u2014contemporary romance with relatable characters, unsung heroes and heroines, and true-to-life stories. Nurses, teachers, firefighters, and Marines top her list of champions. Give her a book about strong, brave characters with hidden vulnerabilities and a secret passion, and she\u2019ll binge read to the end!\nSuzanne and her husband, along with their rescue dog, Murphy, call Upstate New York home. When she\u2019s not reading or writing, she\u2019s often planning a road trip, or if it\u2019s summertime, hanging out at the lake. Connecting with readers through Instagram, Facebook, and newsletters is a favorite pastime.\nAUTHOR LINKS\nWEBSITE\nINSTAGRAM\nFACEBOOK\nGOODREADS\nAMAZON": 2, "After breaking the partition, a sturdy metal frame in placed to ensure the upper part of the wall is safely supported and to facilitate access to the roof.": 2, "From the window situated over the release module and behind glass we can watch the chicks without them seeing us.": 2, "During the release process a young one-year old male from the wild population, visited the release module, attracted by the Colony Environment effect. It is probable that it is an individual from the urban centre of San Vicente where at least two pairs of lesser kestrel breed.": 2, "I\u2019ve had a long love of books, and some of my most prized books are art books. This is a review of books from my collection that can be found on shelves in my studio. I will provide links when possible.": 2, "The Fairy Tales of Oscar Wilde": 2, "Just added to your cart": 2, "The West Side Lofts, a mixed-use development in the heart of Red Bank's antique district, brought a fresh infusion of downtown residents when it opened about four years ago. Tanya Breen\nFullscreen": 2, "Interior of one of the apartments during the opening of Element, a new high-end 35 unit apartment complex along the Navesink River in Red Bank, NJ Wednesday May 29, 2019. Tanya Breen\nFullscreen": 2, "How To Responsibly Donate To Ukrainian Causes": 2, "The Subtle Violence Of So...": 2, "Corona-virus: Fun things to do while social distancing": 2, "Barcelona try to make up for Messi\u2019s lost time": 2, "The Milton and Tamar Maltz Performing Arts Center, located on East 105th Street and Ansel Road in Cleveland. Prior to being used by Case Western Reserve University, the building was The Temple-Tifereth Israel\u2019s home until the 1970s.": 2, "Error: Twitter did not respond. Please wait a few minutes and refresh this page.": 2, "Back to Top\nClose": 3, "It was all over before I knew it and I just could not believe I could see almost perfectly straight after the surgery. Read more...": 2, "Watch music on TV: AXS TV programming highlights for the week of April 15-21": 2, "The BL King\u2019s Topographical Collection: \"THE NORTH-EAST VIEW OF SCALEBY-CASTLE, IN THE COUNTY OF CUMBERLAND. \"": 2, "Welcome to our store": 2, "We seek to promote lively discussion and debate. We believe that our users have the right to express themselves freely in a manner that is courteous and respectful of others' point of view and sensibility.\n\nYour message may be removed if we consider it to be:\n\nRepeated violations may lead to suspension and/or termination of your message posting privileges.\n\nwww.tributes.in takes abuse very seriously: We will co-operate fully with law enforcement, including disclosure of your user ID, IP address and messaging history.\n\nPlease review your message, you cannot delete/edit once it has been posted.\n\nREADY TO GIVE THE MOST MEANINGFUL GIFT TO YOUR FAMILY?\n\nGive a Tribute to someone special and see how your family and friends react - it'll be priceless (trust us)!": 2, "How to start? Making a plan \u2026": 2, "Victorian Fashion This era in fashion ranged primarily from the mid-1800s to the early 1900s. It' PowerPoint Presentation": 2, "How did the crisis grow between 1900-1914? PowerPoint Presentation": 2, "Meet Your Match on Dating Site with": 2, "Data Beams Down to Planet Comicon 2020": 2, "NBA Scoring Title Should Go To Durant Over Carmelo": 2, "Any cookies that may not be particularly necessary for the website to function and is used specifically to collect user personal data via analytics, ads, other embedded contents are termed as non-necessary cookies. It is mandatory to procure user consent prior to running these cookies on your website.": 3, "Commendation: Made in Australia: The Future of Australian Cities by Dr Julian Bolleter and Professor Richard Weller (Perth).": 2, "WINNER: Dune\nNightmare Alley\nThe Power of the Dog\nThe Tragedy of Macbeth\nWest Side Story": 2, "Everything Women Need to Know About Triathlon": 2, "Police keep people away from the Century 16 theater in Aurora, CO, just outside Denver after a shooting at the Midnight Premier of the Dark Knight Rises where 12 people are confirmed dead and many more injured": 2, "You don't have permission to register": 2, "Geoff Neal believes he \u201cshut people up\u201d by knocking out Vicente Luque, expects \u201ceverybody is going to try to wrestle me now\u201d": 2, "The Great Famine and the Irish Diaspora in America ebook": 2, "Demystifying the Role of AI in Cybersecurity\n\nThere's a lot of anticipation and expectation in business around the role of artificial...": 2, "\u201cPale Blue Dot\u201d by The NaveBlues": 2, "D-Day for R. Kelly as sex-crimes trial gets underway\n1 month ago\n1 month ago": 2, "Culture Current: Teenagers Are Hosed, Here\u2019s What We Can Do": 2, "Winter camouflage in the BC Cariboo!": 2, "How Science Denial Happens and What You Can Do About it": 2, "Processed with VSCO with c1 preset": 2, "The Late Late Show with James Corden on Carpool karaoke": 2, "PAUL HINCE AND NEIL YOUNG GRAB ALL THE POINTS FOR CITY": 2, "Details Taking place between the 1st May and the 31st October 2010, the Shanghai World Expo was the largest Expo the world had ever seen. Represent.....": 2, "The office buildings contrast with the old design from Tokyo Station.": 2, "The most northerly point of our road trip.": 2, "Pin On Anniversary Quotes And Wishes": 2, "Longeveron up 100% after FDA approves its Lomecel-B medical product": 2, "\u2193 Download Image\nCaption: Paul Medlock-Walton demonstrates Gameblox, which was developed by researchers at the Education Arcade, and allows users to create their own games.\nCredits: Photo: Casey Atkins": 2, "Is Buying Gold a Good Investment?": 2, "Team 2 \u2013 work together on this collaborative puzzle game": 2, "Meredith Rosenthal (center) spoke about pharmaceutical marketing's role in the opioid crisis. She is Gray professor of health economics at the Harvard T. H. Chan School of Public Health.": 2, "Rehabilitated borehole in use": 2, "This image from video provided by the FBI, shows Aaron Alexis moves through the hallways of Building #197 at the Washington Navy Yard on Sept. 16 in Washington, carrying a Remington 870 shotgun. Alexis, a 34-year-old former Navy reservist and IT contractor, shot and killed 12 people inside a Navy Yard building last week before being killed in a shootout with police. (AP Photo/FBI)": 2, "Pin by Ryann McBride on Humanoids in 2021 Character art": 2, "The Lebanese tourist was spared serious harm due to the rescue by local surfer Alik Reyes Narag and a Frenchman lifeguard \u2019hero\u2019. Photo: Pavida Anantarasmi": 2, "PEMUDA HARUS \u201cI DO CARE\u201d": 2, "Is GameStop the Next RadioShack?": 2}}
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/tokenized_df.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f75e007c2895c3141121690edf9d4028ae64b77bd83ca7ecb6f2ece27b51c2
3
+ size 844434693
cache_dir/HuggingFaceM4/OBELICS_default_train_texts/vocab_counts.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ff40d8372ff15bead1766d2f6b14e349a64d057a6f298ed52726e31b1873c5
3
+ size 39026204
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f23eb11877f1bf934ba12d0ce910ccb71b9ce3865798dc6651d12425244b529
3
+ size 489144
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/dataset_info.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 487928,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "images": {
12
+ "feature": {
13
+ "dtype": "string",
14
+ "_type": "Value"
15
+ },
16
+ "_type": "Sequence"
17
+ },
18
+ "metadata": {
19
+ "dtype": "string",
20
+ "_type": "Value"
21
+ },
22
+ "general_metadata": {
23
+ "dtype": "string",
24
+ "_type": "Value"
25
+ },
26
+ "texts": {
27
+ "feature": {
28
+ "dtype": "string",
29
+ "_type": "Value"
30
+ },
31
+ "_type": "Sequence"
32
+ }
33
+ },
34
+ "homepage": "",
35
+ "license": "",
36
+ "size_in_bytes": 487928,
37
+ "splits": {
38
+ "train": {
39
+ "name": "train",
40
+ "num_bytes": 487928,
41
+ "num_examples": 100,
42
+ "dataset_name": "generator"
43
+ }
44
+ },
45
+ "version": {
46
+ "version_str": "0.0.0",
47
+ "major": 0,
48
+ "minor": 0,
49
+ "patch": 0
50
+ }
51
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/base_dset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "ae60b20f0290ac61",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/dset_peek.json ADDED
The diff for this file is too large to render. See raw diff
 
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/general_stats_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total words": 1465, "total open words": 1344, "text_nan_count": 0, "duplicate_fraction": 0.0}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/length_measurements.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"average_instance_length": 36.29, "standard_dev_instance_length": 4.557810479126613, "num_instance_lengths": 21}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/lengths_fig.png ADDED
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/lengths/lengths_table.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": {"text": "{\"url\": \"https://eppc.org/publication/declaration-of-disruption/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499654.54/warc/CC-MAIN-20230128184907-20230128214907-00327.warc.gz\", \"warc_record_offset\": 250370303, \"warc_record_length\": 20061}", "length": 30}, "1": {"text": "{\"url\": \"https://lamborghinichat.com/forum/news/vw-group-allegedly-receives-offer-to-sell-lamborghini-for-9-2-billion.728/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488528979.69/warc/CC-MAIN-20210623011557-20210623041557-00312.warc.gz\", \"warc_record_offset\": 322560850, \"warc_record_length\": 17143}", "length": 41}, "2": {"text": "{\"url\": \"https://slidelegend.com/the-influencing-factors-on-entrepreneurship-ti-journals_59d9b3c91723dd4b8ead833e.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347392057.6/warc/CC-MAIN-20200527013445-20200527043445-00580.warc.gz\", \"warc_record_offset\": 536988620, \"warc_record_length\": 14280}", "length": 34}, "3": {"text": "{\"url\": \"https://www.henryusa.com/flexforkal/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500076.87/warc/CC-MAIN-20230203221113-20230204011113-00015.warc.gz\", \"warc_record_offset\": 828569590, \"warc_record_length\": 16616}", "length": 28}, "4": {"text": "{\"url\": \"https://www.arout.net/hotel-occupancy-drops-due-to-covid-19-certificates-in-mazatlan/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882572063.65/warc/CC-MAIN-20220814173832-20220814203832-00008.warc.gz\", \"warc_record_offset\": 598837769, \"warc_record_length\": 14575}", "length": 37}, "5": {"text": "{\"url\": \"https://www.glamsham.com/world/technology/apple-may-launch-ar-contact-lenses-in-10-years\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038083007.51/warc/CC-MAIN-20210415035637-20210415065637-00636.warc.gz\", \"warc_record_offset\": 870013982, \"warc_record_length\": 84010}", "length": 38}, "6": {"text": "{\"url\": \"http://otlsports.com/patriots-vs-dolphins-biggest-nfl-spreads-this-decade/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145621.28/warc/CC-MAIN-20200221233354-20200222023354-00398.warc.gz\", \"warc_record_offset\": 111531585, \"warc_record_length\": 25025}", "length": 34}, "7": {"text": "{\"url\": \"https://birmingcabbie.wordpress.com/2013/01/16/silver-and-gold/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439737206.16/warc/CC-MAIN-20200807172851-20200807202851-00477.warc.gz\", \"warc_record_offset\": 233367373, \"warc_record_length\": 27864}", "length": 33}, "8": {"text": "{\"url\": \"https://www.vagabondinn.com/blog/vagabond-inn-comes-to-utah\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662531762.30/warc/CC-MAIN-20220520061824-20220520091824-00060.warc.gz\", \"warc_record_offset\": 1239312380, \"warc_record_length\": 14681}", "length": 33}, "9": {"text": "{\"url\": \"https://www.weeklygripe.co.uk/overuse-of-the-word-issue-in-the-media\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323585322.63/warc/CC-MAIN-20211020152307-20211020182307-00649.warc.gz\", \"warc_record_offset\": 1288254429, \"warc_record_length\": 10413}", "length": 36}, "10": {"text": "{\"url\": \"https://www.heavenlytealeaves.com/blogs/heavenly-tea-leaves-blog/exploring-the-green-teas-of-japan\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-39/segments/1631780057202.68/warc/CC-MAIN-20210921101319-20210921131319-00614.warc.gz\", \"warc_record_offset\": 837566473, \"warc_record_length\": 58147}", "length": 38}, "11": {"text": "{\"url\": \"https://tm.koreaherald.com/view.php?ud=20210504001014\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623487612537.23/warc/CC-MAIN-20210614135913-20210614165913-00601.warc.gz\", \"warc_record_offset\": 532769245, \"warc_record_length\": 25017}", "length": 31}, "12": {"text": "{\"url\": \"https://bricks.stackexchange.com/questions/14655/how-do-you-display-multiple-items-at-once-on-an-ev3-screen/15044\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882573630.12/warc/CC-MAIN-20220819070211-20220819100211-00319.warc.gz\", \"warc_record_offset\": 175500523, \"warc_record_length\": 66290}", "length": 42}, "13": {"text": "{\"url\": \"https://www.toptipz.com.ng/armed-robbers-police-uniforms-enugu-caught-gun-battle-photos/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875148375.36/warc/CC-MAIN-20200229022458-20200229052458-00295.warc.gz\", \"warc_record_offset\": 904154101, \"warc_record_length\": 11479}", "length": 37}, "14": {"text": "{\"url\": \"https://gamecocksonline.com/news/2017/08/22/breaking-down-the-offensive-line/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337322.29/warc/CC-MAIN-20221002115028-20221002145028-00747.warc.gz\", \"warc_record_offset\": 303777204, \"warc_record_length\": 19373}", "length": 35}, "15": {"text": "{\"url\": \"https://paulinepark.com/2014/06/09/gauguin-exoticization-primitivism-paganism/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499697.75/warc/CC-MAIN-20230129012420-20230129042420-00524.warc.gz\", \"warc_record_offset\": 473272628, \"warc_record_length\": 17321}", "length": 33}, "16": {"text": "{\"url\": \"https://talesofthetravelbug.wordpress.com/2016/02/26/abel-tasman-national-park/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439737319.74/warc/CC-MAIN-20200808080642-20200808110642-00060.warc.gz\", \"warc_record_offset\": 516800798, \"warc_record_length\": 29736}", "length": 34}, "17": {"text": "{\"url\": \"https://minnesota.cbslocal.com/2018/08/20/new-brighton-house-fire-dogs-rescued/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703531702.36/warc/CC-MAIN-20210123001629-20210123031629-00630.warc.gz\", \"warc_record_offset\": 455460747, \"warc_record_length\": 21947}", "length": 36}, "18": {"text": "{\"url\": \"https://simonegallina.blogspot.com/2012/03/famous-painter-of-prehistoric-murals.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370500331.13/warc/CC-MAIN-20200331053639-20200331083639-00229.warc.gz\", \"warc_record_offset\": 640302554, \"warc_record_length\": 19781}", "length": 35}, "19": {"text": "{\"url\": \"https://www.podparadise.com/Podcast/1533560948/Listen/1607614191/0\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046157039.99/warc/CC-MAIN-20210805193327-20210805223327-00602.warc.gz\", \"warc_record_offset\": 1002652790, \"warc_record_length\": 7640}", "length": 32}, "20": {"text": "{\"url\": \"https://vidmid.com/news/sir-ian-mckellen-martin-freeman-and-the-lord-of-the-rings-cast-kickstart-fundraising-campaign-to-buy-author-j-r-r-tolkien-s-oxford-home?uid=226886\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703538082.57/warc/CC-MAIN-20210123125715-20210123155715-00108.warc.gz\", \"warc_record_offset\": 628149199, \"warc_record_length\": 9581}", "length": 54}, "21": {"text": "{\"url\": \"https://www.etoren.com/products/fujifilm-x-t200-body-champagne-gold-kit-box\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882571090.80/warc/CC-MAIN-20220809215803-20220810005803-00211.warc.gz\", \"warc_record_offset\": 683638756, \"warc_record_length\": 40224}", "length": 36}, "22": {"text": "{\"url\": \"https://ionigeria.com/james-webb-the-most-powerful-space-telescope-ever-built-blast-into-orbit/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499845.10/warc/CC-MAIN-20230131055533-20230131085533-00613.warc.gz\", \"warc_record_offset\": 348002670, \"warc_record_length\": 28496}", "length": 38}, "23": {"text": "{\"url\": \"https://www.marshfieldnewsherald.com/story/news/2019/05/02/rothschild-police-wausau-man-charged-with-owi-said-his-gum-blame-smell-alcohol/3645355002/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250593994.14/warc/CC-MAIN-20200118221909-20200119005909-00365.warc.gz\", \"warc_record_offset\": 972941129, \"warc_record_length\": 43188}", "length": 46}, "24": {"text": "{\"url\": \"https://brindisa.com/blogs/news/the-food-geek-guide-to-iberico-ham\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764494852.95/warc/CC-MAIN-20230127001911-20230127031911-00821.warc.gz\", \"warc_record_offset\": 176182646, \"warc_record_length\": 58668}", "length": 35}, "25": {"text": "{\"url\": \"https://healthyceleb.com/scottie-pippen-height-weight-body-statistics/54041/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585371830894.88/warc/CC-MAIN-20200409055849-20200409090349-00200.warc.gz\", \"warc_record_offset\": 476834551, \"warc_record_length\": 28617}", "length": 33}, "26": {"text": "{\"url\": \"https://worldnews.su/world/kabul-terror-attack-multiple-explosions-as-gunfire-rings-out-with-deaths-feared-world-news.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103626162.35/warc/CC-MAIN-20220629084939-20220629114939-00694.warc.gz\", \"warc_record_offset\": 683956680, \"warc_record_length\": 14884}", "length": 42}, "27": {"text": "{\"url\": \"https://observer.com/2022/05/what-will-elon-musk-do-with-twitter-here-is-a-6-point-guide/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500758.20/warc/CC-MAIN-20230208092053-20230208122053-00639.warc.gz\", \"warc_record_offset\": 450138738, \"warc_record_length\": 24833}", "length": 41}, "28": {"text": "{\"url\": \"https://www.zwillgen.com/law-enforcement/court-decision-rules-geofence-warrant-violates-fourth-amendment/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710789.95/warc/CC-MAIN-20221201021257-20221201051257-00652.warc.gz\", \"warc_record_offset\": 1124917887, \"warc_record_length\": 23522}", "length": 37}, "29": {"text": "{\"url\": \"https://www.idropnews.com/news/beats-releases-ultra-affordable-new-50-beats-flex-wireless-earbuds/145050/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764501066.53/warc/CC-MAIN-20230209014102-20230209044102-00740.warc.gz\", \"warc_record_offset\": 857199535, \"warc_record_length\": 30176}", "length": 39}, "30": {"text": "{\"url\": \"https://callbag.co/epub/the-love-song-of-j-alfred-prufrock-and-other-poems/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656104496688.78/warc/CC-MAIN-20220704202455-20220704232455-00300.warc.gz\", \"warc_record_offset\": 199444381, \"warc_record_length\": 17548}", "length": 37}, "31": {"text": "{\"url\": \"https://kitchen-repair.info/daftar-film-mae-pong-38/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046154175.76/warc/CC-MAIN-20210801092716-20210801122716-00658.warc.gz\", \"warc_record_offset\": 358919877, \"warc_record_length\": 11838}", "length": 32}, "32": {"text": "{\"url\": \"https://www.austinchronicle.com/news/2008-06-27/640116/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141194982.45/warc/CC-MAIN-20201128011115-20201128041115-00175.warc.gz\", \"warc_record_offset\": 591253388, \"warc_record_length\": 17495}", "length": 32}, "33": {"text": "{\"url\": \"https://www.fame10.com/trending-news/will-smith-and-jada-pinkett-smith-slam-divorce-rumors/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499919.70/warc/CC-MAIN-20230201081311-20230201111311-00455.warc.gz\", \"warc_record_offset\": 766206361, \"warc_record_length\": 32672}", "length": 38}, "34": {"text": "{\"url\": \"https://ednews.net/en/news/politics/440902-iran-embassy-ill-wishers-trying-to-disrupt-relations-between-tehran-baku\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141750841.83/warc/CC-MAIN-20201205211729-20201206001729-00355.warc.gz\", \"warc_record_offset\": 263492130, \"warc_record_length\": 22120}", "length": 41}, "35": {"text": "{\"url\": \"https://www.hopefornigeriaonline.com/category/africa/page/17/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618039388763.75/warc/CC-MAIN-20210420091336-20210420121336-00107.warc.gz\", \"warc_record_offset\": 930101659, \"warc_record_length\": 22399}", "length": 31}, "36": {"text": "{\"url\": \"https://cmeindia.in/covid-19-3rd-wave-experience-in-india-a-survey-of-5971-adults/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500028.12/warc/CC-MAIN-20230202133541-20230202163541-00248.warc.gz\", \"warc_record_offset\": 178693725, \"warc_record_length\": 42926}", "length": 38}, "37": {"text": "{\"url\": \"https://www.sitra.fi/en/blogs/moving-boldly-towards-participatory-foresight/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250607407.48/warc/CC-MAIN-20200122191620-20200122220620-00393.warc.gz\", \"warc_record_offset\": 1088907449, \"warc_record_length\": 11772}", "length": 34}, "38": {"text": "{\"url\": \"https://suesspiciousminds.com/2013/05/14/four-minute-warning-or-last-night-the-plans-for-a-future-war-was-all-i-saw-on-channel-four/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323588398.42/warc/CC-MAIN-20211028162638-20211028192638-00233.warc.gz\", \"warc_record_offset\": 664847335, \"warc_record_length\": 31698}", "length": 48}, "39": {"text": "{\"url\": \"https://www.exquisitecoasts.com/spectacular-coastal-drives.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500339.37/warc/CC-MAIN-20230206113934-20230206143934-00685.warc.gz\", \"warc_record_offset\": 782007633, \"warc_record_length\": 19267}", "length": 31}, "40": {"text": "{\"url\": \"https://petebowes.com/2011/01/02/china-the-first-night/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337625.5/warc/CC-MAIN-20221005105356-20221005135356-00284.warc.gz\", \"warc_record_offset\": 486450658, \"warc_record_length\": 27715}", "length": 33}, "41": {"text": "{\"url\": \"https://bluemonsterprep.com/blogs/news/hawaii-races-to-secure-power-plant-threatened-by-lava-flows\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499713.50/warc/CC-MAIN-20230129112153-20230129142153-00439.warc.gz\", \"warc_record_offset\": 159639259, \"warc_record_length\": 72974}", "length": 38}, "42": {"text": "{\"url\": \"http://archiveislam.com/the-muslim-snow-patrol-cleaning-up-the-city.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579251681412.74/warc/CC-MAIN-20200125191854-20200125221854-00304.warc.gz\", \"warc_record_offset\": 13209660, \"warc_record_length\": 11206}", "length": 35}, "43": {"text": "{\"url\": \"https://thegatorseye.com/3188/news/muslims-in-detainment-a-look-into-chinas-camps/?print=true\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882573145.32/warc/CC-MAIN-20220818003501-20220818033501-00506.warc.gz\", \"warc_record_offset\": 510157631, \"warc_record_length\": 8481}", "length": 38}, "44": {"text": "{\"url\": \"https://jewishnews.timesofisrael.com/video-project-twinning-shoah-survivor-with-celebs-wins-two-awards/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046155458.35/warc/CC-MAIN-20210805063730-20210805093730-00308.warc.gz\", \"warc_record_offset\": 347762646, \"warc_record_length\": 14792}", "length": 37}, "45": {"text": "{\"url\": \"https://www.rowadventures.com/specialist/malcolm-reed\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882571056.58/warc/CC-MAIN-20220809155137-20220809185137-00754.warc.gz\", \"warc_record_offset\": 870524278, \"warc_record_length\": 12466}", "length": 30}, "46": {"text": "{\"url\": \"https://theshelfofunreadbooks.wordpress.com/2019/02/03/reviewthe-library-book-by-susan-orlean/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323587606.8/warc/CC-MAIN-20211024204628-20211024234628-00141.warc.gz\", \"warc_record_offset\": 701475464, \"warc_record_length\": 55792}", "length": 36}, "47": {"text": "{\"url\": \"https://energy.agwired.com/2010/10/07/wasted-food-wasted-energy/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764495001.99/warc/CC-MAIN-20230127164242-20230127194242-00001.warc.gz\", \"warc_record_offset\": 241286356, \"warc_record_length\": 20698}", "length": 34}, "48": {"text": "{\"url\": \"https://www.uscleiden.com/news/2021/05/14/beat-the-microbead/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662521041.0/warc/CC-MAIN-20220518021247-20220518051247-00703.warc.gz\", \"warc_record_offset\": 1269439293, \"warc_record_length\": 6324}", "length": 34}, "49": {"text": "{\"url\": \"https://bedandbicycle.com/is-this-cool-idea-with-butterfly-doorways-the-subsequent-mazda-miata.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710764.12/warc/CC-MAIN-20221130124353-20221130154353-00274.warc.gz\", \"warc_record_offset\": 161515301, \"warc_record_length\": 13553}", "length": 38}, "50": {"text": "{\"url\": \"https://www.greatfallstribune.com/story/opinion/tribune-editorials/2016/06/10/montana-gets-manufacturing-tax-fairness/85722960/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370506870.41/warc/CC-MAIN-20200402080824-20200402110824-00330.warc.gz\", \"warc_record_offset\": 935440331, \"warc_record_length\": 43051}", "length": 40}, "51": {"text": "{\"url\": \"https://thenextweb.com/media/2011/11/08/last-fm-reached-the-60-billion-scrobble-landmark-with-lady-gaga/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141176049.8/warc/CC-MAIN-20201124082900-20201124112900-00250.warc.gz\", \"warc_record_offset\": 525905978, \"warc_record_length\": 21310}", "length": 41}, "52": {"text": "{\"url\": \"https://www.fingerlakes1.com/2021/03/07/sabres-lose-seventh-straight-with-5-2-loss-to-islanders/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710771.39/warc/CC-MAIN-20221130192708-20221130222708-00543.warc.gz\", \"warc_record_offset\": 792006189, \"warc_record_length\": 23266}", "length": 40}, "53": {"text": "{\"url\": \"https://movizark.com/2018/10/29/el-tonto-ray-liotta-adrien-brody-travis-fimmel-ken-jeong-join-charlie-days-film/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662539101.40/warc/CC-MAIN-20220521112022-20220521142022-00165.warc.gz\", \"warc_record_offset\": 444833137, \"warc_record_length\": 39518}", "length": 43}, "54": {"text": "{\"url\": \"https://paradigmshyft.com/2022/02/27/totalitarianism-authoritarianism-and-fascism-what-is-the-difference/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499524.28/warc/CC-MAIN-20230128054815-20230128084815-00673.warc.gz\", \"warc_record_offset\": 442133024, \"warc_record_length\": 33777}", "length": 37}, "55": {"text": "{\"url\": \"https://www.rte.ie/news/2019/1201/1096265-smith-isis-home-turkey/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964363292.82/warc/CC-MAIN-20211206103243-20211206133243-00587.warc.gz\", \"warc_record_offset\": 1036937141, \"warc_record_length\": 22407}", "length": 35}, "56": {"text": "{\"url\": \"http://www.cmt.com/news/1490421/bluegrass-legend-charlie-waller-dead-at-69/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250595787.7/warc/CC-MAIN-20200119234426-20200120022426-00503.warc.gz\", \"warc_record_offset\": 208318103, \"warc_record_length\": 11530}", "length": 36}, "57": {"text": "{\"url\": \"https://caribbeantales-worldwide.com/a-hand-full-of-anticipation-for-the-caribbeantales-2011-film-festival/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499890.39/warc/CC-MAIN-20230131190543-20230131220543-00762.warc.gz\", \"warc_record_offset\": 180032374, \"warc_record_length\": 29529}", "length": 38}, "58": {"text": "{\"url\": \"https://www.knollwood.ca/blog/what-do-deacons-do\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488567696.99/warc/CC-MAIN-20210625023840-20210625053840-00248.warc.gz\", \"warc_record_offset\": 748368127, \"warc_record_length\": 14745}", "length": 32}, "59": {"text": "{\"url\": \"https://silverchips.mbhs.edu/content/nba-award-predictions-31146/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710972.37/warc/CC-MAIN-20221204104311-20221204134311-00476.warc.gz\", \"warc_record_offset\": 559787302, \"warc_record_length\": 11801}", "length": 32}, "60": {"text": "{\"url\": \"https://realdealtheatre.webs.com/press.htm\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446711074.68/warc/CC-MAIN-20221206060908-20221206090908-00826.warc.gz\", \"warc_record_offset\": 503711292, \"warc_record_length\": 4512}", "length": 29}, "61": {"text": "{\"url\": \"https://www.parkablogs.com/content/book-review-daisuke-moriyama-art-works-chronicle\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038072366.31/warc/CC-MAIN-20210413122252-20210413152252-00153.warc.gz\", \"warc_record_offset\": 1044766559, \"warc_record_length\": 11425}", "length": 35}, "62": {"text": "{\"url\": \"https://www.ungeek.ph/2016/10/inspiring-watch-dwayne-the-rock-johnson-react-on-his-very-first-wwe-match/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500094.26/warc/CC-MAIN-20230204044030-20230204074030-00871.warc.gz\", \"warc_record_offset\": 1068673394, \"warc_record_length\": 27741}", "length": 42}, "63": {"text": "{\"url\": \"https://holaamericanews.com/to-my-bros-me-too-the-womens-march-and-my-birthday/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103334753.21/warc/CC-MAIN-20220627134424-20220627164424-00450.warc.gz\", \"warc_record_offset\": 353753500, \"warc_record_length\": 26850}", "length": 37}, "64": {"text": "{\"url\": \"https://nobeladventures.com/2019/04/14/riding-around-new-york/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764501555.34/warc/CC-MAIN-20230209081052-20230209111052-00418.warc.gz\", \"warc_record_offset\": 443600050, \"warc_record_length\": 28918}", "length": 33}, "65": {"text": "{\"url\": \"https://www.nationalhogfarmer.com/news/hog-prices-keep-sliding\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337731.82/warc/CC-MAIN-20221006061224-20221006091224-00466.warc.gz\", \"warc_record_offset\": 944340676, \"warc_record_length\": 23482}", "length": 32}, "66": {"text": "{\"url\": \"https://www.listal.com/viewentry/67108\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703519843.24/warc/CC-MAIN-20210119232006-20210120022006-00714.warc.gz\", \"warc_record_offset\": 856244466, \"warc_record_length\": 6855}", "length": 29}, "67": {"text": "{\"url\": \"https://www.gungoddess.com/blogs/choosing-a-gun/which-concealed-carry-guns-are-the-most-popular-right-now\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323585305.53/warc/CC-MAIN-20211020090145-20211020120145-00247.warc.gz\", \"warc_record_offset\": 952624329, \"warc_record_length\": 36797}", "length": 41}, "68": {"text": "{\"url\": \"https://www.historytoday.com/history-today-issues/volume-38-issue-9-september-1988\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-40/segments/1600400209665.4/warc/CC-MAIN-20200923015227-20200923045227-00745.warc.gz\", \"warc_record_offset\": 868067474, \"warc_record_length\": 7789}", "length": 36}, "69": {"text": "{\"url\": \"https://www.gsa.gov/historic-buildings/african-burial-ground-memorial-new-york-ny\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662530066.45/warc/CC-MAIN-20220519204127-20220519234127-00495.warc.gz\", \"warc_record_offset\": 887707280, \"warc_record_length\": 18194}", "length": 36}, "70": {"text": "{\"url\": \"https://hd-report.com/2014/02/14/comcast-merger-hideous-netflix-service/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103036176.7/warc/CC-MAIN-20220625220543-20220626010543-00439.warc.gz\", \"warc_record_offset\": 331507927, \"warc_record_length\": 70103}", "length": 35}, "71": {"text": "{\"url\": \"https://www.lawyersandsettlements.com/legal-news/harvoni-denied-insurance-claim/harvoni-lawsuits-denied-insurance-claim-20731.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323588526.57/warc/CC-MAIN-20211028193601-20211028223601-00464.warc.gz\", \"warc_record_offset\": 1030808939, \"warc_record_length\": 12183}", "length": 40}, "72": {"text": "{\"url\": \"https://aceaviation.info/15107/msi-personal-cinema-15/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347409171.27/warc/CC-MAIN-20200530102741-20200530132741-00512.warc.gz\", \"warc_record_offset\": 230608992, \"warc_record_length\": 8405}", "length": 31}, "73": {"text": "{\"url\": \"http://www.antehoc.com/2012/05/collateral-at-bank-of-italy-2.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500076.87/warc/CC-MAIN-20230203221113-20230204011113-00490.warc.gz\", \"warc_record_offset\": 49049249, \"warc_record_length\": 11713}", "length": 36}, "74": {"text": "{\"url\": \"https://www.thehansindia.com/news/cities/vijayawada/vijayawada-cpi-wants-cm-ys-jagan-to-raise-voice-against-vizag-steel-plant-sell-off-704225\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964362999.66/warc/CC-MAIN-20211204154554-20211204184554-00197.warc.gz\", \"warc_record_offset\": 1117681212, \"warc_record_length\": 45453}", "length": 46}, "75": {"text": "{\"url\": \"https://wkuf.fm/shows/wkuf-celebrates-the-30th-anniversary-of-violator/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/warc/CC-MAIN-20200329074745-20200329104745-00223.warc.gz\", \"warc_record_offset\": 747266560, \"warc_record_length\": 14190}", "length": 34}, "76": {"text": "{\"url\": \"https://dailytimes.com.pk/379160/pakistan-urges-iran-and-afghanistan-to-do-more/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347385193.5/warc/CC-MAIN-20200524210325-20200525000325-00286.warc.gz\", \"warc_record_offset\": 318772991, \"warc_record_length\": 14394}", "length": 36}, "77": {"text": "{\"url\": \"https://www.dailyfinland.fi/business/21426/KONE-Q1-results-improve/print\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488289268.76/warc/CC-MAIN-20210621181810-20210621211810-00385.warc.gz\", \"warc_record_offset\": 652955119, \"warc_record_length\": 4559}", "length": 34}, "78": {"text": "{\"url\": \"https://newcanaanlandtrust.org/hannan/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500095.4/warc/CC-MAIN-20230204075436-20230204105436-00431.warc.gz\", \"warc_record_offset\": 445539595, \"warc_record_length\": 36215}", "length": 27}, "79": {"text": "{\"url\": \"https://www.coppolacomment.com/2016/05/where-on-earth-is-growth-in-greece.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703519843.24/warc/CC-MAIN-20210119232006-20210120022006-00102.warc.gz\", \"warc_record_offset\": 739236885, \"warc_record_length\": 37583}", "length": 37}, "80": {"text": "{\"url\": \"https://philadelphia.cbslocal.com/2013/02/10/stars-sing-along-to-lumineers/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103269583.13/warc/CC-MAIN-20220626131545-20220626161545-00651.warc.gz\", \"warc_record_offset\": 500521076, \"warc_record_length\": 21082}", "length": 35}, "81": {"text": "{\"url\": \"http://www.tdisport.com/tdisport-diesel-news/passat-used-car-of-the-year/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-29/segments/1593657140746.69/warc/CC-MAIN-20200713002400-20200713032400-00490.warc.gz\", \"warc_record_offset\": 231800895, \"warc_record_length\": 24971}", "length": 36}, "82": {"text": "{\"url\": \"https://www.betarena.com/atlanta-united-vs-montreal-impact-betting-tip-and-prediction/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145316.8/warc/CC-MAIN-20200220224059-20200221014059-00491.warc.gz\", \"warc_record_offset\": 600780066, \"warc_record_length\": 35158}", "length": 36}, "83": {"text": "{\"url\": \"http://climateemergencydeclaration.org/head-cracking-moments-of-the-declaration-campaign/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500719.31/warc/CC-MAIN-20230208060523-20230208090523-00280.warc.gz\", \"warc_record_offset\": 8674637, \"warc_record_length\": 16718}", "length": 33}, "84": {"text": "{\"url\": \"http://2smeraldi.com/home/wp-includes/certificates/pdf/online-Studies-in-Modern-Music%3A-Frederick-Chopin%2C-Antonin-Dvo%C5%99%C3%A1k%2C-Johannes-Brahms-1904%2C2012/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-21/segments/1620243991772.66/warc/CC-MAIN-20210517115207-20210517145207-00081.warc.gz\", \"warc_record_offset\": 667469, \"warc_record_length\": 15861}", "length": 51}, "85": {"text": "{\"url\": \"https://megamarathi.com/news/braveheart-movie-selected-in-third-eye-asian-film-festival/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964362219.5/warc/CC-MAIN-20211202114856-20211202144856-00451.warc.gz\", \"warc_record_offset\": 472202588, \"warc_record_length\": 24269}", "length": 36}, "86": {"text": "{\"url\": \"https://plantpowerednomad.com/got-soy-milk-a-guide-to-vegan-lattes-in-taipei/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446711475.44/warc/CC-MAIN-20221209181231-20221209211231-00135.warc.gz\", \"warc_record_offset\": 502614546, \"warc_record_length\": 17761}", "length": 36}, "87": {"text": "{\"url\": \"https://pnesterova.com/2021/04/03/english-listening-futurologist/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-05/segments/1642320304810.95/warc/CC-MAIN-20220125100035-20220125130035-00017.warc.gz\", \"warc_record_offset\": 487850033, \"warc_record_length\": 35314}", "length": 32}, "88": {"text": "{\"url\": \"https://shapingthefutureofpower.com/2020/03/02/covid19-and-china-africa-soft-power-considerations/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662573053.67/warc/CC-MAIN-20220524142617-20220524172617-00116.warc.gz\", \"warc_record_offset\": 576892098, \"warc_record_length\": 26350}", "length": 36}, "89": {"text": "{\"url\": \"https://tass.com/politics/1023811\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-05/segments/1642320304570.90/warc/CC-MAIN-20220124124654-20220124154654-00633.warc.gz\", \"warc_record_offset\": 605037761, \"warc_record_length\": 18206}", "length": 28}, "90": {"text": "{\"url\": \"https://popculturetimes.com/2020/07/30/edge-of-tomorrow-2-what-is-release-date-and-latest-update/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038921860.72/warc/CC-MAIN-20210419235235-20210420025235-00434.warc.gz\", \"warc_record_offset\": 566143162, \"warc_record_length\": 19036}", "length": 40}, "91": {"text": "{\"url\": \"https://www.amnews.com/2016/11/12/centre-college-womens-soccer-moves-on-to-second-round/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764494826.88/warc/CC-MAIN-20230126210844-20230127000844-00124.warc.gz\", \"warc_record_offset\": 671862269, \"warc_record_length\": 20224}", "length": 39}, "92": {"text": "{\"url\": \"https://dev.to/chiexplores/for-beginners-what-you-need-to-know-about-var-let-const-45pl\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500273.30/warc/CC-MAIN-20230205161658-20230205191658-00037.warc.gz\", \"warc_record_offset\": 228975360, \"warc_record_length\": 20205}", "length": 39}, "93": {"text": "{\"url\": \"https://v5.femalefirst.co.uk/celebrity/eddie-redmayne-uses-cbd-oil-ease-nerves-1286919.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488268274.66/warc/CC-MAIN-20210621055537-20210621085537-00406.warc.gz\", \"warc_record_offset\": 525187166, \"warc_record_length\": 12139}", "length": 38}, "94": {"text": "{\"url\": \"https://www.saturdayeveningpost.com/2017/10/coping-estranged-adult-children/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499710.49/warc/CC-MAIN-20230129080341-20230129110341-00391.warc.gz\", \"warc_record_offset\": 1002765638, \"warc_record_length\": 30197}", "length": 33}, "95": {"text": "{\"url\": \"https://feifa.eu/spectacular-tax-savings-for-expats-using-spanish-compliant-investments/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652663012542.85/warc/CC-MAIN-20220528031224-20220528061224-00707.warc.gz\", \"warc_record_offset\": 312775777, \"warc_record_length\": 12729}", "length": 35}, "96": {"text": "{\"url\": \"http://blogs.reading.ac.uk/crg/advent-botany-2015-day-15-mahleb/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964358847.80/warc/CC-MAIN-20211129225145-20211130015145-00550.warc.gz\", \"warc_record_offset\": 13017767, \"warc_record_length\": 17509}", "length": 35}, "97": {"text": "{\"url\": \"https://www.wired.com/2009/11/alt-text-clever-murdoch-turns-news-into-hip-underground-club/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439739048.46/warc/CC-MAIN-20200813161908-20200813191908-00099.warc.gz\", \"warc_record_offset\": 898049141, \"warc_record_length\": 73991}", "length": 39}, "98": {"text": "{\"url\": \"https://www.statesmanjournal.com/story/news/2015/10/16/oregon-wolf-or3-cascade-mountains-crater-lake-national-park/74071992/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145774.75/warc/CC-MAIN-20200223123852-20200223153852-00550.warc.gz\", \"warc_record_offset\": 893628626, \"warc_record_length\": 42687}", "length": 42}, "99": {"text": "{\"url\": \"https://www.businesstraveller.com/business-travel/2018/03/14/boeing-rolls-10000th-737-aircraft/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-21/segments/1620243992159.64/warc/CC-MAIN-20210517084550-20210517114550-00131.warc.gz\", \"warc_record_offset\": 690172719, \"warc_record_length\": 19143}", "length": 37}}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/sorted_top_vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"main": {"count": 200, "proportion": 0.0613685179502915, "vocab": "main"}, "cc": {"count": 200, "proportion": 0.0613685179502915, "vocab": "cc"}, "warc": {"count": 200, "proportion": 0.0613685179502915, "vocab": "warc"}, "segments": {"count": 100, "proportion": 0.03068425897514575, "vocab": "segments"}, "crawl": {"count": 100, "proportion": 0.03068425897514575, "vocab": "crawl"}, "warc_record_length": {"count": 100, "proportion": 0.03068425897514575, "vocab": "warc_record_length"}, "gz": {"count": 100, "proportion": 0.03068425897514575, "vocab": "gz"}, "warc_record_offset": {"count": 100, "proportion": 0.03068425897514575, "vocab": "warc_record_offset"}, "url": {"count": 100, "proportion": 0.03068425897514575, "vocab": "url"}, "warc_filename": {"count": 100, "proportion": 0.03068425897514575, "vocab": "warc_filename"}, "data": {"count": 100, "proportion": 0.03068425897514575, "vocab": "data"}, "https": {"count": 92, "proportion": 0.02822951825713409, "vocab": "https"}, "com": {"count": 77, "proportion": 0.023626879410862226, "vocab": "com"}, "www": {"count": 43, "proportion": 0.013194231359312672, "vocab": "www"}, "2021": {"count": 31, "proportion": 0.009512120282295183, "vocab": "2021"}, "2022": {"count": 29, "proportion": 0.008898435102792267, "vocab": "2022"}, "06": {"count": 25, "proportion": 0.007671064743786438, "vocab": "06"}, "2020": {"count": 25, "proportion": 0.007671064743786438, "vocab": "2020"}, "2023": {"count": 22, "proportion": 0.006750536974532065, "vocab": "2023"}, "news": {"count": 22, "proportion": 0.006750536974532065, "vocab": "news"}, "05": {"count": 12, "proportion": 0.00368211107701749, "vocab": "05"}, "html": {"count": 10, "proportion": 0.003068425897514575, "vocab": "html"}, "http": {"count": 8, "proportion": 0.00245474071801166, "vocab": "http"}, "02": {"count": 8, "proportion": 0.00245474071801166, "vocab": "02"}, "03": {"count": 6, "proportion": 0.001841055538508745, "vocab": "03"}, "04": {"count": 6, "proportion": 0.001841055538508745, "vocab": "04"}, "blogs": {"count": 6, "proportion": 0.001841055538508745, "vocab": "blogs"}, "2016": {"count": 5, "proportion": 0.0015342129487572874, "vocab": "2016"}, "film": {"count": 4, "proportion": 0.00122737035900583, "vocab": "film"}, "99": {"count": 4, "proportion": 0.00122737035900583, "vocab": "99"}, "2019": {"count": 4, "proportion": 0.00122737035900583, "vocab": "2019"}, "new": {"count": 4, "proportion": 0.00122737035900583, "vocab": "new"}, "org": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "org"}, "world": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "world"}, "2013": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "2013"}, "08": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "08"}, "wordpress": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "wordpress"}, "2011": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "2011"}, "story": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "story"}, "07": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "07"}, "2018": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "2018"}, "blog": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "blog"}, "guide": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "guide"}, "home": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "home"}, "smith": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "smith"}, "uk": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "uk"}, "co": {"count": 3, "proportion": 0.0009205277692543725, "vocab": "co"}, "certificates": {"count": 2, "proportion": 0.000613685179502915, "vocab": "certificates"}, "j": {"count": 2, "proportion": 0.000613685179502915, "vocab": "j"}, "beats": {"count": 2, "proportion": 0.000613685179502915, "vocab": "beats"}, "00060": {"count": 2, "proportion": 0.000613685179502915, "vocab": "00060"}, "book": {"count": 2, "proportion": 0.000613685179502915, "vocab": "book"}, "20230203221113": {"count": 2, "proportion": 0.000613685179502915, "vocab": "20230203221113"}, "last": {"count": 2, "proportion": 0.000613685179502915, "vocab": "last"}, "vs": {"count": 2, "proportion": 0.000613685179502915, "vocab": "vs"}, "campaign": {"count": 2, "proportion": 0.000613685179502915, "vocab": "campaign"}, "01": {"count": 2, "proportion": 0.000613685179502915, "vocab": "01"}, "cbslocal": {"count": 2, "proportion": 0.000613685179502915, "vocab": "cbslocal"}, "billion": {"count": 2, "proportion": 0.000613685179502915, "vocab": "billion"}, "00550": {"count": 2, "proportion": 0.000613685179502915, "vocab": "00550"}, "business": {"count": 2, "proportion": 0.000613685179502915, "vocab": "business"}, "00490": {"count": 2, "proportion": 0.000613685179502915, "vocab": "00490"}, "vijayawada": {"count": 2, "proportion": 0.000613685179502915, "vocab": "vijayawada"}, "womens": {"count": 2, "proportion": 0.000613685179502915, "vocab": "womens"}, "iran": {"count": 2, "proportion": 0.000613685179502915, "vocab": "iran"}, "wkuf": {"count": 2, "proportion": 0.000613685179502915, "vocab": "wkuf"}, "issue": {"count": 2, "proportion": 0.000613685179502915, "vocab": "issue"}, "harvoni": {"count": 2, "proportion": 0.000613685179502915, "vocab": "harvoni"}, "fi": {"count": 2, "proportion": 0.000613685179502915, "vocab": "fi"}, "tdisport": {"count": 2, "proportion": 0.000613685179502915, "vocab": "tdisport"}, "media": {"count": 2, "proportion": 0.000613685179502915, "vocab": "media"}, "claim": {"count": 2, "proportion": 0.000613685179502915, "vocab": "claim"}, "20230204011113": {"count": 2, "proportion": 0.000613685179502915, "vocab": "20230204011113"}, "tax": {"count": 2, "proportion": 0.000613685179502915, "vocab": "tax"}, "china": {"count": 2, "proportion": 0.000613685179502915, "vocab": "china"}, "festival": {"count": 2, "proportion": 0.000613685179502915, "vocab": "festival"}, "charlie": {"count": 2, "proportion": 0.000613685179502915, "vocab": "charlie"}, "wasted": {"count": 2, "proportion": 0.000613685179502915, "vocab": "wasted"}, "insurance": {"count": 2, "proportion": 0.000613685179502915, "vocab": "insurance"}, "body": {"count": 2, "proportion": 0.000613685179502915, "vocab": "body"}, "00439": {"count": 2, "proportion": 0.000613685179502915, "vocab": "00439"}, "00248": {"count": 2, "proportion": 0.000613685179502915, "vocab": "00248"}, "africa": {"count": 2, "proportion": 0.000613685179502915, "vocab": "africa"}, "york": {"count": 2, "proportion": 0.000613685179502915, "vocab": "york"}, "info": {"count": 2, "proportion": 0.000613685179502915, "vocab": "info"}, "national": {"count": 2, "proportion": 0.000613685179502915, "vocab": "national"}, "fm": {"count": 2, "proportion": 0.000613685179502915, "vocab": "fm"}, "energy": {"count": 2, "proportion": 0.000613685179502915, "vocab": "energy"}, "en": {"count": 2, "proportion": 0.000613685179502915, "vocab": "en"}, "food": {"count": 2, "proportion": 0.000613685179502915, "vocab": "food"}, "four": {"count": 2, "proportion": 0.000613685179502915, "vocab": "four"}, "2014": {"count": 2, "proportion": 0.000613685179502915, "vocab": "2014"}, "1610703519843": {"count": 2, "proportion": 0.000613685179502915, "vocab": "1610703519843"}, "gun": {"count": 2, "proportion": 0.000613685179502915, "vocab": "gun"}, "gold": {"count": 2, "proportion": 0.000613685179502915, "vocab": "gold"}, "content": {"count": 2, "proportion": 0.000613685179502915, "vocab": "content"}, "park": {"count": 2, "proportion": 0.000613685179502915, "vocab": "park"}, "spectacular": {"count": 2, "proportion": 0.000613685179502915, "vocab": "spectacular"}, "20210120022006": {"count": 2, "proportion": 0.000613685179502915, "vocab": "20210120022006"}, "covid": {"count": 2, "proportion": 0.000613685179502915, "vocab": "covid"}}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/cache-f6aa4a70e38b4a04.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e5fe498e11eeb301bd8430d2f2c408c696d36bbc21b1d18f31c4cf957a8d47
3
+ size 67576
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d04dc98da752f00f252635eed55717c8a12dee20edb3c8347bd34e2228b16bb3
3
+ size 29512
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/dataset_info.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 487928,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "text": {
12
+ "dtype": "string",
13
+ "_type": "Value"
14
+ }
15
+ },
16
+ "homepage": "",
17
+ "license": "",
18
+ "size_in_bytes": 487928,
19
+ "splits": {
20
+ "train": {
21
+ "name": "train",
22
+ "num_bytes": 487928,
23
+ "num_examples": 100,
24
+ "dataset_name": "generator"
25
+ }
26
+ },
27
+ "version": {
28
+ "version_str": "0.0.0",
29
+ "major": 0,
30
+ "minor": 0,
31
+ "patch": 0
32
+ }
33
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_dset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "88b86fb59adda99e",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_duplicates/text_duplicates.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <table border="1"><tr><th>duplicate_fraction</th><td>0.0</td></tr><tr><th>duplicates_dict</th><td></td></tr></table>
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/text_duplicates/text_duplicates.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"duplicate_fraction": 0.0, "duplicates_dict": {}}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/tokenized_df.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": {"text": "{\"url\": \"https://eppc.org/publication/declaration-of-disruption/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499654.54/warc/CC-MAIN-20230128184907-20230128214907-00327.warc.gz\", \"warc_record_offset\": 250370303, \"warc_record_length\": 20061}", "tokenized_text": ["url", "https", "eppc", "org", "publication", "declaration", "of", "disruption", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499654", "54", "warc", "cc", "main", "20230128184907", "20230128214907", "00327", "warc", "gz", "warc_record_offset", "250370303", "warc_record_length", "20061"]}, "1": {"text": "{\"url\": \"https://lamborghinichat.com/forum/news/vw-group-allegedly-receives-offer-to-sell-lamborghini-for-9-2-billion.728/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488528979.69/warc/CC-MAIN-20210623011557-20210623041557-00312.warc.gz\", \"warc_record_offset\": 322560850, \"warc_record_length\": 17143}", "tokenized_text": ["url", "https", "lamborghinichat", "com", "forum", "news", "vw", "group", "allegedly", "receives", "offer", "to", "sell", "lamborghini", "for", "9", "2", "billion", "728", "warc_filename", "crawl", "data", "cc", "main", "2021", "25", "segments", "1623488528979", "69", "warc", "cc", "main", "20210623011557", "20210623041557", "00312", "warc", "gz", "warc_record_offset", "322560850", "warc_record_length", "17143"]}, "2": {"text": "{\"url\": \"https://slidelegend.com/the-influencing-factors-on-entrepreneurship-ti-journals_59d9b3c91723dd4b8ead833e.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347392057.6/warc/CC-MAIN-20200527013445-20200527043445-00580.warc.gz\", \"warc_record_offset\": 536988620, \"warc_record_length\": 14280}", "tokenized_text": ["url", "https", "slidelegend", "com", "the", "influencing", "factors", "on", "entrepreneurship", "ti", "journals_59d9b3c91723dd4b8ead833e", "html", "warc_filename", "crawl", "data", "cc", "main", "2020", "24", "segments", "1590347392057", "6", "warc", "cc", "main", "20200527013445", "20200527043445", "00580", "warc", "gz", "warc_record_offset", "536988620", "warc_record_length", "14280"]}, "3": {"text": "{\"url\": \"https://www.henryusa.com/flexforkal/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500076.87/warc/CC-MAIN-20230203221113-20230204011113-00015.warc.gz\", \"warc_record_offset\": 828569590, \"warc_record_length\": 16616}", "tokenized_text": ["url", "https", "www", "henryusa", "com", "flexforkal", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500076", "87", "warc", "cc", "main", "20230203221113", "20230204011113", "00015", "warc", "gz", "warc_record_offset", "828569590", "warc_record_length", "16616"]}, "4": {"text": "{\"url\": \"https://www.arout.net/hotel-occupancy-drops-due-to-covid-19-certificates-in-mazatlan/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882572063.65/warc/CC-MAIN-20220814173832-20220814203832-00008.warc.gz\", \"warc_record_offset\": 598837769, \"warc_record_length\": 14575}", "tokenized_text": ["url", "https", "www", "arout", "net", "hotel", "occupancy", "drops", "due", "to", "covid", "19", "certificates", "in", "mazatlan", "warc_filename", "crawl", "data", "cc", "main", "2022", "33", "segments", "1659882572063", "65", "warc", "cc", "main", "20220814173832", "20220814203832", "00008", "warc", "gz", "warc_record_offset", "598837769", "warc_record_length", "14575"]}, "5": {"text": "{\"url\": \"https://www.glamsham.com/world/technology/apple-may-launch-ar-contact-lenses-in-10-years\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038083007.51/warc/CC-MAIN-20210415035637-20210415065637-00636.warc.gz\", \"warc_record_offset\": 870013982, \"warc_record_length\": 84010}", "tokenized_text": ["url", "https", "www", "glamsham", "com", "world", "technology", "apple", "may", "launch", "ar", "contact", "lenses", "in", "10", "years", "warc_filename", "crawl", "data", "cc", "main", "2021", "17", "segments", "1618038083007", "51", "warc", "cc", "main", "20210415035637", "20210415065637", "00636", "warc", "gz", "warc_record_offset", "870013982", "warc_record_length", "84010"]}, "6": {"text": "{\"url\": \"http://otlsports.com/patriots-vs-dolphins-biggest-nfl-spreads-this-decade/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145621.28/warc/CC-MAIN-20200221233354-20200222023354-00398.warc.gz\", \"warc_record_offset\": 111531585, \"warc_record_length\": 25025}", "tokenized_text": ["url", "http", "otlsports", "com", "patriots", "vs", "dolphins", "biggest", "nfl", "spreads", "this", "decade", "warc_filename", "crawl", "data", "cc", "main", "2020", "10", "segments", "1581875145621", "28", "warc", "cc", "main", "20200221233354", "20200222023354", "00398", "warc", "gz", "warc_record_offset", "111531585", "warc_record_length", "25025"]}, "7": {"text": "{\"url\": \"https://birmingcabbie.wordpress.com/2013/01/16/silver-and-gold/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439737206.16/warc/CC-MAIN-20200807172851-20200807202851-00477.warc.gz\", \"warc_record_offset\": 233367373, \"warc_record_length\": 27864}", "tokenized_text": ["url", "https", "birmingcabbie", "wordpress", "com", "2013", "01", "16", "silver", "and", "gold", "warc_filename", "crawl", "data", "cc", "main", "2020", "34", "segments", "1596439737206", "16", "warc", "cc", "main", "20200807172851", "20200807202851", "00477", "warc", "gz", "warc_record_offset", "233367373", "warc_record_length", "27864"]}, "8": {"text": "{\"url\": \"https://www.vagabondinn.com/blog/vagabond-inn-comes-to-utah\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662531762.30/warc/CC-MAIN-20220520061824-20220520091824-00060.warc.gz\", \"warc_record_offset\": 1239312380, \"warc_record_length\": 14681}", "tokenized_text": ["url", "https", "www", "vagabondinn", "com", "blog", "vagabond", "inn", "comes", "to", "utah", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652662531762", "30", "warc", "cc", "main", "20220520061824", "20220520091824", "00060", "warc", "gz", "warc_record_offset", "1239312380", "warc_record_length", "14681"]}, "9": {"text": "{\"url\": \"https://www.weeklygripe.co.uk/overuse-of-the-word-issue-in-the-media\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323585322.63/warc/CC-MAIN-20211020152307-20211020182307-00649.warc.gz\", \"warc_record_offset\": 1288254429, \"warc_record_length\": 10413}", "tokenized_text": ["url", "https", "www", "weeklygripe", "co", "uk", "overuse", "of", "the", "word", "issue", "in", "the", "media", "warc_filename", "crawl", "data", "cc", "main", "2021", "43", "segments", "1634323585322", "63", "warc", "cc", "main", "20211020152307", "20211020182307", "00649", "warc", "gz", "warc_record_offset", "1288254429", "warc_record_length", "10413"]}, "10": {"text": "{\"url\": \"https://www.heavenlytealeaves.com/blogs/heavenly-tea-leaves-blog/exploring-the-green-teas-of-japan\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-39/segments/1631780057202.68/warc/CC-MAIN-20210921101319-20210921131319-00614.warc.gz\", \"warc_record_offset\": 837566473, \"warc_record_length\": 58147}", "tokenized_text": ["url", "https", "www", "heavenlytealeaves", "com", "blogs", "heavenly", "tea", "leaves", "blog", "exploring", "the", "green", "teas", "of", "japan", "warc_filename", "crawl", "data", "cc", "main", "2021", "39", "segments", "1631780057202", "68", "warc", "cc", "main", "20210921101319", "20210921131319", "00614", "warc", "gz", "warc_record_offset", "837566473", "warc_record_length", "58147"]}, "11": {"text": "{\"url\": \"https://tm.koreaherald.com/view.php?ud=20210504001014\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623487612537.23/warc/CC-MAIN-20210614135913-20210614165913-00601.warc.gz\", \"warc_record_offset\": 532769245, \"warc_record_length\": 25017}", "tokenized_text": ["url", "https", "tm", "koreaherald", "com", "view", "php", "ud", "20210504001014", "warc_filename", "crawl", "data", "cc", "main", "2021", "25", "segments", "1623487612537", "23", "warc", "cc", "main", "20210614135913", "20210614165913", "00601", "warc", "gz", "warc_record_offset", "532769245", "warc_record_length", "25017"]}, "12": {"text": "{\"url\": \"https://bricks.stackexchange.com/questions/14655/how-do-you-display-multiple-items-at-once-on-an-ev3-screen/15044\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882573630.12/warc/CC-MAIN-20220819070211-20220819100211-00319.warc.gz\", \"warc_record_offset\": 175500523, \"warc_record_length\": 66290}", "tokenized_text": ["url", "https", "bricks", "stackexchange", "com", "questions", "14655", "how", "do", "you", "display", "multiple", "items", "at", "once", "on", "an", "ev3", "screen", "15044", "warc_filename", "crawl", "data", "cc", "main", "2022", "33", "segments", "1659882573630", "12", "warc", "cc", "main", "20220819070211", "20220819100211", "00319", "warc", "gz", "warc_record_offset", "175500523", "warc_record_length", "66290"]}, "13": {"text": "{\"url\": \"https://www.toptipz.com.ng/armed-robbers-police-uniforms-enugu-caught-gun-battle-photos/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875148375.36/warc/CC-MAIN-20200229022458-20200229052458-00295.warc.gz\", \"warc_record_offset\": 904154101, \"warc_record_length\": 11479}", "tokenized_text": ["url", "https", "www", "toptipz", "com", "ng", "armed", "robbers", "police", "uniforms", "enugu", "caught", "gun", "battle", "photos", "warc_filename", "crawl", "data", "cc", "main", "2020", "10", "segments", "1581875148375", "36", "warc", "cc", "main", "20200229022458", "20200229052458", "00295", "warc", "gz", "warc_record_offset", "904154101", "warc_record_length", "11479"]}, "14": {"text": "{\"url\": \"https://gamecocksonline.com/news/2017/08/22/breaking-down-the-offensive-line/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337322.29/warc/CC-MAIN-20221002115028-20221002145028-00747.warc.gz\", \"warc_record_offset\": 303777204, \"warc_record_length\": 19373}", "tokenized_text": ["url", "https", "gamecocksonline", "com", "news", "2017", "08", "22", "breaking", "down", "the", "offensive", "line", "warc_filename", "crawl", "data", "cc", "main", "2022", "40", "segments", "1664030337322", "29", "warc", "cc", "main", "20221002115028", "20221002145028", "00747", "warc", "gz", "warc_record_offset", "303777204", "warc_record_length", "19373"]}, "15": {"text": "{\"url\": \"https://paulinepark.com/2014/06/09/gauguin-exoticization-primitivism-paganism/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499697.75/warc/CC-MAIN-20230129012420-20230129042420-00524.warc.gz\", \"warc_record_offset\": 473272628, \"warc_record_length\": 17321}", "tokenized_text": ["url", "https", "paulinepark", "com", "2014", "06", "09", "gauguin", "exoticization", "primitivism", "paganism", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499697", "75", "warc", "cc", "main", "20230129012420", "20230129042420", "00524", "warc", "gz", "warc_record_offset", "473272628", "warc_record_length", "17321"]}, "16": {"text": "{\"url\": \"https://talesofthetravelbug.wordpress.com/2016/02/26/abel-tasman-national-park/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439737319.74/warc/CC-MAIN-20200808080642-20200808110642-00060.warc.gz\", \"warc_record_offset\": 516800798, \"warc_record_length\": 29736}", "tokenized_text": ["url", "https", "talesofthetravelbug", "wordpress", "com", "2016", "02", "26", "abel", "tasman", "national", "park", "warc_filename", "crawl", "data", "cc", "main", "2020", "34", "segments", "1596439737319", "74", "warc", "cc", "main", "20200808080642", "20200808110642", "00060", "warc", "gz", "warc_record_offset", "516800798", "warc_record_length", "29736"]}, "17": {"text": "{\"url\": \"https://minnesota.cbslocal.com/2018/08/20/new-brighton-house-fire-dogs-rescued/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703531702.36/warc/CC-MAIN-20210123001629-20210123031629-00630.warc.gz\", \"warc_record_offset\": 455460747, \"warc_record_length\": 21947}", "tokenized_text": ["url", "https", "minnesota", "cbslocal", "com", "2018", "08", "20", "new", "brighton", "house", "fire", "dogs", "rescued", "warc_filename", "crawl", "data", "cc", "main", "2021", "04", "segments", "1610703531702", "36", "warc", "cc", "main", "20210123001629", "20210123031629", "00630", "warc", "gz", "warc_record_offset", "455460747", "warc_record_length", "21947"]}, "18": {"text": "{\"url\": \"https://simonegallina.blogspot.com/2012/03/famous-painter-of-prehistoric-murals.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370500331.13/warc/CC-MAIN-20200331053639-20200331083639-00229.warc.gz\", \"warc_record_offset\": 640302554, \"warc_record_length\": 19781}", "tokenized_text": ["url", "https", "simonegallina", "blogspot", "com", "2012", "03", "famous", "painter", "of", "prehistoric", "murals", "html", "warc_filename", "crawl", "data", "cc", "main", "2020", "16", "segments", "1585370500331", "13", "warc", "cc", "main", "20200331053639", "20200331083639", "00229", "warc", "gz", "warc_record_offset", "640302554", "warc_record_length", "19781"]}, "19": {"text": "{\"url\": \"https://www.podparadise.com/Podcast/1533560948/Listen/1607614191/0\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046157039.99/warc/CC-MAIN-20210805193327-20210805223327-00602.warc.gz\", \"warc_record_offset\": 1002652790, \"warc_record_length\": 7640}", "tokenized_text": ["url", "https", "www", "podparadise", "com", "podcast", "1533560948", "listen", "1607614191", "0", "warc_filename", "crawl", "data", "cc", "main", "2021", "31", "segments", "1627046157039", "99", "warc", "cc", "main", "20210805193327", "20210805223327", "00602", "warc", "gz", "warc_record_offset", "1002652790", "warc_record_length", "7640"]}, "20": {"text": "{\"url\": \"https://vidmid.com/news/sir-ian-mckellen-martin-freeman-and-the-lord-of-the-rings-cast-kickstart-fundraising-campaign-to-buy-author-j-r-r-tolkien-s-oxford-home?uid=226886\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703538082.57/warc/CC-MAIN-20210123125715-20210123155715-00108.warc.gz\", \"warc_record_offset\": 628149199, \"warc_record_length\": 9581}", "tokenized_text": ["url", "https", "vidmid", "com", "news", "sir", "ian", "mckellen", "martin", "freeman", "and", "the", "lord", "of", "the", "rings", "cast", "kickstart", "fundraising", "campaign", "to", "buy", "author", "j", "r", "r", "tolkien", "s", "oxford", "home", "uid", "226886", "warc_filename", "crawl", "data", "cc", "main", "2021", "04", "segments", "1610703538082", "57", "warc", "cc", "main", "20210123125715", "20210123155715", "00108", "warc", "gz", "warc_record_offset", "628149199", "warc_record_length", "9581"]}, "21": {"text": "{\"url\": \"https://www.etoren.com/products/fujifilm-x-t200-body-champagne-gold-kit-box\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882571090.80/warc/CC-MAIN-20220809215803-20220810005803-00211.warc.gz\", \"warc_record_offset\": 683638756, \"warc_record_length\": 40224}", "tokenized_text": ["url", "https", "www", "etoren", "com", "products", "fujifilm", "x", "t200", "body", "champagne", "gold", "kit", "box", "warc_filename", "crawl", "data", "cc", "main", "2022", "33", "segments", "1659882571090", "80", "warc", "cc", "main", "20220809215803", "20220810005803", "00211", "warc", "gz", "warc_record_offset", "683638756", "warc_record_length", "40224"]}, "22": {"text": "{\"url\": \"https://ionigeria.com/james-webb-the-most-powerful-space-telescope-ever-built-blast-into-orbit/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499845.10/warc/CC-MAIN-20230131055533-20230131085533-00613.warc.gz\", \"warc_record_offset\": 348002670, \"warc_record_length\": 28496}", "tokenized_text": ["url", "https", "ionigeria", "com", "james", "webb", "the", "most", "powerful", "space", "telescope", "ever", "built", "blast", "into", "orbit", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499845", "10", "warc", "cc", "main", "20230131055533", "20230131085533", "00613", "warc", "gz", "warc_record_offset", "348002670", "warc_record_length", "28496"]}, "23": {"text": "{\"url\": \"https://www.marshfieldnewsherald.com/story/news/2019/05/02/rothschild-police-wausau-man-charged-with-owi-said-his-gum-blame-smell-alcohol/3645355002/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250593994.14/warc/CC-MAIN-20200118221909-20200119005909-00365.warc.gz\", \"warc_record_offset\": 972941129, \"warc_record_length\": 43188}", "tokenized_text": ["url", "https", "www", "marshfieldnewsherald", "com", "story", "news", "2019", "05", "02", "rothschild", "police", "wausau", "man", "charged", "with", "owi", "said", "his", "gum", "blame", "smell", "alcohol", "3645355002", "warc_filename", "crawl", "data", "cc", "main", "2020", "05", "segments", "1579250593994", "14", "warc", "cc", "main", "20200118221909", "20200119005909", "00365", "warc", "gz", "warc_record_offset", "972941129", "warc_record_length", "43188"]}, "24": {"text": "{\"url\": \"https://brindisa.com/blogs/news/the-food-geek-guide-to-iberico-ham\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764494852.95/warc/CC-MAIN-20230127001911-20230127031911-00821.warc.gz\", \"warc_record_offset\": 176182646, \"warc_record_length\": 58668}", "tokenized_text": ["url", "https", "brindisa", "com", "blogs", "news", "the", "food", "geek", "guide", "to", "iberico", "ham", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764494852", "95", "warc", "cc", "main", "20230127001911", "20230127031911", "00821", "warc", "gz", "warc_record_offset", "176182646", "warc_record_length", "58668"]}, "25": {"text": "{\"url\": \"https://healthyceleb.com/scottie-pippen-height-weight-body-statistics/54041/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585371830894.88/warc/CC-MAIN-20200409055849-20200409090349-00200.warc.gz\", \"warc_record_offset\": 476834551, \"warc_record_length\": 28617}", "tokenized_text": ["url", "https", "healthyceleb", "com", "scottie", "pippen", "height", "weight", "body", "statistics", "54041", "warc_filename", "crawl", "data", "cc", "main", "2020", "16", "segments", "1585371830894", "88", "warc", "cc", "main", "20200409055849", "20200409090349", "00200", "warc", "gz", "warc_record_offset", "476834551", "warc_record_length", "28617"]}, "26": {"text": "{\"url\": \"https://worldnews.su/world/kabul-terror-attack-multiple-explosions-as-gunfire-rings-out-with-deaths-feared-world-news.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103626162.35/warc/CC-MAIN-20220629084939-20220629114939-00694.warc.gz\", \"warc_record_offset\": 683956680, \"warc_record_length\": 14884}", "tokenized_text": ["url", "https", "worldnews", "su", "world", "kabul", "terror", "attack", "multiple", "explosions", "as", "gunfire", "rings", "out", "with", "deaths", "feared", "world", "news", "html", "warc_filename", "crawl", "data", "cc", "main", "2022", "27", "segments", "1656103626162", "35", "warc", "cc", "main", "20220629084939", "20220629114939", "00694", "warc", "gz", "warc_record_offset", "683956680", "warc_record_length", "14884"]}, "27": {"text": "{\"url\": \"https://observer.com/2022/05/what-will-elon-musk-do-with-twitter-here-is-a-6-point-guide/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500758.20/warc/CC-MAIN-20230208092053-20230208122053-00639.warc.gz\", \"warc_record_offset\": 450138738, \"warc_record_length\": 24833}", "tokenized_text": ["url", "https", "observer", "com", "2022", "05", "what", "will", "elon", "musk", "do", "with", "twitter", "here", "is", "a", "6", "point", "guide", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500758", "20", "warc", "cc", "main", "20230208092053", "20230208122053", "00639", "warc", "gz", "warc_record_offset", "450138738", "warc_record_length", "24833"]}, "28": {"text": "{\"url\": \"https://www.zwillgen.com/law-enforcement/court-decision-rules-geofence-warrant-violates-fourth-amendment/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710789.95/warc/CC-MAIN-20221201021257-20221201051257-00652.warc.gz\", \"warc_record_offset\": 1124917887, \"warc_record_length\": 23522}", "tokenized_text": ["url", "https", "www", "zwillgen", "com", "law", "enforcement", "court", "decision", "rules", "geofence", "warrant", "violates", "fourth", "amendment", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446710789", "95", "warc", "cc", "main", "20221201021257", "20221201051257", "00652", "warc", "gz", "warc_record_offset", "1124917887", "warc_record_length", "23522"]}, "29": {"text": "{\"url\": \"https://www.idropnews.com/news/beats-releases-ultra-affordable-new-50-beats-flex-wireless-earbuds/145050/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764501066.53/warc/CC-MAIN-20230209014102-20230209044102-00740.warc.gz\", \"warc_record_offset\": 857199535, \"warc_record_length\": 30176}", "tokenized_text": ["url", "https", "www", "idropnews", "com", "news", "beats", "releases", "ultra", "affordable", "new", "50", "beats", "flex", "wireless", "earbuds", "145050", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764501066", "53", "warc", "cc", "main", "20230209014102", "20230209044102", "00740", "warc", "gz", "warc_record_offset", "857199535", "warc_record_length", "30176"]}, "30": {"text": "{\"url\": \"https://callbag.co/epub/the-love-song-of-j-alfred-prufrock-and-other-poems/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656104496688.78/warc/CC-MAIN-20220704202455-20220704232455-00300.warc.gz\", \"warc_record_offset\": 199444381, \"warc_record_length\": 17548}", "tokenized_text": ["url", "https", "callbag", "co", "epub", "the", "love", "song", "of", "j", "alfred", "prufrock", "and", "other", "poems", "warc_filename", "crawl", "data", "cc", "main", "2022", "27", "segments", "1656104496688", "78", "warc", "cc", "main", "20220704202455", "20220704232455", "00300", "warc", "gz", "warc_record_offset", "199444381", "warc_record_length", "17548"]}, "31": {"text": "{\"url\": \"https://kitchen-repair.info/daftar-film-mae-pong-38/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046154175.76/warc/CC-MAIN-20210801092716-20210801122716-00658.warc.gz\", \"warc_record_offset\": 358919877, \"warc_record_length\": 11838}", "tokenized_text": ["url", "https", "kitchen", "repair", "info", "daftar", "film", "mae", "pong", "38", "warc_filename", "crawl", "data", "cc", "main", "2021", "31", "segments", "1627046154175", "76", "warc", "cc", "main", "20210801092716", "20210801122716", "00658", "warc", "gz", "warc_record_offset", "358919877", "warc_record_length", "11838"]}, "32": {"text": "{\"url\": \"https://www.austinchronicle.com/news/2008-06-27/640116/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141194982.45/warc/CC-MAIN-20201128011115-20201128041115-00175.warc.gz\", \"warc_record_offset\": 591253388, \"warc_record_length\": 17495}", "tokenized_text": ["url", "https", "www", "austinchronicle", "com", "news", "2008", "06", "27", "640116", "warc_filename", "crawl", "data", "cc", "main", "2020", "50", "segments", "1606141194982", "45", "warc", "cc", "main", "20201128011115", "20201128041115", "00175", "warc", "gz", "warc_record_offset", "591253388", "warc_record_length", "17495"]}, "33": {"text": "{\"url\": \"https://www.fame10.com/trending-news/will-smith-and-jada-pinkett-smith-slam-divorce-rumors/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499919.70/warc/CC-MAIN-20230201081311-20230201111311-00455.warc.gz\", \"warc_record_offset\": 766206361, \"warc_record_length\": 32672}", "tokenized_text": ["url", "https", "www", "fame10", "com", "trending", "news", "will", "smith", "and", "jada", "pinkett", "smith", "slam", "divorce", "rumors", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499919", "70", "warc", "cc", "main", "20230201081311", "20230201111311", "00455", "warc", "gz", "warc_record_offset", "766206361", "warc_record_length", "32672"]}, "34": {"text": "{\"url\": \"https://ednews.net/en/news/politics/440902-iran-embassy-ill-wishers-trying-to-disrupt-relations-between-tehran-baku\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141750841.83/warc/CC-MAIN-20201205211729-20201206001729-00355.warc.gz\", \"warc_record_offset\": 263492130, \"warc_record_length\": 22120}", "tokenized_text": ["url", "https", "ednews", "net", "en", "news", "politics", "440902", "iran", "embassy", "ill", "wishers", "trying", "to", "disrupt", "relations", "between", "tehran", "baku", "warc_filename", "crawl", "data", "cc", "main", "2020", "50", "segments", "1606141750841", "83", "warc", "cc", "main", "20201205211729", "20201206001729", "00355", "warc", "gz", "warc_record_offset", "263492130", "warc_record_length", "22120"]}, "35": {"text": "{\"url\": \"https://www.hopefornigeriaonline.com/category/africa/page/17/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618039388763.75/warc/CC-MAIN-20210420091336-20210420121336-00107.warc.gz\", \"warc_record_offset\": 930101659, \"warc_record_length\": 22399}", "tokenized_text": ["url", "https", "www", "hopefornigeriaonline", "com", "category", "africa", "page", "17", "warc_filename", "crawl", "data", "cc", "main", "2021", "17", "segments", "1618039388763", "75", "warc", "cc", "main", "20210420091336", "20210420121336", "00107", "warc", "gz", "warc_record_offset", "930101659", "warc_record_length", "22399"]}, "36": {"text": "{\"url\": \"https://cmeindia.in/covid-19-3rd-wave-experience-in-india-a-survey-of-5971-adults/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500028.12/warc/CC-MAIN-20230202133541-20230202163541-00248.warc.gz\", \"warc_record_offset\": 178693725, \"warc_record_length\": 42926}", "tokenized_text": ["url", "https", "cmeindia", "in", "covid", "19", "3rd", "wave", "experience", "in", "india", "a", "survey", "of", "5971", "adults", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500028", "12", "warc", "cc", "main", "20230202133541", "20230202163541", "00248", "warc", "gz", "warc_record_offset", "178693725", "warc_record_length", "42926"]}, "37": {"text": "{\"url\": \"https://www.sitra.fi/en/blogs/moving-boldly-towards-participatory-foresight/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250607407.48/warc/CC-MAIN-20200122191620-20200122220620-00393.warc.gz\", \"warc_record_offset\": 1088907449, \"warc_record_length\": 11772}", "tokenized_text": ["url", "https", "www", "sitra", "fi", "en", "blogs", "moving", "boldly", "towards", "participatory", "foresight", "warc_filename", "crawl", "data", "cc", "main", "2020", "05", "segments", "1579250607407", "48", "warc", "cc", "main", "20200122191620", "20200122220620", "00393", "warc", "gz", "warc_record_offset", "1088907449", "warc_record_length", "11772"]}, "38": {"text": "{\"url\": \"https://suesspiciousminds.com/2013/05/14/four-minute-warning-or-last-night-the-plans-for-a-future-war-was-all-i-saw-on-channel-four/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323588398.42/warc/CC-MAIN-20211028162638-20211028192638-00233.warc.gz\", \"warc_record_offset\": 664847335, \"warc_record_length\": 31698}", "tokenized_text": ["url", "https", "suesspiciousminds", "com", "2013", "05", "14", "four", "minute", "warning", "or", "last", "night", "the", "plans", "for", "a", "future", "war", "was", "all", "i", "saw", "on", "channel", "four", "warc_filename", "crawl", "data", "cc", "main", "2021", "43", "segments", "1634323588398", "42", "warc", "cc", "main", "20211028162638", "20211028192638", "00233", "warc", "gz", "warc_record_offset", "664847335", "warc_record_length", "31698"]}, "39": {"text": "{\"url\": \"https://www.exquisitecoasts.com/spectacular-coastal-drives.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500339.37/warc/CC-MAIN-20230206113934-20230206143934-00685.warc.gz\", \"warc_record_offset\": 782007633, \"warc_record_length\": 19267}", "tokenized_text": ["url", "https", "www", "exquisitecoasts", "com", "spectacular", "coastal", "drives", "html", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500339", "37", "warc", "cc", "main", "20230206113934", "20230206143934", "00685", "warc", "gz", "warc_record_offset", "782007633", "warc_record_length", "19267"]}, "40": {"text": "{\"url\": \"https://petebowes.com/2011/01/02/china-the-first-night/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337625.5/warc/CC-MAIN-20221005105356-20221005135356-00284.warc.gz\", \"warc_record_offset\": 486450658, \"warc_record_length\": 27715}", "tokenized_text": ["url", "https", "petebowes", "com", "2011", "01", "02", "china", "the", "first", "night", "warc_filename", "crawl", "data", "cc", "main", "2022", "40", "segments", "1664030337625", "5", "warc", "cc", "main", "20221005105356", "20221005135356", "00284", "warc", "gz", "warc_record_offset", "486450658", "warc_record_length", "27715"]}, "41": {"text": "{\"url\": \"https://bluemonsterprep.com/blogs/news/hawaii-races-to-secure-power-plant-threatened-by-lava-flows\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499713.50/warc/CC-MAIN-20230129112153-20230129142153-00439.warc.gz\", \"warc_record_offset\": 159639259, \"warc_record_length\": 72974}", "tokenized_text": ["url", "https", "bluemonsterprep", "com", "blogs", "news", "hawaii", "races", "to", "secure", "power", "plant", "threatened", "by", "lava", "flows", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499713", "50", "warc", "cc", "main", "20230129112153", "20230129142153", "00439", "warc", "gz", "warc_record_offset", "159639259", "warc_record_length", "72974"]}, "42": {"text": "{\"url\": \"http://archiveislam.com/the-muslim-snow-patrol-cleaning-up-the-city.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579251681412.74/warc/CC-MAIN-20200125191854-20200125221854-00304.warc.gz\", \"warc_record_offset\": 13209660, \"warc_record_length\": 11206}", "tokenized_text": ["url", "http", "archiveislam", "com", "the", "muslim", "snow", "patrol", "cleaning", "up", "the", "city", "html", "warc_filename", "crawl", "data", "cc", "main", "2020", "05", "segments", "1579251681412", "74", "warc", "cc", "main", "20200125191854", "20200125221854", "00304", "warc", "gz", "warc_record_offset", "13209660", "warc_record_length", "11206"]}, "43": {"text": "{\"url\": \"https://thegatorseye.com/3188/news/muslims-in-detainment-a-look-into-chinas-camps/?print=true\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882573145.32/warc/CC-MAIN-20220818003501-20220818033501-00506.warc.gz\", \"warc_record_offset\": 510157631, \"warc_record_length\": 8481}", "tokenized_text": ["url", "https", "thegatorseye", "com", "3188", "news", "muslims", "in", "detainment", "a", "look", "into", "chinas", "camps", "print", "true", "warc_filename", "crawl", "data", "cc", "main", "2022", "33", "segments", "1659882573145", "32", "warc", "cc", "main", "20220818003501", "20220818033501", "00506", "warc", "gz", "warc_record_offset", "510157631", "warc_record_length", "8481"]}, "44": {"text": "{\"url\": \"https://jewishnews.timesofisrael.com/video-project-twinning-shoah-survivor-with-celebs-wins-two-awards/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-31/segments/1627046155458.35/warc/CC-MAIN-20210805063730-20210805093730-00308.warc.gz\", \"warc_record_offset\": 347762646, \"warc_record_length\": 14792}", "tokenized_text": ["url", "https", "jewishnews", "timesofisrael", "com", "video", "project", "twinning", "shoah", "survivor", "with", "celebs", "wins", "two", "awards", "warc_filename", "crawl", "data", "cc", "main", "2021", "31", "segments", "1627046155458", "35", "warc", "cc", "main", "20210805063730", "20210805093730", "00308", "warc", "gz", "warc_record_offset", "347762646", "warc_record_length", "14792"]}, "45": {"text": "{\"url\": \"https://www.rowadventures.com/specialist/malcolm-reed\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-33/segments/1659882571056.58/warc/CC-MAIN-20220809155137-20220809185137-00754.warc.gz\", \"warc_record_offset\": 870524278, \"warc_record_length\": 12466}", "tokenized_text": ["url", "https", "www", "rowadventures", "com", "specialist", "malcolm", "reed", "warc_filename", "crawl", "data", "cc", "main", "2022", "33", "segments", "1659882571056", "58", "warc", "cc", "main", "20220809155137", "20220809185137", "00754", "warc", "gz", "warc_record_offset", "870524278", "warc_record_length", "12466"]}, "46": {"text": "{\"url\": \"https://theshelfofunreadbooks.wordpress.com/2019/02/03/reviewthe-library-book-by-susan-orlean/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323587606.8/warc/CC-MAIN-20211024204628-20211024234628-00141.warc.gz\", \"warc_record_offset\": 701475464, \"warc_record_length\": 55792}", "tokenized_text": ["url", "https", "theshelfofunreadbooks", "wordpress", "com", "2019", "02", "03", "reviewthe", "library", "book", "by", "susan", "orlean", "warc_filename", "crawl", "data", "cc", "main", "2021", "43", "segments", "1634323587606", "8", "warc", "cc", "main", "20211024204628", "20211024234628", "00141", "warc", "gz", "warc_record_offset", "701475464", "warc_record_length", "55792"]}, "47": {"text": "{\"url\": \"https://energy.agwired.com/2010/10/07/wasted-food-wasted-energy/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764495001.99/warc/CC-MAIN-20230127164242-20230127194242-00001.warc.gz\", \"warc_record_offset\": 241286356, \"warc_record_length\": 20698}", "tokenized_text": ["url", "https", "energy", "agwired", "com", "2010", "10", "07", "wasted", "food", "wasted", "energy", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764495001", "99", "warc", "cc", "main", "20230127164242", "20230127194242", "00001", "warc", "gz", "warc_record_offset", "241286356", "warc_record_length", "20698"]}, "48": {"text": "{\"url\": \"https://www.uscleiden.com/news/2021/05/14/beat-the-microbead/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662521041.0/warc/CC-MAIN-20220518021247-20220518051247-00703.warc.gz\", \"warc_record_offset\": 1269439293, \"warc_record_length\": 6324}", "tokenized_text": ["url", "https", "www", "uscleiden", "com", "news", "2021", "05", "14", "beat", "the", "microbead", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652662521041", "0", "warc", "cc", "main", "20220518021247", "20220518051247", "00703", "warc", "gz", "warc_record_offset", "1269439293", "warc_record_length", "6324"]}, "49": {"text": "{\"url\": \"https://bedandbicycle.com/is-this-cool-idea-with-butterfly-doorways-the-subsequent-mazda-miata.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710764.12/warc/CC-MAIN-20221130124353-20221130154353-00274.warc.gz\", \"warc_record_offset\": 161515301, \"warc_record_length\": 13553}", "tokenized_text": ["url", "https", "bedandbicycle", "com", "is", "this", "cool", "idea", "with", "butterfly", "doorways", "the", "subsequent", "mazda", "miata", "html", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446710764", "12", "warc", "cc", "main", "20221130124353", "20221130154353", "00274", "warc", "gz", "warc_record_offset", "161515301", "warc_record_length", "13553"]}, "50": {"text": "{\"url\": \"https://www.greatfallstribune.com/story/opinion/tribune-editorials/2016/06/10/montana-gets-manufacturing-tax-fairness/85722960/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370506870.41/warc/CC-MAIN-20200402080824-20200402110824-00330.warc.gz\", \"warc_record_offset\": 935440331, \"warc_record_length\": 43051}", "tokenized_text": ["url", "https", "www", "greatfallstribune", "com", "story", "opinion", "tribune", "editorials", "2016", "06", "10", "montana", "gets", "manufacturing", "tax", "fairness", "85722960", "warc_filename", "crawl", "data", "cc", "main", "2020", "16", "segments", "1585370506870", "41", "warc", "cc", "main", "20200402080824", "20200402110824", "00330", "warc", "gz", "warc_record_offset", "935440331", "warc_record_length", "43051"]}, "51": {"text": "{\"url\": \"https://thenextweb.com/media/2011/11/08/last-fm-reached-the-60-billion-scrobble-landmark-with-lady-gaga/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-50/segments/1606141176049.8/warc/CC-MAIN-20201124082900-20201124112900-00250.warc.gz\", \"warc_record_offset\": 525905978, \"warc_record_length\": 21310}", "tokenized_text": ["url", "https", "thenextweb", "com", "media", "2011", "11", "08", "last", "fm", "reached", "the", "60", "billion", "scrobble", "landmark", "with", "lady", "gaga", "warc_filename", "crawl", "data", "cc", "main", "2020", "50", "segments", "1606141176049", "8", "warc", "cc", "main", "20201124082900", "20201124112900", "00250", "warc", "gz", "warc_record_offset", "525905978", "warc_record_length", "21310"]}, "52": {"text": "{\"url\": \"https://www.fingerlakes1.com/2021/03/07/sabres-lose-seventh-straight-with-5-2-loss-to-islanders/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710771.39/warc/CC-MAIN-20221130192708-20221130222708-00543.warc.gz\", \"warc_record_offset\": 792006189, \"warc_record_length\": 23266}", "tokenized_text": ["url", "https", "www", "fingerlakes1", "com", "2021", "03", "07", "sabres", "lose", "seventh", "straight", "with", "5", "2", "loss", "to", "islanders", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446710771", "39", "warc", "cc", "main", "20221130192708", "20221130222708", "00543", "warc", "gz", "warc_record_offset", "792006189", "warc_record_length", "23266"]}, "53": {"text": "{\"url\": \"https://movizark.com/2018/10/29/el-tonto-ray-liotta-adrien-brody-travis-fimmel-ken-jeong-join-charlie-days-film/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662539101.40/warc/CC-MAIN-20220521112022-20220521142022-00165.warc.gz\", \"warc_record_offset\": 444833137, \"warc_record_length\": 39518}", "tokenized_text": ["url", "https", "movizark", "com", "2018", "10", "29", "el", "tonto", "ray", "liotta", "adrien", "brody", "travis", "fimmel", "ken", "jeong", "join", "charlie", "days", "film", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652662539101", "40", "warc", "cc", "main", "20220521112022", "20220521142022", "00165", "warc", "gz", "warc_record_offset", "444833137", "warc_record_length", "39518"]}, "54": {"text": "{\"url\": \"https://paradigmshyft.com/2022/02/27/totalitarianism-authoritarianism-and-fascism-what-is-the-difference/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499524.28/warc/CC-MAIN-20230128054815-20230128084815-00673.warc.gz\", \"warc_record_offset\": 442133024, \"warc_record_length\": 33777}", "tokenized_text": ["url", "https", "paradigmshyft", "com", "2022", "02", "27", "totalitarianism", "authoritarianism", "and", "fascism", "what", "is", "the", "difference", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499524", "28", "warc", "cc", "main", "20230128054815", "20230128084815", "00673", "warc", "gz", "warc_record_offset", "442133024", "warc_record_length", "33777"]}, "55": {"text": "{\"url\": \"https://www.rte.ie/news/2019/1201/1096265-smith-isis-home-turkey/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964363292.82/warc/CC-MAIN-20211206103243-20211206133243-00587.warc.gz\", \"warc_record_offset\": 1036937141, \"warc_record_length\": 22407}", "tokenized_text": ["url", "https", "www", "rte", "ie", "news", "2019", "1201", "1096265", "smith", "isis", "home", "turkey", "warc_filename", "crawl", "data", "cc", "main", "2021", "49", "segments", "1637964363292", "82", "warc", "cc", "main", "20211206103243", "20211206133243", "00587", "warc", "gz", "warc_record_offset", "1036937141", "warc_record_length", "22407"]}, "56": {"text": "{\"url\": \"http://www.cmt.com/news/1490421/bluegrass-legend-charlie-waller-dead-at-69/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-05/segments/1579250595787.7/warc/CC-MAIN-20200119234426-20200120022426-00503.warc.gz\", \"warc_record_offset\": 208318103, \"warc_record_length\": 11530}", "tokenized_text": ["url", "http", "www", "cmt", "com", "news", "1490421", "bluegrass", "legend", "charlie", "waller", "dead", "at", "69", "warc_filename", "crawl", "data", "cc", "main", "2020", "05", "segments", "1579250595787", "7", "warc", "cc", "main", "20200119234426", "20200120022426", "00503", "warc", "gz", "warc_record_offset", "208318103", "warc_record_length", "11530"]}, "57": {"text": "{\"url\": \"https://caribbeantales-worldwide.com/a-hand-full-of-anticipation-for-the-caribbeantales-2011-film-festival/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499890.39/warc/CC-MAIN-20230131190543-20230131220543-00762.warc.gz\", \"warc_record_offset\": 180032374, \"warc_record_length\": 29529}", "tokenized_text": ["url", "https", "caribbeantales", "worldwide", "com", "a", "hand", "full", "of", "anticipation", "for", "the", "caribbeantales", "2011", "film", "festival", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499890", "39", "warc", "cc", "main", "20230131190543", "20230131220543", "00762", "warc", "gz", "warc_record_offset", "180032374", "warc_record_length", "29529"]}, "58": {"text": "{\"url\": \"https://www.knollwood.ca/blog/what-do-deacons-do\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488567696.99/warc/CC-MAIN-20210625023840-20210625053840-00248.warc.gz\", \"warc_record_offset\": 748368127, \"warc_record_length\": 14745}", "tokenized_text": ["url", "https", "www", "knollwood", "ca", "blog", "what", "do", "deacons", "do", "warc_filename", "crawl", "data", "cc", "main", "2021", "25", "segments", "1623488567696", "99", "warc", "cc", "main", "20210625023840", "20210625053840", "00248", "warc", "gz", "warc_record_offset", "748368127", "warc_record_length", "14745"]}, "59": {"text": "{\"url\": \"https://silverchips.mbhs.edu/content/nba-award-predictions-31146/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446710972.37/warc/CC-MAIN-20221204104311-20221204134311-00476.warc.gz\", \"warc_record_offset\": 559787302, \"warc_record_length\": 11801}", "tokenized_text": ["url", "https", "silverchips", "mbhs", "edu", "content", "nba", "award", "predictions", "31146", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446710972", "37", "warc", "cc", "main", "20221204104311", "20221204134311", "00476", "warc", "gz", "warc_record_offset", "559787302", "warc_record_length", "11801"]}, "60": {"text": "{\"url\": \"https://realdealtheatre.webs.com/press.htm\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446711074.68/warc/CC-MAIN-20221206060908-20221206090908-00826.warc.gz\", \"warc_record_offset\": 503711292, \"warc_record_length\": 4512}", "tokenized_text": ["url", "https", "realdealtheatre", "webs", "com", "press", "htm", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446711074", "68", "warc", "cc", "main", "20221206060908", "20221206090908", "00826", "warc", "gz", "warc_record_offset", "503711292", "warc_record_length", "4512"]}, "61": {"text": "{\"url\": \"https://www.parkablogs.com/content/book-review-daisuke-moriyama-art-works-chronicle\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038072366.31/warc/CC-MAIN-20210413122252-20210413152252-00153.warc.gz\", \"warc_record_offset\": 1044766559, \"warc_record_length\": 11425}", "tokenized_text": ["url", "https", "www", "parkablogs", "com", "content", "book", "review", "daisuke", "moriyama", "art", "works", "chronicle", "warc_filename", "crawl", "data", "cc", "main", "2021", "17", "segments", "1618038072366", "31", "warc", "cc", "main", "20210413122252", "20210413152252", "00153", "warc", "gz", "warc_record_offset", "1044766559", "warc_record_length", "11425"]}, "62": {"text": "{\"url\": \"https://www.ungeek.ph/2016/10/inspiring-watch-dwayne-the-rock-johnson-react-on-his-very-first-wwe-match/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500094.26/warc/CC-MAIN-20230204044030-20230204074030-00871.warc.gz\", \"warc_record_offset\": 1068673394, \"warc_record_length\": 27741}", "tokenized_text": ["url", "https", "www", "ungeek", "ph", "2016", "10", "inspiring", "watch", "dwayne", "the", "rock", "johnson", "react", "on", "his", "very", "first", "wwe", "match", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500094", "26", "warc", "cc", "main", "20230204044030", "20230204074030", "00871", "warc", "gz", "warc_record_offset", "1068673394", "warc_record_length", "27741"]}, "63": {"text": "{\"url\": \"https://holaamericanews.com/to-my-bros-me-too-the-womens-march-and-my-birthday/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103334753.21/warc/CC-MAIN-20220627134424-20220627164424-00450.warc.gz\", \"warc_record_offset\": 353753500, \"warc_record_length\": 26850}", "tokenized_text": ["url", "https", "holaamericanews", "com", "to", "my", "bros", "me", "too", "the", "womens", "march", "and", "my", "birthday", "warc_filename", "crawl", "data", "cc", "main", "2022", "27", "segments", "1656103334753", "21", "warc", "cc", "main", "20220627134424", "20220627164424", "00450", "warc", "gz", "warc_record_offset", "353753500", "warc_record_length", "26850"]}, "64": {"text": "{\"url\": \"https://nobeladventures.com/2019/04/14/riding-around-new-york/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764501555.34/warc/CC-MAIN-20230209081052-20230209111052-00418.warc.gz\", \"warc_record_offset\": 443600050, \"warc_record_length\": 28918}", "tokenized_text": ["url", "https", "nobeladventures", "com", "2019", "04", "14", "riding", "around", "new", "york", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764501555", "34", "warc", "cc", "main", "20230209081052", "20230209111052", "00418", "warc", "gz", "warc_record_offset", "443600050", "warc_record_length", "28918"]}, "65": {"text": "{\"url\": \"https://www.nationalhogfarmer.com/news/hog-prices-keep-sliding\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-40/segments/1664030337731.82/warc/CC-MAIN-20221006061224-20221006091224-00466.warc.gz\", \"warc_record_offset\": 944340676, \"warc_record_length\": 23482}", "tokenized_text": ["url", "https", "www", "nationalhogfarmer", "com", "news", "hog", "prices", "keep", "sliding", "warc_filename", "crawl", "data", "cc", "main", "2022", "40", "segments", "1664030337731", "82", "warc", "cc", "main", "20221006061224", "20221006091224", "00466", "warc", "gz", "warc_record_offset", "944340676", "warc_record_length", "23482"]}, "66": {"text": "{\"url\": \"https://www.listal.com/viewentry/67108\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703519843.24/warc/CC-MAIN-20210119232006-20210120022006-00714.warc.gz\", \"warc_record_offset\": 856244466, \"warc_record_length\": 6855}", "tokenized_text": ["url", "https", "www", "listal", "com", "viewentry", "67108", "warc_filename", "crawl", "data", "cc", "main", "2021", "04", "segments", "1610703519843", "24", "warc", "cc", "main", "20210119232006", "20210120022006", "00714", "warc", "gz", "warc_record_offset", "856244466", "warc_record_length", "6855"]}, "67": {"text": "{\"url\": \"https://www.gungoddess.com/blogs/choosing-a-gun/which-concealed-carry-guns-are-the-most-popular-right-now\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323585305.53/warc/CC-MAIN-20211020090145-20211020120145-00247.warc.gz\", \"warc_record_offset\": 952624329, \"warc_record_length\": 36797}", "tokenized_text": ["url", "https", "www", "gungoddess", "com", "blogs", "choosing", "a", "gun", "which", "concealed", "carry", "guns", "are", "the", "most", "popular", "right", "now", "warc_filename", "crawl", "data", "cc", "main", "2021", "43", "segments", "1634323585305", "53", "warc", "cc", "main", "20211020090145", "20211020120145", "00247", "warc", "gz", "warc_record_offset", "952624329", "warc_record_length", "36797"]}, "68": {"text": "{\"url\": \"https://www.historytoday.com/history-today-issues/volume-38-issue-9-september-1988\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-40/segments/1600400209665.4/warc/CC-MAIN-20200923015227-20200923045227-00745.warc.gz\", \"warc_record_offset\": 868067474, \"warc_record_length\": 7789}", "tokenized_text": ["url", "https", "www", "historytoday", "com", "history", "today", "issues", "volume", "38", "issue", "9", "september", "1988", "warc_filename", "crawl", "data", "cc", "main", "2020", "40", "segments", "1600400209665", "4", "warc", "cc", "main", "20200923015227", "20200923045227", "00745", "warc", "gz", "warc_record_offset", "868067474", "warc_record_length", "7789"]}, "69": {"text": "{\"url\": \"https://www.gsa.gov/historic-buildings/african-burial-ground-memorial-new-york-ny\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662530066.45/warc/CC-MAIN-20220519204127-20220519234127-00495.warc.gz\", \"warc_record_offset\": 887707280, \"warc_record_length\": 18194}", "tokenized_text": ["url", "https", "www", "gsa", "gov", "historic", "buildings", "african", "burial", "ground", "memorial", "new", "york", "ny", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652662530066", "45", "warc", "cc", "main", "20220519204127", "20220519234127", "00495", "warc", "gz", "warc_record_offset", "887707280", "warc_record_length", "18194"]}, "70": {"text": "{\"url\": \"https://hd-report.com/2014/02/14/comcast-merger-hideous-netflix-service/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103036176.7/warc/CC-MAIN-20220625220543-20220626010543-00439.warc.gz\", \"warc_record_offset\": 331507927, \"warc_record_length\": 70103}", "tokenized_text": ["url", "https", "hd", "report", "com", "2014", "02", "14", "comcast", "merger", "hideous", "netflix", "service", "warc_filename", "crawl", "data", "cc", "main", "2022", "27", "segments", "1656103036176", "7", "warc", "cc", "main", "20220625220543", "20220626010543", "00439", "warc", "gz", "warc_record_offset", "331507927", "warc_record_length", "70103"]}, "71": {"text": "{\"url\": \"https://www.lawyersandsettlements.com/legal-news/harvoni-denied-insurance-claim/harvoni-lawsuits-denied-insurance-claim-20731.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-43/segments/1634323588526.57/warc/CC-MAIN-20211028193601-20211028223601-00464.warc.gz\", \"warc_record_offset\": 1030808939, \"warc_record_length\": 12183}", "tokenized_text": ["url", "https", "www", "lawyersandsettlements", "com", "legal", "news", "harvoni", "denied", "insurance", "claim", "harvoni", "lawsuits", "denied", "insurance", "claim", "20731", "html", "warc_filename", "crawl", "data", "cc", "main", "2021", "43", "segments", "1634323588526", "57", "warc", "cc", "main", "20211028193601", "20211028223601", "00464", "warc", "gz", "warc_record_offset", "1030808939", "warc_record_length", "12183"]}, "72": {"text": "{\"url\": \"https://aceaviation.info/15107/msi-personal-cinema-15/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347409171.27/warc/CC-MAIN-20200530102741-20200530132741-00512.warc.gz\", \"warc_record_offset\": 230608992, \"warc_record_length\": 8405}", "tokenized_text": ["url", "https", "aceaviation", "info", "15107", "msi", "personal", "cinema", "15", "warc_filename", "crawl", "data", "cc", "main", "2020", "24", "segments", "1590347409171", "27", "warc", "cc", "main", "20200530102741", "20200530132741", "00512", "warc", "gz", "warc_record_offset", "230608992", "warc_record_length", "8405"]}, "73": {"text": "{\"url\": \"http://www.antehoc.com/2012/05/collateral-at-bank-of-italy-2.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500076.87/warc/CC-MAIN-20230203221113-20230204011113-00490.warc.gz\", \"warc_record_offset\": 49049249, \"warc_record_length\": 11713}", "tokenized_text": ["url", "http", "www", "antehoc", "com", "2012", "05", "collateral", "at", "bank", "of", "italy", "2", "html", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500076", "87", "warc", "cc", "main", "20230203221113", "20230204011113", "00490", "warc", "gz", "warc_record_offset", "49049249", "warc_record_length", "11713"]}, "74": {"text": "{\"url\": \"https://www.thehansindia.com/news/cities/vijayawada/vijayawada-cpi-wants-cm-ys-jagan-to-raise-voice-against-vizag-steel-plant-sell-off-704225\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964362999.66/warc/CC-MAIN-20211204154554-20211204184554-00197.warc.gz\", \"warc_record_offset\": 1117681212, \"warc_record_length\": 45453}", "tokenized_text": ["url", "https", "www", "thehansindia", "com", "news", "cities", "vijayawada", "vijayawada", "cpi", "wants", "cm", "ys", "jagan", "to", "raise", "voice", "against", "vizag", "steel", "plant", "sell", "off", "704225", "warc_filename", "crawl", "data", "cc", "main", "2021", "49", "segments", "1637964362999", "66", "warc", "cc", "main", "20211204154554", "20211204184554", "00197", "warc", "gz", "warc_record_offset", "1117681212", "warc_record_length", "45453"]}, "75": {"text": "{\"url\": \"https://wkuf.fm/shows/wkuf-celebrates-the-30th-anniversary-of-violator/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/warc/CC-MAIN-20200329074745-20200329104745-00223.warc.gz\", \"warc_record_offset\": 747266560, \"warc_record_length\": 14190}", "tokenized_text": ["url", "https", "wkuf", "fm", "shows", "wkuf", "celebrates", "the", "30th", "anniversary", "of", "violator", "warc_filename", "crawl", "data", "cc", "main", "2020", "16", "segments", "1585370494064", "21", "warc", "cc", "main", "20200329074745", "20200329104745", "00223", "warc", "gz", "warc_record_offset", "747266560", "warc_record_length", "14190"]}, "76": {"text": "{\"url\": \"https://dailytimes.com.pk/379160/pakistan-urges-iran-and-afghanistan-to-do-more/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-24/segments/1590347385193.5/warc/CC-MAIN-20200524210325-20200525000325-00286.warc.gz\", \"warc_record_offset\": 318772991, \"warc_record_length\": 14394}", "tokenized_text": ["url", "https", "dailytimes", "com", "pk", "379160", "pakistan", "urges", "iran", "and", "afghanistan", "to", "do", "more", "warc_filename", "crawl", "data", "cc", "main", "2020", "24", "segments", "1590347385193", "5", "warc", "cc", "main", "20200524210325", "20200525000325", "00286", "warc", "gz", "warc_record_offset", "318772991", "warc_record_length", "14394"]}, "77": {"text": "{\"url\": \"https://www.dailyfinland.fi/business/21426/KONE-Q1-results-improve/print\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488289268.76/warc/CC-MAIN-20210621181810-20210621211810-00385.warc.gz\", \"warc_record_offset\": 652955119, \"warc_record_length\": 4559}", "tokenized_text": ["url", "https", "www", "dailyfinland", "fi", "business", "21426", "kone", "q1", "results", "improve", "print", "warc_filename", "crawl", "data", "cc", "main", "2021", "25", "segments", "1623488289268", "76", "warc", "cc", "main", "20210621181810", "20210621211810", "00385", "warc", "gz", "warc_record_offset", "652955119", "warc_record_length", "4559"]}, "78": {"text": "{\"url\": \"https://newcanaanlandtrust.org/hannan/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500095.4/warc/CC-MAIN-20230204075436-20230204105436-00431.warc.gz\", \"warc_record_offset\": 445539595, \"warc_record_length\": 36215}", "tokenized_text": ["url", "https", "newcanaanlandtrust", "org", "hannan", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500095", "4", "warc", "cc", "main", "20230204075436", "20230204105436", "00431", "warc", "gz", "warc_record_offset", "445539595", "warc_record_length", "36215"]}, "79": {"text": "{\"url\": \"https://www.coppolacomment.com/2016/05/where-on-earth-is-growth-in-greece.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-04/segments/1610703519843.24/warc/CC-MAIN-20210119232006-20210120022006-00102.warc.gz\", \"warc_record_offset\": 739236885, \"warc_record_length\": 37583}", "tokenized_text": ["url", "https", "www", "coppolacomment", "com", "2016", "05", "where", "on", "earth", "is", "growth", "in", "greece", "html", "warc_filename", "crawl", "data", "cc", "main", "2021", "04", "segments", "1610703519843", "24", "warc", "cc", "main", "20210119232006", "20210120022006", "00102", "warc", "gz", "warc_record_offset", "739236885", "warc_record_length", "37583"]}, "80": {"text": "{\"url\": \"https://philadelphia.cbslocal.com/2013/02/10/stars-sing-along-to-lumineers/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-27/segments/1656103269583.13/warc/CC-MAIN-20220626131545-20220626161545-00651.warc.gz\", \"warc_record_offset\": 500521076, \"warc_record_length\": 21082}", "tokenized_text": ["url", "https", "philadelphia", "cbslocal", "com", "2013", "02", "10", "stars", "sing", "along", "to", "lumineers", "warc_filename", "crawl", "data", "cc", "main", "2022", "27", "segments", "1656103269583", "13", "warc", "cc", "main", "20220626131545", "20220626161545", "00651", "warc", "gz", "warc_record_offset", "500521076", "warc_record_length", "21082"]}, "81": {"text": "{\"url\": \"http://www.tdisport.com/tdisport-diesel-news/passat-used-car-of-the-year/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-29/segments/1593657140746.69/warc/CC-MAIN-20200713002400-20200713032400-00490.warc.gz\", \"warc_record_offset\": 231800895, \"warc_record_length\": 24971}", "tokenized_text": ["url", "http", "www", "tdisport", "com", "tdisport", "diesel", "news", "passat", "used", "car", "of", "the", "year", "warc_filename", "crawl", "data", "cc", "main", "2020", "29", "segments", "1593657140746", "69", "warc", "cc", "main", "20200713002400", "20200713032400", "00490", "warc", "gz", "warc_record_offset", "231800895", "warc_record_length", "24971"]}, "82": {"text": "{\"url\": \"https://www.betarena.com/atlanta-united-vs-montreal-impact-betting-tip-and-prediction/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145316.8/warc/CC-MAIN-20200220224059-20200221014059-00491.warc.gz\", \"warc_record_offset\": 600780066, \"warc_record_length\": 35158}", "tokenized_text": ["url", "https", "www", "betarena", "com", "atlanta", "united", "vs", "montreal", "impact", "betting", "tip", "and", "prediction", "warc_filename", "crawl", "data", "cc", "main", "2020", "10", "segments", "1581875145316", "8", "warc", "cc", "main", "20200220224059", "20200221014059", "00491", "warc", "gz", "warc_record_offset", "600780066", "warc_record_length", "35158"]}, "83": {"text": "{\"url\": \"http://climateemergencydeclaration.org/head-cracking-moments-of-the-declaration-campaign/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500719.31/warc/CC-MAIN-20230208060523-20230208090523-00280.warc.gz\", \"warc_record_offset\": 8674637, \"warc_record_length\": 16718}", "tokenized_text": ["url", "http", "climateemergencydeclaration", "org", "head", "cracking", "moments", "of", "the", "declaration", "campaign", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500719", "31", "warc", "cc", "main", "20230208060523", "20230208090523", "00280", "warc", "gz", "warc_record_offset", "8674637", "warc_record_length", "16718"]}, "84": {"text": "{\"url\": \"http://2smeraldi.com/home/wp-includes/certificates/pdf/online-Studies-in-Modern-Music%3A-Frederick-Chopin%2C-Antonin-Dvo%C5%99%C3%A1k%2C-Johannes-Brahms-1904%2C2012/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-21/segments/1620243991772.66/warc/CC-MAIN-20210517115207-20210517145207-00081.warc.gz\", \"warc_record_offset\": 667469, \"warc_record_length\": 15861}", "tokenized_text": ["url", "http", "2smeraldi", "com", "home", "wp", "includes", "certificates", "pdf", "online", "studies", "in", "modern", "music", "3a", "frederick", "chopin", "2c", "antonin", "dvo", "c5", "99", "c3", "a1k", "2c", "johannes", "brahms", "1904", "2c2012", "warc_filename", "crawl", "data", "cc", "main", "2021", "21", "segments", "1620243991772", "66", "warc", "cc", "main", "20210517115207", "20210517145207", "00081", "warc", "gz", "warc_record_offset", "667469", "warc_record_length", "15861"]}, "85": {"text": "{\"url\": \"https://megamarathi.com/news/braveheart-movie-selected-in-third-eye-asian-film-festival/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964362219.5/warc/CC-MAIN-20211202114856-20211202144856-00451.warc.gz\", \"warc_record_offset\": 472202588, \"warc_record_length\": 24269}", "tokenized_text": ["url", "https", "megamarathi", "com", "news", "braveheart", "movie", "selected", "in", "third", "eye", "asian", "film", "festival", "warc_filename", "crawl", "data", "cc", "main", "2021", "49", "segments", "1637964362219", "5", "warc", "cc", "main", "20211202114856", "20211202144856", "00451", "warc", "gz", "warc_record_offset", "472202588", "warc_record_length", "24269"]}, "86": {"text": "{\"url\": \"https://plantpowerednomad.com/got-soy-milk-a-guide-to-vegan-lattes-in-taipei/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-49/segments/1669446711475.44/warc/CC-MAIN-20221209181231-20221209211231-00135.warc.gz\", \"warc_record_offset\": 502614546, \"warc_record_length\": 17761}", "tokenized_text": ["url", "https", "plantpowerednomad", "com", "got", "soy", "milk", "a", "guide", "to", "vegan", "lattes", "in", "taipei", "warc_filename", "crawl", "data", "cc", "main", "2022", "49", "segments", "1669446711475", "44", "warc", "cc", "main", "20221209181231", "20221209211231", "00135", "warc", "gz", "warc_record_offset", "502614546", "warc_record_length", "17761"]}, "87": {"text": "{\"url\": \"https://pnesterova.com/2021/04/03/english-listening-futurologist/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-05/segments/1642320304810.95/warc/CC-MAIN-20220125100035-20220125130035-00017.warc.gz\", \"warc_record_offset\": 487850033, \"warc_record_length\": 35314}", "tokenized_text": ["url", "https", "pnesterova", "com", "2021", "04", "03", "english", "listening", "futurologist", "warc_filename", "crawl", "data", "cc", "main", "2022", "05", "segments", "1642320304810", "95", "warc", "cc", "main", "20220125100035", "20220125130035", "00017", "warc", "gz", "warc_record_offset", "487850033", "warc_record_length", "35314"]}, "88": {"text": "{\"url\": \"https://shapingthefutureofpower.com/2020/03/02/covid19-and-china-africa-soft-power-considerations/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652662573053.67/warc/CC-MAIN-20220524142617-20220524172617-00116.warc.gz\", \"warc_record_offset\": 576892098, \"warc_record_length\": 26350}", "tokenized_text": ["url", "https", "shapingthefutureofpower", "com", "2020", "03", "02", "covid19", "and", "china", "africa", "soft", "power", "considerations", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652662573053", "67", "warc", "cc", "main", "20220524142617", "20220524172617", "00116", "warc", "gz", "warc_record_offset", "576892098", "warc_record_length", "26350"]}, "89": {"text": "{\"url\": \"https://tass.com/politics/1023811\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-05/segments/1642320304570.90/warc/CC-MAIN-20220124124654-20220124154654-00633.warc.gz\", \"warc_record_offset\": 605037761, \"warc_record_length\": 18206}", "tokenized_text": ["url", "https", "tass", "com", "politics", "1023811", "warc_filename", "crawl", "data", "cc", "main", "2022", "05", "segments", "1642320304570", "90", "warc", "cc", "main", "20220124124654", "20220124154654", "00633", "warc", "gz", "warc_record_offset", "605037761", "warc_record_length", "18206"]}, "90": {"text": "{\"url\": \"https://popculturetimes.com/2020/07/30/edge-of-tomorrow-2-what-is-release-date-and-latest-update/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-17/segments/1618038921860.72/warc/CC-MAIN-20210419235235-20210420025235-00434.warc.gz\", \"warc_record_offset\": 566143162, \"warc_record_length\": 19036}", "tokenized_text": ["url", "https", "popculturetimes", "com", "2020", "07", "30", "edge", "of", "tomorrow", "2", "what", "is", "release", "date", "and", "latest", "update", "warc_filename", "crawl", "data", "cc", "main", "2021", "17", "segments", "1618038921860", "72", "warc", "cc", "main", "20210419235235", "20210420025235", "00434", "warc", "gz", "warc_record_offset", "566143162", "warc_record_length", "19036"]}, "91": {"text": "{\"url\": \"https://www.amnews.com/2016/11/12/centre-college-womens-soccer-moves-on-to-second-round/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764494826.88/warc/CC-MAIN-20230126210844-20230127000844-00124.warc.gz\", \"warc_record_offset\": 671862269, \"warc_record_length\": 20224}", "tokenized_text": ["url", "https", "www", "amnews", "com", "2016", "11", "12", "centre", "college", "womens", "soccer", "moves", "on", "to", "second", "round", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764494826", "88", "warc", "cc", "main", "20230126210844", "20230127000844", "00124", "warc", "gz", "warc_record_offset", "671862269", "warc_record_length", "20224"]}, "92": {"text": "{\"url\": \"https://dev.to/chiexplores/for-beginners-what-you-need-to-know-about-var-let-const-45pl\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764500273.30/warc/CC-MAIN-20230205161658-20230205191658-00037.warc.gz\", \"warc_record_offset\": 228975360, \"warc_record_length\": 20205}", "tokenized_text": ["url", "https", "dev", "to", "chiexplores", "for", "beginners", "what", "you", "need", "to", "know", "about", "var", "let", "const", "45pl", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764500273", "30", "warc", "cc", "main", "20230205161658", "20230205191658", "00037", "warc", "gz", "warc_record_offset", "228975360", "warc_record_length", "20205"]}, "93": {"text": "{\"url\": \"https://v5.femalefirst.co.uk/celebrity/eddie-redmayne-uses-cbd-oil-ease-nerves-1286919.html\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-25/segments/1623488268274.66/warc/CC-MAIN-20210621055537-20210621085537-00406.warc.gz\", \"warc_record_offset\": 525187166, \"warc_record_length\": 12139}", "tokenized_text": ["url", "https", "v5", "femalefirst", "co", "uk", "celebrity", "eddie", "redmayne", "uses", "cbd", "oil", "ease", "nerves", "1286919", "html", "warc_filename", "crawl", "data", "cc", "main", "2021", "25", "segments", "1623488268274", "66", "warc", "cc", "main", "20210621055537", "20210621085537", "00406", "warc", "gz", "warc_record_offset", "525187166", "warc_record_length", "12139"]}, "94": {"text": "{\"url\": \"https://www.saturdayeveningpost.com/2017/10/coping-estranged-adult-children/\", \"warc_filename\": \"crawl-data/CC-MAIN-2023-06/segments/1674764499710.49/warc/CC-MAIN-20230129080341-20230129110341-00391.warc.gz\", \"warc_record_offset\": 1002765638, \"warc_record_length\": 30197}", "tokenized_text": ["url", "https", "www", "saturdayeveningpost", "com", "2017", "10", "coping", "estranged", "adult", "children", "warc_filename", "crawl", "data", "cc", "main", "2023", "06", "segments", "1674764499710", "49", "warc", "cc", "main", "20230129080341", "20230129110341", "00391", "warc", "gz", "warc_record_offset", "1002765638", "warc_record_length", "30197"]}, "95": {"text": "{\"url\": \"https://feifa.eu/spectacular-tax-savings-for-expats-using-spanish-compliant-investments/\", \"warc_filename\": \"crawl-data/CC-MAIN-2022-21/segments/1652663012542.85/warc/CC-MAIN-20220528031224-20220528061224-00707.warc.gz\", \"warc_record_offset\": 312775777, \"warc_record_length\": 12729}", "tokenized_text": ["url", "https", "feifa", "eu", "spectacular", "tax", "savings", "for", "expats", "using", "spanish", "compliant", "investments", "warc_filename", "crawl", "data", "cc", "main", "2022", "21", "segments", "1652663012542", "85", "warc", "cc", "main", "20220528031224", "20220528061224", "00707", "warc", "gz", "warc_record_offset", "312775777", "warc_record_length", "12729"]}, "96": {"text": "{\"url\": \"http://blogs.reading.ac.uk/crg/advent-botany-2015-day-15-mahleb/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-49/segments/1637964358847.80/warc/CC-MAIN-20211129225145-20211130015145-00550.warc.gz\", \"warc_record_offset\": 13017767, \"warc_record_length\": 17509}", "tokenized_text": ["url", "http", "blogs", "reading", "ac", "uk", "crg", "advent", "botany", "2015", "day", "15", "mahleb", "warc_filename", "crawl", "data", "cc", "main", "2021", "49", "segments", "1637964358847", "80", "warc", "cc", "main", "20211129225145", "20211130015145", "00550", "warc", "gz", "warc_record_offset", "13017767", "warc_record_length", "17509"]}, "97": {"text": "{\"url\": \"https://www.wired.com/2009/11/alt-text-clever-murdoch-turns-news-into-hip-underground-club/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-34/segments/1596439739048.46/warc/CC-MAIN-20200813161908-20200813191908-00099.warc.gz\", \"warc_record_offset\": 898049141, \"warc_record_length\": 73991}", "tokenized_text": ["url", "https", "www", "wired", "com", "2009", "11", "alt", "text", "clever", "murdoch", "turns", "news", "into", "hip", "underground", "club", "warc_filename", "crawl", "data", "cc", "main", "2020", "34", "segments", "1596439739048", "46", "warc", "cc", "main", "20200813161908", "20200813191908", "00099", "warc", "gz", "warc_record_offset", "898049141", "warc_record_length", "73991"]}, "98": {"text": "{\"url\": \"https://www.statesmanjournal.com/story/news/2015/10/16/oregon-wolf-or3-cascade-mountains-crater-lake-national-park/74071992/\", \"warc_filename\": \"crawl-data/CC-MAIN-2020-10/segments/1581875145774.75/warc/CC-MAIN-20200223123852-20200223153852-00550.warc.gz\", \"warc_record_offset\": 893628626, \"warc_record_length\": 42687}", "tokenized_text": ["url", "https", "www", "statesmanjournal", "com", "story", "news", "2015", "10", "16", "oregon", "wolf", "or3", "cascade", "mountains", "crater", "lake", "national", "park", "74071992", "warc_filename", "crawl", "data", "cc", "main", "2020", "10", "segments", "1581875145774", "75", "warc", "cc", "main", "20200223123852", "20200223153852", "00550", "warc", "gz", "warc_record_offset", "893628626", "warc_record_length", "42687"]}, "99": {"text": "{\"url\": \"https://www.businesstraveller.com/business-travel/2018/03/14/boeing-rolls-10000th-737-aircraft/\", \"warc_filename\": \"crawl-data/CC-MAIN-2021-21/segments/1620243992159.64/warc/CC-MAIN-20210517084550-20210517114550-00131.warc.gz\", \"warc_record_offset\": 690172719, \"warc_record_length\": 19143}", "tokenized_text": ["url", "https", "www", "businesstraveller", "com", "business", "travel", "2018", "03", "14", "boeing", "rolls", "10000th", "737", "aircraft", "warc_filename", "crawl", "data", "cc", "main", "2021", "21", "segments", "1620243992159", "64", "warc", "cc", "main", "20210517084550", "20210517114550", "00131", "warc", "gz", "warc_record_offset", "690172719", "warc_record_length", "19143"]}}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/vocab_counts.json ADDED
The diff for this file is too large to render. See raw diff
 
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_basic_stats.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"xmin": 3, "xmax": 22, "alpha": 1.792957327015756, "ks_distance": 0.08076463006320389, "p-value": 0.0011150885998064807, "word_counts_unique": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 16, 22, 25, 29, 31, 43, 200, 77, 92, 100], "word_ranks_unique": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_fig.html ADDED
The diff for this file is too large to render. See raw diff
 
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_general_metadata/zipf/zipf_fig.json ADDED
@@ -0,0 +1 @@
 
 
1
+ "{\"data\":[{\"hovertext\":[\"heavenlytealeaves,investments,growth,improve,heavenly,gum,includes,inspiring,flows,flexforkal,inn,gsa,influencing,healthyceleb,india,ionigeria,islanders,head,hd,japan,ham,fimmel,james,fingerlakes1,fire,jagan,jada,items,italy,issues,guns,flex,gungoddess,gunfire,hawaii,isis,height,ill,impact,hog,gaga,hotel,gamecocksonline,hopefornigeriaonline,gauguin,ground,geek,green,holaamericanews,historytoday,how,geofence,greece,gets,glamsham,history,historic,greatfallstribune,gov,hip,house,htm,got,frederick,henryusa,ie,idropnews,foresight,forum,idea,here,group,fourth,iberico,futurologist,freeman,ian,fujifilm,full,i,hideous,fundraising,hand,future,hannan,dvo,femalefirst,celebs,chinas,children,chiexplores,charged,channel,champagne,centre,celebrity,camps,celebrates,cbd,caught,category,cast,cascade,carry,choosing,chopin,chronicle,cinema,compliant,comes,comcast,college,collateral,coastal,cmt,cmeindia,cm,club,climateemergencydeclaration,clever,cleaning,city,cities,car,callbag,feifa,birmingcabbie,boeing,bluemonsterprep,bluegrass,blogspot,blast,blame,birthday,biggest,ca,between,betting,betarena,beginners,bedandbicycle,beat,battle,boldly,botany,box,brahms,c5,c3,buy,butterfly,businesstraveller,burial,built,buildings,bros,brody,brindisa,brighton,bricks,breaking,braveheart,concealed,considerations,const,el,eppc,enugu,entrepreneurship,english,enforcement,embassy,elon,edu,contact,ednews,editorials,edge,eddie,ease,earth,earbuds,epub,estranged,etoren,eu,feared,fascism,famous,fame10,fairness,factors,eye,exquisitecoasts,explosions,exploring,experience,expats,exoticization,ever,ev3,dwayne,jewishnews,due,deacons,day,date,daisuke,dailytimes,dailyfinland,daftar,crg,crater,cracking,cpi,covid19,court,coppolacomment,coping,cool,days,dead,drops,deaths,drives,down,doorways,dolphins,dogs,divorce,disruption,disrupt,display,difference,diesel,dev,detainment,decision,decade,jeong,mae,johannes,spreads,su,studies,straight,steel,statistics,statesmanjournal,stars,stackexchange,specialist,slidelegend,spanish,space,soy,song,soft,soccer,snow,smell,subsequent,suesspiciousminds,survey,survivor,theshelfofunreadbooks,thenextweb,thehansindia,thegatorseye,text,terror,telescope,tehran,technology,teas,tea,tass,tasman,talesofthetravelbug,taipei,t200,susan,sliding,slam,threatened,rolls,sabres,s,rumors,rules,rte,rowadventures,round,rothschild,rock,sitra,robbers,right,riding,reviewthe,review,results,rescued,report,said,saturdayeveningpost,savings,saw,sir,sing,simonegallina,silverchips,silver,shows,shoah,shapingthefutureofpower,seventh,service,september,selected,secure,second,scrobble,screen,scottie,third,ti,johnson,waller,wave,wausau,watch,was,warrant,warning,war,wants,vw,very,volume,voice,vizag,violator,violates,viewentry,view,vidmid,webb,webs,weeklygripe,weight,ys,years,year,x,wwe,wp,worldwide,worldnews,works,word,wolf,wishers,wireless,wired,wins,which,where,video,vegan,timesofisrael,towards,turns,turkey,trying,true,tribune,trending,travis,travel,totalitarianism,var,toptipz,too,tonto,tomorrow,tolkien,today,tm,tip,twinning,twitter,two,ud,vagabondinn,vagabond,v5,utah,using,uses,used,uscleiden,urges,update,up,united,uniforms,ungeek,underground,ultra,uid,repair,releases,release,mazda,microbead,miata,merger,memorial,megamarathi,me,mckellen,mbhs,mazatlan,baku,may,match,martin,marshfieldnewsherald,march,manufacturing,man,malcolm,milk,minnesota,minute,modern,muslims,muslim,musk,music,murdoch,murals,msi,movizark,moving,movie,moves,mountains,moriyama,more,montreal,montana,moments,mahleb,lumineers,relations,know,latest,landmark,lamborghinichat,lamborghini,lake,lady,koreaherald,kone,knollwood,love,kitchen,kit,kickstart,ken,keep,kabul,journals_59d9b3c91723dd4b8ead833e,join,lattes,launch,lava,law,loss,lose,lord,look,listening,listen,listal,liotta,line,library,let,lenses,legend,legal,leaves,lawyersandsettlements,lawsuits,nationalhogfarmer,nba,need,podcast,prediction,powerful,popular,popculturetimes,pong,point,poems,podparadise,pnesterova,nerves,plantpowerednomad,plans,pk,pippen,pinkett,php,photos,philadelphia,predictions,prehistoric,press,prices,reed,redmayne,receives,realdealtheatre,reading,react,reached,ray,raise,races,questions,q1,publication,prufrock,project,products,primitivism,ph,petebowes,personal,orbit,or,opinion,online,once,oil,offer,offensive,off,occupancy,observer,ny,now,nobeladventures,ng,nfl,newcanaanlandtrust,netflix,or3,oregon,pdf,orlean,paulinepark,patrol,patriots,passat,participatory,parkablogs,paradigmshyft,pakistan,painter,page,paganism,oxford,owi,overuse,out,otlsports,other,bank,ac,awards,1674764499654,1674764500094,1674764500028,1674764499919,1674764499890,1674764499845,1674764499713,1674764499710,1674764499697,1674764499524,1669446710764,1674764495001,1674764494852,1674764494826,16718,1669446711475,1669446711074,1669446710972,1669446710789,1674764500095,1674764500273,1674764500339,1674764500719,1904,19036,18206,18194,180032374,178693725,17761,176182646,175500523,17548,17509,17495,17321,17143,1674764501555,1674764501066,1674764500758,1669446710771,1664030337731,19267,1631780057202,1637964362999,1637964362219,1637964358847,1634323588526,1634323588398,1634323587606,1634323585322,1634323585305,1627046157039,1664030337625,1627046155458,1627046154175,1623488567696,1623488528979,1623488289268,1623488268274,1623487612537,1620243992159,1637964363292,1642320304570,1642320304810,1652662521041,1664030337322,16616,1659882573630,1659882573145,1659882572063,1659882571090,1659882571056,1656104496688,1656103626162,1656103334753,1656103269583,1656103036176,1652663012542,1652662573053,1652662539101,1652662531762,1652662530066,19143,19373,1618039388763,20205,20210415065637,20210415035637,20210413152252,20210413122252,20210123155715,20210123125715,20210123031629,20210123001629,20201206001729,20200808110642,20201205211729,20201128041115,20201128011115,20201124112900,20201124082900,20200923045227,20200923015227,20200813191908,20210419235235,20210420025235,20210420091336,20210420121336,20210801122716,20210801092716,20210625053840,20210625023840,20210623041557,20210623011557,20210621211810,20210621181810,20210621085537,20210621055537,20210614165913,20210614135913,20210517145207,20210517115207,20210517114550,20210517084550,20210504001014,20200813161908,20200808080642,19781,20200120022426,20200222023354,20200221233354,20200221014059,20200220224059,20200125221854,20200125191854,20200122220620,20200122191620,20200119234426,20200807202851,20200119005909,20200118221909,2010,2009,2008,20061,199444381,1988,20200223123852,20200223153852,20200229022458,20200229052458,20200807172851,20200713032400,20200713002400,20200530132741,20200530102741,20200527043445,20200527013445,20200525000325,20200524210325,20200409090349,20200409055849,20200402110824,20200402080824,20200331083639,20200331053639,20200329104745,20200329074745,1620243991772,1618038921860,20210805093730,00476,00543,00524,00512,00506,00503,00495,00491,00477,00466,00393,00464,00455,00451,00450,00434,00431,00418,00406,00580,00587,00601,00602,00740,00714,00707,00703,00694,00685,00673,00658,00652,00651,00649,00639,00636,00633,00630,00614,00613,00398,00391,00747,00116,00197,00175,00165,00153,00141,00135,00131,00124,00108,00385,00107,00102,00099,00081,00037,00017,00015,00008,00200,00211,00223,00229,00365,00355,00330,00327,00319,00312,00308,00304,00300,00295,00286,00284,00280,00274,00250,00247,00233,00745,00754,1618038083007,1579250593994,1585370494064,1581875148375,1581875145774,1581875145621,1581875145316,1579251681412,1579250607407,1579250595787,1533560948,145050,15107,15044,1490421,14884,14792,14745,14681,14655,1585370500331,1585370506870,1585371830894,15861,1618038072366,161515301,1610703538082,1610703531702,1607614191,1606141750841,1606141194982,1606141176049,1600400209665,1596439739048,1596439737319,1596439737206,159639259,1593657140746,1590347409171,1590347392057,1590347385193,14575,14394,00762,1036937141,11206,1117681212,111531585,1096265,1088907449,1068673394,1044766559,10413,1030808939,14280,1023811,1002765638,1002652790,10000th,09,00871,00826,00821,1124917887,11425,11479,11530,14190,13553,13209660,13017767,1288254429,1286919,12729,1269439293,12466,1239312380,12183,12139,1201,11838,11801,11772,11713,20210805063730,20210805193327,award,566143162,60,598837769,5971,591253388,58668,58147,58,576892098,559787302,51,55792,54041,54,536988620,532769245,525905978,525187166,516800798,600780066,605037761,628149199,63,70,690172719,6855,683956680,683638756,671862269,67108,67,667469,664847335,66290,652955119,65,640302554,640116,64,6324,510157631,503711292,701475464,3a,43188,43051,42926,42687,42,41,40224,3rd,39518,502614546,379160,37583,36797,3645355002,36215,358919877,353753500,35314,44,440902,442133024,443600050,500521076,49049249,487850033,486450658,48,476834551,473272628,472202588,46,45pl,4559,455460747,45453,4512,450138738,445539595,444833137,70103,704225,348002670,agwired,amendment,alt,along,allegedly,all,alfred,alcohol,aircraft,against,about,african,afghanistan,affordable,advent,adults,adult,adrien,aceaviation,amnews,an,anniversary,antehoc,authoritarianism,author,austinchronicle,attack,atlanta,asian,as,art,arout,around,armed,are,archiveislam,ar,apple,antonin,anticipation,00001,abel,72,766206361,84010,837566473,83,828569590,792006189,782007633,78,7789,7640,a1k,748368127,747266560,74071992,73991,739236885,737,72974,728,8405,8481,85,856244466,972941129,9581,952624329,944340676,935440331,930101659,904154101,90,898049141,893628626,887707280,870524278,870013982,868067474,8674637,85722960,857199535,35158,347762646,20210805223327,20220818003501,20221006061224,20221005135356,20221005105356,20221002145028,20221002115028,20220819100211,20220819070211,20220818033501,20220814203832,20220627164424,20220814173832,20220810005803,20220809215803,20220809185137,20220809155137,20220704232455,20220704202455,20220629114939,20221006091224,20221130124353,20221130154353,20221130192708,20230128054815,20230127194242,20230127164242,20230127031911,20230127001911,20230127000844,20230126210844,20224,20221209211231,20221209181231,20221206090908,20221206060908,20221204134311,20221204104311,20221201051257,20221201021257,20221130222708,20220629084939,20220627134424,20230128184907,20211028192638,20211204184554,20211204154554,20211202144856,20211202114856,20211130015145,20211129225145,20211028223601,20211028193601,20211028162638,20220626161545,20211024234628,20211024204628,20211020182307,20211020152307,20211020120145,20211020090145,20210921131319,20210921101319,20211206103243,20211206133243,20220124124654,20220124154654,20220626131545,20220626010543,20220625220543,20220528061224,20220528031224,20220524172617,20220524142617,20220521142022,20220521112022,20220520091824,20220520061824,20220519234127,20220519204127,20220518051247,20220518021247,20220125130035,20220125100035,20230128084815,20230128214907,33777,24971,27741,27715,26850,26350,263492130,250370303,25025,25017,24833,228975360,24269,241286356,23522,23482,233367373,23266,231800895,230608992,27864,28496,28617,28918,331507927,32672,322560850,32,3188,318772991,31698,312775777,31146,30th,303777204,30197,30176,2smeraldi,2c2012,29736,29529,23,226886,20230129012420,20230201081311,20230205161658,20230204105436,20230204075436,20230204074030,20230204044030,20230202163541,20230202133541,20230201111311,20230131220543,22407,20230131190543,20230131085533,20230131055533,20230129142153,20230129112153,20230129110341,20230129080341,20230129042420,20230205191658,20230206113934,20230206143934,20230208060523,22399,22120,22,21947,21426,21310,21082,208318103,20731,20698,20230209111052,20230209081052,20230209044102,20230209014102,20230208122053,20230208092053,20230208090523,zwillgen\",\"20,my,6,38,multiple,20210119232006,covid,20210120022006,57,spectacular,park,content,most,gun,15,68,7,19,national,53,1610703519843,2014,four,28,food,en,energy,fm,r,gold,2015,4,power,print,37,2017,45,26,politics,rings,1674764500076,police,36,night,2012,denied,2c,declaration,net,35,plant,first,sell,0,13,caribbeantales,info,00439,africa,vijayawada,00490,business,by,00550,campaign,9,88,cbslocal,87,01,certificates,his,vs,last,20230203221113,book,you,york,00060,beats,j,womens,billion,wkuf,issue,will,00248,iran,body,insurance,wasted,charlie,festival,china,tax,74,this,20230204011113,claim,80,media,76,82,tdisport,fi,harvoni\",\"8,co,uk,95,smith,home,11,66,guide,blog,75,at,into,2018,69,08,29,story,07,30,world,2013,org,wordpress,39,2011\",\"2,99,12,34,film,5,2019,new\",\"33,is,17,do,50,40,for,31,43,24,what,2016,25\",\"on,03,04,14,blogs\",\"16,with,a\",\"27,http,02\",\"and,in,html,21\",\"49\",\"05\",\"10,of\",\"to\",\"news,2023\",\"06,the,2020\",\"2022\",\"2021\",\"www\",\"main,warc,cc\",\"com\",\"https\",\"data,warc_filename,url,warc_record_offset,segments,warc_record_length,gz,crawl\"],\"name\":\"Word Rank Frequency\",\"x\":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],\"y\":[200,200,200,100,100,100,100,100,100,100,100,92,77,43,31,29,25,25,25,22,22,16,13,13,12,11,10,10,10,10,8,8,8,7,7,7,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],\"type\":\"bar\"},{\"hovertext\":[\"2,99,12,34,film,5,2019,new\",\"33,is,17,do,50,40,for,31,43,24,what,2016,25\",\"on,03,04,14,blogs\",\"16,with,a\",\"27,http,02\",\"and,in,html,21\",\"49\",\"05\",\"10,of\",\"to\",\"news,2023\",\"06,the,2020\",\"2022\",\"2021\",\"www\",\"main,warc,cc\",\"com\",\"https\",\"data,warc_filename,url,warc_record_offset,segments,warc_record_length,gz,crawl\"],\"line\":{\"color\":\"crimson\",\"width\":3},\"name\":\"Zipf Predicted Frequency\",\"x\":[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],\"y\":[31,21,15,11,9,7,6,5,4,4,3,3,3,2,2,2,2,2,1],\"type\":\"scatter\"}],\"layout\":{\"xaxis\":{\"range\":[0,100],\"title\":{\"text\":\"Word Rank\"}},\"template\":{\"data\":{\"candlestick\":[{\"decreasing\":{\"line\":{\"color\":\"#000033\"}},\"increasing\":{\"line\":{\"color\":\"#000032\"}},\"type\":\"candlestick\"}],\"contourcarpet\":[{\"colorscale\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]],\"type\":\"contourcarpet\"}],\"contour\":[{\"colorscale\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]],\"type\":\"contour\"}],\"heatmap\":[{\"colorscale\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]],\"type\":\"heatmap\"}],\"histogram2d\":[{\"colorscale\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]],\"type\":\"histogram2d\"}],\"icicle\":[{\"textfont\":{\"color\":\"white\"},\"type\":\"icicle\"}],\"sankey\":[{\"textfont\":{\"color\":\"#000036\"},\"type\":\"sankey\"}],\"scatter\":[{\"marker\":{\"line\":{\"width\":0}},\"type\":\"scatter\"}],\"table\":[{\"cells\":{\"fill\":{\"color\":\"#000038\"},\"font\":{\"color\":\"#000037\"},\"line\":{\"color\":\"#000039\"}},\"header\":{\"fill\":{\"color\":\"#000040\"},\"font\":{\"color\":\"#000036\"},\"line\":{\"color\":\"#000039\"}},\"type\":\"table\"}],\"waterfall\":[{\"connector\":{\"line\":{\"color\":\"#000036\",\"width\":2}},\"decreasing\":{\"marker\":{\"color\":\"#000033\"}},\"increasing\":{\"marker\":{\"color\":\"#000032\"}},\"totals\":{\"marker\":{\"color\":\"#000034\"}},\"type\":\"waterfall\"}]},\"layout\":{\"coloraxis\":{\"colorscale\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]]},\"colorscale\":{\"diverging\":[[0.0,\"#000021\"],[0.1,\"#000022\"],[0.2,\"#000023\"],[0.3,\"#000024\"],[0.4,\"#000025\"],[0.5,\"#000026\"],[0.6,\"#000027\"],[0.7,\"#000028\"],[0.8,\"#000029\"],[0.9,\"#000030\"],[1.0,\"#000031\"]],\"sequential\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]],\"sequentialminus\":[[0.0,\"#000011\"],[0.1111111111111111,\"#000012\"],[0.2222222222222222,\"#000013\"],[0.3333333333333333,\"#000014\"],[0.4444444444444444,\"#000015\"],[0.5555555555555556,\"#000016\"],[0.6666666666666666,\"#000017\"],[0.7777777777777778,\"#000018\"],[0.8888888888888888,\"#000019\"],[1.0,\"#000020\"]]},\"colorway\":[\"#000001\",\"#000002\",\"#000003\",\"#000004\",\"#000005\",\"#000006\",\"#000007\",\"#000008\",\"#000009\",\"#000010\"]}},\"title\":{\"text\":\"Word Counts, Observed and Predicted by Zipf\"},\"yaxis\":{\"title\":{\"text\":\"Frequency\"}},\"legend\":{\"yanchor\":\"top\",\"y\":0.99,\"xanchor\":\"left\",\"x\":0.1}}}"
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f23eb11877f1bf934ba12d0ce910ccb71b9ce3865798dc6651d12425244b529
3
+ size 489144
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/dataset_info.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 487928,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "images": {
12
+ "feature": {
13
+ "dtype": "string",
14
+ "_type": "Value"
15
+ },
16
+ "_type": "Sequence"
17
+ },
18
+ "metadata": {
19
+ "dtype": "string",
20
+ "_type": "Value"
21
+ },
22
+ "general_metadata": {
23
+ "dtype": "string",
24
+ "_type": "Value"
25
+ },
26
+ "texts": {
27
+ "feature": {
28
+ "dtype": "string",
29
+ "_type": "Value"
30
+ },
31
+ "_type": "Sequence"
32
+ }
33
+ },
34
+ "homepage": "",
35
+ "license": "",
36
+ "size_in_bytes": 487928,
37
+ "splits": {
38
+ "train": {
39
+ "name": "train",
40
+ "num_bytes": 487928,
41
+ "num_examples": 100,
42
+ "dataset_name": "generator"
43
+ }
44
+ },
45
+ "version": {
46
+ "version_str": "0.0.0",
47
+ "major": 0,
48
+ "minor": 0,
49
+ "patch": 0
50
+ }
51
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/base_dset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "ae60b20f0290ac61",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/dset_peek.json ADDED
The diff for this file is too large to render. See raw diff
 
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/general_stats_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total words": 984, "total open words": 926, "text_nan_count": 0, "duplicate_fraction": 0.0}
cache_dir/HuggingFaceM4/OBELICS_opt_out_docs_removed_2023_07_12_train_images/lengths/length_measurements.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"average_instance_length": 13.27016129032258, "standard_dev_instance_length": 4.472542952248761, "num_instance_lengths": 23}