Anthonyg5005
commited on
Commit
•
3901d3e
1
Parent(s):
a14d22f
Update EXL2_Private_Quant_V3.ipynb
Browse files
ipynb/EXL2_Private_Quant_V3.ipynb
CHANGED
@@ -163,31 +163,35 @@
|
|
163 |
"#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/dataset-example.jpg \"Copy from download button\")\\\n",
|
164 |
"#@markdown If dataset is jsonl then convert to parquet. (Not always reliable, must be formatted correctly)\n",
|
165 |
"convert_parquet = False # @param {type:\"boolean\"}\n",
|
|
|
|
|
166 |
"if convert_parquet == True:\n",
|
167 |
-
" import pandas as pd
|
168 |
-
" import pyarrow as pa
|
169 |
-
" import pyarrow.parquet as pq
|
170 |
-
" import json
|
171 |
"\n",
|
172 |
-
" def jsonl_to_parquet(jsonl_file, parquet_file)
|
173 |
" # Read JSONL file line by line with explicit encoding\n",
|
174 |
-
" with open(jsonl_file, 'r', encoding='utf-8') as f
|
175 |
-
" lines = f.readlines()
|
176 |
"\n",
|
177 |
" # Parse JSON lines and store as list of dictionaries\n",
|
178 |
-
" data = [json.loads(line.strip()) for line in lines]
|
179 |
"\n",
|
180 |
" # Convert to Pandas DataFrame\n",
|
181 |
-
" df = pd.DataFrame(data)
|
182 |
"\n",
|
183 |
" # Convert DataFrame to PyArrow Table\n",
|
184 |
-
" table = pa.Table.from_pandas(df)
|
185 |
"\n",
|
186 |
" # Write PyArrow Table to Parquet file\n",
|
187 |
-
" pq.write_table(table, parquet_file)
|
188 |
"\n",
|
189 |
-
" jsonl_to_parquet(dataset, f\"{dataset_jtp}.parquet\")
|
190 |
" dataset = f\"{dataset_jtp}.parquet\"\n",
|
|
|
|
|
191 |
"#@markdown Quantizing only allows for parquet datasets to be used. Enable convert_parquet if your dataset ends in a .jsonl extention.\\\n",
|
192 |
"#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/jsonl-example.jpg \"File extension is .jsonl\")\\\n",
|
193 |
"#@markdown pippa is used as an example in this image. This dataset contains content that is not suitable for users under 18. This dataset also may or may not be against colab TOS. It won't be allowed under free colab usage although you're able to use it with paid compute units.\n",
|
|
|
163 |
"#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/dataset-example.jpg \"Copy from download button\")\\\n",
|
164 |
"#@markdown If dataset is jsonl then convert to parquet. (Not always reliable, must be formatted correctly)\n",
|
165 |
"convert_parquet = False # @param {type:\"boolean\"}\n",
|
166 |
+
"\n",
|
167 |
+
" #GITHUB COPILOT GENERATED START\n",
|
168 |
"if convert_parquet == True:\n",
|
169 |
+
" import pandas as pd\n",
|
170 |
+
" import pyarrow as pa\n",
|
171 |
+
" import pyarrow.parquet as pq\n",
|
172 |
+
" import json\n",
|
173 |
"\n",
|
174 |
+
" def jsonl_to_parquet(jsonl_file, parquet_file):\n",
|
175 |
" # Read JSONL file line by line with explicit encoding\n",
|
176 |
+
" with open(jsonl_file, 'r', encoding='utf-8') as f:\n",
|
177 |
+
" lines = f.readlines()\n",
|
178 |
"\n",
|
179 |
" # Parse JSON lines and store as list of dictionaries\n",
|
180 |
+
" data = [json.loads(line.strip()) for line in lines]\n",
|
181 |
"\n",
|
182 |
" # Convert to Pandas DataFrame\n",
|
183 |
+
" df = pd.DataFrame(data)\n",
|
184 |
"\n",
|
185 |
" # Convert DataFrame to PyArrow Table\n",
|
186 |
+
" table = pa.Table.from_pandas(df)\n",
|
187 |
"\n",
|
188 |
" # Write PyArrow Table to Parquet file\n",
|
189 |
+
" pq.write_table(table, parquet_file)\n",
|
190 |
"\n",
|
191 |
+
" jsonl_to_parquet(dataset, f\"{dataset_jtp}.parquet\")\n",
|
192 |
" dataset = f\"{dataset_jtp}.parquet\"\n",
|
193 |
+
" #GITHUB COPILOT GENERATED END\n",
|
194 |
+
"\n",
|
195 |
"#@markdown Quantizing only allows for parquet datasets to be used. Enable convert_parquet if your dataset ends in a .jsonl extention.\\\n",
|
196 |
"#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/jsonl-example.jpg \"File extension is .jsonl\")\\\n",
|
197 |
"#@markdown pippa is used as an example in this image. This dataset contains content that is not suitable for users under 18. This dataset also may or may not be against colab TOS. It won't be allowed under free colab usage although you're able to use it with paid compute units.\n",
|