Anthonyg5005 commited on
Commit
3901d3e
1 Parent(s): a14d22f

Update EXL2_Private_Quant_V3.ipynb

Browse files
Files changed (1) hide show
  1. ipynb/EXL2_Private_Quant_V3.ipynb +16 -12
ipynb/EXL2_Private_Quant_V3.ipynb CHANGED
@@ -163,31 +163,35 @@
163
  "#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/dataset-example.jpg \"Copy from download button\")\\\n",
164
  "#@markdown If dataset is jsonl then convert to parquet. (Not always reliable, must be formatted correctly)\n",
165
  "convert_parquet = False # @param {type:\"boolean\"}\n",
 
 
166
  "if convert_parquet == True:\n",
167
- " import pandas as pd #GitHub Copilot generated\n",
168
- " import pyarrow as pa #GitHub Copilot generated\n",
169
- " import pyarrow.parquet as pq #GitHub Copilot generated\n",
170
- " import json #GitHub Copilot generated\n",
171
  "\n",
172
- " def jsonl_to_parquet(jsonl_file, parquet_file): #GitHub Copilot generated\n",
173
  " # Read JSONL file line by line with explicit encoding\n",
174
- " with open(jsonl_file, 'r', encoding='utf-8') as f: #GitHub Copilot generated\n",
175
- " lines = f.readlines() #GitHub Copilot generated\n",
176
  "\n",
177
  " # Parse JSON lines and store as list of dictionaries\n",
178
- " data = [json.loads(line.strip()) for line in lines] #GitHub Copilot generated\n",
179
  "\n",
180
  " # Convert to Pandas DataFrame\n",
181
- " df = pd.DataFrame(data) #GitHub Copilot generated\n",
182
  "\n",
183
  " # Convert DataFrame to PyArrow Table\n",
184
- " table = pa.Table.from_pandas(df) #GitHub Copilot generated\n",
185
  "\n",
186
  " # Write PyArrow Table to Parquet file\n",
187
- " pq.write_table(table, parquet_file) #GitHub Copilot generated\n",
188
  "\n",
189
- " jsonl_to_parquet(dataset, f\"{dataset_jtp}.parquet\") #GitHub Copilot generated\n",
190
  " dataset = f\"{dataset_jtp}.parquet\"\n",
 
 
191
  "#@markdown Quantizing only allows for parquet datasets to be used. Enable convert_parquet if your dataset ends in a .jsonl extention.\\\n",
192
  "#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/jsonl-example.jpg \"File extension is .jsonl\")\\\n",
193
  "#@markdown pippa is used as an example in this image. This dataset contains content that is not suitable for users under 18. This dataset also may or may not be against colab TOS. It won't be allowed under free colab usage although you're able to use it with paid compute units.\n",
 
163
  "#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/dataset-example.jpg \"Copy from download button\")\\\n",
164
  "#@markdown If dataset is jsonl then convert to parquet. (Not always reliable, must be formatted correctly)\n",
165
  "convert_parquet = False # @param {type:\"boolean\"}\n",
166
+ "\n",
167
+ " #GITHUB COPILOT GENERATED START\n",
168
  "if convert_parquet == True:\n",
169
+ " import pandas as pd\n",
170
+ " import pyarrow as pa\n",
171
+ " import pyarrow.parquet as pq\n",
172
+ " import json\n",
173
  "\n",
174
+ " def jsonl_to_parquet(jsonl_file, parquet_file):\n",
175
  " # Read JSONL file line by line with explicit encoding\n",
176
+ " with open(jsonl_file, 'r', encoding='utf-8') as f:\n",
177
+ " lines = f.readlines()\n",
178
  "\n",
179
  " # Parse JSON lines and store as list of dictionaries\n",
180
+ " data = [json.loads(line.strip()) for line in lines]\n",
181
  "\n",
182
  " # Convert to Pandas DataFrame\n",
183
+ " df = pd.DataFrame(data)\n",
184
  "\n",
185
  " # Convert DataFrame to PyArrow Table\n",
186
+ " table = pa.Table.from_pandas(df)\n",
187
  "\n",
188
  " # Write PyArrow Table to Parquet file\n",
189
+ " pq.write_table(table, parquet_file)\n",
190
  "\n",
191
+ " jsonl_to_parquet(dataset, f\"{dataset_jtp}.parquet\")\n",
192
  " dataset = f\"{dataset_jtp}.parquet\"\n",
193
+ " #GITHUB COPILOT GENERATED END\n",
194
+ "\n",
195
  "#@markdown Quantizing only allows for parquet datasets to be used. Enable convert_parquet if your dataset ends in a .jsonl extention.\\\n",
196
  "#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/ipynb/jsonl-example.jpg \"File extension is .jsonl\")\\\n",
197
  "#@markdown pippa is used as an example in this image. This dataset contains content that is not suitable for users under 18. This dataset also may or may not be against colab TOS. It won't be allowed under free colab usage although you're able to use it with paid compute units.\n",