Commit
·
d7a6ff4
1
Parent(s):
f949aa9
fix: Update batching logic
Browse files
src/distilabel_dataset_generator/apps/sft.py
CHANGED
@@ -92,31 +92,37 @@ def generate_dataset(
|
|
92 |
batch_size = DEFAULT_BATCH_SIZE
|
93 |
|
94 |
# create instructions
|
|
|
95 |
magpie_results = []
|
96 |
-
|
97 |
progress(
|
98 |
-
0.5 *
|
99 |
total=total_steps,
|
100 |
desc="(1/2) Generating instructions",
|
101 |
)
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
progress(0.5, desc="(1/2) Generating instructions")
|
105 |
|
106 |
# generate responses
|
|
|
107 |
response_results = []
|
108 |
if num_turns == 1:
|
109 |
-
|
110 |
progress(
|
111 |
-
0.5 + 0.5 *
|
112 |
total=total_steps,
|
113 |
desc="(2/2) Generating responses",
|
114 |
)
|
115 |
-
batch = magpie_results[
|
116 |
-
batch = [entry[0] for entry in batch]
|
117 |
responses = list(response_generator.process(inputs=batch))
|
118 |
-
response_results.extend(responses)
|
119 |
-
|
|
|
120 |
result["prompt"] = result["instruction"]
|
121 |
result["completion"] = result["generation"]
|
122 |
result["system_prompt"] = system_prompt
|
@@ -126,18 +132,17 @@ def generate_dataset(
|
|
126 |
0, {"role": "system", "content": system_prompt}
|
127 |
)
|
128 |
result[0]["messages"] = result[0]["conversation"]
|
129 |
-
|
130 |
progress(
|
131 |
-
0.5 + 0.5 *
|
132 |
total=total_steps,
|
133 |
desc="(2/2) Generating responses",
|
134 |
)
|
135 |
-
batch = magpie_results[
|
136 |
-
batch = [entry[0] for entry in batch]
|
137 |
responses = list(response_generator.process(inputs=batch))
|
138 |
-
response_results.extend(responses)
|
139 |
-
|
140 |
-
for result in response_results
|
141 |
result["messages"].append(
|
142 |
{"role": "assistant", "content": result["generation"]}
|
143 |
)
|
@@ -149,7 +154,7 @@ def generate_dataset(
|
|
149 |
|
150 |
# create distiset
|
151 |
distiset_results = []
|
152 |
-
for result in response_results
|
153 |
record = {}
|
154 |
for relevant_keys in [
|
155 |
"messages",
|
|
|
92 |
batch_size = DEFAULT_BATCH_SIZE
|
93 |
|
94 |
# create instructions
|
95 |
+
n_processed = 0
|
96 |
magpie_results = []
|
97 |
+
while n_processed < num_rows:
|
98 |
progress(
|
99 |
+
0.5 * n_processed / num_rows,
|
100 |
total=total_steps,
|
101 |
desc="(1/2) Generating instructions",
|
102 |
)
|
103 |
+
remaining_rows = num_rows - n_processed
|
104 |
+
batch_size = min(batch_size, remaining_rows)
|
105 |
+
inputs = [{"system_prompt": system_prompt} for _ in range(batch_size)]
|
106 |
+
batch = list(magpie_generator.process(inputs=inputs))
|
107 |
+
magpie_results.extend(batch[0])
|
108 |
+
n_processed += batch_size
|
109 |
progress(0.5, desc="(1/2) Generating instructions")
|
110 |
|
111 |
# generate responses
|
112 |
+
n_processed = 0
|
113 |
response_results = []
|
114 |
if num_turns == 1:
|
115 |
+
while n_processed < num_rows:
|
116 |
progress(
|
117 |
+
0.5 + 0.5 * n_processed / num_rows,
|
118 |
total=total_steps,
|
119 |
desc="(2/2) Generating responses",
|
120 |
)
|
121 |
+
batch = magpie_results[n_processed : n_processed + batch_size]
|
|
|
122 |
responses = list(response_generator.process(inputs=batch))
|
123 |
+
response_results.extend(responses[0])
|
124 |
+
n_processed += batch_size
|
125 |
+
for result in response_results:
|
126 |
result["prompt"] = result["instruction"]
|
127 |
result["completion"] = result["generation"]
|
128 |
result["system_prompt"] = system_prompt
|
|
|
132 |
0, {"role": "system", "content": system_prompt}
|
133 |
)
|
134 |
result[0]["messages"] = result[0]["conversation"]
|
135 |
+
while n_processed < num_rows:
|
136 |
progress(
|
137 |
+
0.5 + 0.5 * n_processed / num_rows,
|
138 |
total=total_steps,
|
139 |
desc="(2/2) Generating responses",
|
140 |
)
|
141 |
+
batch = magpie_results[n_processed : n_processed + batch_size]
|
|
|
142 |
responses = list(response_generator.process(inputs=batch))
|
143 |
+
response_results.extend(responses[0])
|
144 |
+
n_processed += batch_size
|
145 |
+
for result in response_results:
|
146 |
result["messages"].append(
|
147 |
{"role": "assistant", "content": result["generation"]}
|
148 |
)
|
|
|
154 |
|
155 |
# create distiset
|
156 |
distiset_results = []
|
157 |
+
for result in response_results:
|
158 |
record = {}
|
159 |
for relevant_keys in [
|
160 |
"messages",
|
src/distilabel_dataset_generator/pipelines/sft.py
CHANGED
@@ -4,7 +4,7 @@ from distilabel.distiset import Distiset
|
|
4 |
from distilabel.llms import InferenceEndpointsLLM
|
5 |
from distilabel.pipeline import Pipeline
|
6 |
from distilabel.steps import KeepColumns
|
7 |
-
from distilabel.steps.tasks import ChatGeneration,
|
8 |
|
9 |
from src.distilabel_dataset_generator.utils import HF_TOKENS
|
10 |
|
@@ -221,7 +221,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
|
|
221 |
input_mappings = _get_output_mappings(num_turns)
|
222 |
output_mappings = input_mappings.copy()
|
223 |
if num_turns == 1:
|
224 |
-
magpie_generator =
|
225 |
llm=InferenceEndpointsLLM(
|
226 |
model_id=MODEL,
|
227 |
tokenizer_id=MODEL,
|
@@ -234,15 +234,13 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
|
|
234 |
"stop_sequences": _STOP_SEQUENCES,
|
235 |
},
|
236 |
),
|
237 |
-
batch_size=DEFAULT_BATCH_SIZE,
|
238 |
n_turns=num_turns,
|
239 |
-
num_rows=num_rows,
|
240 |
system_prompt=system_prompt,
|
241 |
output_mappings=output_mappings,
|
242 |
only_instruction=True,
|
243 |
)
|
244 |
else:
|
245 |
-
magpie_generator =
|
246 |
llm=InferenceEndpointsLLM(
|
247 |
model_id=MODEL,
|
248 |
tokenizer_id=MODEL,
|
@@ -255,10 +253,8 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
|
|
255 |
"stop_sequences": _STOP_SEQUENCES,
|
256 |
},
|
257 |
),
|
258 |
-
batch_size=DEFAULT_BATCH_SIZE,
|
259 |
end_with_user=True,
|
260 |
n_turns=num_turns,
|
261 |
-
num_rows=num_rows,
|
262 |
system_prompt=system_prompt,
|
263 |
output_mappings=output_mappings,
|
264 |
)
|
|
|
4 |
from distilabel.llms import InferenceEndpointsLLM
|
5 |
from distilabel.pipeline import Pipeline
|
6 |
from distilabel.steps import KeepColumns
|
7 |
+
from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
|
8 |
|
9 |
from src.distilabel_dataset_generator.utils import HF_TOKENS
|
10 |
|
|
|
221 |
input_mappings = _get_output_mappings(num_turns)
|
222 |
output_mappings = input_mappings.copy()
|
223 |
if num_turns == 1:
|
224 |
+
magpie_generator = Magpie(
|
225 |
llm=InferenceEndpointsLLM(
|
226 |
model_id=MODEL,
|
227 |
tokenizer_id=MODEL,
|
|
|
234 |
"stop_sequences": _STOP_SEQUENCES,
|
235 |
},
|
236 |
),
|
|
|
237 |
n_turns=num_turns,
|
|
|
238 |
system_prompt=system_prompt,
|
239 |
output_mappings=output_mappings,
|
240 |
only_instruction=True,
|
241 |
)
|
242 |
else:
|
243 |
+
magpie_generator = Magpie(
|
244 |
llm=InferenceEndpointsLLM(
|
245 |
model_id=MODEL,
|
246 |
tokenizer_id=MODEL,
|
|
|
253 |
"stop_sequences": _STOP_SEQUENCES,
|
254 |
},
|
255 |
),
|
|
|
256 |
end_with_user=True,
|
257 |
n_turns=num_turns,
|
|
|
258 |
system_prompt=system_prompt,
|
259 |
output_mappings=output_mappings,
|
260 |
)
|