davidberenstein1957 HF staff commited on
Commit
371c76b
·
1 Parent(s): 3c6a88c

Update version to 0.1.6, remove requirements.txt, and enhance dataset handling in pipelines. Added Gradio support and improved LLM class retrieval. Commented out HF_TOKEN in example deployment script.

Browse files
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [project]
2
  name = "synthetic-dataset-generator"
3
- version = "0.1.5"
4
  description = "Build datasets using natural language"
5
  authors = [
6
  {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 
1
  [project]
2
  name = "synthetic-dataset-generator"
3
+ version = "0.1.6"
4
  description = "Build datasets using natural language"
5
  authors = [
6
  {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
requirements.txt DELETED
@@ -1 +0,0 @@
1
- -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
 
 
src/synthetic_dataset_generator/_distiset.py CHANGED
@@ -2,6 +2,7 @@ from typing import Optional
2
 
3
  import distilabel
4
  import distilabel.distiset
 
5
  from distilabel.utils.card.dataset_card import (
6
  DistilabelDatasetCard,
7
  size_categories_parser,
@@ -81,14 +82,23 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
81
  dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
82
  )
83
 
84
- keys = list(sample_records.keys())
85
- if len(keys) != 2 or not (
86
- ("label" in keys and "text" in keys)
87
- or ("labels" in keys and "text" in keys)
 
88
  ):
89
  task_categories = ["text-classification"]
90
- elif "prompt" in keys or "messages" in keys:
91
- task_categories = ["text-generation", "text2text-generation"]
 
 
 
 
 
 
 
 
92
 
93
  readme_metadata = {}
94
  if repo_id and token:
 
2
 
3
  import distilabel
4
  import distilabel.distiset
5
+ import gradio as gr
6
  from distilabel.utils.card.dataset_card import (
7
  DistilabelDatasetCard,
8
  size_categories_parser,
 
82
  dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
83
  )
84
 
85
+ columns = self["default"].column_names
86
+ columns = self["default"].column_names
87
+
88
+ if ("label" in columns and "text" in columns) or (
89
+ "labels" in columns and "text" in columns
90
  ):
91
  task_categories = ["text-classification"]
92
+ elif ("prompt" in columns and "completion" in columns) or (
93
+ "messages" in columns
94
+ ):
95
+ task_categories: list[str] = ["text-generation", "text2text-generation"]
96
+ else:
97
+ task_categories: list[str] = []
98
+ gr.Info(
99
+ f"No task categories found for dataset with columns: {columns}. "
100
+ "Please notify the distilabel team if you think this is an error."
101
+ )
102
 
103
  readme_metadata = {}
104
  if repo_id and token:
src/synthetic_dataset_generator/pipelines/base.py CHANGED
@@ -1,7 +1,6 @@
1
  import math
2
  import random
3
 
4
- import gradio as gr
5
  from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
6
  from distilabel.steps.tasks import TextGeneration
7
 
@@ -9,7 +8,6 @@ from synthetic_dataset_generator.constants import (
9
  API_KEYS,
10
  DEFAULT_BATCH_SIZE,
11
  HUGGINGFACE_BASE_URL,
12
- MAGPIE_PRE_QUERY_TEMPLATE,
13
  MODEL,
14
  OLLAMA_BASE_URL,
15
  OPENAI_BASE_URL,
@@ -62,6 +60,19 @@ def get_rewriten_prompts(prompt: str, num_rows: int):
62
  return prompt_rewrites
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def _get_llm(use_magpie_template=False, **kwargs):
66
  if OPENAI_BASE_URL:
67
  llm = OpenAILLM(
@@ -100,6 +111,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
100
  model=MODEL,
101
  host=OLLAMA_BASE_URL,
102
  tokenizer_id=TOKENIZER_ID or MODEL,
 
103
  **kwargs,
104
  )
105
  elif HUGGINGFACE_BASE_URL:
@@ -108,6 +120,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
108
  api_key=_get_next_api_key(),
109
  base_url=HUGGINGFACE_BASE_URL,
110
  tokenizer_id=TOKENIZER_ID or MODEL,
 
111
  **kwargs,
112
  )
113
  elif VLLM_BASE_URL:
@@ -119,6 +132,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
119
  model=MODEL,
120
  tokenizer=TOKENIZER_ID or MODEL,
121
  api_key=_get_next_api_key(),
 
122
  **kwargs,
123
  )
124
  else:
@@ -126,7 +140,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
126
  api_key=_get_next_api_key(),
127
  tokenizer_id=TOKENIZER_ID or MODEL,
128
  model_id=MODEL,
129
- magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
130
  **kwargs,
131
  )
132
 
@@ -138,4 +152,4 @@ try:
138
  llm.load()
139
  llm.generate([[{"content": "Hello, world!", "role": "user"}]])
140
  except Exception as e:
141
- gr.Error(f"Error loading {llm.__class__.__name__}: {e}")
 
1
  import math
2
  import random
3
 
 
4
  from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
5
  from distilabel.steps.tasks import TextGeneration
6
 
 
8
  API_KEYS,
9
  DEFAULT_BATCH_SIZE,
10
  HUGGINGFACE_BASE_URL,
 
11
  MODEL,
12
  OLLAMA_BASE_URL,
13
  OPENAI_BASE_URL,
 
60
  return prompt_rewrites
61
 
62
 
63
+ def _get_llm_class() -> str:
64
+ if OPENAI_BASE_URL:
65
+ return "OpenAILLM"
66
+ elif OLLAMA_BASE_URL:
67
+ return "OllamaLLM"
68
+ elif HUGGINGFACE_BASE_URL:
69
+ return "InferenceEndpointsLLM"
70
+ elif VLLM_BASE_URL:
71
+ return "ClientvLLM"
72
+ else:
73
+ return "InferenceEndpointsLLM"
74
+
75
+
76
  def _get_llm(use_magpie_template=False, **kwargs):
77
  if OPENAI_BASE_URL:
78
  llm = OpenAILLM(
 
111
  model=MODEL,
112
  host=OLLAMA_BASE_URL,
113
  tokenizer_id=TOKENIZER_ID or MODEL,
114
+ use_magpie_template=use_magpie_template,
115
  **kwargs,
116
  )
117
  elif HUGGINGFACE_BASE_URL:
 
120
  api_key=_get_next_api_key(),
121
  base_url=HUGGINGFACE_BASE_URL,
122
  tokenizer_id=TOKENIZER_ID or MODEL,
123
+ use_magpie_template=use_magpie_template,
124
  **kwargs,
125
  )
126
  elif VLLM_BASE_URL:
 
132
  model=MODEL,
133
  tokenizer=TOKENIZER_ID or MODEL,
134
  api_key=_get_next_api_key(),
135
+ use_magpie_template=use_magpie_template,
136
  **kwargs,
137
  )
138
  else:
 
140
  api_key=_get_next_api_key(),
141
  tokenizer_id=TOKENIZER_ID or MODEL,
142
  model_id=MODEL,
143
+ use_magpie_template=use_magpie_template,
144
  **kwargs,
145
  )
146
 
 
152
  llm.load()
153
  llm.generate([[{"content": "Hello, world!", "role": "user"}]])
154
  except Exception as e:
155
+ raise Exception(f"Error loading {llm.__class__.__name__}: {e}")
src/synthetic_dataset_generator/pipelines/chat.py CHANGED
@@ -1,12 +1,10 @@
1
  from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
2
 
3
  from synthetic_dataset_generator.constants import (
4
- BASE_URL,
5
  MAGPIE_PRE_QUERY_TEMPLATE,
6
  MAX_NUM_TOKENS,
7
- MODEL,
8
  )
9
- from synthetic_dataset_generator.pipelines.base import _get_llm
10
 
11
  INFORMATION_SEEKING_PROMPT = (
12
  "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -237,28 +235,13 @@ import os
237
  from distilabel.pipeline import Pipeline
238
  from distilabel.steps import KeepColumns
239
  from distilabel.steps.tasks import MagpieGenerator
240
- from distilabel.llms import InferenceEndpointsLLM
241
 
242
- MODEL = "{MODEL}"
243
- BASE_URL = "{BASE_URL}"
244
  SYSTEM_PROMPT = "{system_prompt}"
245
- os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
246
 
247
  with Pipeline(name="sft") as pipeline:
248
  magpie = MagpieGenerator(
249
- llm=InferenceEndpointsLLM(
250
- model_id=MODEL,
251
- tokenizer_id=MODEL,
252
- base_url=BASE_URL,
253
- magpie_pre_query_template="llama3",
254
- generation_kwargs={{
255
- "temperature": {temperature},
256
- "do_sample": True,
257
- "max_new_tokens": {MAX_NUM_TOKENS},
258
- "stop_sequences": {_STOP_SEQUENCES}
259
- }},
260
- api_key=os.environ["API_KEY"],
261
- ),
262
  n_turns={num_turns},
263
  num_rows={num_rows},
264
  batch_size=1,
 
1
  from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
2
 
3
  from synthetic_dataset_generator.constants import (
 
4
  MAGPIE_PRE_QUERY_TEMPLATE,
5
  MAX_NUM_TOKENS,
 
6
  )
7
+ from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
8
 
9
  INFORMATION_SEEKING_PROMPT = (
10
  "You are an AI assistant designed to provide accurate and concise information on a wide"
 
235
  from distilabel.pipeline import Pipeline
236
  from distilabel.steps import KeepColumns
237
  from distilabel.steps.tasks import MagpieGenerator
238
+ from distilabel.llms import {_get_llm_class()}
239
 
 
 
240
  SYSTEM_PROMPT = "{system_prompt}"
 
241
 
242
  with Pipeline(name="sft") as pipeline:
243
  magpie = MagpieGenerator(
244
+ llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()})},
 
 
 
 
 
 
 
 
 
 
 
 
245
  n_turns={num_turns},
246
  num_rows={num_rows},
247
  batch_size=1,
src/synthetic_dataset_generator/pipelines/textcat.py CHANGED
@@ -9,11 +9,9 @@ from distilabel.steps.tasks import (
9
  from pydantic import BaseModel, Field
10
 
11
  from synthetic_dataset_generator.constants import (
12
- BASE_URL,
13
  MAX_NUM_TOKENS,
14
- MODEL,
15
  )
16
- from synthetic_dataset_generator.pipelines.base import _get_llm
17
  from synthetic_dataset_generator.utils import get_preprocess_labels
18
 
19
  PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
@@ -131,39 +129,21 @@ def generate_pipeline_code(
131
  temperature: float = 0.9,
132
  ) -> str:
133
  labels = get_preprocess_labels(labels)
134
- MODEL_ARG = "model_id" if BASE_URL else "model"
135
- MODEL_CLASS = "InferenceEndpointsLLM" if BASE_URL else "OpenAILLM"
136
  base_code = f"""
137
  # Requirements: `pip install distilabel[hf-inference-endpoints]`
138
  import os
139
  import random
140
- from distilabel.llms import InferenceEndpointsLLM
141
  from distilabel.pipeline import Pipeline
142
  from distilabel.steps import LoadDataFromDicts, KeepColumns
143
  from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
144
 
145
- MODEL = "{MODEL}"
146
- BASE_URL = "{BASE_URL}"
147
- TEXT_CLASSIFICATION_TASK = "{system_prompt}"
148
- os.environ["API_KEY"] = (
149
- "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
150
- )
151
-
152
  with Pipeline(name="textcat") as pipeline:
153
 
154
  task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
155
 
156
  textcat_generation = GenerateTextClassificationData(
157
- llm={MODEL_CLASS}(
158
- {MODEL_ARG}=MODEL,
159
- base_url=BASE_URL,
160
- api_key=os.environ["API_KEY"],
161
- generation_kwargs={{
162
- "temperature": {temperature},
163
- "max_new_tokens": {MAX_NUM_TOKENS},
164
- "top_p": 0.95,
165
- }},
166
- ),
167
  seed=random.randint(0, 2**32 - 1),
168
  difficulty={None if difficulty == "mixed" else repr(difficulty)},
169
  clarity={None if clarity == "mixed" else repr(clarity)},
@@ -196,15 +176,7 @@ with Pipeline(name="textcat") as pipeline:
196
  )
197
 
198
  textcat_labeller = TextClassification(
199
- llm={MODEL_CLASS}(
200
- {MODEL_ARG}=MODEL,
201
- base_url=BASE_URL,
202
- api_key=os.environ["API_KEY"],
203
- generation_kwargs={{
204
- "temperature": 0.8,
205
- "max_new_tokens": {MAX_NUM_TOKENS},
206
- }},
207
- ),
208
  n={num_labels},
209
  available_labels={labels},
210
  context=TEXT_CLASSIFICATION_TASK,
 
9
  from pydantic import BaseModel, Field
10
 
11
  from synthetic_dataset_generator.constants import (
 
12
  MAX_NUM_TOKENS,
 
13
  )
14
+ from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
15
  from synthetic_dataset_generator.utils import get_preprocess_labels
16
 
17
  PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
 
129
  temperature: float = 0.9,
130
  ) -> str:
131
  labels = get_preprocess_labels(labels)
 
 
132
  base_code = f"""
133
  # Requirements: `pip install distilabel[hf-inference-endpoints]`
134
  import os
135
  import random
136
+ from distilabel.llms import {_get_llm_class()}
137
  from distilabel.pipeline import Pipeline
138
  from distilabel.steps import LoadDataFromDicts, KeepColumns
139
  from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
140
 
 
 
 
 
 
 
 
141
  with Pipeline(name="textcat") as pipeline:
142
 
143
  task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
144
 
145
  textcat_generation = GenerateTextClassificationData(
146
+ llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
 
 
 
 
 
 
 
 
 
147
  seed=random.randint(0, 2**32 - 1),
148
  difficulty={None if difficulty == "mixed" else repr(difficulty)},
149
  clarity={None if clarity == "mixed" else repr(clarity)},
 
176
  )
177
 
178
  textcat_labeller = TextClassification(
179
+ llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
 
 
 
 
 
 
 
 
180
  n={num_labels},
181
  available_labels={labels},
182
  context=TEXT_CLASSIFICATION_TASK,