Commit
•
d982700
1
Parent(s):
dd0124d
update failed token message
Browse files
pdm.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -19,11 +19,11 @@ license = {text = "Apache 2"}
|
|
19 |
|
20 |
dependencies = [
|
21 |
"distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
|
22 |
-
"gradio[oauth]
|
23 |
"transformers>=4.44.2",
|
24 |
"sentence-transformers>=3.2.0",
|
25 |
"model2vec>=0.2.4",
|
26 |
-
"gradio-huggingfacehub-search>=0.0.
|
27 |
"argilla>=2.4.0",
|
28 |
]
|
29 |
|
|
|
19 |
|
20 |
dependencies = [
|
21 |
"distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
|
22 |
+
"gradio[oauth]>=5.4.0",
|
23 |
"transformers>=4.44.2",
|
24 |
"sentence-transformers>=3.2.0",
|
25 |
"model2vec>=0.2.4",
|
26 |
+
"gradio-huggingfacehub-search>=0.0.12",
|
27 |
"argilla>=2.4.0",
|
28 |
]
|
29 |
|
src/synthetic_dataset_generator/apps/eval.py
CHANGED
@@ -739,7 +739,6 @@ with gr.Blocks() as app:
|
|
739 |
dataframe = gr.Dataframe(
|
740 |
headers=["prompt", "completion", "evaluation"],
|
741 |
wrap=True,
|
742 |
-
height=500,
|
743 |
interactive=False,
|
744 |
elem_classes="table-view",
|
745 |
)
|
|
|
739 |
dataframe = gr.Dataframe(
|
740 |
headers=["prompt", "completion", "evaluation"],
|
741 |
wrap=True,
|
|
|
742 |
interactive=False,
|
743 |
elem_classes="table-view",
|
744 |
)
|
src/synthetic_dataset_generator/apps/sft.py
CHANGED
@@ -15,7 +15,11 @@ from synthetic_dataset_generator.apps.base import (
|
|
15 |
validate_argilla_user_workspace_dataset,
|
16 |
validate_push_to_hub,
|
17 |
)
|
18 |
-
from synthetic_dataset_generator.constants import
|
|
|
|
|
|
|
|
|
19 |
from synthetic_dataset_generator.pipelines.embeddings import (
|
20 |
get_embeddings,
|
21 |
get_sentence_embedding_dimensions,
|
@@ -82,7 +86,6 @@ def _get_dataframe():
|
|
82 |
return gr.Dataframe(
|
83 |
headers=["prompt", "completion"],
|
84 |
wrap=True,
|
85 |
-
height=500,
|
86 |
interactive=False,
|
87 |
elem_classes="table-view",
|
88 |
)
|
@@ -97,8 +100,12 @@ def generate_dataset(
|
|
97 |
progress=gr.Progress(),
|
98 |
) -> pd.DataFrame:
|
99 |
progress(0.0, desc="(1/2) Generating instructions")
|
100 |
-
magpie_generator = get_magpie_generator(
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
total_steps: int = num_rows * 2
|
103 |
batch_size = DEFAULT_BATCH_SIZE
|
104 |
|
@@ -520,7 +527,7 @@ with gr.Blocks() as app:
|
|
520 |
num_turns,
|
521 |
num_rows,
|
522 |
private,
|
523 |
-
temperature
|
524 |
],
|
525 |
outputs=[success_message],
|
526 |
show_progress=True,
|
|
|
15 |
validate_argilla_user_workspace_dataset,
|
16 |
validate_push_to_hub,
|
17 |
)
|
18 |
+
from synthetic_dataset_generator.constants import (
|
19 |
+
DEFAULT_BATCH_SIZE,
|
20 |
+
MODEL,
|
21 |
+
SFT_AVAILABLE,
|
22 |
+
)
|
23 |
from synthetic_dataset_generator.pipelines.embeddings import (
|
24 |
get_embeddings,
|
25 |
get_sentence_embedding_dimensions,
|
|
|
86 |
return gr.Dataframe(
|
87 |
headers=["prompt", "completion"],
|
88 |
wrap=True,
|
|
|
89 |
interactive=False,
|
90 |
elem_classes="table-view",
|
91 |
)
|
|
|
100 |
progress=gr.Progress(),
|
101 |
) -> pd.DataFrame:
|
102 |
progress(0.0, desc="(1/2) Generating instructions")
|
103 |
+
magpie_generator = get_magpie_generator(
|
104 |
+
system_prompt, num_turns, temperature, is_sample
|
105 |
+
)
|
106 |
+
response_generator = get_response_generator(
|
107 |
+
system_prompt, num_turns, temperature, is_sample
|
108 |
+
)
|
109 |
total_steps: int = num_rows * 2
|
110 |
batch_size = DEFAULT_BATCH_SIZE
|
111 |
|
|
|
527 |
num_turns,
|
528 |
num_rows,
|
529 |
private,
|
530 |
+
temperature,
|
531 |
],
|
532 |
outputs=[success_message],
|
533 |
show_progress=True,
|
src/synthetic_dataset_generator/apps/textcat.py
CHANGED
@@ -39,7 +39,6 @@ def _get_dataframe():
|
|
39 |
return gr.Dataframe(
|
40 |
headers=["labels", "text"],
|
41 |
wrap=True,
|
42 |
-
height=500,
|
43 |
interactive=False,
|
44 |
elem_classes="table-view",
|
45 |
)
|
@@ -96,7 +95,10 @@ def generate_dataset(
|
|
96 |
progress(0.0, desc="(1/2) Generating text classification data")
|
97 |
labels = get_preprocess_labels(labels)
|
98 |
textcat_generator = get_textcat_generator(
|
99 |
-
difficulty=difficulty,
|
|
|
|
|
|
|
100 |
)
|
101 |
labeller_generator = get_labeller_generator(
|
102 |
system_prompt=f"{system_prompt} {', '.join(labels)}",
|
@@ -541,7 +543,7 @@ with gr.Blocks() as app:
|
|
541 |
num_rows,
|
542 |
labels,
|
543 |
private,
|
544 |
-
temperature
|
545 |
],
|
546 |
outputs=[success_message],
|
547 |
show_progress=True,
|
@@ -558,7 +560,7 @@ with gr.Blocks() as app:
|
|
558 |
labels,
|
559 |
num_labels,
|
560 |
num_rows,
|
561 |
-
temperature
|
562 |
],
|
563 |
outputs=[pipeline_code],
|
564 |
).success(
|
|
|
39 |
return gr.Dataframe(
|
40 |
headers=["labels", "text"],
|
41 |
wrap=True,
|
|
|
42 |
interactive=False,
|
43 |
elem_classes="table-view",
|
44 |
)
|
|
|
95 |
progress(0.0, desc="(1/2) Generating text classification data")
|
96 |
labels = get_preprocess_labels(labels)
|
97 |
textcat_generator = get_textcat_generator(
|
98 |
+
difficulty=difficulty,
|
99 |
+
clarity=clarity,
|
100 |
+
temperature=temperature,
|
101 |
+
is_sample=is_sample,
|
102 |
)
|
103 |
labeller_generator = get_labeller_generator(
|
104 |
system_prompt=f"{system_prompt} {', '.join(labels)}",
|
|
|
543 |
num_rows,
|
544 |
labels,
|
545 |
private,
|
546 |
+
temperature,
|
547 |
],
|
548 |
outputs=[success_message],
|
549 |
show_progress=True,
|
|
|
560 |
labels,
|
561 |
num_labels,
|
562 |
num_rows,
|
563 |
+
temperature,
|
564 |
],
|
565 |
outputs=[pipeline_code],
|
566 |
).success(
|
src/synthetic_dataset_generator/utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
from typing import List, Optional, Union
|
3 |
|
4 |
import argilla as rg
|
@@ -38,9 +39,15 @@ def list_orgs(oauth_token: Union[OAuthToken, None] = None):
|
|
38 |
organizations = [org for org in organizations if org != data["name"]]
|
39 |
organizations = [data["name"]] + organizations
|
40 |
except Exception as e:
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
)
|
|
|
|
|
44 |
return organizations
|
45 |
|
46 |
|
|
|
1 |
import json
|
2 |
+
import warnings
|
3 |
from typing import List, Optional, Union
|
4 |
|
5 |
import argilla as rg
|
|
|
39 |
organizations = [org for org in organizations if org != data["name"]]
|
40 |
organizations = [data["name"]] + organizations
|
41 |
except Exception as e:
|
42 |
+
data = whoami(oauth_token.token)
|
43 |
+
warnings.warn(str(e))
|
44 |
+
gr.Info(
|
45 |
+
"Your user token does not have the necessary permissions to push to organizations."
|
46 |
+
"Please check your OAuth permissions in https://huggingface.co/settings/connected-applications."
|
47 |
+
"Update yout token permissions to include repo.write: https://huggingface.co/settings/tokens."
|
48 |
)
|
49 |
+
return [data["name"]]
|
50 |
+
|
51 |
return organizations
|
52 |
|
53 |
|