machineuser commited on
Commit
a6b2d88
1 Parent(s): 27a1c11

Sync widgets demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. packages/tasks/scripts/inference-codegen.ts +1 -1
  2. packages/tasks/src/tasks/audio-classification/inference.ts +3 -3
  3. packages/tasks/src/tasks/audio-classification/spec/input.json +2 -2
  4. packages/tasks/src/tasks/audio-classification/spec/output.json +1 -12
  5. packages/tasks/src/tasks/automatic-speech-recognition/inference.ts +34 -29
  6. packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json +1 -1
  7. packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json +30 -28
  8. packages/tasks/src/tasks/common-definitions.json +25 -17
  9. packages/tasks/src/tasks/depth-estimation/inference.ts +1 -1
  10. packages/tasks/src/tasks/document-question-answering/inference.ts +15 -7
  11. packages/tasks/src/tasks/document-question-answering/spec/input.json +7 -7
  12. packages/tasks/src/tasks/document-question-answering/spec/output.json +2 -2
  13. packages/tasks/src/tasks/fill-mask/inference.ts +3 -2
  14. packages/tasks/src/tasks/fill-mask/spec/input.json +1 -1
  15. packages/tasks/src/tasks/fill-mask/spec/output.json +1 -1
  16. packages/tasks/src/tasks/image-classification/inference.ts +2 -2
  17. packages/tasks/src/tasks/image-classification/spec/input.json +2 -2
  18. packages/tasks/src/tasks/image-segmentation/inference.ts +2 -2
  19. packages/tasks/src/tasks/image-segmentation/spec/input.json +2 -2
  20. packages/tasks/src/tasks/image-to-image/inference.ts +4 -4
  21. packages/tasks/src/tasks/image-to-image/spec/input.json +4 -4
  22. packages/tasks/src/tasks/image-to-text/inference.ts +24 -19
  23. packages/tasks/src/tasks/image-to-text/spec/input.json +1 -1
  24. packages/tasks/src/tasks/image-to-text/spec/output.json +8 -11
  25. packages/tasks/src/tasks/placeholder/spec/input.json +2 -2
  26. packages/tasks/src/tasks/placeholder/spec/output.json +1 -1
  27. packages/tasks/src/tasks/question-answering/inference.ts +7 -7
  28. packages/tasks/src/tasks/question-answering/spec/input.json +7 -7
  29. packages/tasks/src/tasks/summarization/inference.ts +4 -3
  30. packages/tasks/src/tasks/text-classification/inference.ts +2 -2
  31. packages/tasks/src/tasks/text-classification/spec/input.json +2 -2
  32. packages/tasks/src/tasks/text-generation/inference.ts +14 -11
  33. packages/tasks/src/tasks/text-generation/spec/input.json +8 -8
  34. packages/tasks/src/tasks/text-generation/spec/output.json +8 -11
  35. packages/tasks/src/tasks/text-to-audio/inference.ts +23 -18
  36. packages/tasks/src/tasks/text-to-audio/spec/output.json +10 -13
  37. packages/tasks/src/tasks/text-to-image/inference.ts +5 -7
  38. packages/tasks/src/tasks/text-to-image/spec/input.json +4 -4
  39. packages/tasks/src/tasks/text-to-image/spec/output.json +7 -9
  40. packages/tasks/src/tasks/text-to-speech/inference.ts +17 -16
  41. packages/tasks/src/tasks/text2text-generation/inference.ts +9 -7
  42. packages/tasks/src/tasks/text2text-generation/spec/input.json +2 -2
  43. packages/tasks/src/tasks/text2text-generation/spec/output.json +8 -11
  44. packages/tasks/src/tasks/token-classification/inference.ts +3 -3
  45. packages/tasks/src/tasks/token-classification/spec/input.json +2 -2
  46. packages/tasks/src/tasks/token-classification/spec/output.json +1 -1
  47. packages/tasks/src/tasks/translation/inference.ts +4 -3
  48. packages/tasks/src/tasks/video-classification/inference.ts +4 -4
  49. packages/tasks/src/tasks/video-classification/spec/input.json +4 -4
  50. packages/tasks/src/tasks/visual-question-answering/inference.ts +1 -1
packages/tasks/scripts/inference-codegen.ts CHANGED
@@ -57,7 +57,7 @@ async function generateTypescript(inputData: InputData): Promise<SerializedRende
57
  indentation: "\t",
58
  rendererOptions: {
59
  "just-types": true,
60
- "nice-property-names": true,
61
  "prefer-unions": true,
62
  "prefer-const-values": true,
63
  "prefer-unknown": true,
 
57
  indentation: "\t",
58
  rendererOptions: {
59
  "just-types": true,
60
+ "nice-property-names": false,
61
  "prefer-unions": true,
62
  "prefer-const-values": true,
63
  "prefer-unknown": true,
packages/tasks/src/tasks/audio-classification/inference.ts CHANGED
@@ -23,11 +23,11 @@ export interface AudioClassificationInput {
23
  * Additional inference parameters for Audio Classification
24
  */
25
  export interface AudioClassificationParameters {
26
- functionToApply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
- topK?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
@@ -40,7 +40,7 @@ export type AudioClassificationOutput = AudioClassificationOutputElement[];
40
  */
41
  export interface AudioClassificationOutputElement {
42
  /**
43
- * The predicted class label (model specific).
44
  */
45
  label: string;
46
  /**
 
23
  * Additional inference parameters for Audio Classification
24
  */
25
  export interface AudioClassificationParameters {
26
+ function_to_apply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
+ top_k?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
 
40
  */
41
  export interface AudioClassificationOutputElement {
42
  /**
43
+ * The predicted class label.
44
  */
45
  label: string;
46
  /**
packages/tasks/src/tasks/audio-classification/spec/input.json CHANGED
@@ -19,11 +19,11 @@
19
  "description": "Additional inference parameters for Audio Classification",
20
  "type": "object",
21
  "properties": {
22
- "functionToApply": {
23
  "title": "AudioClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
- "topK": {
27
  "type": "integer",
28
  "description": "When specified, limits the output to the top K most probable classes."
29
  }
 
19
  "description": "Additional inference parameters for Audio Classification",
20
  "type": "object",
21
  "properties": {
22
+ "function_to_apply": {
23
  "title": "AudioClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
+ "top_k": {
27
  "type": "integer",
28
  "description": "When specified, limits the output to the top K most probable classes."
29
  }
packages/tasks/src/tasks/audio-classification/spec/output.json CHANGED
@@ -5,17 +5,6 @@
5
  "description": "Outputs for Audio Classification inference",
6
  "type": "array",
7
  "items": {
8
- "type": "object",
9
- "properties": {
10
- "label": {
11
- "type": "string",
12
- "description": "The predicted class label (model specific)."
13
- },
14
- "score": {
15
- "type": "number",
16
- "description": "The corresponding probability."
17
- }
18
- },
19
- "required": ["label", "score"]
20
  }
21
  }
 
5
  "description": "Outputs for Audio Classification inference",
6
  "type": "array",
7
  "items": {
8
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 
 
 
 
 
 
 
 
 
 
 
9
  }
10
  }
packages/tasks/src/tasks/automatic-speech-recognition/inference.ts CHANGED
@@ -3,6 +3,7 @@
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
 
6
  /**
7
  * Inputs for Automatic Speech Recognition inference
8
  */
@@ -17,6 +18,7 @@ export interface AutomaticSpeechRecognitionInput {
17
  parameters?: AutomaticSpeechRecognitionParameters;
18
  [property: string]: unknown;
19
  }
 
20
  /**
21
  * Additional inference parameters
22
  *
@@ -30,9 +32,10 @@ export interface AutomaticSpeechRecognitionParameters {
30
  /**
31
  * Whether to output corresponding timestamps with the generated text
32
  */
33
- returnTimestamps?: boolean;
34
  [property: string]: unknown;
35
  }
 
36
  /**
37
  * Parametrization of the text generation process
38
  *
@@ -42,18 +45,18 @@ export interface GenerationParameters {
42
  /**
43
  * Whether to use sampling instead of greedy decoding when generating new tokens.
44
  */
45
- doSample?: boolean;
46
  /**
47
  * Controls the stopping condition for beam-based methods.
48
  */
49
- earlyStopping?: EarlyStoppingUnion;
50
  /**
51
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
52
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
53
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
54
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
55
  */
56
- epsilonCutoff?: number;
57
  /**
58
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
59
  * float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -63,37 +66,37 @@ export interface GenerationParameters {
63
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
64
  * for more details.
65
  */
66
- etaCutoff?: number;
67
  /**
68
  * The maximum length (in tokens) of the generated text, including the input.
69
  */
70
- maxLength?: number;
71
  /**
72
  * The maximum number of tokens to generate. Takes precedence over maxLength.
73
  */
74
- maxNewTokens?: number;
75
  /**
76
  * The minimum length (in tokens) of the generated text, including the input.
77
  */
78
- minLength?: number;
79
  /**
80
  * The minimum number of tokens to generate. Takes precedence over maxLength.
81
  */
82
- minNewTokens?: number;
83
  /**
84
  * Number of groups to divide num_beams into in order to ensure diversity among different
85
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
86
  */
87
- numBeamGroups?: number;
88
  /**
89
  * Number of beams to use for beam search.
90
  */
91
- numBeams?: number;
92
  /**
93
  * The value balances the model confidence and the degeneration penalty in contrastive
94
  * search decoding.
95
  */
96
- penaltyAlpha?: number;
97
  /**
98
  * The value used to modulate the next token probabilities.
99
  */
@@ -101,12 +104,12 @@ export interface GenerationParameters {
101
  /**
102
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
103
  */
104
- topK?: number;
105
  /**
106
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
107
  * that add up to top_p or higher are kept for generation.
108
  */
109
- topP?: number;
110
  /**
111
  * Local typicality measures how similar the conditional probability of predicting a target
112
  * token next is to the expected conditional probability of predicting a random token next,
@@ -114,33 +117,23 @@ export interface GenerationParameters {
114
  * most locally typical tokens with probabilities that add up to typical_p or higher are
115
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
116
  */
117
- typicalP?: number;
118
  /**
119
  * Whether the model should use the past last key/values attentions to speed up decoding
120
  */
121
- useCache?: boolean;
122
  [property: string]: unknown;
123
  }
 
124
  /**
125
  * Controls the stopping condition for beam-based methods.
126
  */
127
  export type EarlyStoppingUnion = boolean | "never";
128
- export interface AutomaticSpeechRecognitionOutputChunk {
129
- /**
130
- * A chunk of text identified by the model
131
- */
132
- text: string;
133
- /**
134
- * The start and end timestamps corresponding with the text
135
- */
136
- timestamps: number[];
137
- [property: string]: unknown;
138
- }
139
- export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
140
  /**
141
  * Outputs of inference for the Automatic Speech Recognition task
142
  */
143
- export interface AutomaticSpeechRecognitionOutputElement {
144
  /**
145
  * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
146
  * the model.
@@ -152,3 +145,15 @@ export interface AutomaticSpeechRecognitionOutputElement {
152
  text: string;
153
  [property: string]: unknown;
154
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
6
+
7
  /**
8
  * Inputs for Automatic Speech Recognition inference
9
  */
 
18
  parameters?: AutomaticSpeechRecognitionParameters;
19
  [property: string]: unknown;
20
  }
21
+
22
  /**
23
  * Additional inference parameters
24
  *
 
32
  /**
33
  * Whether to output corresponding timestamps with the generated text
34
  */
35
+ return_timestamps?: boolean;
36
  [property: string]: unknown;
37
  }
38
+
39
  /**
40
  * Parametrization of the text generation process
41
  *
 
45
  /**
46
  * Whether to use sampling instead of greedy decoding when generating new tokens.
47
  */
48
+ do_sample?: boolean;
49
  /**
50
  * Controls the stopping condition for beam-based methods.
51
  */
52
+ early_stopping?: EarlyStoppingUnion;
53
  /**
54
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
55
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
56
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
57
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
58
  */
59
+ epsilon_cutoff?: number;
60
  /**
61
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
62
  * float strictly between 0 and 1, a token is only considered if it is greater than either
 
66
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
67
  * for more details.
68
  */
69
+ eta_cutoff?: number;
70
  /**
71
  * The maximum length (in tokens) of the generated text, including the input.
72
  */
73
+ max_length?: number;
74
  /**
75
  * The maximum number of tokens to generate. Takes precedence over maxLength.
76
  */
77
+ max_new_tokens?: number;
78
  /**
79
  * The minimum length (in tokens) of the generated text, including the input.
80
  */
81
+ min_length?: number;
82
  /**
83
  * The minimum number of tokens to generate. Takes precedence over maxLength.
84
  */
85
+ min_new_tokens?: number;
86
  /**
87
  * Number of groups to divide num_beams into in order to ensure diversity among different
88
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
89
  */
90
+ num_beam_groups?: number;
91
  /**
92
  * Number of beams to use for beam search.
93
  */
94
+ num_beams?: number;
95
  /**
96
  * The value balances the model confidence and the degeneration penalty in contrastive
97
  * search decoding.
98
  */
99
+ penalty_alpha?: number;
100
  /**
101
  * The value used to modulate the next token probabilities.
102
  */
 
104
  /**
105
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
106
  */
107
+ top_k?: number;
108
  /**
109
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
110
  * that add up to top_p or higher are kept for generation.
111
  */
112
+ top_p?: number;
113
  /**
114
  * Local typicality measures how similar the conditional probability of predicting a target
115
  * token next is to the expected conditional probability of predicting a random token next,
 
117
  * most locally typical tokens with probabilities that add up to typical_p or higher are
118
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
119
  */
120
+ typical_p?: number;
121
  /**
122
  * Whether the model should use the past last key/values attentions to speed up decoding
123
  */
124
+ use_cache?: boolean;
125
  [property: string]: unknown;
126
  }
127
+
128
  /**
129
  * Controls the stopping condition for beam-based methods.
130
  */
131
  export type EarlyStoppingUnion = boolean | "never";
132
+
 
 
 
 
 
 
 
 
 
 
 
133
  /**
134
  * Outputs of inference for the Automatic Speech Recognition task
135
  */
136
+ export interface AutomaticSpeechRecognitionOutput {
137
  /**
138
  * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
139
  * the model.
 
145
  text: string;
146
  [property: string]: unknown;
147
  }
148
+
149
+ export interface AutomaticSpeechRecognitionOutputChunk {
150
+ /**
151
+ * A chunk of text identified by the model
152
+ */
153
+ text: string;
154
+ /**
155
+ * The start and end timestamps corresponding with the text
156
+ */
157
+ timestamps: number[];
158
+ [property: string]: unknown;
159
+ }
packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json CHANGED
@@ -19,7 +19,7 @@
19
  "description": "Additional inference parameters for Automatic Speech Recognition",
20
  "type": "object",
21
  "properties": {
22
- "returnTimestamps": {
23
  "type": "boolean",
24
  "description": "Whether to output corresponding timestamps with the generated text"
25
  },
 
19
  "description": "Additional inference parameters for Automatic Speech Recognition",
20
  "type": "object",
21
  "properties": {
22
+ "return_timestamps": {
23
  "type": "boolean",
24
  "description": "Whether to output corresponding timestamps with the generated text"
25
  },
packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json CHANGED
@@ -3,34 +3,36 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Automatic Speech Recognition task",
5
  "title": "AutomaticSpeechRecognitionOutput",
6
- "type": "array",
7
- "items": {
8
- "type": "object",
9
- "properties": {
10
- "text": {
11
- "type": "string",
12
- "description": "The recognized text."
13
- },
14
- "chunks": {
15
- "type": "array",
16
- "description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
17
- "items": {
18
- "type": "object",
19
- "title": "AutomaticSpeechRecognitionOutputChunk",
20
- "properties": {
21
- "text": { "type": "string", "description": "A chunk of text identified by the model" },
22
- "timestamps": {
23
- "type": "array",
24
- "description": "The start and end timestamps corresponding with the text",
25
- "items": { "type": "number" },
26
- "minLength": 2,
27
- "maxLength": 2
28
- }
29
  },
30
- "required": ["text", "timestamps"]
31
- }
 
 
 
 
 
 
 
 
 
32
  }
33
- },
34
- "required": ["text"]
35
- }
36
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Automatic Speech Recognition task",
5
  "title": "AutomaticSpeechRecognitionOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "text": {
9
+ "type": "string",
10
+ "description": "The recognized text."
11
+ },
12
+ "chunks": {
13
+ "type": "array",
14
+ "description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
15
+ "items": {
16
+ "type": "object",
17
+ "title": "AutomaticSpeechRecognitionOutputChunk",
18
+ "properties": {
19
+ "text": {
20
+ "type": "string",
21
+ "description": "A chunk of text identified by the model"
 
 
 
 
 
 
 
22
  },
23
+ "timestamps": {
24
+ "type": "array",
25
+ "description": "The start and end timestamps corresponding with the text",
26
+ "items": {
27
+ "type": "number"
28
+ },
29
+ "minLength": 2,
30
+ "maxLength": 2
31
+ }
32
+ },
33
+ "required": ["text", "timestamps"]
34
  }
35
+ }
36
+ },
37
+ "required": ["text"]
38
  }
packages/tasks/src/tasks/common-definitions.json CHANGED
@@ -43,63 +43,71 @@
43
  "type": "number",
44
  "description": "The value used to modulate the next token probabilities."
45
  },
46
- "topK": {
47
  "type": "integer",
48
  "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
49
  },
50
- "topP": {
51
  "type": "number",
52
  "description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
53
  },
54
- "typicalP": {
55
  "type": "number",
56
  "description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
57
  },
58
- "epsilonCutoff": {
59
  "type": "number",
60
  "description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
61
  },
62
- "etaCutoff": {
63
  "type": "number",
64
  "description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
65
  },
66
- "maxLength": {
67
  "type": "integer",
68
  "description": "The maximum length (in tokens) of the generated text, including the input."
69
  },
70
- "maxNewTokens": {
71
  "type": "integer",
72
  "description": "The maximum number of tokens to generate. Takes precedence over maxLength."
73
  },
74
- "minLength": {
75
  "type": "integer",
76
  "description": "The minimum length (in tokens) of the generated text, including the input."
77
  },
78
- "minNewTokens": {
79
  "type": "integer",
80
  "description": "The minimum number of tokens to generate. Takes precedence over maxLength."
81
  },
82
- "doSample": {
83
  "type": "boolean",
84
  "description": "Whether to use sampling instead of greedy decoding when generating new tokens."
85
  },
86
- "earlyStopping": {
87
  "description": "Controls the stopping condition for beam-based methods.",
88
- "oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
89
- },
90
- "numBeams": {
 
 
 
 
 
 
 
 
91
  "type": "integer",
92
  "description": "Number of beams to use for beam search."
93
  },
94
- "numBeamGroups": {
95
  "type": "integer",
96
  "description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
97
  },
98
- "penaltyAlpha": {
99
  "type": "number",
100
  "description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
101
  },
102
- "useCache": {
103
  "type": "boolean",
104
  "description": "Whether the model should use the past last key/values attentions to speed up decoding"
105
  }
 
43
  "type": "number",
44
  "description": "The value used to modulate the next token probabilities."
45
  },
46
+ "top_k": {
47
  "type": "integer",
48
  "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
49
  },
50
+ "top_p": {
51
  "type": "number",
52
  "description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
53
  },
54
+ "typical_p": {
55
  "type": "number",
56
  "description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
57
  },
58
+ "epsilon_cutoff": {
59
  "type": "number",
60
  "description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
61
  },
62
+ "eta_cutoff": {
63
  "type": "number",
64
  "description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
65
  },
66
+ "max_length": {
67
  "type": "integer",
68
  "description": "The maximum length (in tokens) of the generated text, including the input."
69
  },
70
+ "max_new_tokens": {
71
  "type": "integer",
72
  "description": "The maximum number of tokens to generate. Takes precedence over maxLength."
73
  },
74
+ "min_length": {
75
  "type": "integer",
76
  "description": "The minimum length (in tokens) of the generated text, including the input."
77
  },
78
+ "min_new_tokens": {
79
  "type": "integer",
80
  "description": "The minimum number of tokens to generate. Takes precedence over maxLength."
81
  },
82
+ "do_sample": {
83
  "type": "boolean",
84
  "description": "Whether to use sampling instead of greedy decoding when generating new tokens."
85
  },
86
+ "early_stopping": {
87
  "description": "Controls the stopping condition for beam-based methods.",
88
+ "oneOf": [
89
+ {
90
+ "type": "boolean"
91
+ },
92
+ {
93
+ "const": "never",
94
+ "type": "string"
95
+ }
96
+ ]
97
+ },
98
+ "num_beams": {
99
  "type": "integer",
100
  "description": "Number of beams to use for beam search."
101
  },
102
+ "num_beam_groups": {
103
  "type": "integer",
104
  "description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
105
  },
106
+ "penalty_alpha": {
107
  "type": "number",
108
  "description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
109
  },
110
+ "use_cache": {
111
  "type": "boolean",
112
  "description": "Whether the model should use the past last key/values attentions to speed up decoding"
113
  }
packages/tasks/src/tasks/depth-estimation/inference.ts CHANGED
@@ -30,6 +30,6 @@ export interface DepthEstimationOutput {
30
  /**
31
  * The predicted depth as a tensor
32
  */
33
- predictedDepth?: unknown;
34
  [property: string]: unknown;
35
  }
 
30
  /**
31
  * The predicted depth as a tensor
32
  */
33
+ predicted_depth?: unknown;
34
  [property: string]: unknown;
35
  }
packages/tasks/src/tasks/document-question-answering/inference.ts CHANGED
@@ -42,11 +42,11 @@ export interface DocumentQuestionAnsweringParameters {
42
  * be split in several chunks with some overlap. This argument controls the size of that
43
  * overlap.
44
  */
45
- docStride?: number;
46
  /**
47
  * Whether to accept impossible as an answer
48
  */
49
- handleImpossibleAnswer?: boolean;
50
  /**
51
  * Language to use while running OCR. Defaults to english.
52
  */
@@ -55,27 +55,27 @@ export interface DocumentQuestionAnsweringParameters {
55
  * The maximum length of predicted answers (e.g., only answers with a shorter length are
56
  * considered).
57
  */
58
- maxAnswerLen?: number;
59
  /**
60
  * The maximum length of the question after tokenization. It will be truncated if needed.
61
  */
62
- maxQuestionLen?: number;
63
  /**
64
  * The maximum length of the total sentence (context + question) in tokens of each chunk
65
  * passed to the model. The context will be split in several chunks (using doc_stride as
66
  * overlap) if needed.
67
  */
68
- maxSeqLen?: number;
69
  /**
70
  * The number of answers to return (will be chosen by order of likelihood). Can return less
71
  * than top_k answers if there are not enough options available within the context.
72
  */
73
- topK?: number;
74
  /**
75
  * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
76
  * skip the OCR step and use the provided bounding boxes instead.
77
  */
78
- wordBoxes?: WordBox[];
79
  [property: string]: unknown;
80
  }
81
  export type WordBox = number[] | string;
@@ -88,11 +88,19 @@ export interface DocumentQuestionAnsweringOutputElement {
88
  * The answer to the question.
89
  */
90
  answer: string;
 
 
 
 
91
  end: number;
92
  /**
93
  * The probability associated to the answer.
94
  */
95
  score: number;
 
 
 
 
96
  start: number;
97
  /**
98
  * The index of each word/box pair that is in the answer
 
42
  * be split in several chunks with some overlap. This argument controls the size of that
43
  * overlap.
44
  */
45
+ doc_stride?: number;
46
  /**
47
  * Whether to accept impossible as an answer
48
  */
49
+ handle_impossible_answer?: boolean;
50
  /**
51
  * Language to use while running OCR. Defaults to english.
52
  */
 
55
  * The maximum length of predicted answers (e.g., only answers with a shorter length are
56
  * considered).
57
  */
58
+ max_answer_len?: number;
59
  /**
60
  * The maximum length of the question after tokenization. It will be truncated if needed.
61
  */
62
+ max_question_len?: number;
63
  /**
64
  * The maximum length of the total sentence (context + question) in tokens of each chunk
65
  * passed to the model. The context will be split in several chunks (using doc_stride as
66
  * overlap) if needed.
67
  */
68
+ max_seq_len?: number;
69
  /**
70
  * The number of answers to return (will be chosen by order of likelihood). Can return less
71
  * than top_k answers if there are not enough options available within the context.
72
  */
73
+ top_k?: number;
74
  /**
75
  * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
76
  * skip the OCR step and use the provided bounding boxes instead.
77
  */
78
+ word_boxes?: WordBox[];
79
  [property: string]: unknown;
80
  }
81
  export type WordBox = number[] | string;
 
88
  * The answer to the question.
89
  */
90
  answer: string;
91
+ /**
92
+ * The end word index of the answer (in the OCR’d version of the input or provided word
93
+ * boxes).
94
+ */
95
  end: number;
96
  /**
97
  * The probability associated to the answer.
98
  */
99
  score: number;
100
+ /**
101
+ * The start word index of the answer (in the OCR’d version of the input or provided word
102
+ * boxes).
103
+ */
104
  start: number;
105
  /**
106
  * The index of each word/box pair that is in the answer
packages/tasks/src/tasks/document-question-answering/spec/input.json CHANGED
@@ -31,11 +31,11 @@
31
  "description": "Additional inference parameters for Document Question Answering",
32
  "type": "object",
33
  "properties": {
34
- "docStride": {
35
  "type": "integer",
36
  "description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
37
  },
38
- "handleImpossibleAnswer": {
39
  "type": "boolean",
40
  "description": "Whether to accept impossible as an answer"
41
  },
@@ -43,23 +43,23 @@
43
  "type": "string",
44
  "description": "Language to use while running OCR. Defaults to english."
45
  },
46
- "maxAnswerLen": {
47
  "type": "integer",
48
  "description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
49
  },
50
- "maxSeqLen": {
51
  "type": "integer",
52
  "description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
53
  },
54
- "maxQuestionLen": {
55
  "type": "integer",
56
  "description": "The maximum length of the question after tokenization. It will be truncated if needed."
57
  },
58
- "topK": {
59
  "type": "integer",
60
  "description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
61
  },
62
- "wordBoxes": {
63
  "type": "array",
64
  "description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
65
  "items": {
 
31
  "description": "Additional inference parameters for Document Question Answering",
32
  "type": "object",
33
  "properties": {
34
+ "doc_stride": {
35
  "type": "integer",
36
  "description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
37
  },
38
+ "handle_impossible_answer": {
39
  "type": "boolean",
40
  "description": "Whether to accept impossible as an answer"
41
  },
 
43
  "type": "string",
44
  "description": "Language to use while running OCR. Defaults to english."
45
  },
46
+ "max_answer_len": {
47
  "type": "integer",
48
  "description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
49
  },
50
+ "max_seq_len": {
51
  "type": "integer",
52
  "description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
53
  },
54
+ "max_question_len": {
55
  "type": "integer",
56
  "description": "The maximum length of the question after tokenization. It will be truncated if needed."
57
  },
58
+ "top_k": {
59
  "type": "integer",
60
  "description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
61
  },
62
+ "word_boxes": {
63
  "type": "array",
64
  "description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
65
  "items": {
packages/tasks/src/tasks/document-question-answering/spec/output.json CHANGED
@@ -17,11 +17,11 @@
17
  },
18
  "start": {
19
  "type": "integer",
20
- "descrtiption": "The start word index of the answer (in the OCR’d version of the input or provided word boxes)."
21
  },
22
  "end": {
23
  "type": "integer",
24
- "descrtiption": "The end word index of the answer (in the OCR’d version of the input or provided word boxes)."
25
  },
26
  "words": {
27
  "type": "array",
 
17
  },
18
  "start": {
19
  "type": "integer",
20
+ "description": "The start word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
21
  },
22
  "end": {
23
  "type": "integer",
24
+ "description": "The end word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
25
  },
26
  "words": {
27
  "type": "array",
packages/tasks/src/tasks/fill-mask/inference.ts CHANGED
@@ -33,7 +33,7 @@ export interface FillMaskParameters {
33
  /**
34
  * When passed, overrides the number of predictions to return.
35
  */
36
- topK?: number;
37
  [property: string]: unknown;
38
  }
39
  export type FillMaskOutput = FillMaskOutputElement[];
@@ -53,9 +53,10 @@ export interface FillMaskOutputElement {
53
  * The predicted token id (to replace the masked one).
54
  */
55
  token: number;
 
56
  /**
57
  * The predicted token (to replace the masked one).
58
  */
59
- tokenStr: string;
60
  [property: string]: unknown;
61
  }
 
33
  /**
34
  * When passed, overrides the number of predictions to return.
35
  */
36
+ top_k?: number;
37
  [property: string]: unknown;
38
  }
39
  export type FillMaskOutput = FillMaskOutputElement[];
 
53
  * The predicted token id (to replace the masked one).
54
  */
55
  token: number;
56
+ tokenStr: unknown;
57
  /**
58
  * The predicted token (to replace the masked one).
59
  */
60
+ token_str?: string;
61
  [property: string]: unknown;
62
  }
packages/tasks/src/tasks/fill-mask/spec/input.json CHANGED
@@ -20,7 +20,7 @@
20
  "description": "Additional inference parameters for Fill Mask",
21
  "type": "object",
22
  "properties": {
23
- "topK": {
24
  "type": "integer",
25
  "description": "When passed, overrides the number of predictions to return."
26
  },
 
20
  "description": "Additional inference parameters for Fill Mask",
21
  "type": "object",
22
  "properties": {
23
+ "top_k": {
24
  "type": "integer",
25
  "description": "When passed, overrides the number of predictions to return."
26
  },
packages/tasks/src/tasks/fill-mask/spec/output.json CHANGED
@@ -19,7 +19,7 @@
19
  "type": "integer",
20
  "description": "The predicted token id (to replace the masked one)."
21
  },
22
- "tokenStr": {
23
  "type": "string",
24
  "description": "The predicted token (to replace the masked one)."
25
  }
 
19
  "type": "integer",
20
  "description": "The predicted token id (to replace the masked one)."
21
  },
22
+ "token_str": {
23
  "type": "string",
24
  "description": "The predicted token (to replace the masked one)."
25
  }
packages/tasks/src/tasks/image-classification/inference.ts CHANGED
@@ -23,11 +23,11 @@ export interface ImageClassificationInput {
23
  * Additional inference parameters for Image Classification
24
  */
25
  export interface ImageClassificationParameters {
26
- functionToApply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
- topK?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
 
23
  * Additional inference parameters for Image Classification
24
  */
25
  export interface ImageClassificationParameters {
26
+ function_to_apply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
+ top_k?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
packages/tasks/src/tasks/image-classification/spec/input.json CHANGED
@@ -19,11 +19,11 @@
19
  "description": "Additional inference parameters for Image Classification",
20
  "type": "object",
21
  "properties": {
22
- "functionToApply": {
23
  "title": "ImageClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
- "topK": {
27
  "type": "integer",
28
  "description": "When specified, limits the output to the top K most probable classes."
29
  }
 
19
  "description": "Additional inference parameters for Image Classification",
20
  "type": "object",
21
  "properties": {
22
+ "function_to_apply": {
23
  "title": "ImageClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
+ "top_k": {
27
  "type": "integer",
28
  "description": "When specified, limits the output to the top K most probable classes."
29
  }
packages/tasks/src/tasks/image-segmentation/inference.ts CHANGED
@@ -26,11 +26,11 @@ export interface ImageSegmentationParameters {
26
  /**
27
  * Threshold to use when turning the predicted masks into binary values.
28
  */
29
- maskThreshold?: number;
30
  /**
31
  * Mask overlap threshold to eliminate small, disconnected segments.
32
  */
33
- overlapMaskAreaThreshold?: number;
34
  /**
35
  * Segmentation task to be performed, depending on model capabilities.
36
  */
 
26
  /**
27
  * Threshold to use when turning the predicted masks into binary values.
28
  */
29
+ mask_threshold?: number;
30
  /**
31
  * Mask overlap threshold to eliminate small, disconnected segments.
32
  */
33
+ overlap_mask_area_threshold?: number;
34
  /**
35
  * Segmentation task to be performed, depending on model capabilities.
36
  */
packages/tasks/src/tasks/image-segmentation/spec/input.json CHANGED
@@ -19,11 +19,11 @@
19
  "description": "Additional inference parameters for Image Segmentation",
20
  "type": "object",
21
  "properties": {
22
- "maskThreshold": {
23
  "type": "number",
24
  "description": "Threshold to use when turning the predicted masks into binary values."
25
  },
26
- "overlapMaskAreaThreshold": {
27
  "type": "number",
28
  "description": "Mask overlap threshold to eliminate small, disconnected segments."
29
  },
 
19
  "description": "Additional inference parameters for Image Segmentation",
20
  "type": "object",
21
  "properties": {
22
+ "mask_threshold": {
23
  "type": "number",
24
  "description": "Threshold to use when turning the predicted masks into binary values."
25
  },
26
+ "overlap_mask_area_threshold": {
27
  "type": "number",
28
  "description": "Mask overlap threshold to eliminate small, disconnected segments."
29
  },
packages/tasks/src/tasks/image-to-image/inference.ts CHANGED
@@ -29,20 +29,20 @@ export interface ImageToImageParameters {
29
  * For diffusion models. A higher guidance scale value encourages the model to generate
30
  * images closely linked to the text prompt at the expense of lower image quality.
31
  */
32
- guidanceScale?: number;
33
  /**
34
  * One or several prompt to guide what NOT to include in image generation.
35
  */
36
- negativePrompt?: string[];
37
  /**
38
  * For diffusion models. The number of denoising steps. More denoising steps usually lead to
39
  * a higher quality image at the expense of slower inference.
40
  */
41
- numInferenceSteps?: number;
42
  /**
43
  * The size in pixel of the output image
44
  */
45
- targetSize?: TargetSize;
46
  [property: string]: unknown;
47
  }
48
 
 
29
  * For diffusion models. A higher guidance scale value encourages the model to generate
30
  * images closely linked to the text prompt at the expense of lower image quality.
31
  */
32
+ guidance_scale?: number;
33
  /**
34
  * One or several prompt to guide what NOT to include in image generation.
35
  */
36
+ negative_prompt?: string[];
37
  /**
38
  * For diffusion models. The number of denoising steps. More denoising steps usually lead to
39
  * a higher quality image at the expense of slower inference.
40
  */
41
+ num_inference_steps?: number;
42
  /**
43
  * The size in pixel of the output image
44
  */
45
+ target_size?: TargetSize;
46
  [property: string]: unknown;
47
  }
48
 
packages/tasks/src/tasks/image-to-image/spec/input.json CHANGED
@@ -19,22 +19,22 @@
19
  "description": "Additional inference parameters for Image To Image",
20
  "type": "object",
21
  "properties": {
22
- "guidanceScale": {
23
  "type": "number",
24
  "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
25
  },
26
- "negativePrompt": {
27
  "type": "array",
28
  "items": {
29
  "type": "string"
30
  },
31
  "description": "One or several prompt to guide what NOT to include in image generation."
32
  },
33
- "numInferenceSteps": {
34
  "type": "integer",
35
  "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
36
  },
37
- "targetSize": {
38
  "type": "object",
39
  "description": "The size in pixel of the output image",
40
  "properties": {
 
19
  "description": "Additional inference parameters for Image To Image",
20
  "type": "object",
21
  "properties": {
22
+ "guidance_scale": {
23
  "type": "number",
24
  "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
25
  },
26
+ "negative_prompt": {
27
  "type": "array",
28
  "items": {
29
  "type": "string"
30
  },
31
  "description": "One or several prompt to guide what NOT to include in image generation."
32
  },
33
+ "num_inference_steps": {
34
  "type": "integer",
35
  "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
36
  },
37
+ "target_size": {
38
  "type": "object",
39
  "description": "The size in pixel of the output image",
40
  "properties": {
packages/tasks/src/tasks/image-to-text/inference.ts CHANGED
@@ -3,6 +3,7 @@
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
 
6
  /**
7
  * Inputs for Image To Text inference
8
  */
@@ -17,6 +18,7 @@ export interface ImageToTextInput {
17
  parameters?: ImageToTextParameters;
18
  [property: string]: unknown;
19
  }
 
20
  /**
21
  * Additional inference parameters
22
  *
@@ -30,9 +32,10 @@ export interface ImageToTextParameters {
30
  /**
31
  * The amount of maximum tokens to generate.
32
  */
33
- maxNewTokens?: number;
34
  [property: string]: unknown;
35
  }
 
36
  /**
37
  * Parametrization of the text generation process
38
  *
@@ -42,18 +45,18 @@ export interface GenerationParameters {
42
  /**
43
  * Whether to use sampling instead of greedy decoding when generating new tokens.
44
  */
45
- doSample?: boolean;
46
  /**
47
  * Controls the stopping condition for beam-based methods.
48
  */
49
- earlyStopping?: EarlyStoppingUnion;
50
  /**
51
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
52
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
53
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
54
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
55
  */
56
- epsilonCutoff?: number;
57
  /**
58
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
59
  * float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -63,37 +66,37 @@ export interface GenerationParameters {
63
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
64
  * for more details.
65
  */
66
- etaCutoff?: number;
67
  /**
68
  * The maximum length (in tokens) of the generated text, including the input.
69
  */
70
- maxLength?: number;
71
  /**
72
  * The maximum number of tokens to generate. Takes precedence over maxLength.
73
  */
74
- maxNewTokens?: number;
75
  /**
76
  * The minimum length (in tokens) of the generated text, including the input.
77
  */
78
- minLength?: number;
79
  /**
80
  * The minimum number of tokens to generate. Takes precedence over maxLength.
81
  */
82
- minNewTokens?: number;
83
  /**
84
  * Number of groups to divide num_beams into in order to ensure diversity among different
85
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
86
  */
87
- numBeamGroups?: number;
88
  /**
89
  * Number of beams to use for beam search.
90
  */
91
- numBeams?: number;
92
  /**
93
  * The value balances the model confidence and the degeneration penalty in contrastive
94
  * search decoding.
95
  */
96
- penaltyAlpha?: number;
97
  /**
98
  * The value used to modulate the next token probabilities.
99
  */
@@ -101,12 +104,12 @@ export interface GenerationParameters {
101
  /**
102
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
103
  */
104
- topK?: number;
105
  /**
106
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
107
  * that add up to top_p or higher are kept for generation.
108
  */
109
- topP?: number;
110
  /**
111
  * Local typicality measures how similar the conditional probability of predicting a target
112
  * token next is to the expected conditional probability of predicting a random token next,
@@ -114,25 +117,27 @@ export interface GenerationParameters {
114
  * most locally typical tokens with probabilities that add up to typical_p or higher are
115
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
116
  */
117
- typicalP?: number;
118
  /**
119
  * Whether the model should use the past last key/values attentions to speed up decoding
120
  */
121
- useCache?: boolean;
122
  [property: string]: unknown;
123
  }
 
124
  /**
125
  * Controls the stopping condition for beam-based methods.
126
  */
127
  export type EarlyStoppingUnion = boolean | "never";
128
- export type ImageToTextOutput = ImageToTextOutputElement[];
129
  /**
130
  * Outputs of inference for the Image To Text task
131
  */
132
- export interface ImageToTextOutputElement {
 
133
  /**
134
  * The generated text.
135
  */
136
- generatedText: string;
137
  [property: string]: unknown;
138
  }
 
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
6
+
7
  /**
8
  * Inputs for Image To Text inference
9
  */
 
18
  parameters?: ImageToTextParameters;
19
  [property: string]: unknown;
20
  }
21
+
22
  /**
23
  * Additional inference parameters
24
  *
 
32
  /**
33
  * The amount of maximum tokens to generate.
34
  */
35
+ max_new_tokens?: number;
36
  [property: string]: unknown;
37
  }
38
+
39
  /**
40
  * Parametrization of the text generation process
41
  *
 
45
  /**
46
  * Whether to use sampling instead of greedy decoding when generating new tokens.
47
  */
48
+ do_sample?: boolean;
49
  /**
50
  * Controls the stopping condition for beam-based methods.
51
  */
52
+ early_stopping?: EarlyStoppingUnion;
53
  /**
54
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
55
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
56
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
57
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
58
  */
59
+ epsilon_cutoff?: number;
60
  /**
61
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
62
  * float strictly between 0 and 1, a token is only considered if it is greater than either
 
66
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
67
  * for more details.
68
  */
69
+ eta_cutoff?: number;
70
  /**
71
  * The maximum length (in tokens) of the generated text, including the input.
72
  */
73
+ max_length?: number;
74
  /**
75
  * The maximum number of tokens to generate. Takes precedence over maxLength.
76
  */
77
+ max_new_tokens?: number;
78
  /**
79
  * The minimum length (in tokens) of the generated text, including the input.
80
  */
81
+ min_length?: number;
82
  /**
83
  * The minimum number of tokens to generate. Takes precedence over maxLength.
84
  */
85
+ min_new_tokens?: number;
86
  /**
87
  * Number of groups to divide num_beams into in order to ensure diversity among different
88
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
89
  */
90
+ num_beam_groups?: number;
91
  /**
92
  * Number of beams to use for beam search.
93
  */
94
+ num_beams?: number;
95
  /**
96
  * The value balances the model confidence and the degeneration penalty in contrastive
97
  * search decoding.
98
  */
99
+ penalty_alpha?: number;
100
  /**
101
  * The value used to modulate the next token probabilities.
102
  */
 
104
  /**
105
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
106
  */
107
+ top_k?: number;
108
  /**
109
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
110
  * that add up to top_p or higher are kept for generation.
111
  */
112
+ top_p?: number;
113
  /**
114
  * Local typicality measures how similar the conditional probability of predicting a target
115
  * token next is to the expected conditional probability of predicting a random token next,
 
117
  * most locally typical tokens with probabilities that add up to typical_p or higher are
118
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
119
  */
120
+ typical_p?: number;
121
  /**
122
  * Whether the model should use the past last key/values attentions to speed up decoding
123
  */
124
+ use_cache?: boolean;
125
  [property: string]: unknown;
126
  }
127
+
128
  /**
129
  * Controls the stopping condition for beam-based methods.
130
  */
131
  export type EarlyStoppingUnion = boolean | "never";
132
+
133
  /**
134
  * Outputs of inference for the Image To Text task
135
  */
136
+ export interface ImageToTextOutput {
137
+ generatedText: unknown;
138
  /**
139
  * The generated text.
140
  */
141
+ generated_text?: string;
142
  [property: string]: unknown;
143
  }
packages/tasks/src/tasks/image-to-text/spec/input.json CHANGED
@@ -19,7 +19,7 @@
19
  "description": "Additional inference parameters for Image To Text",
20
  "type": "object",
21
  "properties": {
22
- "maxNewTokens": {
23
  "type": "integer",
24
  "description": "The amount of maximum tokens to generate."
25
  },
 
19
  "description": "Additional inference parameters for Image To Text",
20
  "type": "object",
21
  "properties": {
22
+ "max_new_tokens": {
23
  "type": "integer",
24
  "description": "The amount of maximum tokens to generate."
25
  },
packages/tasks/src/tasks/image-to-text/spec/output.json CHANGED
@@ -3,15 +3,12 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Image To Text task",
5
  "title": "ImageToTextOutput",
6
- "type": "array",
7
- "items": {
8
- "type": "object",
9
- "properties": {
10
- "generatedText": {
11
- "type": "string",
12
- "description": "The generated text."
13
- }
14
- },
15
- "required": ["generatedText"]
16
- }
17
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Image To Text task",
5
  "title": "ImageToTextOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "generated_text": {
9
+ "type": "string",
10
+ "description": "The generated text."
11
+ }
12
+ },
13
+ "required": ["generatedText"]
 
 
 
14
  }
packages/tasks/src/tasks/placeholder/spec/input.json CHANGED
@@ -20,11 +20,11 @@
20
  "description": "TODO: describe additional parameters here.",
21
  "type": "object",
22
  "properties": {
23
- "dummyParameterName": {
24
  "type": "boolean",
25
  "description": "TODO: describe the parameter here"
26
  },
27
- "dummyParameterName2": {
28
  "type": "integer",
29
  "description": "TODO: describe the parameter here"
30
  }
 
20
  "description": "TODO: describe additional parameters here.",
21
  "type": "object",
22
  "properties": {
23
+ "dummy_parameter_name": {
24
  "type": "boolean",
25
  "description": "TODO: describe the parameter here"
26
  },
27
+ "dummy_parameter_name2": {
28
  "type": "integer",
29
  "description": "TODO: describe the parameter here"
30
  }
packages/tasks/src/tasks/placeholder/spec/output.json CHANGED
@@ -7,7 +7,7 @@
7
  "items": {
8
  "type": "object",
9
  "properties": {
10
- "meaningfulOutputName": {
11
  "type": "string",
12
  "description": "TODO: Describe what is outputed by the inference here"
13
  }
 
7
  "items": {
8
  "type": "object",
9
  "properties": {
10
+ "meaningful_output_name": {
11
  "type": "string",
12
  "description": "TODO: Describe what is outputed by the inference here"
13
  }
packages/tasks/src/tasks/question-answering/inference.ts CHANGED
@@ -41,37 +41,37 @@ export interface QuestionAnsweringParameters {
41
  * Attempts to align the answer to real words. Improves quality on space separated
42
  * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
43
  */
44
- alignToWords?: boolean;
45
  /**
46
  * If the context is too long to fit with the question for the model, it will be split in
47
  * several chunks with some overlap. This argument controls the size of that overlap.
48
  */
49
- docStride?: number;
50
  /**
51
  * Whether to accept impossible as an answer.
52
  */
53
- handleImpossibleAnswer?: boolean;
54
  /**
55
  * The maximum length of predicted answers (e.g., only answers with a shorter length are
56
  * considered).
57
  */
58
- maxAnswerLen?: number;
59
  /**
60
  * The maximum length of the question after tokenization. It will be truncated if needed.
61
  */
62
- maxQuestionLen?: number;
63
  /**
64
  * The maximum length of the total sentence (context + question) in tokens of each chunk
65
  * passed to the model. The context will be split in several chunks (using docStride as
66
  * overlap) if needed.
67
  */
68
- maxSeqLen?: number;
69
  /**
70
  * The number of answers to return (will be chosen by order of likelihood). Note that we
71
  * return less than topk answers if there are not enough options available within the
72
  * context.
73
  */
74
- topK?: number;
75
  [property: string]: unknown;
76
  }
77
  export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
 
41
  * Attempts to align the answer to real words. Improves quality on space separated
42
  * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
43
  */
44
+ align_to_words?: boolean;
45
  /**
46
  * If the context is too long to fit with the question for the model, it will be split in
47
  * several chunks with some overlap. This argument controls the size of that overlap.
48
  */
49
+ doc_stride?: number;
50
  /**
51
  * Whether to accept impossible as an answer.
52
  */
53
+ handle_impossible_answer?: boolean;
54
  /**
55
  * The maximum length of predicted answers (e.g., only answers with a shorter length are
56
  * considered).
57
  */
58
+ max_answer_len?: number;
59
  /**
60
  * The maximum length of the question after tokenization. It will be truncated if needed.
61
  */
62
+ max_question_len?: number;
63
  /**
64
  * The maximum length of the total sentence (context + question) in tokens of each chunk
65
  * passed to the model. The context will be split in several chunks (using docStride as
66
  * overlap) if needed.
67
  */
68
+ max_seq_len?: number;
69
  /**
70
  * The number of answers to return (will be chosen by order of likelihood). Note that we
71
  * return less than topk answers if there are not enough options available within the
72
  * context.
73
  */
74
+ top_k?: number;
75
  [property: string]: unknown;
76
  }
77
  export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
packages/tasks/src/tasks/question-answering/spec/input.json CHANGED
@@ -32,31 +32,31 @@
32
  "description": "Additional inference parameters for Question Answering",
33
  "type": "object",
34
  "properties": {
35
- "topK": {
36
  "type": "integer",
37
  "description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
38
  },
39
- "docStride": {
40
  "type": "integer",
41
  "description": "If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
42
  },
43
- "maxAnswerLen": {
44
  "type": "integer",
45
  "description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
46
  },
47
- "maxSeqLen": {
48
  "type": "integer",
49
  "description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed."
50
  },
51
- "maxQuestionLen": {
52
  "type": "integer",
53
  "description": "The maximum length of the question after tokenization. It will be truncated if needed."
54
  },
55
- "handleImpossibleAnswer": {
56
  "type": "boolean",
57
  "description": "Whether to accept impossible as an answer."
58
  },
59
- "alignToWords": {
60
  "type": "boolean",
61
  "description": "Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese)"
62
  }
 
32
  "description": "Additional inference parameters for Question Answering",
33
  "type": "object",
34
  "properties": {
35
+ "top_k": {
36
  "type": "integer",
37
  "description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
38
  },
39
+ "doc_stride": {
40
  "type": "integer",
41
  "description": "If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
42
  },
43
+ "max_answer_len": {
44
  "type": "integer",
45
  "description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
46
  },
47
+ "max_seq_len": {
48
  "type": "integer",
49
  "description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed."
50
  },
51
+ "max_question_len": {
52
  "type": "integer",
53
  "description": "The maximum length of the question after tokenization. It will be truncated if needed."
54
  },
55
+ "handle_impossible_answer": {
56
  "type": "boolean",
57
  "description": "Whether to accept impossible as an answer."
58
  },
59
+ "align_to_words": {
60
  "type": "boolean",
61
  "description": "Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese)"
62
  }
packages/tasks/src/tasks/summarization/inference.ts CHANGED
@@ -30,11 +30,11 @@ export interface Text2TextGenerationParameters {
30
  /**
31
  * Whether to clean up the potential extra spaces in the text output.
32
  */
33
- cleanUpTokenizationSpaces?: boolean;
34
  /**
35
  * Additional parametrization of the text generation algorithm
36
  */
37
- generateParameters?: { [key: string]: unknown };
38
  /**
39
  * The truncation strategy to use
40
  */
@@ -50,9 +50,10 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
50
  * Outputs of inference for the Text2text Generation task
51
  */
52
  export interface SummarizationOutput {
 
53
  /**
54
  * The generated text.
55
  */
56
- generatedText: string;
57
  [property: string]: unknown;
58
  }
 
30
  /**
31
  * Whether to clean up the potential extra spaces in the text output.
32
  */
33
+ clean_up_tokenization_spaces?: boolean;
34
  /**
35
  * Additional parametrization of the text generation algorithm
36
  */
37
+ generate_parameters?: { [key: string]: unknown };
38
  /**
39
  * The truncation strategy to use
40
  */
 
50
  * Outputs of inference for the Text2text Generation task
51
  */
52
  export interface SummarizationOutput {
53
+ generatedText: unknown;
54
  /**
55
  * The generated text.
56
  */
57
+ generated_text?: string;
58
  [property: string]: unknown;
59
  }
packages/tasks/src/tasks/text-classification/inference.ts CHANGED
@@ -23,11 +23,11 @@ export interface TextClassificationInput {
23
  * Additional inference parameters for Text Classification
24
  */
25
  export interface TextClassificationParameters {
26
- functionToApply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
- topK?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
 
23
  * Additional inference parameters for Text Classification
24
  */
25
  export interface TextClassificationParameters {
26
+ function_to_apply?: ClassificationOutputTransform;
27
  /**
28
  * When specified, limits the output to the top K most probable classes.
29
  */
30
+ top_k?: number;
31
  [property: string]: unknown;
32
  }
33
  /**
packages/tasks/src/tasks/text-classification/spec/input.json CHANGED
@@ -20,11 +20,11 @@
20
  "description": "Additional inference parameters for Text Classification",
21
  "type": "object",
22
  "properties": {
23
- "functionToApply": {
24
  "title": "TextClassificationOutputTransform",
25
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
26
  },
27
- "topK": {
28
  "type": "integer",
29
  "description": "When specified, limits the output to the top K most probable classes."
30
  }
 
20
  "description": "Additional inference parameters for Text Classification",
21
  "type": "object",
22
  "properties": {
23
+ "function_to_apply": {
24
  "title": "TextClassificationOutputTransform",
25
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
26
  },
27
+ "top_k": {
28
  "type": "integer",
29
  "description": "When specified, limits the output to the top K most probable classes."
30
  }
packages/tasks/src/tasks/text-generation/inference.ts CHANGED
@@ -3,6 +3,7 @@
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
 
6
  /**
7
  * Inputs for Text Generation inference
8
  */
@@ -17,6 +18,7 @@ export interface TextGenerationInput {
17
  parameters?: TextGenerationParameters;
18
  [property: string]: unknown;
19
  }
 
20
  /**
21
  * Additional inference parameters
22
  *
@@ -26,24 +28,24 @@ export interface TextGenerationParameters {
26
  /**
27
  * Whether to use logit sampling (true) or greedy search (false).
28
  */
29
- doSample?: boolean;
30
  /**
31
  * Maximum number of generated tokens.
32
  */
33
- maxNewTokens?: number;
34
  /**
35
  * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
36
  * paper](https://hf.co/papers/1909.05858) for more details.
37
  */
38
- repetitionPenalty?: number;
39
  /**
40
  * Whether to prepend the prompt to the generated text.
41
  */
42
- returnFullText?: boolean;
43
  /**
44
  * Stop generating tokens if a member of `stop_sequences` is generated.
45
  */
46
- stopSequences?: string[];
47
  /**
48
  * The value used to modulate the logits distribution.
49
  */
@@ -51,12 +53,12 @@ export interface TextGenerationParameters {
51
  /**
52
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
53
  */
54
- topK?: number;
55
  /**
56
  * If set to < 1, only the smallest set of most probable tokens with probabilities that add
57
  * up to `top_p` or higher are kept for generation.
58
  */
59
- topP?: number;
60
  /**
61
  * Truncate input tokens to the given size.
62
  */
@@ -65,21 +67,22 @@ export interface TextGenerationParameters {
65
  * Typical Decoding mass. See [Typical Decoding for Natural Language
66
  * Generation](https://hf.co/papers/2202.00666) for more information
67
  */
68
- typicalP?: number;
69
  /**
70
  * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
71
  */
72
  watermark?: boolean;
73
  [property: string]: unknown;
74
  }
75
- export type TextGenerationOutput = TextGenerationOutputElement[];
76
  /**
77
  * Outputs for Text Generation inference
78
  */
79
- export interface TextGenerationOutputElement {
 
80
  /**
81
  * The generated text
82
  */
83
- generatedText: string;
84
  [property: string]: unknown;
85
  }
 
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
6
+
7
  /**
8
  * Inputs for Text Generation inference
9
  */
 
18
  parameters?: TextGenerationParameters;
19
  [property: string]: unknown;
20
  }
21
+
22
  /**
23
  * Additional inference parameters
24
  *
 
28
  /**
29
  * Whether to use logit sampling (true) or greedy search (false).
30
  */
31
+ do_sample?: boolean;
32
  /**
33
  * Maximum number of generated tokens.
34
  */
35
+ max_new_tokens?: number;
36
  /**
37
  * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
38
  * paper](https://hf.co/papers/1909.05858) for more details.
39
  */
40
+ repetition_penalty?: number;
41
  /**
42
  * Whether to prepend the prompt to the generated text.
43
  */
44
+ return_full_text?: boolean;
45
  /**
46
  * Stop generating tokens if a member of `stop_sequences` is generated.
47
  */
48
+ stop_sequences?: string[];
49
  /**
50
  * The value used to modulate the logits distribution.
51
  */
 
53
  /**
54
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
55
  */
56
+ top_k?: number;
57
  /**
58
  * If set to < 1, only the smallest set of most probable tokens with probabilities that add
59
  * up to `top_p` or higher are kept for generation.
60
  */
61
+ top_p?: number;
62
  /**
63
  * Truncate input tokens to the given size.
64
  */
 
67
  * Typical Decoding mass. See [Typical Decoding for Natural Language
68
  * Generation](https://hf.co/papers/2202.00666) for more information
69
  */
70
+ typical_p?: number;
71
  /**
72
  * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
73
  */
74
  watermark?: boolean;
75
  [property: string]: unknown;
76
  }
77
+
78
  /**
79
  * Outputs for Text Generation inference
80
  */
81
+ export interface TextGenerationOutput {
82
+ generatedText: unknown;
83
  /**
84
  * The generated text
85
  */
86
+ generated_text?: string;
87
  [property: string]: unknown;
88
  }
packages/tasks/src/tasks/text-generation/spec/input.json CHANGED
@@ -20,23 +20,23 @@
20
  "description": "Additional inference parameters for Text Generation",
21
  "type": "object",
22
  "properties": {
23
- "doSample": {
24
  "type": "boolean",
25
  "description": "Whether to use logit sampling (true) or greedy search (false)."
26
  },
27
- "maxNewTokens": {
28
  "type": "integer",
29
  "description": "Maximum number of generated tokens."
30
  },
31
- "repetitionPenalty": {
32
  "type": "number",
33
  "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
34
  },
35
- "returnFullText": {
36
  "type": "boolean",
37
  "description": "Whether to prepend the prompt to the generated text."
38
  },
39
- "stopSequences": {
40
  "type": "array",
41
  "items": {
42
  "type": "string"
@@ -47,11 +47,11 @@
47
  "type": "number",
48
  "description": "The value used to modulate the logits distribution."
49
  },
50
- "topK": {
51
  "type": "integer",
52
  "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
53
  },
54
- "topP": {
55
  "type": "number",
56
  "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
57
  },
@@ -59,7 +59,7 @@
59
  "type": "integer",
60
  "description": "Truncate input tokens to the given size."
61
  },
62
- "typicalP": {
63
  "type": "number",
64
  "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
65
  },
 
20
  "description": "Additional inference parameters for Text Generation",
21
  "type": "object",
22
  "properties": {
23
+ "do_sample": {
24
  "type": "boolean",
25
  "description": "Whether to use logit sampling (true) or greedy search (false)."
26
  },
27
+ "max_new_tokens": {
28
  "type": "integer",
29
  "description": "Maximum number of generated tokens."
30
  },
31
+ "repetition_penalty": {
32
  "type": "number",
33
  "description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
34
  },
35
+ "return_full_text": {
36
  "type": "boolean",
37
  "description": "Whether to prepend the prompt to the generated text."
38
  },
39
+ "stop_sequences": {
40
  "type": "array",
41
  "items": {
42
  "type": "string"
 
47
  "type": "number",
48
  "description": "The value used to modulate the logits distribution."
49
  },
50
+ "top_k": {
51
  "type": "integer",
52
  "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
53
  },
54
+ "top_p": {
55
  "type": "number",
56
  "description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
57
  },
 
59
  "type": "integer",
60
  "description": "Truncate input tokens to the given size."
61
  },
62
+ "typical_p": {
63
  "type": "number",
64
  "description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
65
  },
packages/tasks/src/tasks/text-generation/spec/output.json CHANGED
@@ -3,15 +3,12 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs for Text Generation inference",
5
  "title": "TextGenerationOutput",
6
- "type": "array",
7
- "items": {
8
- "type": "object",
9
- "properties": {
10
- "generatedText": {
11
- "type": "string",
12
- "description": "The generated text"
13
- }
14
- },
15
- "required": ["generatedText"]
16
- }
17
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs for Text Generation inference",
5
  "title": "TextGenerationOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "generated_text": {
9
+ "type": "string",
10
+ "description": "The generated text"
11
+ }
12
+ },
13
+ "required": ["generatedText"]
 
 
 
14
  }
packages/tasks/src/tasks/text-to-audio/inference.ts CHANGED
@@ -3,6 +3,7 @@
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
 
6
  /**
7
  * Inputs for Text To Audio inference
8
  */
@@ -17,6 +18,7 @@ export interface TextToAudioInput {
17
  parameters?: TextToAudioParameters;
18
  [property: string]: unknown;
19
  }
 
20
  /**
21
  * Additional inference parameters
22
  *
@@ -29,6 +31,7 @@ export interface TextToAudioParameters {
29
  generate?: GenerationParameters;
30
  [property: string]: unknown;
31
  }
 
32
  /**
33
  * Parametrization of the text generation process
34
  *
@@ -38,18 +41,18 @@ export interface GenerationParameters {
38
  /**
39
  * Whether to use sampling instead of greedy decoding when generating new tokens.
40
  */
41
- doSample?: boolean;
42
  /**
43
  * Controls the stopping condition for beam-based methods.
44
  */
45
- earlyStopping?: EarlyStoppingUnion;
46
  /**
47
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
48
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
49
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
50
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
51
  */
52
- epsilonCutoff?: number;
53
  /**
54
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
55
  * float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -59,37 +62,37 @@ export interface GenerationParameters {
59
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
60
  * for more details.
61
  */
62
- etaCutoff?: number;
63
  /**
64
  * The maximum length (in tokens) of the generated text, including the input.
65
  */
66
- maxLength?: number;
67
  /**
68
  * The maximum number of tokens to generate. Takes precedence over maxLength.
69
  */
70
- maxNewTokens?: number;
71
  /**
72
  * The minimum length (in tokens) of the generated text, including the input.
73
  */
74
- minLength?: number;
75
  /**
76
  * The minimum number of tokens to generate. Takes precedence over maxLength.
77
  */
78
- minNewTokens?: number;
79
  /**
80
  * Number of groups to divide num_beams into in order to ensure diversity among different
81
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
82
  */
83
- numBeamGroups?: number;
84
  /**
85
  * Number of beams to use for beam search.
86
  */
87
- numBeams?: number;
88
  /**
89
  * The value balances the model confidence and the degeneration penalty in contrastive
90
  * search decoding.
91
  */
92
- penaltyAlpha?: number;
93
  /**
94
  * The value used to modulate the next token probabilities.
95
  */
@@ -97,12 +100,12 @@ export interface GenerationParameters {
97
  /**
98
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
99
  */
100
- topK?: number;
101
  /**
102
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
103
  * that add up to top_p or higher are kept for generation.
104
  */
105
- topP?: number;
106
  /**
107
  * Local typicality measures how similar the conditional probability of predicting a target
108
  * token next is to the expected conditional probability of predicting a random token next,
@@ -110,29 +113,31 @@ export interface GenerationParameters {
110
  * most locally typical tokens with probabilities that add up to typical_p or higher are
111
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
112
  */
113
- typicalP?: number;
114
  /**
115
  * Whether the model should use the past last key/values attentions to speed up decoding
116
  */
117
- useCache?: boolean;
118
  [property: string]: unknown;
119
  }
 
120
  /**
121
  * Controls the stopping condition for beam-based methods.
122
  */
123
  export type EarlyStoppingUnion = boolean | "never";
124
- export type TextToAudioOutput = TextToAudioOutputElement[];
125
  /**
126
  * Outputs of inference for the Text To Audio task
127
  */
128
- export interface TextToAudioOutputElement {
129
  /**
130
  * The generated audio waveform.
131
  */
132
  audio: unknown;
 
133
  /**
134
  * The sampling rate of the generated audio waveform.
135
  */
136
- samplingRate: number;
137
  [property: string]: unknown;
138
  }
 
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
6
+
7
  /**
8
  * Inputs for Text To Audio inference
9
  */
 
18
  parameters?: TextToAudioParameters;
19
  [property: string]: unknown;
20
  }
21
+
22
  /**
23
  * Additional inference parameters
24
  *
 
31
  generate?: GenerationParameters;
32
  [property: string]: unknown;
33
  }
34
+
35
  /**
36
  * Parametrization of the text generation process
37
  *
 
41
  /**
42
  * Whether to use sampling instead of greedy decoding when generating new tokens.
43
  */
44
+ do_sample?: boolean;
45
  /**
46
  * Controls the stopping condition for beam-based methods.
47
  */
48
+ early_stopping?: EarlyStoppingUnion;
49
  /**
50
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
51
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
52
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
53
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
54
  */
55
+ epsilon_cutoff?: number;
56
  /**
57
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
58
  * float strictly between 0 and 1, a token is only considered if it is greater than either
 
62
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
63
  * for more details.
64
  */
65
+ eta_cutoff?: number;
66
  /**
67
  * The maximum length (in tokens) of the generated text, including the input.
68
  */
69
+ max_length?: number;
70
  /**
71
  * The maximum number of tokens to generate. Takes precedence over maxLength.
72
  */
73
+ max_new_tokens?: number;
74
  /**
75
  * The minimum length (in tokens) of the generated text, including the input.
76
  */
77
+ min_length?: number;
78
  /**
79
  * The minimum number of tokens to generate. Takes precedence over maxLength.
80
  */
81
+ min_new_tokens?: number;
82
  /**
83
  * Number of groups to divide num_beams into in order to ensure diversity among different
84
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
85
  */
86
+ num_beam_groups?: number;
87
  /**
88
  * Number of beams to use for beam search.
89
  */
90
+ num_beams?: number;
91
  /**
92
  * The value balances the model confidence and the degeneration penalty in contrastive
93
  * search decoding.
94
  */
95
+ penalty_alpha?: number;
96
  /**
97
  * The value used to modulate the next token probabilities.
98
  */
 
100
  /**
101
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
102
  */
103
+ top_k?: number;
104
  /**
105
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
106
  * that add up to top_p or higher are kept for generation.
107
  */
108
+ top_p?: number;
109
  /**
110
  * Local typicality measures how similar the conditional probability of predicting a target
111
  * token next is to the expected conditional probability of predicting a random token next,
 
113
  * most locally typical tokens with probabilities that add up to typical_p or higher are
114
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
115
  */
116
+ typical_p?: number;
117
  /**
118
  * Whether the model should use the past last key/values attentions to speed up decoding
119
  */
120
+ use_cache?: boolean;
121
  [property: string]: unknown;
122
  }
123
+
124
  /**
125
  * Controls the stopping condition for beam-based methods.
126
  */
127
  export type EarlyStoppingUnion = boolean | "never";
128
+
129
  /**
130
  * Outputs of inference for the Text To Audio task
131
  */
132
+ export interface TextToAudioOutput {
133
  /**
134
  * The generated audio waveform.
135
  */
136
  audio: unknown;
137
+ samplingRate: unknown;
138
  /**
139
  * The sampling rate of the generated audio waveform.
140
  */
141
+ sampling_rate?: number;
142
  [property: string]: unknown;
143
  }
packages/tasks/src/tasks/text-to-audio/spec/output.json CHANGED
@@ -3,18 +3,15 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text To Audio task",
5
  "title": "TextToAudioOutput",
6
- "type": "array",
7
- "items": {
8
- "type": "object",
9
- "properties": {
10
- "audio": {
11
- "description": "The generated audio waveform."
12
- },
13
- "samplingRate": {
14
- "type": "number",
15
- "description": "The sampling rate of the generated audio waveform."
16
- }
17
  },
18
- "required": ["audio", "samplingRate"]
19
- }
 
 
 
 
20
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text To Audio task",
5
  "title": "TextToAudioOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "audio": {
9
+ "description": "The generated audio waveform."
 
 
 
 
 
 
 
10
  },
11
+ "sampling_rate": {
12
+ "type": "number",
13
+ "description": "The sampling rate of the generated audio waveform."
14
+ }
15
+ },
16
+ "required": ["audio", "samplingRate"]
17
  }
packages/tasks/src/tasks/text-to-image/inference.ts CHANGED
@@ -29,16 +29,16 @@ export interface TextToImageParameters {
29
  * For diffusion models. A higher guidance scale value encourages the model to generate
30
  * images closely linked to the text prompt at the expense of lower image quality.
31
  */
32
- guidanceScale?: number;
33
  /**
34
  * One or several prompt to guide what NOT to include in image generation.
35
  */
36
- negativePrompt?: string[];
37
  /**
38
  * For diffusion models. The number of denoising steps. More denoising steps usually lead to
39
  * a higher quality image at the expense of slower inference.
40
  */
41
- numInferenceSteps?: number;
42
  /**
43
  * For diffusion models. Override the scheduler with a compatible one
44
  */
@@ -46,7 +46,7 @@ export interface TextToImageParameters {
46
  /**
47
  * The size in pixel of the output image
48
  */
49
- targetSize?: TargetSize;
50
  [property: string]: unknown;
51
  }
52
 
@@ -62,9 +62,7 @@ export interface TargetSize {
62
  /**
63
  * Outputs of inference for the Text To Image task
64
  */
65
- export type TextToImageOutput = unknown[] | boolean | number | number | null | TextToImageOutputObject | string;
66
-
67
- export interface TextToImageOutputObject {
68
  /**
69
  * The generated image
70
  */
 
29
  * For diffusion models. A higher guidance scale value encourages the model to generate
30
  * images closely linked to the text prompt at the expense of lower image quality.
31
  */
32
+ guidance_scale?: number;
33
  /**
34
  * One or several prompt to guide what NOT to include in image generation.
35
  */
36
+ negative_prompt?: string[];
37
  /**
38
  * For diffusion models. The number of denoising steps. More denoising steps usually lead to
39
  * a higher quality image at the expense of slower inference.
40
  */
41
+ num_inference_steps?: number;
42
  /**
43
  * For diffusion models. Override the scheduler with a compatible one
44
  */
 
46
  /**
47
  * The size in pixel of the output image
48
  */
49
+ target_size?: TargetSize;
50
  [property: string]: unknown;
51
  }
52
 
 
62
  /**
63
  * Outputs of inference for the Text To Image task
64
  */
65
+ export interface TextToImageOutput {
 
 
66
  /**
67
  * The generated image
68
  */
packages/tasks/src/tasks/text-to-image/spec/input.json CHANGED
@@ -20,22 +20,22 @@
20
  "description": "Additional inference parameters for Text To Image",
21
  "type": "object",
22
  "properties": {
23
- "guidanceScale": {
24
  "type": "number",
25
  "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
26
  },
27
- "negativePrompt": {
28
  "type": "array",
29
  "items": {
30
  "type": "string"
31
  },
32
  "description": "One or several prompt to guide what NOT to include in image generation."
33
  },
34
- "numInferenceSteps": {
35
  "type": "integer",
36
  "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
37
  },
38
- "targetSize": {
39
  "type": "object",
40
  "description": "The size in pixel of the output image",
41
  "properties": {
 
20
  "description": "Additional inference parameters for Text To Image",
21
  "type": "object",
22
  "properties": {
23
+ "guidance_scale": {
24
  "type": "number",
25
  "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
26
  },
27
+ "negative_prompt": {
28
  "type": "array",
29
  "items": {
30
  "type": "string"
31
  },
32
  "description": "One or several prompt to guide what NOT to include in image generation."
33
  },
34
+ "num_inference_steps": {
35
  "type": "integer",
36
  "description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
37
  },
38
+ "target_size": {
39
  "type": "object",
40
  "description": "The size in pixel of the output image",
41
  "properties": {
packages/tasks/src/tasks/text-to-image/spec/output.json CHANGED
@@ -3,13 +3,11 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text To Image task",
5
  "title": "TextToImageOutput",
6
- "type": "array",
7
- "items": {
8
- "properties": {
9
- "image": {
10
- "description": "The generated image"
11
- }
12
- },
13
- "required": ["image"]
14
- }
15
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text To Image task",
5
  "title": "TextToImageOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "image": {
9
+ "description": "The generated image"
10
+ }
11
+ },
12
+ "required": ["image"]
 
 
13
  }
packages/tasks/src/tasks/text-to-speech/inference.ts CHANGED
@@ -43,18 +43,18 @@ export interface GenerationParameters {
43
  /**
44
  * Whether to use sampling instead of greedy decoding when generating new tokens.
45
  */
46
- doSample?: boolean;
47
  /**
48
  * Controls the stopping condition for beam-based methods.
49
  */
50
- earlyStopping?: EarlyStoppingUnion;
51
  /**
52
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
53
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
54
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
55
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
56
  */
57
- epsilonCutoff?: number;
58
  /**
59
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
60
  * float strictly between 0 and 1, a token is only considered if it is greater than either
@@ -64,37 +64,37 @@ export interface GenerationParameters {
64
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
65
  * for more details.
66
  */
67
- etaCutoff?: number;
68
  /**
69
  * The maximum length (in tokens) of the generated text, including the input.
70
  */
71
- maxLength?: number;
72
  /**
73
  * The maximum number of tokens to generate. Takes precedence over maxLength.
74
  */
75
- maxNewTokens?: number;
76
  /**
77
  * The minimum length (in tokens) of the generated text, including the input.
78
  */
79
- minLength?: number;
80
  /**
81
  * The minimum number of tokens to generate. Takes precedence over maxLength.
82
  */
83
- minNewTokens?: number;
84
  /**
85
  * Number of groups to divide num_beams into in order to ensure diversity among different
86
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
87
  */
88
- numBeamGroups?: number;
89
  /**
90
  * Number of beams to use for beam search.
91
  */
92
- numBeams?: number;
93
  /**
94
  * The value balances the model confidence and the degeneration penalty in contrastive
95
  * search decoding.
96
  */
97
- penaltyAlpha?: number;
98
  /**
99
  * The value used to modulate the next token probabilities.
100
  */
@@ -102,12 +102,12 @@ export interface GenerationParameters {
102
  /**
103
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
104
  */
105
- topK?: number;
106
  /**
107
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
108
  * that add up to top_p or higher are kept for generation.
109
  */
110
- topP?: number;
111
  /**
112
  * Local typicality measures how similar the conditional probability of predicting a target
113
  * token next is to the expected conditional probability of predicting a random token next,
@@ -115,11 +115,11 @@ export interface GenerationParameters {
115
  * most locally typical tokens with probabilities that add up to typical_p or higher are
116
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
117
  */
118
- typicalP?: number;
119
  /**
120
  * Whether the model should use the past last key/values attentions to speed up decoding
121
  */
122
- useCache?: boolean;
123
  [property: string]: unknown;
124
  }
125
 
@@ -138,9 +138,10 @@ export interface TextToSpeechOutput {
138
  * The generated audio waveform.
139
  */
140
  audio: unknown;
 
141
  /**
142
  * The sampling rate of the generated audio waveform.
143
  */
144
- samplingRate: number;
145
  [property: string]: unknown;
146
  }
 
43
  /**
44
  * Whether to use sampling instead of greedy decoding when generating new tokens.
45
  */
46
+ do_sample?: boolean;
47
  /**
48
  * Controls the stopping condition for beam-based methods.
49
  */
50
+ early_stopping?: EarlyStoppingUnion;
51
  /**
52
  * If set to float strictly between 0 and 1, only tokens with a conditional probability
53
  * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
54
  * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
55
  * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
56
  */
57
+ epsilon_cutoff?: number;
58
  /**
59
  * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
60
  * float strictly between 0 and 1, a token is only considered if it is greater than either
 
64
  * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
65
  * for more details.
66
  */
67
+ eta_cutoff?: number;
68
  /**
69
  * The maximum length (in tokens) of the generated text, including the input.
70
  */
71
+ max_length?: number;
72
  /**
73
  * The maximum number of tokens to generate. Takes precedence over maxLength.
74
  */
75
+ max_new_tokens?: number;
76
  /**
77
  * The minimum length (in tokens) of the generated text, including the input.
78
  */
79
+ min_length?: number;
80
  /**
81
  * The minimum number of tokens to generate. Takes precedence over maxLength.
82
  */
83
+ min_new_tokens?: number;
84
  /**
85
  * Number of groups to divide num_beams into in order to ensure diversity among different
86
  * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
87
  */
88
+ num_beam_groups?: number;
89
  /**
90
  * Number of beams to use for beam search.
91
  */
92
+ num_beams?: number;
93
  /**
94
  * The value balances the model confidence and the degeneration penalty in contrastive
95
  * search decoding.
96
  */
97
+ penalty_alpha?: number;
98
  /**
99
  * The value used to modulate the next token probabilities.
100
  */
 
102
  /**
103
  * The number of highest probability vocabulary tokens to keep for top-k-filtering.
104
  */
105
+ top_k?: number;
106
  /**
107
  * If set to float < 1, only the smallest set of most probable tokens with probabilities
108
  * that add up to top_p or higher are kept for generation.
109
  */
110
+ top_p?: number;
111
  /**
112
  * Local typicality measures how similar the conditional probability of predicting a target
113
  * token next is to the expected conditional probability of predicting a random token next,
 
115
  * most locally typical tokens with probabilities that add up to typical_p or higher are
116
  * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
117
  */
118
+ typical_p?: number;
119
  /**
120
  * Whether the model should use the past last key/values attentions to speed up decoding
121
  */
122
+ use_cache?: boolean;
123
  [property: string]: unknown;
124
  }
125
 
 
138
  * The generated audio waveform.
139
  */
140
  audio: unknown;
141
+ samplingRate: unknown;
142
  /**
143
  * The sampling rate of the generated audio waveform.
144
  */
145
+ sampling_rate?: number;
146
  [property: string]: unknown;
147
  }
packages/tasks/src/tasks/text2text-generation/inference.ts CHANGED
@@ -3,6 +3,7 @@
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
 
6
  /**
7
  * Inputs for Text2text Generation inference
8
  */
@@ -17,6 +18,7 @@ export interface Text2TextGenerationInput {
17
  parameters?: Text2TextGenerationParameters;
18
  [property: string]: unknown;
19
  }
 
20
  /**
21
  * Additional inference parameters
22
  *
@@ -26,28 +28,28 @@ export interface Text2TextGenerationParameters {
26
  /**
27
  * Whether to clean up the potential extra spaces in the text output.
28
  */
29
- cleanUpTokenizationSpaces?: boolean;
30
  /**
31
  * Additional parametrization of the text generation algorithm
32
  */
33
- generateParameters?: {
34
- [key: string]: unknown;
35
- };
36
  /**
37
  * The truncation strategy to use
38
  */
39
  truncation?: Text2TextGenerationTruncationStrategy;
40
  [property: string]: unknown;
41
  }
 
42
  export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
43
- export type Text2TextGenerationOutput = Text2TextGenerationOutputElement[];
44
  /**
45
  * Outputs of inference for the Text2text Generation task
46
  */
47
- export interface Text2TextGenerationOutputElement {
 
48
  /**
49
  * The generated text.
50
  */
51
- generatedText: string;
52
  [property: string]: unknown;
53
  }
 
3
  *
4
  * Using src/scripts/inference-codegen
5
  */
6
+
7
  /**
8
  * Inputs for Text2text Generation inference
9
  */
 
18
  parameters?: Text2TextGenerationParameters;
19
  [property: string]: unknown;
20
  }
21
+
22
  /**
23
  * Additional inference parameters
24
  *
 
28
  /**
29
  * Whether to clean up the potential extra spaces in the text output.
30
  */
31
+ clean_up_tokenization_spaces?: boolean;
32
  /**
33
  * Additional parametrization of the text generation algorithm
34
  */
35
+ generate_parameters?: { [key: string]: unknown };
 
 
36
  /**
37
  * The truncation strategy to use
38
  */
39
  truncation?: Text2TextGenerationTruncationStrategy;
40
  [property: string]: unknown;
41
  }
42
+
43
  export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
44
+
45
  /**
46
  * Outputs of inference for the Text2text Generation task
47
  */
48
+ export interface Text2TextGenerationOutput {
49
+ generatedText: unknown;
50
  /**
51
  * The generated text.
52
  */
53
+ generated_text?: string;
54
  [property: string]: unknown;
55
  }
packages/tasks/src/tasks/text2text-generation/spec/input.json CHANGED
@@ -20,7 +20,7 @@
20
  "description": "Additional inference parameters for Text2text Generation",
21
  "type": "object",
22
  "properties": {
23
- "cleanUpTokenizationSpaces": {
24
  "type": "boolean",
25
  "description": "Whether to clean up the potential extra spaces in the text output."
26
  },
@@ -43,7 +43,7 @@
43
  }
44
  ]
45
  },
46
- "generateParameters": {
47
  "title": "generateParameters",
48
  "type": "object",
49
  "description": "Additional parametrization of the text generation algorithm"
 
20
  "description": "Additional inference parameters for Text2text Generation",
21
  "type": "object",
22
  "properties": {
23
+ "clean_up_tokenization_spaces": {
24
  "type": "boolean",
25
  "description": "Whether to clean up the potential extra spaces in the text output."
26
  },
 
43
  }
44
  ]
45
  },
46
+ "generate_parameters": {
47
  "title": "generateParameters",
48
  "type": "object",
49
  "description": "Additional parametrization of the text generation algorithm"
packages/tasks/src/tasks/text2text-generation/spec/output.json CHANGED
@@ -3,15 +3,12 @@
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text2text Generation task",
5
  "title": "Text2TextGenerationOutput",
6
- "type": "array",
7
- "items": {
8
- "type": "object",
9
- "properties": {
10
- "generatedText": {
11
- "type": "string",
12
- "description": "The generated text."
13
- }
14
- },
15
- "required": ["generatedText"]
16
- }
17
  }
 
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
  "description": "Outputs of inference for the Text2text Generation task",
5
  "title": "Text2TextGenerationOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "generated_text": {
9
+ "type": "string",
10
+ "description": "The generated text."
11
+ }
12
+ },
13
+ "required": ["generatedText"]
 
 
 
14
  }
packages/tasks/src/tasks/token-classification/inference.ts CHANGED
@@ -26,11 +26,11 @@ export interface TokenClassificationParameters {
26
  /**
27
  * The strategy used to fuse tokens based on model predictions
28
  */
29
- aggregationStrategy?: TokenClassificationAggregationStrategy;
30
  /**
31
  * A list of labels to ignore
32
  */
33
- ignoreLabels?: string[];
34
  /**
35
  * The number of overlapping tokens between chunks when splitting the input text.
36
  */
@@ -64,7 +64,7 @@ export interface TokenClassificationOutputElement {
64
  /**
65
  * The predicted label for that group of tokens
66
  */
67
- entityGroup?: string;
68
  label: unknown;
69
  /**
70
  * The associated score / probability
 
26
  /**
27
  * The strategy used to fuse tokens based on model predictions
28
  */
29
+ aggregation_strategy?: TokenClassificationAggregationStrategy;
30
  /**
31
  * A list of labels to ignore
32
  */
33
+ ignore_labels?: string[];
34
  /**
35
  * The number of overlapping tokens between chunks when splitting the input text.
36
  */
 
64
  /**
65
  * The predicted label for that group of tokens
66
  */
67
+ entity_group?: string;
68
  label: unknown;
69
  /**
70
  * The associated score / probability
packages/tasks/src/tasks/token-classification/spec/input.json CHANGED
@@ -20,7 +20,7 @@
20
  "description": "Additional inference parameters for Token Classification",
21
  "type": "object",
22
  "properties": {
23
- "ignoreLabels": {
24
  "type": "array",
25
  "items": {
26
  "type": "string"
@@ -31,7 +31,7 @@
31
  "type": "integer",
32
  "description": "The number of overlapping tokens between chunks when splitting the input text."
33
  },
34
- "aggregationStrategy": {
35
  "title": "TokenClassificationAggregationStrategy",
36
  "type": "string",
37
  "description": "The strategy used to fuse tokens based on model predictions",
 
20
  "description": "Additional inference parameters for Token Classification",
21
  "type": "object",
22
  "properties": {
23
+ "ignore_labels": {
24
  "type": "array",
25
  "items": {
26
  "type": "string"
 
31
  "type": "integer",
32
  "description": "The number of overlapping tokens between chunks when splitting the input text."
33
  },
34
+ "aggregation_strategy": {
35
  "title": "TokenClassificationAggregationStrategy",
36
  "type": "string",
37
  "description": "The strategy used to fuse tokens based on model predictions",
packages/tasks/src/tasks/token-classification/spec/output.json CHANGED
@@ -7,7 +7,7 @@
7
  "items": {
8
  "type": "object",
9
  "properties": {
10
- "entityGroup": {
11
  "type": "string",
12
  "description": "The predicted label for that group of tokens"
13
  },
 
7
  "items": {
8
  "type": "object",
9
  "properties": {
10
+ "entity_group": {
11
  "type": "string",
12
  "description": "The predicted label for that group of tokens"
13
  },
packages/tasks/src/tasks/translation/inference.ts CHANGED
@@ -30,11 +30,11 @@ export interface Text2TextGenerationParameters {
30
  /**
31
  * Whether to clean up the potential extra spaces in the text output.
32
  */
33
- cleanUpTokenizationSpaces?: boolean;
34
  /**
35
  * Additional parametrization of the text generation algorithm
36
  */
37
- generateParameters?: { [key: string]: unknown };
38
  /**
39
  * The truncation strategy to use
40
  */
@@ -50,9 +50,10 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
50
  * Outputs of inference for the Text2text Generation task
51
  */
52
  export interface TranslationOutput {
 
53
  /**
54
  * The generated text.
55
  */
56
- generatedText: string;
57
  [property: string]: unknown;
58
  }
 
30
  /**
31
  * Whether to clean up the potential extra spaces in the text output.
32
  */
33
+ clean_up_tokenization_spaces?: boolean;
34
  /**
35
  * Additional parametrization of the text generation algorithm
36
  */
37
+ generate_parameters?: { [key: string]: unknown };
38
  /**
39
  * The truncation strategy to use
40
  */
 
50
  * Outputs of inference for the Text2text Generation task
51
  */
52
  export interface TranslationOutput {
53
+ generatedText: unknown;
54
  /**
55
  * The generated text.
56
  */
57
+ generated_text?: string;
58
  [property: string]: unknown;
59
  }
packages/tasks/src/tasks/video-classification/inference.ts CHANGED
@@ -26,16 +26,16 @@ export interface VideoClassificationParameters {
26
  /**
27
  * The sampling rate used to select frames from the video.
28
  */
29
- frameSamplingRate?: number;
30
- functionToApply?: ClassificationOutputTransform;
31
  /**
32
  * The number of sampled frames to consider for classification.
33
  */
34
- numFrames?: number;
35
  /**
36
  * When specified, limits the output to the top K most probable classes.
37
  */
38
- topK?: number;
39
  [property: string]: unknown;
40
  }
41
  /**
 
26
  /**
27
  * The sampling rate used to select frames from the video.
28
  */
29
+ frame_sampling_rate?: number;
30
+ function_to_apply?: ClassificationOutputTransform;
31
  /**
32
  * The number of sampled frames to consider for classification.
33
  */
34
+ num_frames?: number;
35
  /**
36
  * When specified, limits the output to the top K most probable classes.
37
  */
38
+ top_k?: number;
39
  [property: string]: unknown;
40
  }
41
  /**
packages/tasks/src/tasks/video-classification/spec/input.json CHANGED
@@ -19,19 +19,19 @@
19
  "description": "Additional inference parameters for Video Classification",
20
  "type": "object",
21
  "properties": {
22
- "functionToApply": {
23
  "title": "TextClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
- "numFrames": {
27
  "type": "integer",
28
  "description": "The number of sampled frames to consider for classification."
29
  },
30
- "frameSamplingRate": {
31
  "type": "integer",
32
  "description": "The sampling rate used to select frames from the video."
33
  },
34
- "topK": {
35
  "type": "integer",
36
  "description": "When specified, limits the output to the top K most probable classes."
37
  }
 
19
  "description": "Additional inference parameters for Video Classification",
20
  "type": "object",
21
  "properties": {
22
+ "function_to_apply": {
23
  "title": "TextClassificationOutputTransform",
24
  "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
  },
26
+ "num_frames": {
27
  "type": "integer",
28
  "description": "The number of sampled frames to consider for classification."
29
  },
30
+ "frame_sampling_rate": {
31
  "type": "integer",
32
  "description": "The sampling rate used to select frames from the video."
33
  },
34
+ "top_k": {
35
  "type": "integer",
36
  "description": "When specified, limits the output to the top K most probable classes."
37
  }
packages/tasks/src/tasks/visual-question-answering/inference.ts CHANGED
@@ -42,7 +42,7 @@ export interface VisualQuestionAnsweringParameters {
42
  * return less than topk answers if there are not enough options available within the
43
  * context.
44
  */
45
- topK?: number;
46
  [property: string]: unknown;
47
  }
48
  export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
 
42
  * return less than topk answers if there are not enough options available within the
43
  * context.
44
  */
45
+ top_k?: number;
46
  [property: string]: unknown;
47
  }
48
  export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];