diff --git a/packages/tasks/package.json b/packages/tasks/package.json
index 0917d797567e6f4a28ccfdbb5007f456cbf460f6..e61a09163d070149b1c06d3d13fd1332fa22c11e 100644
--- a/packages/tasks/package.json
+++ b/packages/tasks/package.json
@@ -24,9 +24,10 @@
 		"format": "prettier --write .",
 		"format:check": "prettier --check .",
 		"prepublishOnly": "pnpm run build",
-		"build": "tsup src/index.ts --format cjs,esm --clean --dts",
+		"build": "tsup src/index.ts src/scripts/**.ts --format cjs,esm --clean --dts",
 		"prepare": "pnpm run build",
-		"check": "tsc"
+		"check": "tsc",
+		"inference-codegen": "pnpm run build && node dist/scripts/inference-codegen.js"
 	},
 	"files": [
 		"dist",
@@ -40,5 +41,8 @@
 	],
 	"author": "Hugging Face",
 	"license": "MIT",
-	"devDependencies": {}
+	"devDependencies": {
+		"@types/node": "^20.11.5",
+		"quicktype-core": "https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz"
+	}
 }
diff --git a/packages/tasks/pnpm-lock.yaml b/packages/tasks/pnpm-lock.yaml
index 2b9f1883a1f45e12d36aec8a56f5aa1342d48e44..0ff78350fc6063a4c1d21b5b31a8931ed097a817 100644
--- a/packages/tasks/pnpm-lock.yaml
+++ b/packages/tasks/pnpm-lock.yaml
@@ -3,3 +3,212 @@ lockfileVersion: '6.0'
 settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
+
+devDependencies:
+  '@types/node':
+    specifier: ^20.11.5
+    version: 20.11.5
+  quicktype-core:
+    specifier: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz
+    version: '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz'
+
+packages:
+
+  /@glideapps/ts-necessities@2.1.3:
+    resolution: {integrity: sha512-q9U8v/n9qbkd2zDYjuX3qtlbl+OIyI9zF+zQhZjfYOE9VMDH7tfcUSJ9p0lXoY3lxmGFne09yi4iiNeQUwV7AA==}
+    dev: true
+
+  /@types/node@20.11.5:
+    resolution: {integrity: sha512-g557vgQjUUfN76MZAN/dt1z3dzcUsimuysco0KeluHgrPdJXkP/XdAURgyO2W9fZWHRtRBiVKzKn8vyOAwlG+w==}
+    dependencies:
+      undici-types: 5.26.5
+    dev: true
+
+  /@types/urijs@1.19.25:
+    resolution: {integrity: sha512-XOfUup9r3Y06nFAZh3WvO0rBU4OtlfPB/vgxpjg+NRdGU6CN6djdc6OEiH+PcqHCY6eFLo9Ista73uarf4gnBg==}
+    dev: true
+
+  /abort-controller@3.0.0:
+    resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==}
+    engines: {node: '>=6.5'}
+    dependencies:
+      event-target-shim: 5.0.1
+    dev: true
+
+  /base64-js@1.5.1:
+    resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
+    dev: true
+
+  /browser-or-node@2.1.1:
+    resolution: {integrity: sha512-8CVjaLJGuSKMVTxJ2DpBl5XnlNDiT4cQFeuCJJrvJmts9YrTZDizTX7PjC2s6W4x+MBGZeEY6dGMrF04/6Hgqg==}
+    dev: true
+
+  /buffer@6.0.3:
+    resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==}
+    dependencies:
+      base64-js: 1.5.1
+      ieee754: 1.2.1
+    dev: true
+
+  /collection-utils@1.0.1:
+    resolution: {integrity: sha512-LA2YTIlR7biSpXkKYwwuzGjwL5rjWEZVOSnvdUc7gObvWe4WkjxOpfrdhoP7Hs09YWDVfg0Mal9BpAqLfVEzQg==}
+    dev: true
+
+  /cross-fetch@4.0.0:
+    resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==}
+    dependencies:
+      node-fetch: 2.7.0
+    transitivePeerDependencies:
+      - encoding
+    dev: true
+
+  /event-target-shim@5.0.1:
+    resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
+    engines: {node: '>=6'}
+    dev: true
+
+  /events@3.3.0:
+    resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
+    engines: {node: '>=0.8.x'}
+    dev: true
+
+  /ieee754@1.2.1:
+    resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
+    dev: true
+
+  /is-url@1.2.4:
+    resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==}
+    dev: true
+
+  /js-base64@3.7.6:
+    resolution: {integrity: sha512-NPrWuHFxFUknr1KqJRDgUQPexQF0uIJWjeT+2KjEePhitQxQEx5EJBG1lVn5/hc8aLycTpXrDOgPQ6Zq+EDiTA==}
+    dev: true
+
+  /lodash@4.17.21:
+    resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==}
+    dev: true
+
+  /node-fetch@2.7.0:
+    resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
+    engines: {node: 4.x || >=6.0.0}
+    peerDependencies:
+      encoding: ^0.1.0
+    peerDependenciesMeta:
+      encoding:
+        optional: true
+    dependencies:
+      whatwg-url: 5.0.0
+    dev: true
+
+  /pako@0.2.9:
+    resolution: {integrity: sha512-NUcwaKxUxWrZLpDG+z/xZaCgQITkA/Dv4V/T6bw7VON6l1Xz/VnrBqrYjZQ12TamKHzITTfOEIYUj48y2KXImA==}
+    dev: true
+
+  /pako@1.0.11:
+    resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
+    dev: true
+
+  /pluralize@8.0.0:
+    resolution: {integrity: sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==}
+    engines: {node: '>=4'}
+    dev: true
+
+  /process@0.11.10:
+    resolution: {integrity: sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==}
+    engines: {node: '>= 0.6.0'}
+    dev: true
+
+  /readable-stream@4.4.2:
+    resolution: {integrity: sha512-Lk/fICSyIhodxy1IDK2HazkeGjSmezAWX2egdtJnYhtzKEsBPJowlI6F6LPb5tqIQILrMbx22S5o3GuJavPusA==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      abort-controller: 3.0.0
+      buffer: 6.0.3
+      events: 3.3.0
+      process: 0.11.10
+      string_decoder: 1.3.0
+    dev: true
+
+  /safe-buffer@5.2.1:
+    resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
+    dev: true
+
+  /string_decoder@1.3.0:
+    resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
+    dependencies:
+      safe-buffer: 5.2.1
+    dev: true
+
+  /tiny-inflate@1.0.3:
+    resolution: {integrity: sha512-pkY1fj1cKHb2seWDy0B16HeWyczlJA9/WW3u3c4z/NiWDsO3DOU5D7nhTLE9CF0yXv/QZFY7sEJmj24dK+Rrqw==}
+    dev: true
+
+  /tr46@0.0.3:
+    resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==}
+    dev: true
+
+  /undici-types@5.26.5:
+    resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
+    dev: true
+
+  /unicode-properties@1.4.1:
+    resolution: {integrity: sha512-CLjCCLQ6UuMxWnbIylkisbRj31qxHPAurvena/0iwSVbQ2G1VY5/HjV0IRabOEbDHlzZlRdCrD4NhB0JtU40Pg==}
+    dependencies:
+      base64-js: 1.5.1
+      unicode-trie: 2.0.0
+    dev: true
+
+  /unicode-trie@2.0.0:
+    resolution: {integrity: sha512-x7bc76x0bm4prf1VLg79uhAzKw8DVboClSN5VxJuQ+LKDOVEW9CdH+VY7SP+vX7xCYQqzzgQpFqz15zeLvAtZQ==}
+    dependencies:
+      pako: 0.2.9
+      tiny-inflate: 1.0.3
+    dev: true
+
+  /urijs@1.19.11:
+    resolution: {integrity: sha512-HXgFDgDommxn5/bIv0cnQZsPhHDA90NPHD6+c/v21U5+Sx5hoP8+dP9IZXBU1gIfvdRfhG8cel9QNPeionfcCQ==}
+    dev: true
+
+  /webidl-conversions@3.0.1:
+    resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==}
+    dev: true
+
+  /whatwg-url@5.0.0:
+    resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==}
+    dependencies:
+      tr46: 0.0.3
+      webidl-conversions: 3.0.1
+    dev: true
+
+  /wordwrap@1.0.0:
+    resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==}
+    dev: true
+
+  /yaml@2.3.4:
+    resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==}
+    engines: {node: '>= 14'}
+    dev: true
+
+  '@github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz':
+    resolution: {tarball: https://github.com/huggingface/quicktype/raw/pack-18.0.15/packages/quicktype-core/quicktype-core-18.0.15.tgz}
+    name: quicktype-core
+    version: 18.0.15
+    dependencies:
+      '@glideapps/ts-necessities': 2.1.3
+      '@types/urijs': 1.19.25
+      browser-or-node: 2.1.1
+      collection-utils: 1.0.1
+      cross-fetch: 4.0.0
+      is-url: 1.2.4
+      js-base64: 3.7.6
+      lodash: 4.17.21
+      pako: 1.0.11
+      pluralize: 8.0.0
+      readable-stream: 4.4.2
+      unicode-properties: 1.4.1
+      urijs: 1.19.11
+      wordwrap: 1.0.0
+      yaml: 2.3.4
+    transitivePeerDependencies:
+      - encoding
+    dev: true
diff --git a/packages/tasks/src/scripts/inference-codegen.ts b/packages/tasks/src/scripts/inference-codegen.ts
new file mode 100644
index 0000000000000000000000000000000000000000..02c8e30031cbf274532052738a6931228ed27f28
--- /dev/null
+++ b/packages/tasks/src/scripts/inference-codegen.ts
@@ -0,0 +1,192 @@
+import type { SerializedRenderResult } from "quicktype-core";
+import { quicktype, InputData, JSONSchemaInput, FetchingJSONSchemaStore } from "quicktype-core";
+import * as fs from "fs/promises";
+import { existsSync as pathExists } from "fs";
+import * as path from "path";
+import * as ts from "typescript";
+
+const TYPESCRIPT_HEADER_FILE = `
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ * 
+ * Using src/scripts/inference-codegen
+ */
+
+`;
+
+const rootDirFinder = function (): string {
+	const parts = __dirname.split("/");
+	let level = parts.length - 1;
+	while (level > 0) {
+		const currentPath = parts.slice(0, level).join("/");
+		if (pathExists(`${currentPath}/package.json`)) {
+			return path.normalize(currentPath);
+		}
+		level--;
+	}
+	return "";
+};
+
+/**
+ *
+ * @param taskId The ID of the task for which we are generating code
+ * @param taskSpecDir The path to the directory where the input.json & output.json files are
+ * @param allSpecFiles An array of paths to all the tasks specs. Allows resolving cross-file references ($ref).
+ */
+async function buildInputData(taskId: string, taskSpecDir: string, allSpecFiles: string[]): Promise<InputData> {
+	const schema = new JSONSchemaInput(new FetchingJSONSchemaStore(), [], allSpecFiles);
+	await schema.addSource({
+		name: `${taskId}-input`,
+		schema: await fs.readFile(`${taskSpecDir}/input.json`, { encoding: "utf-8" }),
+	});
+	await schema.addSource({
+		name: `${taskId}-output`,
+		schema: await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }),
+	});
+	const inputData = new InputData();
+	inputData.addInput(schema);
+	return inputData;
+}
+
+async function generateTypescript(inputData: InputData): Promise<SerializedRenderResult> {
+	return await quicktype({
+		inputData,
+		lang: "typescript",
+		alphabetizeProperties: true,
+		rendererOptions: {
+			"just-types": true,
+			"nice-property-names": true,
+			"prefer-unions": true,
+			"prefer-const-values": true,
+			"prefer-unknown": true,
+			"explicit-unions": true,
+		},
+	});
+}
+/**
+ * quicktype is unable to generate "top-level array types" that are defined in the output spec: https://github.com/glideapps/quicktype/issues/2481
+ * We have to use the TypeScript API to generate those types when required.
+ * This hacky function:
+ *   - looks for the generated interface for output types
+ *   - renames it with a `Element` suffix
+ *   - generates  type alias in the form `export type <OutputType> = <OutputType>Element[];
+ *
+ * And writes that to the `inference.ts` file
+ *
+ */
+async function postProcessOutput(path2generated: string, outputSpec: Record<string, unknown>): Promise<void> {
+	const source = ts.createSourceFile(
+		path.basename(path2generated),
+		await fs.readFile(path2generated, { encoding: "utf-8" }),
+		ts.ScriptTarget.ES2022
+	);
+	const exportedName = outputSpec.title;
+	if (outputSpec.type !== "array" || typeof exportedName !== "string") {
+		console.log("      Nothing to do");
+		return;
+	}
+	const topLevelNodes = source.getChildAt(0).getChildren();
+	const hasTypeAlias = topLevelNodes.some(
+		(node) =>
+			node.kind === ts.SyntaxKind.TypeAliasDeclaration &&
+			(node as ts.TypeAliasDeclaration).name.escapedText === exportedName
+	);
+	if (hasTypeAlias) {
+		return;
+	}
+
+	const interfaceDeclaration = topLevelNodes.find((node): node is ts.InterfaceDeclaration => {
+		if (node.kind === ts.SyntaxKind.InterfaceDeclaration) {
+			return (node as ts.InterfaceDeclaration).name.getText(source) === exportedName;
+		}
+		return false;
+	});
+	if (!interfaceDeclaration) {
+		console.log("      Nothing to do");
+		return;
+	}
+
+	console.log("      Inserting top-level array type alias...");
+
+	const updatedInterface = ts.factory.updateInterfaceDeclaration(
+		interfaceDeclaration,
+		interfaceDeclaration.modifiers,
+		ts.factory.createIdentifier(interfaceDeclaration.name.getText(source) + "Element"),
+		interfaceDeclaration.typeParameters,
+		interfaceDeclaration.heritageClauses,
+		interfaceDeclaration.members
+	);
+	const arrayDeclaration = ts.factory.createTypeAliasDeclaration(
+		[ts.factory.createModifier(ts.SyntaxKind.ExportKeyword)],
+		exportedName,
+		undefined,
+		ts.factory.createArrayTypeNode(ts.factory.createTypeReferenceNode(updatedInterface.name))
+	);
+
+	const printer = ts.createPrinter();
+
+	const newNodes = ts.factory.createNodeArray([
+		...topLevelNodes.filter((node) => node !== interfaceDeclaration),
+		arrayDeclaration,
+		updatedInterface,
+	]);
+
+	await fs.writeFile(path2generated, printer.printList(ts.ListFormat.MultiLine, newNodes, source), {
+		flag: "w+",
+		encoding: "utf-8",
+	});
+
+	return;
+}
+
+async function main() {
+	const rootDir = rootDirFinder();
+	const tasksDir = path.join(rootDir, "src", "tasks");
+	const allTasks = await Promise.all(
+		(await fs.readdir(tasksDir, { withFileTypes: true }))
+			.filter((entry) => entry.isDirectory())
+			.filter((entry) => entry.name !== "placeholder")
+			.map(async (entry) => ({ task: entry.name, dirPath: path.join(entry.path, entry.name) }))
+	);
+	const allSpecFiles = [
+		path.join(tasksDir, "common-definitions.json"),
+		...allTasks
+			.flatMap(({ dirPath }) => [path.join(dirPath, "spec", "input.json"), path.join(dirPath, "spec", "output.json")])
+			.filter((filepath) => pathExists(filepath)),
+	];
+
+	for (const { task, dirPath } of allTasks) {
+		const taskSpecDir = path.join(dirPath, "spec");
+		if (!(pathExists(path.join(taskSpecDir, "input.json")) && pathExists(path.join(taskSpecDir, "output.json")))) {
+			console.debug(`No spec found for task ${task} - skipping`);
+			continue;
+		}
+		console.debug(`✨ Generating types for task`, task);
+
+		console.debug("   📦 Building input data");
+		const inputData = await buildInputData(task, taskSpecDir, allSpecFiles);
+
+		console.debug("   🏭 Generating typescript code");
+		{
+			const { lines } = await generateTypescript(inputData);
+			await fs.writeFile(`${dirPath}/inference.ts`, [TYPESCRIPT_HEADER_FILE, ...lines].join(`\n`), {
+				flag: "w+",
+				encoding: "utf-8",
+			});
+		}
+
+		const outputSpec = JSON.parse(await fs.readFile(`${taskSpecDir}/output.json`, { encoding: "utf-8" }));
+
+		console.log("   🩹 Post-processing the generated code");
+		await postProcessOutput(`${dirPath}/inference.ts`, outputSpec);
+	}
+	console.debug("✅ All done!");
+}
+
+let exit = 0;
+main()
+	.catch((err) => {
+		console.error("Failure", err);
+		exit = 1;
+	})
+	.finally(() => process.exit(exit));
diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..ae37f29acf02093b61d66d85a6fba074002998bf
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -0,0 +1,51 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Audio Classification inference
+ */
+export interface AudioClassificationInput {
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AudioClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Audio Classification
+ */
+export interface AudioClassificationParameters {
+	functionToApply?: ClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+export type AudioClassificationOutput = AudioClassificationOutputElement[];
+/**
+ * Outputs for Audio Classification inference
+ */
+export interface AudioClassificationOutputElement {
+	/**
+	 * The predicted class label (model specific).
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5a54a6f8bbfd78cd84aafc324b6717fb0708b
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"$id": "/inference/schemas/audio-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Audio Classification inference",
+	"title": "AudioClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input audio data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/AudioClassificationParameters"
+		}
+	},
+	"$defs": {
+		"AudioClassificationParameters": {
+			"title": "AudioClassificationParameters",
+			"description": "Additional inference parameters for Audio Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"title": "AudioClassificationOutputTransform",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..83e7abe71d093e3df42ffff9a5c04c45162c547c
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -0,0 +1,21 @@
+{
+	"$id": "/inference/schemas/audio-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "AudioClassificationOutput",
+	"description": "Outputs for Audio Classification inference",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted class label (model specific)."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..d9e2adc859a380bd85aa3d7cd5d9babaff5d0591
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -0,0 +1,154 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Automatic Speech Recognition inference
+ */
+export interface AutomaticSpeechRecognitionInput {
+	/**
+	 * The input audio data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: AutomaticSpeechRecognitionParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Automatic Speech Recognition
+ */
+export interface AutomaticSpeechRecognitionParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	/**
+	 * Whether to output corresponding timestamps with the generated text
+	 */
+	returnTimestamps?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+export interface AutomaticSpeechRecognitionOutputChunk {
+	/**
+	 * A chunk of text identified by the model
+	 */
+	text: string;
+	/**
+	 * The start and end timestamps corresponding with the text
+	 */
+	timestamps: number[];
+	[property: string]: unknown;
+}
+export type AutomaticSpeechRecognitionOutput = AutomaticSpeechRecognitionOutputElement[];
+/**
+ * Outputs of inference for the Automatic Speech Recognition task
+ */
+export interface AutomaticSpeechRecognitionOutputElement {
+	/**
+	 * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
+	 * the model.
+	 */
+	chunks?: AutomaticSpeechRecognitionOutputChunk[];
+	/**
+	 * The recognized text.
+	 */
+	text: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d31957ed260375435a6daa937814f1877bad298
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"$id": "/inference/schemas/automatic-speech-recognition/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Automatic Speech Recognition inference",
+	"title": "AutomaticSpeechRecognitionInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input audio data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/AutomaticSpeechRecognitionParameters"
+		}
+	},
+	"$defs": {
+		"AutomaticSpeechRecognitionParameters": {
+			"title": "AutomaticSpeechRecognitionParameters",
+			"description": "Additional inference parameters for Automatic Speech Recognition",
+			"type": "object",
+			"properties": {
+				"returnTimestamps": {
+					"type": "boolean",
+					"description": "Whether to output corresponding timestamps with the generated text"
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..217f210b15ecc22d654de2039a9f2a49848f75e1
--- /dev/null
+++ b/packages/tasks/src/tasks/automatic-speech-recognition/spec/output.json
@@ -0,0 +1,36 @@
+{
+	"$id": "/inference/schemas/automatic-speech-recognition/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Automatic Speech Recognition task",
+	"title": "AutomaticSpeechRecognitionOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"text": {
+				"type": "string",
+				"description": "The recognized text."
+			},
+			"chunks": {
+				"type": "array",
+				"description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
+				"items": {
+					"type": "object",
+					"title": "AutomaticSpeechRecognitionOutputChunk",
+					"properties": {
+						"text": { "type": "string", "description": "A chunk of text identified by the model" },
+						"timestamps": {
+							"type": "array",
+							"description": "The start and end timestamps corresponding with the text",
+							"items": { "type": "number" },
+							"minLength": 2,
+							"maxLength": 2
+						}
+					},
+					"required": ["text", "timestamps"]
+				}
+			}
+		},
+		"required": ["text"]
+	}
+}
diff --git a/packages/tasks/src/tasks/common-definitions.json b/packages/tasks/src/tasks/common-definitions.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e0ec532d478d345979f3b2ff5f0644226ce1f4e
--- /dev/null
+++ b/packages/tasks/src/tasks/common-definitions.json
@@ -0,0 +1,109 @@
+{
+	"$id": "/inference/schemas/common-definitions.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "(Incomplete!) Common type definitions shared by several tasks",
+	"definitions": {
+		"ClassificationOutputTransform": {
+			"title": "ClassificationOutputTransform",
+			"type": "string",
+			"description": "The function to apply to the model outputs in order to retrieve the scores.",
+			"oneOf": [
+				{
+					"const": "sigmoid"
+				},
+				{
+					"const": "softmax"
+				},
+				{
+					"const": "none"
+				}
+			]
+		},
+		"ClassificationOutput": {
+			"title": "ClassificationOutput",
+			"type": "object",
+			"properties": {
+				"label": {
+					"type": "string",
+					"description": "The predicted class label."
+				},
+				"score": {
+					"type": "number",
+					"description": "The corresponding probability."
+				}
+			},
+			"required": ["label", "score"]
+		},
+		"GenerationParameters": {
+			"title": "GenerationParameters",
+			"description": "Ad-hoc parametrization of the text generation process",
+			"type": "object",
+			"properties": {
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the next token probabilities."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": " Local typicality measures how similar the conditional probability of predicting a target token next is to the expected conditional probability of predicting a random token next, given the partial text already generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that add up to typical_p or higher are kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details."
+				},
+				"epsilonCutoff": {
+					"type": "number",
+					"description": "If set to float strictly between 0 and 1, only tokens with a conditional probability greater than epsilon_cutoff will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"etaCutoff": {
+					"type": "number",
+					"description": "Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between 0 and 1, a token is only considered if it is greater than either eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191) for more details."
+				},
+				"maxLength": {
+					"type": "integer",
+					"description": "The maximum length (in tokens) of the generated text, including the input."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The maximum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"minLength": {
+					"type": "integer",
+					"description": "The minimum length (in tokens) of the generated text, including the input."
+				},
+				"minNewTokens": {
+					"type": "integer",
+					"description": "The minimum number of tokens to generate. Takes precedence over maxLength."
+				},
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
+				},
+				"earlyStopping": {
+					"description": "Controls the stopping condition for beam-based methods.",
+					"oneOf": [{ "type": "boolean" }, { "const": "never", "type": "string" }]
+				},
+				"numBeams": {
+					"type": "integer",
+					"description": "Number of beams to use for beam search."
+				},
+				"numBeamGroups": {
+					"type": "integer",
+					"description": "Number of groups to divide num_beams into in order to ensure diversity among different groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details."
+				},
+				"penaltyAlpha": {
+					"type": "number",
+					"description": "The value balances the model confidence and the degeneration penalty in contrastive search decoding."
+				},
+				"useCache": {
+					"type": "boolean",
+					"description": "Whether the model should use the past last key/values attentions to speed up decoding"
+				}
+			}
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/inference.ts b/packages/tasks/src/tasks/depth-estimation/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..ca831fdb411f8abb7ce8a00468ca61cd2c013757
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/inference.ts
@@ -0,0 +1,35 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+export type DepthEstimationOutput = unknown[];
+
+/**
+ * Inputs for Depth Estimation inference
+ */
+export interface DepthEstimationInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DepthEstimationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Depth Estimation
+ */
+export interface DepthEstimationParameters {
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/input.json b/packages/tasks/src/tasks/depth-estimation/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d58c82ff689932560a3655150d692808f59a807
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/input.json
@@ -0,0 +1,30 @@
+{
+	"$id": "/inference/schemas/depth-estimation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Depth Estimation inference",
+	"title": "DepthEstimationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/DepthEstimationParameters"
+		}
+	},
+	"$defs": {
+		"DepthEstimationParameters": {
+			"title": "DepthEstimationParameters",
+			"description": "Additional inference parameters for Depth Estimation",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/depth-estimation/spec/output.json b/packages/tasks/src/tasks/depth-estimation/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..72d6a714dd8e535715e40c534e89e6f600fd5e99
--- /dev/null
+++ b/packages/tasks/src/tasks/depth-estimation/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/depth-estimation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Depth Estimation task",
+	"title": "DepthEstimationOutput",
+	"type": "array",
+	"items": {
+		"description": "The output depth labels"
+	}
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/inference.ts b/packages/tasks/src/tasks/document-question-answering/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..cd2ab54051771d56db62d866a6b757d91d7ea6be
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/inference.ts
@@ -0,0 +1,102 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Document Question Answering inference
+ */
+export interface DocumentQuestionAnsweringInput {
+	/**
+	 * One (document, question) pair to answer
+	 */
+	data: DocumentQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: DocumentQuestionAnsweringParameters;
+	[property: string]: unknown;
+}
+/**
+ * One (document, question) pair to answer
+ */
+export interface DocumentQuestionAnsweringInputData {
+	/**
+	 * The image on which the question is asked
+	 */
+	image: unknown;
+	/**
+	 * A question to ask of the document
+	 */
+	question: string;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Document Question Answering
+ */
+export interface DocumentQuestionAnsweringParameters {
+	/**
+	 * If the words in the document are too long to fit with the question for the model, it will
+	 * be split in several chunks with some overlap. This argument controls the size of that
+	 * overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * Language to use while running OCR. Defaults to english.
+	 */
+	lang?: string;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using doc_stride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Can return less
+	 * than top_k answers if there are not enough options available within the context.
+	 */
+	topK?: number;
+	/**
+	 * A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+	 * skip the OCR step and use the provided bounding boxes instead.
+	 */
+	wordBoxes?: WordBox[];
+	[property: string]: unknown;
+}
+export type WordBox = number[] | string;
+export type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Document Question Answering task
+ */
+export interface DocumentQuestionAnsweringOutputElement {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	start: number;
+	/**
+	 * The index of each word/box pair that is in the answer
+	 */
+	words: number[];
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/input.json b/packages/tasks/src/tasks/document-question-answering/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..a607735e749af26778c8e8e41d31b10d865fc3d7
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/input.json
@@ -0,0 +1,85 @@
+{
+	"$id": "/inference/schemas/document-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Document Question Answering inference",
+	"title": "DocumentQuestionAnsweringInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "One (document, question) pair to answer",
+			"type": "object",
+			"title": "DocumentQuestionAnsweringInputData",
+			"properties": {
+				"image": {
+					"description": "The image on which the question is asked"
+				},
+				"question": {
+					"type": "string",
+					"description": "A question to ask of the document"
+				}
+			},
+			"required": ["image", "question"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/DocumentQuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
+		"DocumentQuestionAnsweringParameters": {
+			"title": "DocumentQuestionAnsweringParameters",
+			"description": "Additional inference parameters for Document Question Answering",
+			"type": "object",
+			"properties": {
+				"docStride": {
+					"type": "integer",
+					"description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer"
+				},
+				"lang": {
+					"type": "string",
+					"description": "Language to use while running OCR. Defaults to english."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
+				},
+				"wordBoxes": {
+					"type": "array",
+					"description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
+					"items": {
+						"anyOf": [
+							{
+								"type": "string"
+							},
+							{
+								"type": "array",
+								"items": {
+									"type": "number"
+								},
+								"maxLength": 4,
+								"minLength": 4
+							}
+						]
+					}
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/document-question-answering/spec/output.json b/packages/tasks/src/tasks/document-question-answering/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f69584ae89696ca2d94b55ae60029ae868b8fb6
--- /dev/null
+++ b/packages/tasks/src/tasks/document-question-answering/spec/output.json
@@ -0,0 +1,36 @@
+{
+	"$id": "/inference/schemas/document-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Document Question Answering task",
+	"title": "DocumentQuestionAnsweringOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"descrtiption": "The start word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"end": {
+				"type": "integer",
+				"descrtiption": "The end word index of the answer (in the OCR’d version of the input or provided word boxes)."
+			},
+			"words": {
+				"type": "array",
+				"items": {
+					"type": "integer"
+				},
+				"description": "The index of each word/box pair that is in the answer"
+			}
+		},
+		"required": ["answer", "score", "start", "end", "words"]
+	}
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..22dc8dd1d925dac35620e4e4b5f7e90add421f2e
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -0,0 +1,22 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+export type FeatureExtractionOutput = unknown[];
+
+/**
+ * Inputs for Text Embedding inference
+ */
+export interface FeatureExtractionInput {
+	/**
+	 * The text to get the embeddings of
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..a61455f6ca13f1484efd1ed8024c98035693d824
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -0,0 +1,26 @@
+{
+	"$id": "/inference/schemas/feature-extraction/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Embedding inference",
+	"title": "FeatureExtractionInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The text to get the embeddings of",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/FeatureExtractionParameters"
+		}
+	},
+	"$defs": {
+		"FeatureExtractionParameters": {
+			"title": "FeatureExtractionParameters",
+			"description": "Additional inference parameters for Feature Extraction",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..b51788daaf73d93fac51a116f5da01bf6c001433
--- /dev/null
+++ b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -0,0 +1,7 @@
+{
+	"$id": "/inference/schemas/feature-extraction/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "The embedding for the input text, as a nested list (tensor) of floats",
+	"type": "array",
+	"title": "FeatureExtractionOutput"
+}
diff --git a/packages/tasks/src/tasks/fill-mask/inference.ts b/packages/tasks/src/tasks/fill-mask/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..b80383da64846f26c1ab802ab7f5bd8f072bbc47
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/inference.ts
@@ -0,0 +1,61 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Fill Mask inference
+ */
+export interface FillMaskInput {
+	/**
+	 * The text with masked tokens
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: FillMaskParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Fill Mask
+ */
+export interface FillMaskParameters {
+	/**
+	 * When passed, the model will limit the scores to the passed targets instead of looking up
+	 * in the whole vocabulary. If the provided targets are not in the model vocab, they will be
+	 * tokenized and the first resulting token will be used (with a warning, and that might be
+	 * slower).
+	 */
+	targets?: string[];
+	/**
+	 * When passed, overrides the number of predictions to return.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+export type FillMaskOutput = FillMaskOutputElement[];
+/**
+ * Outputs of inference for the Fill Mask task
+ */
+export interface FillMaskOutputElement {
+	/**
+	 * The corresponding probability
+	 */
+	score: number;
+	/**
+	 * The corresponding input with the mask token prediction.
+	 */
+	sequence: string;
+	/**
+	 * The predicted token id (to replace the masked one).
+	 */
+	token: number;
+	/**
+	 * The predicted token (to replace the masked one).
+	 */
+	tokenStr: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/input.json b/packages/tasks/src/tasks/fill-mask/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..00def602ef7afc3cd75cc63351906431f0e47f2a
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/input.json
@@ -0,0 +1,38 @@
+{
+	"$id": "/inference/schemas/fill-mask/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Fill Mask inference",
+	"title": "FillMaskInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The text with masked tokens",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/FillMaskParameters"
+		}
+	},
+	"$defs": {
+		"FillMaskParameters": {
+			"title": "FillMaskParameters",
+			"description": "Additional inference parameters for Fill Mask",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "When passed, overrides the number of predictions to return."
+				},
+				"targets": {
+					"description": "When passed, the model will limit the scores to the passed targets instead of looking up in the whole vocabulary. If the provided targets are not in the model vocab, they will be tokenized and the first resulting token will be used (with a warning, and that might be slower).",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/fill-mask/spec/output.json b/packages/tasks/src/tasks/fill-mask/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8e91aeeaa0871f7f498fd627608ecc80f687f68
--- /dev/null
+++ b/packages/tasks/src/tasks/fill-mask/spec/output.json
@@ -0,0 +1,29 @@
+{
+	"$id": "/inference/schemas/fill-mask/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Fill Mask task",
+	"title": "FillMaskOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"sequence": {
+				"type": "string",
+				"description": "The corresponding input with the mask token prediction."
+			},
+			"score": {
+				"type": "number",
+				"description": "The corresponding probability"
+			},
+			"token": {
+				"type": "integer",
+				"description": "The predicted token id (to replace the masked one)."
+			},
+			"tokenStr": {
+				"type": "string",
+				"description": "The predicted token (to replace the masked one)."
+			}
+		},
+		"required": ["sequence", "score", "token", "tokenStr"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7138a50735ece63d7fea100fbdeb9b7d5ae95625
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -0,0 +1,51 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image Classification inference
+ */
+export interface ImageClassificationInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Classification
+ */
+export interface ImageClassificationParameters {
+	functionToApply?: ClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+export type ImageClassificationOutput = ImageClassificationOutputElement[];
+/**
+ * Outputs of inference for the Image Classification task
+ */
+export interface ImageClassificationOutputElement {
+	/**
+	 * The predicted class label.
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..362c0d517167e8a7676b9fee4cf5c3b4d338c129
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"$id": "/inference/schemas/image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Classification inference",
+	"title": "ImageClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageClassificationParameters"
+		}
+	},
+	"$defs": {
+		"ImageClassificationParameters": {
+			"title": "ImageClassificationParameters",
+			"description": "Additional inference parameters for Image Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"title": "ImageClassificationOutputTransform",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a3264bce7511f590175cd3e3ecc0af7ffe84d14
--- /dev/null
+++ b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Classification task",
+	"title": "ImageClassificationOutput",
+	"type": "array",
+	"items": {
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
+	}
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..b316715f54c1a8aa746c0287295c70d53c80c102
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -0,0 +1,65 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image Segmentation inference
+ */
+export interface ImageSegmentationInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageSegmentationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image Segmentation
+ */
+export interface ImageSegmentationParameters {
+	/**
+	 * Threshold to use when turning the predicted masks into binary values.
+	 */
+	maskThreshold?: number;
+	/**
+	 * Mask overlap threshold to eliminate small, disconnected segments.
+	 */
+	overlapMaskAreaThreshold?: number;
+	/**
+	 * Segmentation task to be performed, depending on model capabilities.
+	 */
+	subtask?: ImageSegmentationSubtask;
+	/**
+	 * Probability threshold to filter out predicted masks.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
+}
+export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
+export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
+/**
+ * Outputs of inference for the Image Segmentation task
+ *
+ * A predicted mask / segment
+ */
+export interface ImageSegmentationOutputElement {
+	/**
+	 * The label of the predicted segment
+	 */
+	label: string;
+	/**
+	 * The corresponding mask as a black-and-white image
+	 */
+	mask: unknown;
+	/**
+	 * The score or confidence degreee the model has
+	 */
+	score?: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae4adc70e902c266f0ef206da2063fab7a71eb87
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -0,0 +1,54 @@
+{
+	"$id": "/inference/schemas/image-segmentation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image Segmentation inference",
+	"title": "ImageSegmentationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageSegmentationParameters"
+		}
+	},
+	"$defs": {
+		"ImageSegmentationParameters": {
+			"title": "ImageSegmentationParameters",
+			"description": "Additional inference parameters for Image Segmentation",
+			"type": "object",
+			"properties": {
+				"maskThreshold": {
+					"type": "number",
+					"description": "Threshold to use when turning the predicted masks into binary values."
+				},
+				"overlapMaskAreaThreshold": {
+					"type": "number",
+					"description": "Mask overlap threshold to eliminate small, disconnected segments."
+				},
+				"subtask": {
+					"title": "ImageSegmentationSubtask",
+					"type": "string",
+					"description": "Segmentation task to be performed, depending on model capabilities.",
+					"oneOf": [
+						{
+							"const": "instance"
+						},
+						{
+							"const": "panoptic"
+						},
+						{
+							"const": "semantic"
+						}
+					]
+				},
+				"threshold": {
+					"type": "number",
+					"description": "Probability threshold to filter out predicted masks."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..b20aa415e058fd1b7f75f2915765cd0e89483075
--- /dev/null
+++ b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -0,0 +1,25 @@
+{
+	"$id": "/inference/schemas/image-segmentation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image Segmentation task",
+	"title": "ImageSegmentationOutput",
+	"type": "array",
+	"items": {
+		"description": "A predicted mask / segment",
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The label of the predicted segment"
+			},
+			"mask": {
+				"description": "The corresponding mask as a black-and-white image"
+			},
+			"score": {
+				"type": "number",
+				"description": "The score or confidence degreee the model has"
+			}
+		},
+		"required": ["label", "mask"]
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..96a532b25277e4d72ce5977a16f423a42c2a49da
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -0,0 +1,67 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Image To Image inference
+ */
+export interface ImageToImageInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Image
+ */
+export interface ImageToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Image To Image task
+ */
+export interface ImageToImageOutput {
+	/**
+	 * The output image
+	 */
+	image?: unknown;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..11d4bee8af287737b4e71c75582ee86ebd95d9f6
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -0,0 +1,52 @@
+{
+	"$id": "/inference/schemas/image-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Image inference",
+	"title": "ImageToImageInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageToImageParameters"
+		}
+	},
+	"$defs": {
+		"ImageToImageParameters": {
+			"title": "ImageToImageParameters",
+			"description": "Additional inference parameters for Image To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..af4eff804604607f75ac96d1dc6ed6bcc1eb953c
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -0,0 +1,12 @@
+{
+	"$id": "/inference/schemas/image-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Image task",
+	"title": "ImageToImageOutput",
+	"type": "object",
+	"properties": {
+		"image": {
+			"description": "The output image"
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/image-to-text/inference.ts b/packages/tasks/src/tasks/image-to-text/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..19bb147e2d14e0e7c79cfd4d905cf7fcd2fea51d
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/inference.ts
@@ -0,0 +1,138 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image To Text inference
+ */
+export interface ImageToTextInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Text
+ */
+export interface ImageToTextParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	maxNewTokens?: number;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+export type ImageToTextOutput = ImageToTextOutputElement[];
+/**
+ * Outputs of inference for the Image To Text task
+ */
+export interface ImageToTextOutputElement {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/input.json b/packages/tasks/src/tasks/image-to-text/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ef8ba1dc5fcde18fc86069887b54bba870c6f23
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/input.json
@@ -0,0 +1,34 @@
+{
+	"$id": "/inference/schemas/image-to-text/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Text inference",
+	"title": "ImageToTextInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageToTextParameters"
+		}
+	},
+	"$defs": {
+		"ImageToTextParameters": {
+			"title": "ImageToTextParameters",
+			"description": "Additional inference parameters for Image To Text",
+			"type": "object",
+			"properties": {
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "The amount of maximum tokens to generate."
+				},
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/image-to-text/spec/output.json b/packages/tasks/src/tasks/image-to-text/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3283e34f7c71a3165e4bce52e9c5d51ccf7f810
--- /dev/null
+++ b/packages/tasks/src/tasks/image-to-text/spec/output.json
@@ -0,0 +1,17 @@
+{
+	"$id": "/inference/schemas/image-to-text/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Text task",
+	"title": "ImageToTextOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedText"]
+	}
+}
diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts
index b0615dfeb34553d3e031a90ab1fd071904daadbb..9e94253763793fe0dd8131183152ea57a8e6078f 100644
--- a/packages/tasks/src/tasks/index.ts
+++ b/packages/tasks/src/tasks/index.ts
@@ -216,6 +216,7 @@ export interface TaskData {
 	datasets: ExampleRepo[];
 	demo: TaskDemo;
 	id: PipelineType;
+	canonicalId?: PipelineType;
 	isPlaceholder?: boolean;
 	label: string;
 	libraries: ModelLibraryKey[];
diff --git a/packages/tasks/src/tasks/object-detection/inference.ts b/packages/tasks/src/tasks/object-detection/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..9650c781e072dcc73234d2e21d86dc82b75f5e2c
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/inference.ts
@@ -0,0 +1,62 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Object Detection inference
+ */
+export interface ObjectDetectionInput {
+	/**
+	 * The input image data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ObjectDetectionParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Object Detection
+ */
+export interface ObjectDetectionParameters {
+	/**
+	 * The probability necessary to make a prediction.
+	 */
+	threshold?: number;
+	[property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ObjectDetectionOutput = ObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Object Detection task
+ */
+export interface ObjectDetectionOutputElement {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * The predicted label for the bounding box
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/input.json b/packages/tasks/src/tasks/object-detection/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..e01ebf496542ba2e1f45cd8bf25f580cd8d882c4
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/input.json
@@ -0,0 +1,30 @@
+{
+	"$id": "/inference/schemas/object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Object Detection inference",
+	"title": "ObjectDetectionInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ObjectDetectionParameters"
+		}
+	},
+	"$defs": {
+		"ObjectDetectionParameters": {
+			"title": "ObjectDetectionParameters",
+			"description": "Additional inference parameters for Object Detection",
+			"type": "object",
+			"properties": {
+				"threshold": {
+					"type": "number",
+					"description": "The probability necessary to make a prediction."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/object-detection/spec/output.json b/packages/tasks/src/tasks/object-detection/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..20c92d5d30b219f4ac1117874ea0020d59e4a822
--- /dev/null
+++ b/packages/tasks/src/tasks/object-detection/spec/output.json
@@ -0,0 +1,46 @@
+{
+	"$id": "/inference/schemas/object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Object Detection task",
+	"title": "ObjectDetectionOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "The predicted label for the bounding box"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/$defs/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"$defs": {
+		"BoundingBox": {
+			"type": "object",
+			"title": "BoundingBox",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}
diff --git a/packages/tasks/src/tasks/placeholder/data.ts b/packages/tasks/src/tasks/placeholder/data.ts
index 0cbc735ad92a29ad51e6db61be1dae02a8bb1dba..110b43703e5e9865db1551985ad588fa8cb7ce04 100644
--- a/packages/tasks/src/tasks/placeholder/data.ts
+++ b/packages/tasks/src/tasks/placeholder/data.ts
@@ -13,6 +13,9 @@ const taskData: TaskDataCustom = {
 	summary: "",
 	widgetModels: [],
 	youtubeId: undefined,
+	/// If this is a subtask, link to the most general task ID
+	/// (eg, text2text-generation is the canonical ID of translation)
+	canonicalId: undefined,
 };
 
 export default taskData;
diff --git a/packages/tasks/src/tasks/placeholder/spec/input.json b/packages/tasks/src/tasks/placeholder/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c206baef38f8ce8c2eac0e7cc771c5dd6f5c16e
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/input.json
@@ -0,0 +1,35 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for <TASK_ID> inference",
+	"title": "PlaceholderInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "TODO: describe the input here. This must be model & framework agnostic.",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/<TASK_ID>Parameters"
+		}
+	},
+	"$defs": {
+		"<TASK_ID>Parameters": {
+			"title": "<TASK_ID>Parameters",
+			"description": "TODO: describe additional parameters here.",
+			"type": "object",
+			"properties": {
+				"dummyParameterName": {
+					"type": "boolean",
+					"description": "TODO: describe the parameter here"
+				},
+				"dummyParameterName2": {
+					"type": "integer",
+					"description": "TODO: describe the parameter here"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/placeholder/spec/output.json b/packages/tasks/src/tasks/placeholder/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e3e132941936718b08c0cbcd961fcc277e57a38
--- /dev/null
+++ b/packages/tasks/src/tasks/placeholder/spec/output.json
@@ -0,0 +1,17 @@
+{
+	"$id": "/inference/schemas/<TASK_ID>/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for <TASK_ID> inference",
+	"title": "PlaceholderOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"meaningfulOutputName": {
+				"type": "string",
+				"description": "TODO: Describe what is outputed by the inference here"
+			}
+		},
+		"required": ["meaningfulOutputName"]
+	}
+}
diff --git a/packages/tasks/src/tasks/question-answering/inference.ts b/packages/tasks/src/tasks/question-answering/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..bffc71cc6e29c0da0fe257e844731411fcdceba2
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/inference.ts
@@ -0,0 +1,99 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Question Answering inference
+ */
+export interface QuestionAnsweringInput {
+	/**
+	 * One (context, question) pair to answer
+	 */
+	data: QuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: QuestionAnsweringParameters;
+	[property: string]: unknown;
+}
+/**
+ * One (context, question) pair to answer
+ */
+export interface QuestionAnsweringInputData {
+	/**
+	 * The context to be used for answering the question
+	 */
+	context: string;
+	/**
+	 * The question to be answered
+	 */
+	question: string;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Question Answering
+ */
+export interface QuestionAnsweringParameters {
+	/**
+	 * Attempts to align the answer to real words. Improves quality on space separated
+	 * languages. Might hurt on non-space-separated languages (like Japanese or Chinese)
+	 */
+	alignToWords?: boolean;
+	/**
+	 * If the context is too long to fit with the question for the model, it will be split in
+	 * several chunks with some overlap. This argument controls the size of that overlap.
+	 */
+	docStride?: number;
+	/**
+	 * Whether to accept impossible as an answer.
+	 */
+	handleImpossibleAnswer?: boolean;
+	/**
+	 * The maximum length of predicted answers (e.g., only answers with a shorter length are
+	 * considered).
+	 */
+	maxAnswerLen?: number;
+	/**
+	 * The maximum length of the question after tokenization. It will be truncated if needed.
+	 */
+	maxQuestionLen?: number;
+	/**
+	 * The maximum length of the total sentence (context + question) in tokens of each chunk
+	 * passed to the model. The context will be split in several chunks (using docStride as
+	 * overlap) if needed.
+	 */
+	maxSeqLen?: number;
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+export type QuestionAnsweringOutput = QuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Question Answering task
+ */
+export interface QuestionAnsweringOutputElement {
+	/**
+	 * The answer to the question.
+	 */
+	answer: string;
+	/**
+	 * The character position in the input where the answer ends.
+	 */
+	end: number;
+	/**
+	 * The probability associated to the answer.
+	 */
+	score: number;
+	/**
+	 * The character position in the input where the answer begins.
+	 */
+	start: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/input.json b/packages/tasks/src/tasks/question-answering/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..088e77200b4ac58f5e31df546e7b6741400d4e84
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/input.json
@@ -0,0 +1,67 @@
+{
+	"$id": "/inference/schemas/question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Question Answering inference",
+	"title": "QuestionAnsweringInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"title": "QuestionAnsweringInputData",
+			"description": "One (context, question) pair to answer",
+			"type": "object",
+			"properties": {
+				"context": {
+					"type": "string",
+					"description": "The context to be used for answering the question"
+				},
+				"question": {
+					"type": "string",
+					"description": "The question to be answered"
+				}
+			},
+			"required": ["question", "context"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/QuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
+		"QuestionAnsweringParameters": {
+			"title": "QuestionAnsweringParameters",
+			"description": "Additional inference parameters for Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				},
+				"docStride": {
+					"type": "integer",
+					"description": "If the context is too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
+				},
+				"maxAnswerLen": {
+					"type": "integer",
+					"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
+				},
+				"maxSeqLen": {
+					"type": "integer",
+					"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using docStride as overlap) if needed."
+				},
+				"maxQuestionLen": {
+					"type": "integer",
+					"description": "The maximum length of the question after tokenization. It will be truncated if needed."
+				},
+				"handleImpossibleAnswer": {
+					"type": "boolean",
+					"description": "Whether to accept impossible as an answer."
+				},
+				"alignToWords": {
+					"type": "boolean",
+					"description": "Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on non-space-separated languages (like Japanese or Chinese)"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/question-answering/spec/output.json b/packages/tasks/src/tasks/question-answering/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..9da8f988ad21668db65d18e22cb105a0da96a63d
--- /dev/null
+++ b/packages/tasks/src/tasks/question-answering/spec/output.json
@@ -0,0 +1,29 @@
+{
+	"$id": "/inference/schemas/question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "QuestionAnsweringOutput",
+	"description": "Outputs of inference for the Question Answering task",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question."
+			},
+			"score": {
+				"type": "number",
+				"description": "The probability associated to the answer."
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where the answer begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where the answer ends."
+			}
+		},
+		"required": ["answer", "score", "start", "end"]
+	}
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/inference.ts b/packages/tasks/src/tasks/sentence-similarity/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..f1b72447d5c8d94051c89ebe44a762bf3995e941
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/inference.ts
@@ -0,0 +1,32 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+export type SentenceSimilarityOutput = number[];
+
+/**
+ * Inputs for Sentence similarity inference
+ */
+export interface SentenceSimilarityInput {
+	data: SentenceSimilarityInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: { [key: string]: unknown };
+	[property: string]: unknown;
+}
+
+export interface SentenceSimilarityInputData {
+	/**
+	 * A list of strings which will be compared against the source_sentence.
+	 */
+	sentences: string[];
+	/**
+	 * The string that you wish to compare the other strings with. This can be a phrase,
+	 * sentence, or longer passage, depending on the model being used.
+	 */
+	sourceSentence: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/input.json b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bb9e2e5ade6da3becb9fbf1c45bcb154c6720a6
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/input.json
@@ -0,0 +1,40 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Sentence similarity inference",
+	"title": "SentenceSimilarityInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"title": "SentenceSimilarityInputData",
+			"type": "object",
+			"properties": {
+				"sourceSentence": {
+					"description": "The string that you wish to compare the other strings with. This can be a phrase, sentence, or longer passage, depending on the model being used.",
+					"type": "string"
+				},
+				"sentences": {
+					"type": "array",
+					"description": "A list of strings which will be compared against the source_sentence.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["sourceSentence", "sentences"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/SentenceSimilarityParameters"
+		}
+	},
+	"$defs": {
+		"SentenceSimilarityParameters": {
+			"title": "SentenceSimilarityParameters",
+			"description": "Additional inference parameters for Sentence Similarity",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/sentence-similarity/spec/output.json b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca13d98bd5f55bd581e99c8cc4d970b9b7735512
--- /dev/null
+++ b/packages/tasks/src/tasks/sentence-similarity/spec/output.json
@@ -0,0 +1,12 @@
+{
+	"$id": "/inference/schemas/sentence-similarity/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SentenceSimilarityOutput",
+	"description": "Outputs of inference for the Sentence Similarity task",
+	"type": "array",
+	"items": {
+		"description": "The associated similarity score for each of the given sentences",
+		"type": "number",
+		"title": "SentenceSimilarityScore"
+	}
+}
diff --git a/packages/tasks/src/tasks/summarization/data.ts b/packages/tasks/src/tasks/summarization/data.ts
index b13fa3d163a4116b4b6a6f995ba1e37e28a3854e..bd04453da3f36af45f203e7c0039056677b59432 100644
--- a/packages/tasks/src/tasks/summarization/data.ts
+++ b/packages/tasks/src/tasks/summarization/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description:
diff --git a/packages/tasks/src/tasks/summarization/inference.ts b/packages/tasks/src/tasks/summarization/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..16d30cf7a10d1c16d684f47a95a7956375c57542
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/inference.ts
@@ -0,0 +1,58 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Summarization inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface SummarizationInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Text2TextGenerationParameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
+}
+
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Summarization inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface SummarizationOutput {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/input.json b/packages/tasks/src/tasks/summarization/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..629da31ea67216b20f2314eb454b1f710367d9a2
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/input.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/summarization/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationInput",
+	"description": "Inputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/summarization/spec/output.json b/packages/tasks/src/tasks/summarization/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b1f8bf30341dec43244a169849c85483ee599d1
--- /dev/null
+++ b/packages/tasks/src/tasks/summarization/spec/output.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/summarization/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "SummarizationOutput",
+	"description": "Outputs for Summarization inference"
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/inference.ts b/packages/tasks/src/tasks/table-question-answering/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..fe06dbbfe538d8089468d48b19413e0a69b8619a
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/inference.ts
@@ -0,0 +1,61 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Table Question Answering inference
+ */
+export interface TableQuestionAnsweringInput {
+	/**
+	 * One (table, question) pair to answer
+	 */
+	data: TableQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: {
+		[key: string]: unknown;
+	};
+	[property: string]: unknown;
+}
+/**
+ * One (table, question) pair to answer
+ */
+export interface TableQuestionAnsweringInputData {
+	/**
+	 * The question to be answered about the table
+	 */
+	question: string;
+	/**
+	 * The table to serve as context for the questions
+	 */
+	table: {
+		[key: string]: string[];
+	};
+	[property: string]: unknown;
+}
+export type TableQuestionAnsweringOutput = TableQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Table Question Answering task
+ */
+export interface TableQuestionAnsweringOutputElement {
+	/**
+	 * If the model has an aggregator, this returns the aggregator.
+	 */
+	aggregator?: string;
+	/**
+	 * The answer of the question given the table. If there is an aggregator, the answer will be
+	 * preceded by `AGGREGATOR >`.
+	 */
+	answer: string;
+	/**
+	 * List of strings made up of the answer cell values.
+	 */
+	cells: string[];
+	/**
+	 * Coordinates of the cells of the answers.
+	 */
+	coordinates: Array<number[]>;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/input.json b/packages/tasks/src/tasks/table-question-answering/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..6309cf1f36a793cc45772f3dbb17a300f8610f1a
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/input.json
@@ -0,0 +1,39 @@
+{
+	"$id": "/inference/schemas/table-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Table Question Answering inference",
+	"title": "TableQuestionAnsweringInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "One (table, question) pair to answer",
+			"title": "TableQuestionAnsweringInputData",
+			"type": "object",
+			"properties": {
+				"table": {
+					"description": "The table to serve as context for the questions",
+					"type": "object",
+					"additionalProperties": { "type": "array", "items": { "type": "string" } }
+				},
+				"question": {
+					"description": "The question to be answered about the table",
+					"type": "string"
+				}
+			},
+			"required": ["table", "question"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TableQuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
+		"TableQuestionAnsweringParameters": {
+			"title": "TableQuestionAnsweringParameters",
+			"description": "Additional inference parameters for Table Question Answering",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/table-question-answering/spec/output.json b/packages/tasks/src/tasks/table-question-answering/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b43026ea12299dc83110c99d3983841a8d30c6e
--- /dev/null
+++ b/packages/tasks/src/tasks/table-question-answering/spec/output.json
@@ -0,0 +1,40 @@
+{
+	"$id": "/inference/schemas/table-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Table Question Answering task",
+	"title": "TableQuestionAnsweringOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer of the question given the table. If there is an aggregator, the answer will be preceded by `AGGREGATOR >`."
+			},
+			"coordinates": {
+				"type": "array",
+				"description": "Coordinates of the cells of the answers.",
+				"items": {
+					"type": "array",
+					"items": {
+						"type": "integer"
+					},
+					"minLength": 2,
+					"maxLength": 2
+				}
+			},
+			"cells": {
+				"type": "array",
+				"description": "List of strings made up of the answer cell values.",
+				"items": {
+					"type": "string"
+				}
+			},
+			"aggregator": {
+				"type": "string",
+				"description": "If the model has an aggregator, this returns the aggregator."
+			}
+		},
+		"required": ["answer", "cells", "coordinates"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-classification/inference.ts b/packages/tasks/src/tasks/text-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..9bc728a50c2257832b5055cca4ce4cc81cdf46a5
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/inference.ts
@@ -0,0 +1,51 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text Classification inference
+ */
+export interface TextClassificationInput {
+	/**
+	 * The text to classify
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Classification
+ */
+export interface TextClassificationParameters {
+	functionToApply?: ClassificationOutputTransform;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+export type TextClassificationOutput = TextClassificationOutputElement[];
+/**
+ * Outputs of inference for the Text Classification task
+ */
+export interface TextClassificationOutputElement {
+	/**
+	 * The predicted class label.
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/input.json b/packages/tasks/src/tasks/text-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ae6f1c39ccc11b2007b8b628b790e92c6e51d4b
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/input.json
@@ -0,0 +1,35 @@
+{
+	"$id": "/inference/schemas/text-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Classification inference",
+	"title": "TextClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The text to classify",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextClassificationParameters"
+		}
+	},
+	"$defs": {
+		"TextClassificationParameters": {
+			"title": "TextClassificationParameters",
+			"description": "Additional inference parameters for Text Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-classification/spec/output.json b/packages/tasks/src/tasks/text-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..704b82225b78d6cf17f3ffc00d7f47fa8befd1a8
--- /dev/null
+++ b/packages/tasks/src/tasks/text-classification/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/text-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text Classification task",
+	"title": "TextClassificationOutput",
+	"type": "array",
+	"items": {
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
+	}
+}
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..94279336c88bb973d686050cc445292305739a20
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -0,0 +1,85 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text Generation inference
+ */
+export interface TextGenerationInput {
+	/**
+	 * The text to initialize generation with
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextGenerationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text Generation
+ */
+export interface TextGenerationParameters {
+	/**
+	 * Whether to use logit sampling (true) or greedy search (false).
+	 */
+	doSample?: boolean;
+	/**
+	 * Maximum number of generated tokens.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The parameter for repetition penalty. A value of 1.0 means no penalty. See [this
+	 * paper](https://hf.co/papers/1909.05858) for more details.
+	 */
+	repetitionPenalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text.
+	 */
+	returnFullText?: boolean;
+	/**
+	 * Stop generating tokens if a member of `stop_sequences` is generated.
+	 */
+	stopSequences?: string[];
+	/**
+	 * The value used to modulate the logits distribution.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to < 1, only the smallest set of most probable tokens with probabilities that add
+	 * up to `top_p` or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Truncate input tokens to the given size.
+	 */
+	truncate?: number;
+	/**
+	 * Typical Decoding mass. See [Typical Decoding for Natural Language
+	 * Generation](https://hf.co/papers/2202.00666) for more information
+	 */
+	typicalP?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)
+	 */
+	watermark?: boolean;
+	[property: string]: unknown;
+}
+export type TextGenerationOutput = TextGenerationOutputElement[];
+/**
+ * Outputs for Text Generation inference
+ */
+export interface TextGenerationOutputElement {
+	/**
+	 * The generated text
+	 */
+	generatedText: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..223561691304ec54972759cc26a6d5e92c325610
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -0,0 +1,74 @@
+{
+	"$id": "/inference/schemas/text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text Generation inference",
+	"title": "TextGenerationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The text to initialize generation with",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextGenerationParameters"
+		}
+	},
+	"$defs": {
+		"TextGenerationParameters": {
+			"title": "TextGenerationParameters",
+			"description": "Additional inference parameters for Text Generation",
+			"type": "object",
+			"properties": {
+				"doSample": {
+					"type": "boolean",
+					"description": "Whether to use logit sampling (true) or greedy search (false)."
+				},
+				"maxNewTokens": {
+					"type": "integer",
+					"description": "Maximum number of generated tokens."
+				},
+				"repetitionPenalty": {
+					"type": "number",
+					"description": "The parameter for repetition penalty. A value of 1.0 means no penalty. See [this paper](https://hf.co/papers/1909.05858) for more details."
+				},
+				"returnFullText": {
+					"type": "boolean",
+					"description": "Whether to prepend the prompt to the generated text."
+				},
+				"stopSequences": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "Stop generating tokens if a member of `stop_sequences` is generated."
+				},
+				"temperature": {
+					"type": "number",
+					"description": "The value used to modulate the logits distribution."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+				},
+				"topP": {
+					"type": "number",
+					"description": "If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."
+				},
+				"truncate": {
+					"type": "integer",
+					"description": "Truncate input tokens to the given size."
+				},
+				"typicalP": {
+					"type": "number",
+					"description": "Typical Decoding mass. See [Typical Decoding for Natural Language Generation](https://hf.co/papers/2202.00666) for more information"
+				},
+				"watermark": {
+					"type": "boolean",
+					"description": "Watermarking with [A Watermark for Large Language Models](https://hf.co/papers/2301.10226)"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-generation/spec/output.json b/packages/tasks/src/tasks/text-generation/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..eacb907e2c75f02a866b9b963b6a2bbfefe18d8d
--- /dev/null
+++ b/packages/tasks/src/tasks/text-generation/spec/output.json
@@ -0,0 +1,17 @@
+{
+	"$id": "/inference/schemas/text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs for Text Generation inference",
+	"title": "TextGenerationOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text"
+			}
+		},
+		"required": ["generatedText"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/inference.ts b/packages/tasks/src/tasks/text-to-audio/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..14c484bf2fe212836200d2f1bbf5d4d6ddbc4b90
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/inference.ts
@@ -0,0 +1,138 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text To Audio inference
+ */
+export interface TextToAudioInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+export type TextToAudioOutput = TextToAudioOutputElement[];
+/**
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToAudioOutputElement {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/input.json b/packages/tasks/src/tasks/text-to-audio/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..95bd8d16db3e0874c947138250778c889578b097
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/input.json
@@ -0,0 +1,31 @@
+{
+	"$id": "/inference/schemas/text-to-audio/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Audio inference",
+	"title": "TextToAudioInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToAudioParameters"
+		}
+	},
+	"$defs": {
+		"TextToAudioParameters": {
+			"title": "TextToAudioParameters",
+			"description": "Additional inference parameters for Text To Audio",
+			"type": "object",
+			"properties": {
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-to-audio/spec/output.json b/packages/tasks/src/tasks/text-to-audio/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0a25bd9ad4bcdb2e1f55a1fa65b7e2d9d8cf832
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-audio/spec/output.json
@@ -0,0 +1,20 @@
+{
+	"$id": "/inference/schemas/text-to-audio/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Audio task",
+	"title": "TextToAudioOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"audio": {
+				"description": "The generated audio waveform."
+			},
+			"samplingRate": {
+				"type": "number",
+				"description": "The sampling rate of the generated audio waveform."
+			}
+		},
+		"required": ["audio", "samplingRate"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-image/inference.ts b/packages/tasks/src/tasks/text-to-image/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..c25031b29ef5bffdd1aa61c82c355e8d81bda31c
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/inference.ts
@@ -0,0 +1,73 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text To Image inference
+ */
+export interface TextToImageInput {
+	/**
+	 * The input text data (sometimes called "prompt"
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextToImageParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Image
+ */
+export interface TextToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidanceScale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negativePrompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	numInferenceSteps?: number;
+	/**
+	 * For diffusion models. Override the scheduler with a compatible one
+	 */
+	scheduler?: string;
+	/**
+	 * The size in pixel of the output image
+	 */
+	targetSize?: TargetSize;
+	[property: string]: unknown;
+}
+
+/**
+ * The size in pixel of the output image
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+
+/**
+ * Outputs of inference for the Text To Image task
+ */
+export type TextToImageOutput = unknown[] | boolean | number | number | null | TextToImageOutputObject | string;
+
+export interface TextToImageOutputObject {
+	/**
+	 * The generated image
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/input.json b/packages/tasks/src/tasks/text-to-image/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb1e1c6cf6cecb91e6c657c02cd5f2fc40a24ef8
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/input.json
@@ -0,0 +1,57 @@
+{
+	"$id": "/inference/schemas/text-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Image inference",
+	"title": "TextToImageInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data (sometimes called \"prompt\"",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToImageParameters"
+		}
+	},
+	"$defs": {
+		"TextToImageParameters": {
+			"title": "TextToImageParameters",
+			"description": "Additional inference parameters for Text To Image",
+			"type": "object",
+			"properties": {
+				"guidanceScale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negativePrompt": {
+					"type": "array",
+					"items": { "type": "string" },
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"numInferenceSteps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"targetSize": {
+					"type": "object",
+					"description": "The size in pixel of the output image",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				},
+				"scheduler": {
+					"type": "string",
+					"description": "For diffusion models. Override the scheduler with a compatible one"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text-to-image/spec/output.json b/packages/tasks/src/tasks/text-to-image/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ab3ee7879b9833b97774a4db37254c3a76c2dbf
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-image/spec/output.json
@@ -0,0 +1,15 @@
+{
+	"$id": "/inference/schemas/text-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text To Image task",
+	"title": "TextToImageOutput",
+	"type": "array",
+	"items": {
+		"properties": {
+			"image": {
+				"description": "The generated image"
+			}
+		},
+		"required": ["image"]
+	}
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
index 73560b7afc8e38cc003d1bc60f2af808abc895f1..26c6f48371fe4f847902139b989929285af57be3 100644
--- a/packages/tasks/src/tasks/text-to-speech/data.ts
+++ b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text-to-audio",
 	datasets: [
 		{
 			description: "Thousands of short audio clips of a single speaker.",
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..f67e03652a1ee6a3250397bf15c2367390080908
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -0,0 +1,146 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Text to Speech inference
+ *
+ * Inputs for Text To Audio inference
+ */
+export interface TextToSpeechInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TextToAudioParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text To Audio
+ */
+export interface TextToAudioParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generate?: GenerationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	doSample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	earlyStopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilonCutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	etaCutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	maxLength?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	maxNewTokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	minLength?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over maxLength.
+	 */
+	minNewTokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	numBeamGroups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	numBeams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penaltyAlpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	topK?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	topP?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typicalP?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	useCache?: boolean;
+	[property: string]: unknown;
+}
+
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+
+/**
+ * Outputs for Text to Speech inference
+ *
+ * Outputs of inference for the Text To Audio task
+ */
+export interface TextToSpeechOutput {
+	/**
+	 * The generated audio waveform.
+	 */
+	audio: unknown;
+	/**
+	 * The sampling rate of the generated audio waveform.
+	 */
+	samplingRate: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d2bac0924d743b9a077d122df1c734533fa73d4
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text-to-audio/input.json",
+	"$id": "/inference/schemas/text-to-speech/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechInput",
+	"description": "Inputs for Text to Speech inference"
+}
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/output.json b/packages/tasks/src/tasks/text-to-speech/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..91654e2b506962a371791796abc0b862f6b73ce2
--- /dev/null
+++ b/packages/tasks/src/tasks/text-to-speech/spec/output.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text-to-audio/output.json",
+	"$id": "/inference/schemas/text-to-speech/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TextToSpeechOutput",
+	"description": "Outputs for Text to Speech inference"
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/inference.ts b/packages/tasks/src/tasks/text2text-generation/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..788845dd24eee041de676d8d9cafea7412fe0feb
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/inference.ts
@@ -0,0 +1,53 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Text2text Generation inference
+ */
+export interface Text2TextGenerationInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Text2TextGenerationParameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: {
+		[key: string]: unknown;
+	};
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
+}
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+export type Text2TextGenerationOutput = Text2TextGenerationOutputElement[];
+/**
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface Text2TextGenerationOutputElement {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/input.json b/packages/tasks/src/tasks/text2text-generation/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..a00ae575fc4df0e1579890155b299ed29ac2b665
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/input.json
@@ -0,0 +1,55 @@
+{
+	"$id": "/inference/schemas/text2text-generation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text2text Generation inference",
+	"title": "Text2TextGenerationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/Text2textGenerationParameters"
+		}
+	},
+	"$defs": {
+		"Text2textGenerationParameters": {
+			"title": "Text2textGenerationParameters",
+			"description": "Additional inference parameters for Text2text Generation",
+			"type": "object",
+			"properties": {
+				"cleanUpTokenizationSpaces": {
+					"type": "boolean",
+					"description": "Whether to clean up the potential extra spaces in the text output."
+				},
+				"truncation": {
+					"title": "Text2textGenerationTruncationStrategy",
+					"type": "string",
+					"description": "The truncation strategy to use",
+					"oneOf": [
+						{
+							"const": "do_not_truncate"
+						},
+						{
+							"const": "longest_first"
+						},
+						{
+							"const": "only_first"
+						},
+						{
+							"const": "only_second"
+						}
+					]
+				},
+				"generateParameters": {
+					"title": "generateParameters",
+					"type": "object",
+					"description": "Additional parametrization of the text generation algorithm"
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/text2text-generation/spec/output.json b/packages/tasks/src/tasks/text2text-generation/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..f60ba8933eecead6e159ca07e03edc5f1fb93284
--- /dev/null
+++ b/packages/tasks/src/tasks/text2text-generation/spec/output.json
@@ -0,0 +1,17 @@
+{
+	"$id": "/inference/schemas/text2text-generation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Text2text Generation task",
+	"title": "Text2TextGenerationOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"generatedText": {
+				"type": "string",
+				"description": "The generated text."
+			}
+		},
+		"required": ["generatedText"]
+	}
+}
diff --git a/packages/tasks/src/tasks/token-classification/inference.ts b/packages/tasks/src/tasks/token-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7a8da8dcfc00dfea605a8188a46d860c36125812
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/inference.ts
@@ -0,0 +1,82 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Token Classification inference
+ */
+export interface TokenClassificationInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: TokenClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Token Classification
+ */
+export interface TokenClassificationParameters {
+	/**
+	 * The strategy used to fuse tokens based on model predictions
+	 */
+	aggregationStrategy?: TokenClassificationAggregationStrategy;
+	/**
+	 * A list of labels to ignore
+	 */
+	ignoreLabels?: string[];
+	/**
+	 * The number of overlapping tokens between chunks when splitting the input text.
+	 */
+	stride?: number;
+	[property: string]: unknown;
+}
+/**
+ * Do not aggregate tokens
+ *
+ * Group consecutive tokens with the same label in a single entity.
+ *
+ * Similar to "simple", also preserves word integrity (use the label predicted for the first
+ * token in a word).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest
+ * score, averaged across the word's tokens).
+ *
+ * Similar to "simple", also preserves word integrity (uses the label with the highest score
+ * across the word's tokens).
+ */
+export type TokenClassificationAggregationStrategy = "none" | "simple" | "first" | "average" | "max";
+export type TokenClassificationOutput = TokenClassificationOutputElement[];
+/**
+ * Outputs of inference for the Token Classification task
+ */
+export interface TokenClassificationOutputElement {
+	/**
+	 * The character position in the input where this group ends.
+	 */
+	end?: number;
+	/**
+	 * The predicted label for that group of tokens
+	 */
+	entityGroup?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	/**
+	 * The character position in the input where this group begins.
+	 */
+	start?: number;
+	/**
+	 * The corresponding text
+	 */
+	word?: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/input.json b/packages/tasks/src/tasks/token-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fd89ce34cda5763bcce15848658eada114cc4c5
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/input.json
@@ -0,0 +1,65 @@
+{
+	"$id": "/inference/schemas/token-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Token Classification inference",
+	"title": "TokenClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TokenClassificationParameters"
+		}
+	},
+	"$defs": {
+		"TokenClassificationParameters": {
+			"title": "TokenClassificationParameters",
+			"description": "Additional inference parameters for Token Classification",
+			"type": "object",
+			"properties": {
+				"ignoreLabels": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "A list of labels to ignore"
+				},
+				"stride": {
+					"type": "integer",
+					"description": "The number of overlapping tokens between chunks when splitting the input text."
+				},
+				"aggregationStrategy": {
+					"title": "TokenClassificationAggregationStrategy",
+					"type": "string",
+					"description": "The strategy used to fuse tokens based on model predictions",
+					"oneOf": [
+						{
+							"const": "none",
+							"description": "Do not aggregate tokens"
+						},
+						{
+							"const": "simple",
+							"description": "Group consecutive tokens with the same label in a single entity."
+						},
+						{
+							"const": "first",
+							"description": "Similar to \"simple\", also preserves word integrity (use the label predicted for the first token in a word)."
+						},
+						{
+							"const": "average",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score, averaged across the word's tokens)."
+						},
+						{
+							"const": "max",
+							"description": "Similar to \"simple\", also preserves word integrity (uses the label with the highest score across the word's tokens)."
+						}
+					]
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/token-classification/spec/output.json b/packages/tasks/src/tasks/token-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..8522d972a283821244e40b8c5f9e1107750464a9
--- /dev/null
+++ b/packages/tasks/src/tasks/token-classification/spec/output.json
@@ -0,0 +1,33 @@
+{
+	"$id": "/inference/schemas/token-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Token Classification task",
+	"title": "TokenClassificationOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"entityGroup": {
+				"type": "string",
+				"description": "The predicted label for that group of tokens"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"word": {
+				"type": "string",
+				"description": "The corresponding text"
+			},
+			"start": {
+				"type": "integer",
+				"description": "The character position in the input where this group begins."
+			},
+			"end": {
+				"type": "integer",
+				"description": "The character position in the input where this group ends."
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/translation/data.ts b/packages/tasks/src/tasks/translation/data.ts
index c0e4c3a34035806a3321e0c068a9774bb39be044..0edfab7b889b9bf54b28873f3ffe922b6f2296a8 100644
--- a/packages/tasks/src/tasks/translation/data.ts
+++ b/packages/tasks/src/tasks/translation/data.ts
@@ -1,6 +1,7 @@
 import type { TaskDataCustom } from "..";
 
 const taskData: TaskDataCustom = {
+	canonicalId: "text2text-generation",
 	datasets: [
 		{
 			description: "A dataset of copyright-free books translated into 16 different languages.",
diff --git a/packages/tasks/src/tasks/translation/inference.ts b/packages/tasks/src/tasks/translation/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..c932617a4036681d16df3c928ee9707a4f157a7e
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/inference.ts
@@ -0,0 +1,58 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+
+/**
+ * Inputs for Translation inference
+ *
+ * Inputs for Text2text Generation inference
+ */
+export interface TranslationInput {
+	/**
+	 * The input text data
+	 */
+	data: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: Text2TextGenerationParameters;
+	[property: string]: unknown;
+}
+
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Text2text Generation
+ */
+export interface Text2TextGenerationParameters {
+	/**
+	 * Whether to clean up the potential extra spaces in the text output.
+	 */
+	cleanUpTokenizationSpaces?: boolean;
+	/**
+	 * Additional parametrization of the text generation algorithm
+	 */
+	generateParameters?: { [key: string]: unknown };
+	/**
+	 * The truncation strategy to use
+	 */
+	truncation?: Text2TextGenerationTruncationStrategy;
+	[property: string]: unknown;
+}
+
+export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
+
+/**
+ * Outputs for Translation inference
+ *
+ * Outputs of inference for the Text2text Generation task
+ */
+export interface TranslationOutput {
+	/**
+	 * The generated text.
+	 */
+	generatedText: string;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/translation/spec/input.json b/packages/tasks/src/tasks/translation/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..0695bc6728994e3b5ff72e62c517ac038b6871ad
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/input.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/input.json",
+	"$id": "/inference/schemas/translation/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationInput",
+	"description": "Inputs for Translation inference"
+}
diff --git a/packages/tasks/src/tasks/translation/spec/output.json b/packages/tasks/src/tasks/translation/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..61b701db2c19d946b0ea49d283fe0fe01479b55d
--- /dev/null
+++ b/packages/tasks/src/tasks/translation/spec/output.json
@@ -0,0 +1,7 @@
+{
+	"$ref": "/inference/schemas/text2text-generation/output.json",
+	"$id": "/inference/schemas/translation/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"title": "TranslationOutput",
+	"description": "Outputs for Translation inference"
+}
diff --git a/packages/tasks/src/tasks/video-classification/inference.ts b/packages/tasks/src/tasks/video-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..1f765160f37ee54a0966953dc942d51d5a6fe158
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/inference.ts
@@ -0,0 +1,59 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Video Classification inference
+ */
+export interface VideoClassificationInput {
+	/**
+	 * The input video data
+	 */
+	data: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VideoClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Video Classification
+ */
+export interface VideoClassificationParameters {
+	/**
+	 * The sampling rate used to select frames from the video.
+	 */
+	frameSamplingRate?: number;
+	functionToApply?: ClassificationOutputTransform;
+	/**
+	 * The number of sampled frames to consider for classification.
+	 */
+	numFrames?: number;
+	/**
+	 * When specified, limits the output to the top K most probable classes.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+/**
+ * The function to apply to the model outputs in order to retrieve the scores.
+ */
+export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
+export type VideoClassificationOutput = VideoClassificationOutputElement[];
+/**
+ * Outputs of inference for the Video Classification task
+ */
+export interface VideoClassificationOutputElement {
+	/**
+	 * The predicted class label.
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/input.json b/packages/tasks/src/tasks/video-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..984670953b29c170ea3bba1488c295ace89aecae
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/input.json
@@ -0,0 +1,42 @@
+{
+	"$id": "/inference/schemas/video-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Video Classification inference",
+	"title": "VideoClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input video data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/VideoClassificationParameters"
+		}
+	},
+	"$defs": {
+		"VideoClassificationParameters": {
+			"title": "VideoClassificationParameters",
+			"description": "Additional inference parameters for Video Classification",
+			"type": "object",
+			"properties": {
+				"functionToApply": {
+					"title": "TextClassificationOutputTransform",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
+				},
+				"numFrames": {
+					"type": "integer",
+					"description": "The number of sampled frames to consider for classification."
+				},
+				"frameSamplingRate": {
+					"type": "integer",
+					"description": "The sampling rate used to select frames from the video."
+				},
+				"topK": {
+					"type": "integer",
+					"description": "When specified, limits the output to the top K most probable classes."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/video-classification/spec/output.json b/packages/tasks/src/tasks/video-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c24f5d577717994e0b4a8e329a7e063a967cb10
--- /dev/null
+++ b/packages/tasks/src/tasks/video-classification/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/video-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Video Classification task",
+	"title": "VideoClassificationOutput",
+	"type": "array",
+	"items": {
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
+	}
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/inference.ts b/packages/tasks/src/tasks/visual-question-answering/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..0eb513ebf6432dce2550bb3d9fadfb8e218d9797
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/inference.ts
@@ -0,0 +1,63 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Visual Question Answering inference
+ */
+export interface VisualQuestionAnsweringInput {
+	/**
+	 * One (image, question) pair to answer
+	 */
+	data: VisualQuestionAnsweringInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: VisualQuestionAnsweringParameters;
+	[property: string]: unknown;
+}
+/**
+ * One (image, question) pair to answer
+ */
+export interface VisualQuestionAnsweringInputData {
+	/**
+	 * The image.
+	 */
+	image: unknown;
+	/**
+	 * The question to answer based on the image.
+	 */
+	question: unknown;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Visual Question Answering
+ */
+export interface VisualQuestionAnsweringParameters {
+	/**
+	 * The number of answers to return (will be chosen by order of likelihood). Note that we
+	 * return less than topk answers if there are not enough options available within the
+	 * context.
+	 */
+	topK?: number;
+	[property: string]: unknown;
+}
+export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
+/**
+ * Outputs of inference for the Visual Question Answering task
+ */
+export interface VisualQuestionAnsweringOutputElement {
+	/**
+	 * The answer to the question
+	 */
+	answer?: string;
+	label: unknown;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/input.json b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6cb0e123cb842041dc930a8c0c80bb295179499
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/input.json
@@ -0,0 +1,41 @@
+{
+	"$id": "/inference/schemas/visual-question-answering/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Visual Question Answering inference",
+	"title": "VisualQuestionAnsweringInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "One (image, question) pair to answer",
+			"type": "object",
+			"title": "VisualQuestionAnsweringInputData",
+			"properties": {
+				"image": {
+					"description": "The image."
+				},
+				"question": {
+					"description": "The question to answer based on the image."
+				}
+			},
+			"required": ["question", "image"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/VisualQuestionAnsweringParameters"
+		}
+	},
+	"$defs": {
+		"VisualQuestionAnsweringParameters": {
+			"title": "VisualQuestionAnsweringParameters",
+			"description": "Additional inference parameters for Visual Question Answering",
+			"type": "object",
+			"properties": {
+				"topK": {
+					"type": "integer",
+					"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/visual-question-answering/spec/output.json b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..32c9c6c26b8134412588731fdb894799a3f107e3
--- /dev/null
+++ b/packages/tasks/src/tasks/visual-question-answering/spec/output.json
@@ -0,0 +1,21 @@
+{
+	"$id": "/inference/schemas/visual-question-answering/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Visual Question Answering task",
+	"title": "VisualQuestionAnsweringOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"properties": {
+			"answer": {
+				"type": "string",
+				"description": "The answer to the question"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			}
+		},
+		"required": ["label", "score"]
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..e0b43ec70b77c9ec760422e7d1f16cc36721403f
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/inference.ts
@@ -0,0 +1,67 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Classification inference
+ */
+export interface ZeroShotClassificationInput {
+	/**
+	 * The input text data, with candidate labels
+	 */
+	data: ZeroShotClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * The input text data, with candidate labels
+ */
+export interface ZeroShotClassificationInputData {
+	/**
+	 * The set of possible class labels to classify the text into.
+	 */
+	candidateLabels: string[];
+	/**
+	 * The text to classify
+	 */
+	text: string;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Classification
+ */
+export interface ZeroShotClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	/**
+	 * Whether multiple candidate labels can be true. If false, the scores are normalized such
+	 * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
+	 * considered independent and probabilities are normalized for each candidate.
+	 */
+	multiLabel?: boolean;
+	[property: string]: unknown;
+}
+export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Classification task
+ */
+export interface ZeroShotClassificationOutputElement {
+	/**
+	 * The predicted class label.
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..689c22769c8c1d1ec010f5b1d76f812a8fa33d97
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/input.json
@@ -0,0 +1,50 @@
+{
+	"$id": "/inference/schemas/zero-shot-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Classification inference",
+	"title": "ZeroShotClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input text data, with candidate labels",
+			"type": "object",
+			"title": "ZeroShotClassificationInputData",
+			"properties": {
+				"text": {
+					"type": "string",
+					"description": "The text to classify"
+				},
+				"candidateLabels": {
+					"type": "array",
+					"description": "The set of possible class labels to classify the text into.",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["text", "candidateLabels"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotClassificationParameters"
+		}
+	},
+	"$defs": {
+		"ZeroShotClassificationParameters": {
+			"title": "ZeroShotClassificationParameters",
+			"description": "Additional inference parameters for Zero Shot Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				},
+				"multiLabel": {
+					"type": "boolean",
+					"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..83ed1098fd139fe5373a5c5065596d3c1fffd491
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-classification/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/zero-shot-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Classification task",
+	"title": "ZeroShotClassificationOutput",
+	"type": "array",
+	"items": {
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..2bea5436b8882eccd3bd24de27dabac020e5b7c1
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/inference.ts
@@ -0,0 +1,61 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Image Classification inference
+ */
+export interface ZeroShotImageClassificationInput {
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotImageClassificationInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ZeroShotImageClassificationParameters;
+	[property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+export interface ZeroShotImageClassificationInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to classify
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Image Classification
+ */
+export interface ZeroShotImageClassificationParameters {
+	/**
+	 * The sentence used in conjunction with candidateLabels to attempt the text classification
+	 * by replacing the placeholder with the candidate labels.
+	 */
+	hypothesisTemplate?: string;
+	[property: string]: unknown;
+}
+export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Image Classification task
+ */
+export interface ZeroShotImageClassificationOutputElement {
+	/**
+	 * The predicted class label.
+	 */
+	label: string;
+	/**
+	 * The corresponding probability.
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5b212918ff1ddbee93a1297735d12335527dc07
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/input.json
@@ -0,0 +1,45 @@
+{
+	"$id": "/inference/schemas/zero-shot-image-classification/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Image Classification inference",
+	"title": "ZeroShotImageClassificationInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data, with candidate labels",
+			"type": "object",
+			"title": "ZeroShotImageClassificationInputData",
+			"properties": {
+				"image": {
+					"description": "The image data to classify"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotImageClassificationParameters"
+		}
+	},
+	"$defs": {
+		"ZeroShotImageClassificationParameters": {
+			"title": "ZeroShotImageClassificationParameters",
+			"description": "Additional inference parameters for Zero Shot Image Classification",
+			"type": "object",
+			"properties": {
+				"hypothesisTemplate": {
+					"type": "string",
+					"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
+				}
+			}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b795fbdbae8b566845fb424f30a7d7908609358
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-image-classification/spec/output.json
@@ -0,0 +1,10 @@
+{
+	"$id": "/inference/schemas/zero-shot-image-classification/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Image Classification task",
+	"title": "ZeroShotImageClassificationOutput",
+	"type": "array",
+	"items": {
+		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
+	}
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
new file mode 100644
index 0000000000000000000000000000000000000000..edb51172eceace9896d5b12e20a3cb38aa4ab953
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/inference.ts
@@ -0,0 +1,66 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Zero Shot Object Detection inference
+ */
+export interface ZeroShotObjectDetectionInput {
+	/**
+	 * The input image data, with candidate labels
+	 */
+	data: ZeroShotObjectDetectionInputData;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: {
+		[key: string]: unknown;
+	};
+	[property: string]: unknown;
+}
+/**
+ * The input image data, with candidate labels
+ */
+export interface ZeroShotObjectDetectionInputData {
+	/**
+	 * The candidate labels for this image
+	 */
+	candidateLabels: string[];
+	/**
+	 * The image data to generate bounding boxes from
+	 */
+	image: unknown;
+	[property: string]: unknown;
+}
+/**
+ * The predicted bounding box. Coordinates are relative to the top left corner of the input
+ * image.
+ */
+export interface BoundingBox {
+	xmax: number;
+	xmin: number;
+	ymax: number;
+	ymin: number;
+	[property: string]: unknown;
+}
+export type ZeroShotObjectDetectionOutput = ZeroShotObjectDetectionOutputElement[];
+/**
+ * Outputs of inference for the Zero Shot Object Detection task
+ */
+export interface ZeroShotObjectDetectionOutputElement {
+	/**
+	 * The predicted bounding box. Coordinates are relative to the top left corner of the input
+	 * image.
+	 */
+	box: BoundingBox;
+	/**
+	 * A candidate label
+	 */
+	label: string;
+	/**
+	 * The associated score / probability
+	 */
+	score: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
new file mode 100644
index 0000000000000000000000000000000000000000..63dce00edb3607cf062af797bcaa136692ab11f9
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/input.json
@@ -0,0 +1,40 @@
+{
+	"$id": "/inference/schemas/zero-shot-object-detection/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Zero Shot Object Detection inference",
+	"title": "ZeroShotObjectDetectionInput",
+	"type": "object",
+	"properties": {
+		"data": {
+			"description": "The input image data, with candidate labels",
+			"type": "object",
+			"title": "ZeroShotObjectDetectionInputData",
+			"properties": {
+				"image": {
+					"description": "The image data to generate bounding boxes from"
+				},
+				"candidateLabels": {
+					"description": "The candidate labels for this image",
+					"type": "array",
+					"items": {
+						"type": "string"
+					}
+				}
+			},
+			"required": ["image", "candidateLabels"]
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ZeroShotObjectDetectionParameters"
+		}
+	},
+	"$defs": {
+		"ZeroShotObjectDetectionParameters": {
+			"title": "ZeroShotObjectDetectionParameters",
+			"description": "Additional inference parameters for Zero Shot Object Detection",
+			"type": "object",
+			"properties": {}
+		}
+	},
+	"required": ["data"]
+}
diff --git a/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
new file mode 100644
index 0000000000000000000000000000000000000000..8afa6052769f617ae365348d4d560ee43095ae4a
--- /dev/null
+++ b/packages/tasks/src/tasks/zero-shot-object-detection/spec/output.json
@@ -0,0 +1,47 @@
+{
+	"$id": "/inference/schemas/zero-shot-object-detection/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Zero Shot Object Detection task",
+	"title": "ZeroShotObjectDetectionOutput",
+	"type": "array",
+	"items": {
+		"type": "object",
+		"title": "ZeroShotObjectDetectionOutputElement",
+		"properties": {
+			"label": {
+				"type": "string",
+				"description": "A candidate label"
+			},
+			"score": {
+				"type": "number",
+				"description": "The associated score / probability"
+			},
+			"box": {
+				"$ref": "#/$defs/BoundingBox",
+				"description": "The predicted bounding box. Coordinates are relative to the top left corner of the input image."
+			}
+		},
+		"required": ["box", "label", "score"]
+	},
+	"$defs": {
+		"BoundingBox": {
+			"title": "BoundingBox",
+			"type": "object",
+			"properties": {
+				"xmin": {
+					"type": "integer"
+				},
+				"xmax": {
+					"type": "integer"
+				},
+				"ymin": {
+					"type": "integer"
+				},
+				"ymax": {
+					"type": "integer"
+				}
+			},
+			"required": ["xmin", "xmax", "ymin", "ymax"]
+		}
+	}
+}