LeaderboardsExplorer

Running

LeaderboardsExplorer / src /static /tag_info.py

Clémentine

fix typo in judge

64d2c90 4 months ago

No virus

4.35 kB

	from enum import Enum
	from dataclasses import dataclass

	@dataclass
	class Tag:
	key: str
	name: str # for display
	usage: str # explains usage
	icon: str

	class SubmissionType(Enum):
	automatic = Tag(
	key="submission:automatic",
	name="Automatic",
	usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
	icon=""
	)
	semiautomatic = Tag(
	key="submission:semiautomatic",
	name="Semi Automatic",
	usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
	icon=""
	)
	manual = Tag(
	key="submission:manual",
	name="Manual",
	usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
	icon=""
	)
	closed = Tag(
	key="submission:closed",
	name="Closed",
	usage="the leaderboard does not accept submissions at the moment",
	icon=""
	)

	class TestSetStatus(Enum):
	public = Tag(
	key="test:public",
	name="Public",
	usage="all the test sets used are public, the evaluations are completely reproducible",
	icon=""
	)
	mix = Tag(
	key="test:mix",
	name="Mix",
	usage="some test sets are public and some private",
	icon=""
	)
	private = Tag(
	key="test:private",
	name="Private",
	usage="all the test sets used are private, the evaluations are hard to game",
	icon=""
	)
	rolling = Tag(
	key="test:rolling",
	name="Rolling",
	usage="the test sets used change regularly through time and evaluation scores are refreshed",
	icon=""
	)

	class Judge(Enum):
	auto = Tag(
	key="judge:auto",
	name="Automatic metric",
	usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
	icon=""
	)
	model = Tag(
	key="judge:model",
	name="Model",
	usage="evaluations are run using a model as a judge approach to rate answer",
	icon=""
	)
	humans = Tag(
	key="judge:humans",
	name="Human",
	usage="evaluations are done by humans to rate answer - this is an arena",
	icon=""
	)
	vibe_check = Tag(
	key="judge:vibe_check",
	name="Vibe check",
	usage="evaluations are done manually by one or several humans",
	icon=""
	)

	class Modality(Enum):
	text = Tag(
	key="modality:text",
	name="Text",
	usage="",
	icon=""
	)
	image = Tag(
	key="modality:image",
	name="Image",
	usage="",
	icon=""
	)
	audio = Tag(
	key="modality:audio",
	name="Audio",
	usage="",
	icon=""
	)
	video = Tag(
	key="modality:video",
	name="Video",
	usage="",
	icon=""
	)
	tools = Tag(
	key="modality:tools",
	name="Tools",
	usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
	icon=""
	)
	artefacts = Tag(
	key="modality:artefacts",
	name="Artefacts",
	usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
	icon=""
	)

	class EvaluationCategory(Enum):
	generation = Tag(
	key="eval:generation",
	name="Generation",
	usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
	icon=""
	)
	math = Tag(
	key="eval:math",
	name="Math",
	usage="the evaluation tests math abilities",
	icon=""
	)
	code = Tag(
	key="eval:code",
	name="Code",
	usage="the evaluation tests coding capabilities",
	icon=""
	)
	performance = Tag(
	key="eval:performance",
	name="Performance",
	usage="model performance (speed, energy consumption, ...)",
	icon=""
	)
	safety = Tag(
	key="eval:safety",
	name="Safety",
	usage="the evaluation considers safety, toxicity, bias",
	icon=""
	)