whisper-webui-translate

Running

App Files Files Community

whisper-webui-translate / config.json5

avans06

Added the "Whisper Segments Filter" option along with some configuration adjustments.

ec7cc5c 11 months ago

raw

history blame

14.1 kB

	{
	"models": {
	"whisper": [
	// Configuration for the built-in models. You can remove any of these
	// if you don't want to use the default models.
	{
	"name": "tiny",
	"url": "tiny"
	},
	{
	"name": "base",
	"url": "base"
	},
	{
	"name": "small",
	"url": "small"
	},
	{
	"name": "medium",
	"url": "medium"
	},
	{
	"name": "large",
	"url": "large"
	},
	{
	"name": "large-v1",
	"url": "large-v1"
	},
	{
	"name": "large-v2",
	"url": "large-v2"
	},
	{
	"name": "large-v3",
	"url": "large-v3"
	}
	// Uncomment to add custom Japanese models
	//{
	// "name": "whisper-large-v2-mix-jp",
	// "url": "vumichien/whisper-large-v2-mix-jp",
	// // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default.
	// // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models.
	// "type": "huggingface",
	//},
	//{
	// "name": "local-model",
	// "url": "path/to/local/model",
	//},
	//{
	// "name": "remote-model",
	// "url": "https://example.com/path/to/model",
	//}
	],
	"m2m100": [
	{
	"name": "m2m100_1.2B-ct2fast/michaelfeil",
	"url": "michaelfeil/ct2fast-m2m100_1.2B",
	"type": "huggingface",
	"tokenizer_url": "facebook/m2m100_1.2B"
	},
	{
	"name": "m2m100_418M-ct2fast/michaelfeil",
	"url": "michaelfeil/ct2fast-m2m100_418M",
	"type": "huggingface",
	"tokenizer_url": "facebook/m2m100_418M"
	},
	//{
	// "name": "m2m100-12B-ct2fast/michaelfeil",
	// "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
	// "type": "huggingface",
	// "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
	//},
	{
	"name": "m2m100_1.2B/facebook",
	"url": "facebook/m2m100_1.2B",
	"type": "huggingface"
	},
	{
	"name": "m2m100_418M/facebook",
	"url": "facebook/m2m100_418M",
	"type": "huggingface"
	}
	],
	"nllb": [
	{
	"name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
	"url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
	},
	{
	"name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
	"url": "michaelfeil/ct2fast-nllb-200-3.3B",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-3.3B"
	},
	{
	"name": "nllb-200-1.3B-ct2:float16/JustFrederik",
	"url": "JustFrederik/nllb-200-1.3B-ct2-float16",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-1.3B"
	},
	{
	"name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
	"url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
	},
	{
	"name": "nllb-200-1.3B-ct2:int8/JustFrederik",
	"url": "JustFrederik/nllb-200-1.3B-ct2-int8",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-1.3B"
	},
	{
	"name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
	"url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-1.3B"
	},
	{
	"name": "nllb-200-distilled-600M/facebook",
	"url": "facebook/nllb-200-distilled-600M",
	"type": "huggingface"
	},
	{
	"name": "nllb-200-distilled-600M-ct2/JustFrederik",
	"url": "JustFrederik/nllb-200-distilled-600M-ct2",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-600M"
	},
	{
	"name": "nllb-200-distilled-600M-ct2:float16/JustFrederik",
	"url": "JustFrederik/nllb-200-distilled-600M-ct2-float16",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-600M"
	},
	{
	"name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
	"url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
	"type": "huggingface",
	"tokenizer_url": "facebook/nllb-200-distilled-600M"
	}
	// Uncomment to add official Facebook 1.3B and 3.3B model
	// The official Facebook 1.3B and 3.3B model files are too large,
	// and to avoid occupying too much disk space on Hugging Face's free spaces,
	// these models are not included in the config.
	//{
	// "name": "nllb-200-distilled-1.3B/facebook",
	// "url": "facebook/nllb-200-distilled-1.3B",
	// "type": "huggingface"
	//},
	//{
	// "name": "nllb-200-1.3B/facebook",
	// "url": "facebook/nllb-200-1.3B",
	// "type": "huggingface"
	//},
	//{
	// "name": "nllb-200-3.3B/facebook",
	// "url": "facebook/nllb-200-3.3B",
	// "type": "huggingface"
	//},
	//{
	// "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
	// "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
	// "type": "huggingface",
	// "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
	//},
	//{
	// "name": "nllb-200-1.3B-ct2/JustFrederik",
	// "url": "JustFrederik/nllb-200-1.3B-ct2",
	// "type": "huggingface",
	// "tokenizer_url": "facebook/nllb-200-1.3B"
	//},
	//{
	// "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
	// "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
	// "type": "huggingface",
	// "tokenizer_url": "facebook/nllb-200-3.3B"
	//},
	],
	"mt5": [
	{
	"name": "mt5-zh-ja-en-trimmed/K024",
	"url": "K024/mt5-zh-ja-en-trimmed",
	"type": "huggingface"
	},
	{
	"name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth",
	"url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
	"type": "huggingface"
	}
	],
	"ALMA": [
	{
	"name": "ALMA-7B-GPTQ/TheBloke",
	"url": "TheBloke/ALMA-7B-GPTQ",
	"type": "huggingface"
	},
	{
	"name": "ALMA-13B-GPTQ/TheBloke",
	"url": "TheBloke/ALMA-13B-GPTQ",
	"type": "huggingface"
	},
	{
	"name": "ALMA-7B-GGUF-Q4_K_M/TheBloke",
	"url": "TheBloke/ALMA-7B-GGUF",
	"type": "huggingface",
	"model_file": "alma-7b.Q4_K_M.gguf",
	"tokenizer_url": "haoranxu/ALMA-7B"
	},
	{
	"name": "ALMA-13B-GGUF-Q4_K_M/TheBloke",
	"url": "TheBloke/ALMA-13B-GGUF",
	"type": "huggingface",
	"model_file": "alma-13b.Q4_K_M.gguf",
	"tokenizer_url": "haoranxu/ALMA-13B"
	},
	{
	"name": "ALMA-7B-ct2:int8_float16/avan",
	"url": "avans06/ALMA-7B-ct2-int8_float16",
	"type": "huggingface",
	"tokenizer_url": "haoranxu/ALMA-7B"
	},
	{
	"name": "ALMA-13B-ct2:int8_float16/avan",
	"url": "avans06/ALMA-13B-ct2-int8_float16",
	"type": "huggingface",
	"tokenizer_url": "haoranxu/ALMA-13B"
	},
	]
	},
	// Configuration options that will be used if they are not specified in the command line arguments.

	// * WEBUI options *

	// Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
	"input_audio_max_duration": 1800,
	// True to share the app on HuggingFace.
	"share": false,
	// The host or IP to bind to. If None, bind to localhost.
	"server_name": null,
	// The port to bind to.
	"server_port": 7860,
	// The number of workers to use for the web server. Use -1 to disable queueing.
	"queue_concurrency_count": 1,
	// Whether or not to automatically delete all uploaded files, to save disk space
	"delete_uploaded_files": true,

	// * General options *

	// The default implementation to use for Whisper. Can be "whisper" or "faster-whisper".
	// Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt)
	// or whisper (requirements.txt)
	"whisper_implementation": "faster-whisper",

	// The default model name.
	"default_model_name": "large-v2",
	// The default VAD.
	"default_vad": "silero-vad",
	// A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.
	"vad_parallel_devices": "",
	// The number of CPU cores to use for VAD pre-processing.
	"vad_cpu_cores": 1,
	// The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
	"vad_process_timeout": 1800,
	// True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
	"auto_parallel": false,
	// Directory to save the outputs (CLI will use the current directory if not specified)
	"output_dir": null,
	// The path to save model files; uses ~/.cache/whisper by default
	"model_dir": null,
	// Device to use for PyTorch inference, or Null to use the default device
	"device": null,
	// Whether to print out the progress and debug messages
	"verbose": true,
	// Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
	"task": "transcribe",
	// Language spoken in the audio, specify None to perform language detection
	"language": null,
	// The window size (in seconds) to merge voice segments
	"vad_merge_window": 5,
	// The maximum size (in seconds) of a voice segment
	"vad_max_merge_size": 90,
	// The padding (in seconds) to add to each voice segment
	"vad_padding": 1,
	// Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment)
	"vad_initial_prompt_mode": "prepend_first_segment",
	// The window size of the prompt to pass to Whisper
	"vad_prompt_window": 3,
	// Temperature to use for sampling
	"temperature": 0,
	// Number of candidates when sampling with non-zero temperature
	"best_of": 5,
	// Number of beams in beam search, only applicable when temperature is zero
	"beam_size": 5,
	// Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
	"patience": 1,
	// Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
	"length_penalty": null,
	// Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
	"suppress_tokens": "-1",
	// Optional text to provide as a prompt for the first window
	"initial_prompt": null,
	// If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
	"condition_on_previous_text": true,
	// Whether to perform inference in fp16; True by default
	"fp16": true,
	// The compute type used by faster-whisper. Can be "int8". "int16" or "float16".
	"compute_type": "auto",
	// Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
	"temperature_increment_on_fallback": 0.2,
	// If the gzip compression ratio is higher than this value, treat the decoding as failed
	"compression_ratio_threshold": 2.4,
	// If the average log probability is lower than this value, treat the decoding as failed
	"logprob_threshold": -1.0,
	// If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
	"no_speech_threshold": 0.6,
	// [faster-whisper] The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
	"repetition_penalty": 1.0,
	// [faster-whisper] The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
	"no_repeat_ngram_size": 0,

	// (experimental) extract word-level timestamps and refine the results based on them
	"word_timestamps": true,
	// if word_timestamps is True, merge these punctuation symbols with the next word
	"prepend_punctuations": "\"\'“¿([{-",
	// if word_timestamps is True, merge these punctuation symbols with the previous word
	"append_punctuations": "\"\'.。,，!！?？:：”)]}、",
	// (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
	"highlight_words": false,

	// Diarization settings
	"auth_token": null,
	// Whether to perform speaker diarization
	"diarization": false,
	// The number of speakers to detect
	"diarization_speakers": 2,
	// The minimum number of speakers to detect
	"diarization_min_speakers": 1,
	// The maximum number of speakers to detect
	"diarization_max_speakers": 8,
	// The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
	"diarization_process_timeout": 60,

	// Whisper Segments Filter
	"whisper_segments_filter": false,
	"whisper_segments_filters": [
	"avg_logprob < -0.9",
	"(durationLen < 1.5 \|\| segment_last), textLen > 5, avg_logprob < -0.4, no_speech_prob > 0.5",
	"(durationLen < 1.5 \|\| segment_last), textLen > 5, avg_logprob < -0.4, no_speech_prob > 0.07, compression_ratio < 0.9",
	"(durationLen < 1.5 \|\| segment_last), compression_ratio < 0.9, no_speech_prob > 0.1"
	],

	// Translation - The maximum batch size.
	"translation_batch_size": 2,
	// Translation - Prevent repetitions of ngrams with this size (set 0 to disable).
	"translation_no_repeat_ngram_size": 3,
	// Translation - Beam size (1 for greedy search).
	"translation_num_beams": 2,
	}