Spaces:

canserai
/

gg

Building

App Files Files Community

gg / projects /text_to_speech /worker.js

mfoud444's picture

first commit

79859e3 about 2 months ago

3.49 kB

	import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers';
	import { encodeWAV } from './utils';

	// Disable local model checks
	env.allowLocalModels = false;

	// Use the Singleton pattern to enable lazy construction of the pipeline.
	class MyTextToSpeechPipeline {

	static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/';

	static model_id = 'Xenova/speecht5_tts';
	static vocoder_id = 'Xenova/speecht5_hifigan';

	static tokenizer_instance = null;
	static model_instance = null;
	static vocoder_instance = null;

	static async getInstance(progress_callback = null) {
	if (this.tokenizer_instance === null) {
	this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback });
	}

	if (this.model_instance === null) {
	this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, {
	quantized: false,
	progress_callback,
	});
	}

	if (this.vocoder_instance === null) {
	this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, {
	quantized: false,
	progress_callback,
	});
	}

	return new Promise(async (resolve, reject) => {
	const result = await Promise.all([
	this.tokenizer,
	this.model_instance,
	this.vocoder_instance,
	]);
	self.postMessage({
	status: 'ready',
	});
	resolve(result);
	});
	}

	static async getSpeakerEmbeddings(speaker_id) {
	// e.g., `cmu_us_awb_arctic-wav-arctic_a0001`
	const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`;
	const speaker_embeddings = new Tensor(
	'float32',
	new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()),
	[1, 512]
	)
	return speaker_embeddings;
	}
	}

	// Mapping of cached speaker embeddings
	const speaker_embeddings_cache = new Map();

	// Listen for messages from the main thread
	self.addEventListener('message', async (event) => {
	// Load the pipeline
	const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => {
	// We also add a progress callback so that we can track model loading.
	self.postMessage(x);
	});

	// Tokenize the input
	const { input_ids } = tokenizer(event.data.text);

	// Load the speaker embeddings
	let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id);
	if (speaker_embeddings === undefined) {
	speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id);
	speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings);
	}

	// Generate the waveform
	let response;
	try {
	response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
	} catch(e) {
	self.postMessage({
	status: 'error',
	exception: e,
	});
	throw e;
	}
	const { waveform } = response;

	// Encode the waveform as a WAV file
	const wav = encodeWAV(waveform.data);

	// Send the output back to the main thread
	self.postMessage({
	status: 'complete',
	output: new Blob([wav], { type: 'audio/wav' }),
	});
	});