import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react'; import Button from '@mui/material/Button'; import Typography from '@mui/material/Typography'; import InputLabel from '@mui/material/InputLabel'; import FormControl from '@mui/material/FormControl'; import Select, {SelectChangeEvent} from '@mui/material/Select'; import MenuItem from '@mui/material/MenuItem'; import Stack from '@mui/material/Stack'; import seamlessLogoUrl from './assets/seamless.svg'; import { AgentCapabilities, BaseResponse, BrowserAudioStreamConfig, DynamicConfig, PartialDynamicConfig, SUPPORTED_INPUT_SOURCES, SUPPORTED_OUTPUT_MODES, ServerExceptionData, ServerSpeechData, ServerState, ServerTextData, StartStreamEventConfig, StreamingStatus, SupportedInputSource, SupportedOutputMode, TranslationSentences, } from './types/StreamingTypes'; import FormLabel from '@mui/material/FormLabel'; import RadioGroup from '@mui/material/RadioGroup'; import FormControlLabel from '@mui/material/FormControlLabel'; import Radio from '@mui/material/Radio'; import './StreamingInterface.css'; import RoomConfig from './RoomConfig'; import Divider from '@mui/material/Divider'; import {useSocket} from './useSocket'; import {RoomState} from './types/RoomState'; import useStable from './useStable'; import float32To16BitPCM from './float32To16BitPCM'; import createBufferedSpeechPlayer from './createBufferedSpeechPlayer'; import Checkbox from '@mui/material/Checkbox'; import Alert from '@mui/material/Alert'; import isScrolledToDocumentBottom from './isScrolledToDocumentBottom'; import Box from '@mui/material/Box'; import Slider from '@mui/material/Slider'; import VolumeDown from '@mui/icons-material/VolumeDown'; import VolumeUp from '@mui/icons-material/VolumeUp'; import Mic from '@mui/icons-material/Mic'; import MicOff from '@mui/icons-material/MicOff'; import XRDialog from './react-xr/XRDialog'; import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData'; import { sliceTranslationSentencesUpToIndex, getTotalSentencesLength, } from './sliceTranslationSentencesUtils'; import Blink from './Blink'; import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval'; import {getURLParams} from './URLParams'; import debug from './debug'; import DebugSection from './DebugSection'; import Switch from '@mui/material/Switch'; import Grid from '@mui/material/Grid'; import {getLanguageFromThreeLetterCode} from './languageLookup'; import HeadphonesIcon from '@mui/icons-material/Headphones'; const AUDIO_STREAM_DEFAULTS = { userMedia: { echoCancellation: false, noiseSuppression: true, }, displayMedia: { echoCancellation: false, noiseSuppression: false, }, } as const; async function requestUserMediaAudioStream( config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['userMedia'], ) { const stream = await navigator.mediaDevices.getUserMedia({ audio: {...config, channelCount: 1}, }); console.debug( '[requestUserMediaAudioStream] stream created with settings:', stream.getAudioTracks()?.[0]?.getSettings(), ); return stream; } async function requestDisplayMediaAudioStream( config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['displayMedia'], ) { const stream = await navigator.mediaDevices.getDisplayMedia({ audio: {...config, channelCount: 1}, }); console.debug( '[requestDisplayMediaAudioStream] stream created with settings:', stream.getAudioTracks()?.[0]?.getSettings(), ); return stream; } const buttonLabelMap: {[key in StreamingStatus]: string} = { stopped: 'Start Streaming', running: 'Stop Streaming', starting: 'Starting...', }; const BUFFER_LIMIT = 1; const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36; const GAIN_MULTIPLIER_OVER_1 = 3; const getGainScaledValue = (value) => value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value; const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2; const MAX_SERVER_EXCEPTIONS_TRACKED = 500; export const TYPING_ANIMATION_DELAY_MS = 6; export default function StreamingInterface() { const urlParams = getURLParams(); const debugParam = urlParams.debug; const [animateTextDisplay, setAnimateTextDisplay] = useState( urlParams.animateTextDisplay, ); const socketObject = useSocket(); const {socket, clientID} = socketObject; const [serverState, setServerState] = useState(null); const [agent, setAgent] = useState(null); const model = agent?.name ?? null; const agentsCapabilities: Array = serverState?.agentsCapabilities ?? []; const currentAgent: AgentCapabilities | null = agentsCapabilities.find((agent) => agent.name === model) ?? null; const [serverExceptions, setServerExceptions] = useState< Array >([]); const [roomState, setRoomState] = useState(null); const roomID = roomState?.room_id ?? null; const isSpeaker = (clientID != null && roomState?.speakers.includes(clientID)) ?? false; const isListener = (clientID != null && roomState?.listeners.includes(clientID)) ?? false; const [streamingStatus, setStreamingStatus] = useState('stopped'); const isStreamConfiguredRef = useRef(false); const [outputMode, setOutputMode] = useState('s2s&t'); const [inputSource, setInputSource] = useState('userMedia'); const [enableNoiseSuppression, setEnableNoiseSuppression] = useState< boolean | null >(null); const [enableEchoCancellation, setEnableEchoCancellation] = useState< boolean | null >(null); // Dynamic Params: const [targetLang, setTargetLang] = useState(null); const [serverDebugFlag, setServerDebugFlag] = useState( debugParam ?? false, ); const [receivedData, setReceivedData] = useState>([]); const [ translationSentencesAnimatedIndex, setTranslationSentencesAnimatedIndex, ] = useState(0); const lastTranslationResultRef = useRef(null); const [inputStream, setInputStream] = useState(null); const [inputStreamSource, setInputStreamSource] = useState(null); const audioContext = useStable(() => new AudioContext()); const [scriptNodeProcessor, setScriptNodeProcessor] = useState(null); const [muted, setMuted] = useState(false); // The onaudioprocess script needs an up-to-date reference to the muted state, so // we use a ref here and keep it in sync via useEffect const mutedRef = useRef(muted); useEffect(() => { mutedRef.current = muted; }, [muted]); const [gain, setGain] = useState(1); const isScrolledToBottomRef = useRef(isScrolledToDocumentBottom()); // Some config options must be set when starting streaming and cannot be chaned dynamically. // This controls whether they are disabled or not const streamFixedConfigOptionsDisabled = streamingStatus !== 'stopped' || roomID == null; const bufferedSpeechPlayer = useStable(() => { const player = createBufferedSpeechPlayer({ onStarted: () => { console.debug('📢 PLAYBACK STARTED 📢'); }, onEnded: () => { console.debug('🛑 PLAYBACK ENDED 🛑'); }, }); // Start the player now so it eagerly plays audio when it arrives player.start(); return player; }); const translationSentencesBase: TranslationSentences = getTranslationSentencesFromReceivedData(receivedData); const translationSentencesBaseTotalLength = getTotalSentencesLength( translationSentencesBase, ); const translationSentences: TranslationSentences = animateTextDisplay ? sliceTranslationSentencesUpToIndex( translationSentencesBase, translationSentencesAnimatedIndex, ) : translationSentencesBase; // We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up const translationSentencesWithEmptyStartingString = streamingStatus === 'running' && translationSentences.length === 0 ? [''] : translationSentences; /****************************************** * Event Handlers ******************************************/ const setAgentAndUpdateParams = useCallback( (newAgent: AgentCapabilities | null) => { setAgent((prevAgent) => { if (prevAgent?.name !== newAgent?.name) { setTargetLang(newAgent?.targetLangs[0] ?? null); } return newAgent; }); }, [], ); const onSetDynamicConfig = useCallback( async (partialConfig: PartialDynamicConfig) => { return new Promise((resolve, reject) => { if (socket == null) { reject(new Error('[onSetDynamicConfig] socket is null ')); return; } socket.emit( 'set_dynamic_config', partialConfig, (result: BaseResponse) => { console.log('[emit result: set_dynamic_config]', result); if (result.status === 'ok') { resolve(); } else { reject(); } }, ); }); }, [socket], ); const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => { return new Promise((resolve, reject) => { if (socket == null) { reject(new Error('[configureStreamAsync] socket is null ')); return; } const modelName = agent?.name ?? null; if (modelName == null) { reject(new Error('[configureStreamAsync] modelName is null ')); return; } const config: StartStreamEventConfig = { event: 'config', rate: sampleRate, model_name: modelName, debug: serverDebugFlag, // synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true async_processing: true, buffer_limit: BUFFER_LIMIT, model_type: outputMode, }; console.log('[configureStreamAsync] sending config', config); socket.emit('configure_stream', config, (statusObject) => { if (statusObject.status === 'ok') { isStreamConfiguredRef.current = true; console.debug( '[configureStreamAsync] stream configured!', statusObject, ); resolve(); } else { isStreamConfiguredRef.current = false; reject( new Error( `[configureStreamAsync] configure_stream returned status: ${statusObject.status}`, ), ); return; } }); }); }; const startStreaming = async () => { if (streamingStatus !== 'stopped') { console.warn( `Attempting to start stream when status is ${streamingStatus}`, ); return; } setStreamingStatus('starting'); if (audioContext.state === 'suspended') { console.warn('audioContext was suspended! resuming...'); await audioContext.resume(); } let stream: MediaStream | null = null; try { if (inputSource === 'userMedia') { stream = await requestUserMediaAudioStream({ noiseSuppression: enableNoiseSuppression ?? AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression, echoCancellation: enableEchoCancellation ?? AUDIO_STREAM_DEFAULTS['userMedia'].echoCancellation, }); } else if (inputSource === 'displayMedia') { stream = await requestDisplayMediaAudioStream({ noiseSuppression: enableNoiseSuppression ?? AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression, echoCancellation: enableEchoCancellation ?? AUDIO_STREAM_DEFAULTS['displayMedia'].echoCancellation, }); } else { throw new Error(`Unsupported input source requested: ${inputSource}`); } setInputStream(stream); } catch (e) { console.error('[startStreaming] media stream request failed:', e); setStreamingStatus('stopped'); return; } const mediaStreamSource = audioContext.createMediaStreamSource(stream); setInputStreamSource(mediaStreamSource); /** * NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor), but * which is easy and convenient for our purposes. * * Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor * * In an ideal world this would be migrated to something like this SO answer: https://stackoverflow.com/a/65448287 */ const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1); setScriptNodeProcessor(scriptProcessor); scriptProcessor.onaudioprocess = (event) => { if (isStreamConfiguredRef.current === false) { console.debug('[onaudioprocess] stream is not configured yet!'); return; } if (socket == null) { console.warn('[onaudioprocess] socket is null in onaudioprocess'); return; } if (mutedRef.current) { // We still want to send audio to the server when we're muted to ensure we // get any remaining audio back from the server, so let's pass an array length 1 with a value of 0 const mostlyEmptyInt16Array = new Int16Array(1); socket.emit('incoming_audio', mostlyEmptyInt16Array); } else { const float32Audio = event.inputBuffer.getChannelData(0); const pcm16Audio = float32To16BitPCM(float32Audio); socket.emit('incoming_audio', pcm16Audio); } debug()?.sentAudio(event); }; mediaStreamSource.connect(scriptProcessor); scriptProcessor.connect(audioContext.destination); bufferedSpeechPlayer.start(); try { if (targetLang == null) { throw new Error('[startStreaming] targetLang cannot be nullish'); } // When we are starting the stream we want to pass all the dynamic config values // available before actually configuring and starting the stream const fullDynamicConfig: DynamicConfig = { targetLanguage: targetLang, }; await onSetDynamicConfig(fullDynamicConfig); // NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why. await configureStreamAsync({ sampleRate: audioContext.sampleRate, }); } catch (e) { console.error('configureStreamAsync failed', e); setStreamingStatus('stopped'); return; } setStreamingStatus('running'); }; const stopStreaming = useCallback(async () => { if (streamingStatus === 'stopped') { console.warn( `Attempting to stop stream when status is ${streamingStatus}`, ); return; } // Stop the speech playback right away bufferedSpeechPlayer.stop(); if (inputStreamSource == null || scriptNodeProcessor == null) { console.error( 'inputStreamSource || scriptNodeProcessor is null in stopStreaming', ); } else { inputStreamSource.disconnect(scriptNodeProcessor); scriptNodeProcessor.disconnect(audioContext.destination); // Release the mic input so we stop showing the red recording icon in the browser inputStream?.getTracks().forEach((track) => track.stop()); } if (socket == null) { console.warn('Unable to emit stop_stream because socket is null'); } else { socket.emit('stop_stream', (result) => { console.debug('[emit result: stop_stream]', result); }); } setStreamingStatus('stopped'); }, [ audioContext.destination, bufferedSpeechPlayer, inputStream, inputStreamSource, scriptNodeProcessor, socket, streamingStatus, ]); const onClearTranscriptForAll = useCallback(() => { if (socket != null) { socket.emit('clear_transcript_for_all'); } }, [socket]); /****************************************** * Effects ******************************************/ useEffect(() => { if (socket == null) { return; } const onRoomStateUpdate = (roomState: RoomState) => { setRoomState(roomState); }; socket.on('room_state_update', onRoomStateUpdate); return () => { socket.off('room_state_update', onRoomStateUpdate); }; }, [socket]); useEffect(() => { if (socket != null) { const onTranslationText = (data: ServerTextData) => { setReceivedData((prev) => [...prev, data]); debug()?.receivedText(data.payload); }; const onTranslationSpeech = (data: ServerSpeechData) => { bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate); }; socket.on('translation_text', onTranslationText); socket.on('translation_speech', onTranslationSpeech); return () => { socket.off('translation_text', onTranslationText); socket.off('translation_speech', onTranslationSpeech); }; } }, [bufferedSpeechPlayer, socket]); useEffect(() => { if (socket != null) { const onServerStateUpdate = (newServerState: ServerState) => { setServerState(newServerState); // If a client creates a server lock, we want to stop streaming if we're not them if ( newServerState.serverLock?.isActive === true && newServerState.serverLock?.clientID !== clientID && streamingStatus === 'running' ) { stopStreaming(); } const firstAgentNullable = newServerState.agentsCapabilities[0]; if (agent == null && firstAgentNullable != null) { setAgentAndUpdateParams(firstAgentNullable); } }; socket.on('server_state_update', onServerStateUpdate); return () => { socket.off('server_state_update', onServerStateUpdate); }; } }, [ agent, clientID, setAgentAndUpdateParams, socket, stopStreaming, streamingStatus, ]); useEffect(() => { if (socket != null) { const onServerException = ( exceptionDataWithoutClientTime: ServerExceptionData, ) => { const exceptionData = { ...exceptionDataWithoutClientTime, timeStringClient: new Date( exceptionDataWithoutClientTime['timeEpochMs'], ).toLocaleString(), }; setServerExceptions((prev) => [exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED), ); console.error( `[server_exception] The server encountered an exception: ${exceptionData['message']}`, exceptionData, ); }; socket.on('server_exception', onServerException); return () => { socket.off('server_exception', onServerException); }; } }, [socket]); useEffect(() => { if (socket != null) { const onClearTranscript = () => { setReceivedData([]); setTranslationSentencesAnimatedIndex(0); }; socket.on('clear_transcript', onClearTranscript); return () => { socket.off('clear_transcript', onClearTranscript); }; } }, [socket]); useEffect(() => { const onScroll = () => { if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) { isScrolledToBottomRef.current = true; return; } isScrolledToBottomRef.current = false; return; }; document.addEventListener('scroll', onScroll); return () => { document.removeEventListener('scroll', onScroll); }; }, []); useLayoutEffect(() => { if ( lastTranslationResultRef.current != null && isScrolledToBottomRef.current ) { // Scroll the div to the most recent entry lastTranslationResultRef.current.scrollIntoView(); } // Run the effect every time data is received, so that // we scroll to the bottom even if we're just adding text to // a pre-existing chunk }, [receivedData]); useEffect(() => { if (!animateTextDisplay) { return; } if ( translationSentencesAnimatedIndex < translationSentencesBaseTotalLength ) { const timeout = setTimeout(() => { setTranslationSentencesAnimatedIndex((prev) => prev + 1); debug()?.startRenderText(); }, TYPING_ANIMATION_DELAY_MS); return () => clearTimeout(timeout); } else { debug()?.endRenderText(); } }, [ animateTextDisplay, translationSentencesAnimatedIndex, translationSentencesBaseTotalLength, ]); /****************************************** * Sub-components ******************************************/ const volumeSliderNode = ( `${(value * 100).toFixed(0)}%`} valueLabelDisplay="auto" value={gain} onChange={(_event: Event, newValue: number | number[]) => { if (typeof newValue === 'number') { const scaledGain = getGainScaledValue(newValue); // We want the actual gain node to use the scaled value bufferedSpeechPlayer.setGain(scaledGain); // But we want react state to keep track of the non-scaled value setGain(newValue); } else { console.error( `[volume slider] Unexpected non-number value: ${newValue}`, ); } }} /> ); const xrDialogComponent = ( { setAnimateTextDisplay(urlParams.animateTextDisplay); }} onARVisible={() => setAnimateTextDisplay(false)} /> ); return (
Seamless Translation Logo
Seamless Translation
Welcome! This space is locked, please duplicate the space here. Unset the environment variable `LOCK_SERVER_COMPLETELY`.
In your duplicated space, join a room as speaker or listener (or both), and share the room code to invite listeners.
Check out the seamless_communication README for more information.
SeamlessStreaming model is a research model and is not released for production deployment. The streaming quality is closely related to proper VAD segmentation. It works best if you pause every couple of sentences, or you may wish adjust the VAD threshold in the model config. The real-time performance will degrade if you try streaming multiple speakers at the same time.
{ // If the user has switched from speaker to listener we need to tell the // player to play eagerly, since currently the listener doesn't have any stop/start controls bufferedSpeechPlayer.start(); }} /> {isListener && !isSpeaker && ( {volumeSliderNode} )} {isSpeaker && ( <> Model Model Output Target Language setOutputMode( e.target.value as SupportedOutputMode, ) } name="output-modes-radio-buttons-group"> { // TODO: Use supported modalities from agentCapabilities SUPPORTED_OUTPUT_MODES.map(({value, label}) => ( } label={label} /> )) } {isListener && ( {volumeSliderNode} )} Input Source ) => setInputSource( e.target.value as SupportedInputSource, ) } name="input-source-radio-buttons-group"> {SUPPORTED_INPUT_SOURCES.map(({label, value}) => ( } label={label} /> ))} Options , ) => setEnableNoiseSuppression(event.target.checked) } /> } label="Noise Suppression" /> , ) => setEnableEchoCancellation(event.target.checked) } /> } label="Echo Cancellation (not recommended)" /> , ) => setServerDebugFlag(event.target.checked)} /> } label="Enable Server Debugging" /> {isSpeaker && isListener && inputSource === 'userMedia' && !enableEchoCancellation && gain !== 0 && (
}> Headphones required to prevent feedback.
)} {isSpeaker && enableEchoCancellation && (
We don't recommend using echo cancellation as it may distort the input audio. If possible, use headphones and disable echo cancellation instead.
)} {streamingStatus === 'stopped' ? ( ) : ( )} {roomID == null ? null : ( {xrDialogComponent} )} {serverExceptions.length > 0 && (
{`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
)} {serverState != null && serverState.totalActiveTranscoders >= TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
{`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
)} {serverState?.serverLock != null && serverState.serverLock.clientID !== clientID && (
{`The server is currently locked. Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
)} )}
{isListener && !isSpeaker && ( {xrDialogComponent} )}
{debugParam && roomID != null && }
Transcript {isSpeaker && ( )}
{translationSentencesWithEmptyStartingString.map( (sentence, index, arr) => { const isLast = index === arr.length - 1; const maybeRef = isLast ? {ref: lastTranslationResultRef} : {}; return (
{sentence} {animateTextDisplay && isLast && ( 0 }> {'|'} )}
); }, )}
); }