seamless-streaming

Running on T4

App Files Files Community

Update README.md

by reach-vb HF staff - opened Nov 30, 2023

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+388

-549

Files changed (19) hide show

Dockerfile +5 -21
README.md +6 -20
seamless_server/app_pubsub.py +3 -35
seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml +0 -25
seamless_server/models/SeamlessStreaming/{vad_s2st_sc_main.yaml → vad_s2st_sc_24khz_main.yaml} +0 -0
seamless_server/requirements.txt +5 -4
seamless_server/run_docker.sh +0 -5
seamless_server/src/simuleval_agent_directory.py +8 -29
seamless_server/src/simuleval_transcoder.py +2 -7
seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl +2 -2
streaming-react-app/package.json +3 -2
streaming-react-app/src/StreamingInterface.tsx +34 -88
streaming-react-app/src/languageLookup.ts +103 -105
streaming-react-app/src/react-xr/TextBlocks.tsx +119 -191
streaming-react-app/src/react-xr/XRConfig.tsx +69 -9
streaming-react-app/src/react-xr/XRDialog.tsx +16 -1
streaming-react-app/src/types/StreamingTypes.ts +3 -5
streaming-react-app/vite.config.ts +5 -0
streaming-react-app/yarn.lock +5 -0

Dockerfile CHANGED Viewed

@@ -45,21 +45,10 @@ RUN apt-get update && \
     # gradio dependencies \
     ffmpeg \
     # fairseq2 dependencies \
-    libjpeg8-dev \
-    libpng-dev \
     libsndfile-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
-USER root
-RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
-# install older versions libjpeg62-turbo and libpng15
-RUN wget http://ftp.us.debian.org/debian/pool/main/libj/libjpeg-turbo/libjpeg62-turbo_2.1.5-2_amd64.deb && \
-    dpkg -i libjpeg62-turbo_2.1.5-2_amd64.deb && \
-    rm libjpeg62-turbo_2.1.5-2_amd64.deb
-RUN wget https://master.dl.sourceforge.net/project/libpng/libpng15/1.5.30/libpng-1.5.30.tar.gz && \
-    tar -xvf libpng-1.5.30.tar.gz && cd libpng-1.5.30 && ./configure && make && make install && cd .. && rm -rf libpng-1.5.30.tar.gz libpng-1.5.30
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
@@ -76,18 +65,13 @@ RUN pyenv install $PYTHON_VERSION && \
 COPY --chown=user:user ./seamless_server ./seamless_server
 # change dir since pip needs to seed whl folder
-RUN cd seamless_server && \
-    pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.1.1/cu118 && \
-    pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --from=frontend /app/dist ./streaming-react-app/dist
 WORKDIR $HOME/app/seamless_server
-RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=false \
-    huggingface-cli login --token $(cat /run/secrets/HF_TOKEN) || echo "HF_TOKEN error" && \
-    huggingface-cli download meta-private/SeamlessExpressive pretssel_melhifigan_wm-final.pt  --local-dir ./models/Seamless/ || echo "HF_TOKEN error" && \
-    ln -s $(readlink -f models/Seamless/pretssel_melhifigan_wm-final.pt) models/Seamless/pretssel_melhifigan_wm.pt || true;
 USER user
-RUN ["chmod", "+x", "./run_docker.sh"]
-CMD ./run_docker.sh

     # gradio dependencies \
     ffmpeg \
     # fairseq2 dependencies \
     libsndfile-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
 COPY --chown=user:user ./seamless_server ./seamless_server
 # change dir since pip needs to seed whl folder
+RUN cd seamless_server && pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --from=frontend /app/dist ./streaming-react-app/dist
 WORKDIR $HOME/app/seamless_server
+USER root
+RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
 USER user
+CMD [ "uvicorn", "app_pubsub:app", "--host", "0.0.0.0", "--port", "7860" ]

README.md CHANGED Viewed

@@ -5,9 +5,7 @@ colorFrom: blue
 colorTo: yellow
 sdk: docker
 pinned: false
-suggested_hardware: t4-small
-models:
- - facebook/seamless-streaming
 ---
 # Seamless Streaming demo
@@ -17,21 +15,12 @@ You can simply duplicate the space to run it.
 ## Running locally
 ### Install backend seamless_server dependencies
-> [!NOTE]
-> Please note: we *do not* recommend running the model on CPU. CPU inference will be slow and introduce noticable delays in the simultaneous translation.
-> [!NOTE]
-> The example below is for PyTorch stable (2.1.1) and variant cu118.
-> Check [here](https://pytorch.org/get-started/locally/) to find the torch/torchaudio command for your variant.
-> Check [here](https://github.com/facebookresearch/fairseq2#variants) to find the fairseq2 command for your variant.
-If running for the first time, create conda environment and install the desired torch version. Then install the rest of the requirements:
 ```
-cd seamless_server
-conda create --yes --name smlss_server python=3.8 libsndfile==1.0.31
 conda activate smlss_server
-conda install --yes pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
-pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.1.1/cu118
 pip install -r requirements.txt
 ```
@@ -39,9 +28,8 @@ pip install -r requirements.txt
 ```
 conda install -c conda-forge nodejs
 cd streaming-react-app
-npm install --global yarn
-yarn
-yarn build  # this will create the dist/ folder
 ```
@@ -51,14 +39,12 @@ The server can be run locally with uvicorn below.
 Run the server in dev mode:
 ```
-cd seamless_server
 uvicorn app_pubsub:app --reload --host localhost
 ```
 Run the server in prod mode:
 ```
-cd seamless_server
 uvicorn app_pubsub:app --host 0.0.0.0
 ```

 colorTo: yellow
 sdk: docker
 pinned: false
+suggested_hardware: t4-medium
 ---
 # Seamless Streaming demo
 ## Running locally
 ### Install backend seamless_server dependencies
+`cd seamless-experiences/seamless_vc/seamless_server`
+If running for the first time, create conda environment:
 ```
+conda create --name smlss_server python=3.8
 conda activate smlss_server
 pip install -r requirements.txt
 ```
 ```
 conda install -c conda-forge nodejs
 cd streaming-react-app
+npm install
+npm run build  # this will create the dist/ folder
 ```
 Run the server in dev mode:
 ```
 uvicorn app_pubsub:app --reload --host localhost
 ```
 Run the server in prod mode:
 ```
 uvicorn app_pubsub:app --host 0.0.0.0
 ```

seamless_server/app_pubsub.py CHANGED Viewed

@@ -123,26 +123,8 @@ class ServerLock(TypedDict):
     client_id: str
     member_object: Member
-MAX_SPEAKERS = os.environ.get("MAX_SPEAKERS")
-if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1":
-    logger.info("LOCK_SERVER_COMPLETELY is set. Server will be locked on startup.")
-if MAX_SPEAKERS is not None and int(MAX_SPEAKERS):
-    logger.info(f"MAX_SPEAKERS is set to: {MAX_SPEAKERS}")
-dummy_server_lock_member_object = Member(
-    client_id="seamless_user", session_id="dummy", name="Seamless User"
-)
-# Normally this would be an actual transcoder, but it's fine putting True here since currently we only check for the presence of the transcoder
-dummy_server_lock_member_object.transcoder = True
-server_lock: Optional[ServerLock] = (
-    {
-        "name": "Seamless User",
-        "client_id": "seamless_user",
-        "member_object": dummy_server_lock_member_object,
-    }
-    if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
-    else None
-)
 server_id = str(uuid4())
@@ -519,8 +501,6 @@ async def join_room(sid, client_id, room_id_from_client, config_dict):
             server_lock is not None
             and config_dict.get("lockServerName")
             == ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
-            # If we are locking the server completely we don't want someone to be able to unlock it
-            and not os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
         ):
             server_lock = None
             logger.info(
@@ -528,7 +508,7 @@ async def join_room(sid, client_id, room_id_from_client, config_dict):
             )
         # If the server is not locked, set a lock. If it's already locked to this client, update the lock object
-        if server_lock is None or server_lock.get("client_id") == client_id:
             # TODO: Add some sort of timeout as a backstop in case someone leaves the browser tab open after locking the server
             server_lock = {
                 "name": config_dict.get("lockServerName"),
@@ -559,12 +539,6 @@ async def join_room(sid, client_id, room_id_from_client, config_dict):
     return {"roomsJoined": sio.rooms(sid), "roomID": room_id}
-def allow_speaker(room, client_id):
-    if MAX_SPEAKERS is not None and client_id in room.speakers:
-        room_statuses = {room_id: room.get_room_status_dict() for room_id, room in rooms.items()}
-        speakers = sum(room_status["activeTranscoders"] for room_status in room_statuses.values())
-        return speakers < int(MAX_SPEAKERS)
-    return True
 # TODO: Add code to prevent more than one speaker from connecting/streaming at a time
 @sio.event
@@ -585,12 +559,6 @@ async def configure_stream(sid, config):
         )
         return {"status": "error", "message": "member_or_room_is_none"}
-    if not allow_speaker(room, client_id):
-        logger.error(
-            f"In MAX_SPEAKERS mode we only allow one speaker at a time. Ignoring request to configure stream from client {client_id}."
-        )
-        return {"status": "error", "message": "max_speakers"}
     # If there is a server lock WITH an active transcoder session, prevent other users from configuring and starting a stream
     # If the server lock client does NOT have an active transcoder session allow this to proceed, knowing that
     # this stream will be interrupted if the server lock client starts streaming

     client_id: str
     member_object: Member
+server_lock: Optional[ServerLock] = None
 server_id = str(uuid4())
             server_lock is not None
             and config_dict.get("lockServerName")
             == ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
         ):
             server_lock = None
             logger.info(
             )
         # If the server is not locked, set a lock. If it's already locked to this client, update the lock object
+        elif server_lock is None or server_lock.get("client_id") == client_id:
             # TODO: Add some sort of timeout as a backstop in case someone leaves the browser tab open after locking the server
             server_lock = {
                 "name": config_dict.get("lockServerName"),
     return {"roomsJoined": sio.rooms(sid), "roomID": room_id}
 # TODO: Add code to prevent more than one speaker from connecting/streaming at a time
 @sio.event
         )
         return {"status": "error", "message": "member_or_room_is_none"}
     # If there is a server lock WITH an active transcoder session, prevent other users from configuring and starting a stream
     # If the server lock client does NOT have an active transcoder session allow this to proceed, knowing that
     # this stream will be interrupted if the server lock client starts streaming

seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml DELETED Viewed

@@ -1,25 +0,0 @@
-agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
-monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
-unity_model_name: seamless_streaming_unity
-sentencepiece_model: spm_256k_nllb100.model
-task: s2st
-tgt_lang: "eng"
-min_unit_chunk_size: 50
-decision_threshold: 0.7
-no_early_stop: True
-block_ngrams: True
-vocoder_name: vocoder_v2
-expr_vocoder_name: vocoder_pretssel
-gated_model_dir: .
-expr_vocoder_gain: 3.0
-upstream_idx: 1
-wav2vec_yaml: wav2vec.yaml
-min_starting_wait_w2vbert: 192
-config_yaml: cfg_fbank_u2t.yaml
-upstream_idx: 1
-detokenize_only: True
-device: cuda:0
-max_len_a: 0
-max_len_b: 1000

seamless_server/models/SeamlessStreaming/{vad_s2st_sc_main.yaml → vad_s2st_sc_24khz_main.yaml} RENAMED Viewed

File without changes

seamless_server/requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 # seamless_communication
-git+https://github.com/facebookresearch/seamless_communication.git
-# ./whl/seamless_communication-1.0.0-py3-none-any.whl
 Flask==2.1.3
 Flask_Sockets==0.2.1
 g2p_en==2.1.0
@@ -13,10 +14,10 @@ protobuf==4.24.2
 psola==0.0.1
 pydub==0.25.1
 silero==0.4.1
 soundfile==0.11.0
 stable_ts==1.4.0
-# torch  # to be installed by user for desired PyTorch version
-# simuleval  # to be installed by seamless_communication
 Werkzeug==2.0.3
 whisper==1.1.10
 colorlog==6.7.0

+--pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/pt2.1.1/cu118
+simuleval==1.1.3
 # seamless_communication
+./whl/seamless_communication-1.0.0-py3-none-any.whl
 Flask==2.1.3
 Flask_Sockets==0.2.1
 g2p_en==2.1.0
 psola==0.0.1
 pydub==0.25.1
 silero==0.4.1
+# simuleval==1.1.1
 soundfile==0.11.0
 stable_ts==1.4.0
+torch  # specific torch version depends on fairseq2 installation
 Werkzeug==2.0.3
 whisper==1.1.10
 colorlog==6.7.0

seamless_server/run_docker.sh DELETED Viewed

@@ -1,5 +0,0 @@
-# !/bin/bash
-if [ -f models/Seamless/pretssel_melhifigan_wm.pt ] ; then
-    export USE_EXPRESSIVE_MODEL=1;
-fi
-uvicorn app_pubsub:app --host 0.0.0.0 --port 7860

seamless_server/src/simuleval_agent_directory.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Creates a directory in which to look up available agents
-import os
-from typing import List, Optional
 from src.simuleval_transcoder import SimulevalTranscoder
 import json
 import logging
@@ -34,10 +33,8 @@ class AgentWithInfo:
         # Supported dynamic params are defined in StreamingTypes.ts
         dynamic_params: List[str] = [],
         description="",
-        has_expressive: Optional[bool] = None,
     ):
         self.agent = agent
-        self.has_expressive = has_expressive
         self.name = name
         self.description = description
         self.modalities = modalities
@@ -78,7 +75,6 @@ class AgentWithInfo:
 class SimulevalAgentDirectory:
     # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
     seamless_streaming_agent = "SeamlessStreaming"
-    seamless_agent = "Seamless"
     def __init__(self):
         self.agents = []
@@ -100,12 +96,7 @@ class SimulevalAgentDirectory:
                     model_id,
                 )
         except Exception as e:
-            from fairseq2.assets.error import AssetError
             logger.warning("Failed to build agent %s: %s" % (model_id, e))
-            if isinstance(e, AssetError):
-                logger.warning(
-                    "Please download gated assets and set `gated_model_dir` in the config"
-                )
             raise e
         return agent
@@ -119,32 +110,20 @@ class SimulevalAgentDirectory:
             for agent_info in agent_infos:
                 self.add_agent(agent_info)
         else:
-            s2s_agent = None
-            if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
-                logger.info("Building expressive model...")
-                s2s_agent = self.build_agent_if_available(
-                    SimulevalAgentDirectory.seamless_agent,
-                    config_name="vad_s2st_sc_24khz_main.yaml",
-                )
-                has_expressive = True
-            else:
-                logger.info("Building non-expressive model...")
-                s2s_agent = self.build_agent_if_available(
-                    SimulevalAgentDirectory.seamless_streaming_agent,
-                    config_name="vad_s2st_sc_main.yaml",
-                )
-                has_expressive = False
-            if s2s_agent:
                 self.add_agent(
                     AgentWithInfo(
-                        agent=s2s_agent,
                         name=SimulevalAgentDirectory.seamless_streaming_agent,
                         modalities=["s2t", "s2s"],
                         target_langs=M4T_P0_LANGS,
                         dynamic_params=["expressive"],
                         description="multilingual expressive model that supports S2S and S2T",
-                        has_expressive=has_expressive,
                     )
                 )
@@ -158,7 +137,7 @@ class SimulevalAgentDirectory:
     def get_agent(self, name):
         for agent in self.agents:
             if agent.name == name:
-                return agent
         return None
     def get_agent_or_throw(self, name):

 # Creates a directory in which to look up available agents
+from typing import List
 from src.simuleval_transcoder import SimulevalTranscoder
 import json
 import logging
         # Supported dynamic params are defined in StreamingTypes.ts
         dynamic_params: List[str] = [],
         description="",
     ):
         self.agent = agent
         self.name = name
         self.description = description
         self.modalities = modalities
 class SimulevalAgentDirectory:
     # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
     seamless_streaming_agent = "SeamlessStreaming"
     def __init__(self):
         self.agents = []
                     model_id,
                 )
         except Exception as e:
             logger.warning("Failed to build agent %s: %s" % (model_id, e))
             raise e
         return agent
             for agent_info in agent_infos:
                 self.add_agent(agent_info)
         else:
+            s2s_m4t_expr_agent = self.build_agent_if_available(
+                SimulevalAgentDirectory.seamless_streaming_agent,
+                config_name="vad_s2st_sc_24khz_main.yaml",
+            )
+            if s2s_m4t_expr_agent:
                 self.add_agent(
                     AgentWithInfo(
+                        agent=s2s_m4t_expr_agent,
                         name=SimulevalAgentDirectory.seamless_streaming_agent,
                         modalities=["s2t", "s2s"],
                         target_langs=M4T_P0_LANGS,
                         dynamic_params=["expressive"],
                         description="multilingual expressive model that supports S2S and S2T",
                     )
                 )
     def get_agent(self, name):
         for agent in self.agents:
             if agent.name == name:
+                return agent.agent
         return None
     def get_agent_or_throw(self, name):

seamless_server/src/simuleval_transcoder.py CHANGED Viewed

@@ -119,8 +119,7 @@ class OutputSegments:
 class SimulevalTranscoder:
     def __init__(self, agent, sample_rate, debug, buffer_limit):
-        self.agent = agent.agent
-        self.has_expressive = agent.has_expressive
         self.input_queue = asyncio.Queue()
         self.output_queue = asyncio.Queue()
         self.states = self.agent.build_states()
@@ -186,7 +185,7 @@ class SimulevalTranscoder:
             logger.info(*args)
     @classmethod
-    def build_agent(cls, model_path, config_name):
         logger.info(f"Building simuleval agent: {model_path}, {config_name}")
         agent = build_system_from_dir(
             Path(__file__).resolve().parent.parent / f"models/{model_path}",
@@ -209,10 +208,6 @@ class SimulevalTranscoder:
             tgt_lang=dynamic_config.get("targetLanguage"),
             config=dynamic_config,
         )
-        if dynamic_config.get("expressive") is True and self.has_expressive is False:
-            logger.warning(
-                "Passing 'expressive' but the agent does not support expressive output!"
-            )
         # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
         self.input_queue.put_nowait(segment)

 class SimulevalTranscoder:
     def __init__(self, agent, sample_rate, debug, buffer_limit):
+        self.agent = agent
         self.input_queue = asyncio.Queue()
         self.output_queue = asyncio.Queue()
         self.states = self.agent.build_states()
             logger.info(*args)
     @classmethod
+    def build_agent(cls, model_path, config_name="vad_s2st_main.yaml"):
         logger.info(f"Building simuleval agent: {model_path}, {config_name}")
         agent = build_system_from_dir(
             Path(__file__).resolve().parent.parent / f"models/{model_path}",
             tgt_lang=dynamic_config.get("targetLanguage"),
             config=dynamic_config,
         )
         # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
         self.input_queue.put_nowait(segment)

seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5b81add4d9917ac562c2e8a10bd5b3c88804b8bd94c56cef4e9a01ecde4a839
-size 204321

 version https://git-lfs.github.com/spec/v1
+oid sha256:1df10e0c85ee0ffbc9f2e1bf8896850a52c551383df0332a94d26d9d39770c85
+size 201552

streaming-react-app/package.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "name": "streaming-react-app",
   "private": true,
-  "version": "0.0.14",
   "type": "module",
   "scripts": {
     "dev": "vite --host --strictPort",
-    "build": "vite build",
     "preview": "vite preview",
     "clean:node-modules": "rm -rf node_modules/",
     "ts-check": "tsc --noEmit",
@@ -24,6 +24,7 @@
     "amazon-cognito-identity-js": "^6.3.6",
     "audiobuffer-to-wav": "^1.0.0",
     "aws-sdk": "^2.1472.0",
     "js-cookie": "^3.0.5",
     "lodash": "4.17.21",
     "react": "^18.2.0",

 {
   "name": "streaming-react-app",
   "private": true,
+  "version": "0.0.13",
   "type": "module",
   "scripts": {
     "dev": "vite --host --strictPort",
+    "build": "tsc && vite build",
     "preview": "vite preview",
     "clean:node-modules": "rm -rf node_modules/",
     "ts-check": "tsc --noEmit",
     "amazon-cognito-identity-js": "^6.3.6",
     "audiobuffer-to-wav": "^1.0.0",
     "aws-sdk": "^2.1472.0",
+    "iso-639-1": "^3.1.0",
     "js-cookie": "^3.0.5",
     "lodash": "4.17.21",
     "react": "^18.2.0",

streaming-react-app/src/StreamingInterface.tsx CHANGED Viewed

@@ -57,12 +57,12 @@ import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval';
 import {getURLParams} from './URLParams';
 import debug from './debug';
 import DebugSection from './DebugSection';
-import Switch from '@mui/material/Switch';
-import Grid from '@mui/material/Grid';
 import {getLanguageFromThreeLetterCode} from './languageLookup';
-import HeadphonesIcon from '@mui/icons-material/Headphones';
-const AUDIO_STREAM_DEFAULTS = {
   userMedia: {
     echoCancellation: false,
     noiseSuppression: true,
@@ -71,10 +71,13 @@ const AUDIO_STREAM_DEFAULTS = {
     echoCancellation: false,
     noiseSuppression: false,
   },
-} as const;
 async function requestUserMediaAudioStream(
-  config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['userMedia'],
 ) {
   const stream = await navigator.mediaDevices.getUserMedia({
     audio: {...config, channelCount: 1},
@@ -87,7 +90,10 @@ async function requestUserMediaAudioStream(
 }
 async function requestDisplayMediaAudioStream(
-  config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['displayMedia'],
 ) {
   const stream = await navigator.mediaDevices.getDisplayMedia({
     audio: {...config, channelCount: 1},
@@ -152,7 +158,6 @@ export default function StreamingInterface() {
     useState<StreamingStatus>('stopped');
   const isStreamConfiguredRef = useRef<boolean>(false);
-  const [hasMaxSpeakers, setHasMaxSpeakers] = useState<boolean>(false);
   const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
   const [inputSource, setInputSource] =
@@ -166,9 +171,6 @@ export default function StreamingInterface() {
   // Dynamic Params:
   const [targetLang, setTargetLang] = useState<string | null>(null);
-  const [enableExpressive, setEnableExpressive] = useState<boolean | null>(
-    null,
-  );
   const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
     debugParam ?? false,
@@ -250,7 +252,6 @@ export default function StreamingInterface() {
       setAgent((prevAgent) => {
         if (prevAgent?.name !== newAgent?.name) {
           setTargetLang(newAgent?.targetLangs[0] ?? null);
-          setEnableExpressive(null);
         }
         return newAgent;
       });
@@ -309,7 +310,6 @@ export default function StreamingInterface() {
       console.log('[configureStreamAsync] sending config', config);
       socket.emit('configure_stream', config, (statusObject) => {
-        setHasMaxSpeakers(statusObject.message === 'max_speakers')
         if (statusObject.status === 'ok') {
           isStreamConfiguredRef.current = true;
           console.debug(
@@ -427,7 +427,6 @@ export default function StreamingInterface() {
       // available before actually configuring and starting the stream
       const fullDynamicConfig: DynamicConfig = {
         targetLanguage: targetLang,
-        expressive: enableExpressive,
       };
       await onSetDynamicConfig(fullDynamicConfig);
@@ -757,23 +756,14 @@ export default function StreamingInterface() {
             <div className="header-container-sra">
               <div>
                 <Typography variant="body2" sx={{color: '#65676B'}}>
-                  Welcome! This space is limited to one speaker at a time.
-                  If using the live HF space, sharing room code to listeners on another
-                  IP address may not work because it's running on different replicas.
-                  Use headphones if you are both speaker and listener to prevent feedback.
-                  <br/>
-                  If max speakers reached, please duplicate the space <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/spaces/facebook/seamless-streaming?duplicate=true">here</a>.
-                  In your duplicated space, join a room as speaker or listener (or both),
-                  and share the room code to invite listeners.
-                  <br/>
-                  Check out the seamless_communication <a target="_blank" rel="noopener noreferrer" href="https://github.com/facebookresearch/seamless_communication/tree/main">README</a> for more information.
                   <br/>
                   SeamlessStreaming model is a research model and is not released
-                  for production deployment. It is important to use a microphone with
-                  noise cancellation (for e.g. a smartphone), otherwise you may see model hallucination on noises.
-                  It works best if you pause every couple of sentences, or you may wish adjust the VAD threshold
-                  in the model config. The real-time performance will degrade
-                  if you try streaming multiple speakers at the same time.
                 </Typography>
               </div>
             </div>
@@ -917,28 +907,6 @@ export default function StreamingInterface() {
                           spacing={1}
                           alignItems="flex-start"
                           sx={{flexGrow: 1}}>
-                          {currentAgent?.dynamicParams?.includes(
-                            'expressive',
-                          ) && (
-                            <FormControlLabel
-                              control={
-                                <Switch
-                                  checked={enableExpressive ?? false}
-                                  onChange={(
-                                    event: React.ChangeEvent<HTMLInputElement>,
-                                  ) => {
-                                    const newValue = event.target.checked;
-                                    setEnableExpressive(newValue);
-                                    onSetDynamicConfig({
-                                      expressive: newValue,
-                                    });
-                                  }}
-                                />
-                              }
-                              label="Expressive"
-                            />
-                          )}
                           {isListener && (
                             <Box
                               sx={{
@@ -955,6 +923,13 @@ export default function StreamingInterface() {
                     </Grid>
                   </Stack>
                   <Stack
                     direction="row"
                     spacing={2}
@@ -984,9 +959,8 @@ export default function StreamingInterface() {
                         </RadioGroup>
                       </FormControl>
                     </Box>
-                    <Box sx={{flex: 1, flexGrow: 2}}>
-                    <FormControl disabled={streamFixedConfigOptionsDisabled}>
                         <FormLabel>Options</FormLabel>
                         <FormControlLabel
                           control={
@@ -1003,9 +977,9 @@ export default function StreamingInterface() {
                               }
                             />
                           }
-                          label="Noise Suppression"
                         />
-                        <FormControlLabel
                           control={
                             <Checkbox
                               checked={
@@ -1020,7 +994,7 @@ export default function StreamingInterface() {
                               }
                             />
                           }
-                          label="Echo Cancellation (not recommended)"
                         />
                         <FormControlLabel
                           control={
@@ -1031,34 +1005,12 @@ export default function StreamingInterface() {
                               ) => setServerDebugFlag(event.target.checked)}
                             />
                           }
-                          label="Enable Server Debugging"
                         />
                       </FormControl>
                     </Box>
                   </Stack>
-                  {isSpeaker &&
-                    isListener &&
-                    inputSource === 'userMedia' &&
-                    !enableEchoCancellation &&
-                    gain !== 0 && (
-                      <div>
-                        <Alert severity="warning" icon={<HeadphonesIcon />}>
-                          Headphones required to prevent feedback.
-                        </Alert>
-                      </div>
-                    )}
-                  {isSpeaker && enableEchoCancellation && (
-                    <div>
-                      <Alert severity="warning">
-                        We don't recommend using echo cancellation as it may
-                        distort the input audio. If possible, use headphones and
-                        disable echo cancellation instead.
-                      </Alert>
-                    </div>
-                  )}
                   <Stack direction="row" spacing={2}>
                     {streamingStatus === 'stopped' ? (
                       <Button
@@ -1120,13 +1072,7 @@ export default function StreamingInterface() {
                       </Alert>
                     </div>
                   )}
-                  {serverState != null && hasMaxSpeakers && (
-                    <div>
-                      <Alert severity="error">
-                        {`Maximum number of speakers reached. Please try again at a later time.`}
-                      </Alert>
-                    </div>
-                  )}
                   {serverState != null &&
                     serverState.totalActiveTranscoders >=
                       TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
@@ -1141,7 +1087,7 @@ export default function StreamingInterface() {
                     serverState.serverLock.clientID !== clientID && (
                       <div>
                         <Alert severity="warning">
-                          {`The server is currently locked. Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
                         </Alert>
                       </div>
                     )}

 import {getURLParams} from './URLParams';
 import debug from './debug';
 import DebugSection from './DebugSection';
+import {Grid} from '@mui/material';
 import {getLanguageFromThreeLetterCode} from './languageLookup';
+const AUDIO_STREAM_DEFAULTS: {
+  [key in SupportedInputSource]: BrowserAudioStreamConfig;
+} = {
   userMedia: {
     echoCancellation: false,
     noiseSuppression: true,
     echoCancellation: false,
     noiseSuppression: false,
   },
+};
 async function requestUserMediaAudioStream(
+  config: BrowserAudioStreamConfig = {
+    echoCancellation: false,
+    noiseSuppression: true,
+  },
 ) {
   const stream = await navigator.mediaDevices.getUserMedia({
     audio: {...config, channelCount: 1},
 }
 async function requestDisplayMediaAudioStream(
+  config: BrowserAudioStreamConfig = {
+    echoCancellation: false,
+    noiseSuppression: false,
+  },
 ) {
   const stream = await navigator.mediaDevices.getDisplayMedia({
     audio: {...config, channelCount: 1},
     useState<StreamingStatus>('stopped');
   const isStreamConfiguredRef = useRef<boolean>(false);
   const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
   const [inputSource, setInputSource] =
   // Dynamic Params:
   const [targetLang, setTargetLang] = useState<string | null>(null);
   const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
     debugParam ?? false,
       setAgent((prevAgent) => {
         if (prevAgent?.name !== newAgent?.name) {
           setTargetLang(newAgent?.targetLangs[0] ?? null);
         }
         return newAgent;
       });
       console.log('[configureStreamAsync] sending config', config);
       socket.emit('configure_stream', config, (statusObject) => {
         if (statusObject.status === 'ok') {
           isStreamConfiguredRef.current = true;
           console.debug(
       // available before actually configuring and starting the stream
       const fullDynamicConfig: DynamicConfig = {
         targetLanguage: targetLang,
       };
       await onSetDynamicConfig(fullDynamicConfig);
             <div className="header-container-sra">
               <div>
                 <Typography variant="body2" sx={{color: '#65676B'}}>
+                  Welcome! Join a room as speaker or listener (or both), and share the
+                  room code to invite listeners.
                   <br/>
                   SeamlessStreaming model is a research model and is not released
+                  for production deployment. The streaming quality is closely
+                  related to proper VAD segmentation. It works best if you pause
+                  every couple of sentences, or you may wish adjust the VAD threshold
+                  in the model config.
                 </Typography>
               </div>
             </div>
                           spacing={1}
                           alignItems="flex-start"
                           sx={{flexGrow: 1}}>
                           {isListener && (
                             <Box
                               sx={{
                     </Grid>
                   </Stack>
+                  <Typography variant="body2" sx={{color: '#65676B'}}>
+                    Note: we don't recommend echo cancellation, as it may distort
+                    the input audio (dropping words/sentences) if there is output
+                    audio playing. Instead, you should use headphones if you'd like
+                    to listen to the output audio while speaking.
+                  </Typography>
                   <Stack
                     direction="row"
                     spacing={2}
                         </RadioGroup>
                       </FormControl>
                     </Box>
+                    <Box sx={{flex: 1}}>
+                      <FormControl disabled={streamFixedConfigOptionsDisabled}>
                         <FormLabel>Options</FormLabel>
                         <FormControlLabel
                           control={
                               }
                             />
                           }
+                          label="Noise Suppression (Browser)"
                         />
+                                                <FormControlLabel
                           control={
                             <Checkbox
                               checked={
                               }
                             />
                           }
+                          label="Echo Cancellation (Browser)"
                         />
                         <FormControlLabel
                           control={
                               ) => setServerDebugFlag(event.target.checked)}
                             />
                           }
+                          label="Server Debug Flag"
                         />
                       </FormControl>
                     </Box>
                   </Stack>
                   <Stack direction="row" spacing={2}>
                     {streamingStatus === 'stopped' ? (
                       <Button
                       </Alert>
                     </div>
                   )}
                   {serverState != null &&
                     serverState.totalActiveTranscoders >=
                       TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
                     serverState.serverLock.clientID !== clientID && (
                       <div>
                         <Alert severity="warning">
+                          {`The server is currently locked by "${serverState.serverLock.name}". Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
                         </Alert>
                       </div>
                     )}

streaming-react-app/src/languageLookup.ts CHANGED Viewed

@@ -1,110 +1,108 @@
-const LANG3_TO_NAME = {
-  afr: 'afrikaans',
-  amh: 'amharic',
-  arb: 'arabic',
-  asm: 'assamese',
-  azj: 'azerbaijani',
-  bak: 'bashkir',
-  bel: 'belarusian',
-  ben: 'bengali',
-  bod: 'tibetan',
-  bos: 'bosnian',
-  bre: 'breton',
-  bul: 'bulgarian',
-  cat: 'catalan',
-  ces: 'czech',
-  cmn: 'chinese',
-  cym: 'welsh',
-  dan: 'danish',
-  deu: 'german',
-  ell: 'greek',
-  eng: 'english',
-  est: 'estonian',
-  eus: 'basque',
-  fao: 'faroese',
-  fin: 'finnish',
-  fra: 'french',
-  glg: 'galician',
-  guj: 'gujarati',
-  hat: 'haitian creole',
-  hau: 'hausa',
-  haw: 'hawaiian',
-  heb: 'hebrew',
-  hin: 'hindi',
-  hrv: 'croatian',
-  hun: 'hungarian',
-  hye: 'armenian',
-  ind: 'indonesian',
-  isl: 'icelandic',
-  ita: 'italian',
-  jav: 'javanese',
-  jpn: 'japanese',
-  kan: 'kannada',
-  kat: 'georgian',
-  kaz: 'kazakh',
-  khk: 'mongolian',
-  khm: 'khmer',
-  kor: 'korean',
-  lao: 'lao',
-  lat: 'latin',
-  lin: 'lingala',
-  lit: 'lithuanian',
-  ltz: 'luxembourgish',
-  lvs: 'latvian',
-  mal: 'malayalam',
-  mar: 'marathi',
-  mkd: 'macedonian',
-  mlg: 'malagasy',
-  mlt: 'maltese',
-  mri: 'maori',
-  mya: 'myanmar',
-  nld: 'dutch',
-  nno: 'nynorsk',
-  nob: 'norwegian',
-  npi: 'nepali',
-  oci: 'occitan',
-  pan: 'punjabi',
-  pbt: 'pashto',
-  pes: 'persian',
-  pol: 'polish',
-  por: 'portuguese',
-  ron: 'romanian',
-  rus: 'russian',
-  san: 'sanskrit',
-  sin: 'sinhala',
-  slk: 'slovak',
-  slv: 'slovenian',
-  sna: 'shona',
-  snd: 'sindhi',
-  som: 'somali',
-  spa: 'spanish',
-  sqi: 'albanian',
-  srp: 'serbian',
-  sun: 'sundanese',
-  swe: 'swedish',
-  swh: 'swahili',
-  tam: 'tamil',
-  tat: 'tatar',
-  tel: 'telugu',
-  tgk: 'tajik',
-  tgl: 'tagalog',
-  tha: 'thai',
-  tuk: 'turkmen',
-  tur: 'turkish',
-  ukr: 'ukrainian',
-  urd: 'urdu',
-  uzn: 'uzbek',
-  vie: 'vietnamese',
-  yid: 'yiddish',
-  yor: 'yoruba',
-  zlm: 'malay',
 };
-export function getLanguageFromThreeLetterCode(
-  lang3Code: string,
-): string | null {
   try {
-    const name = LANG3_TO_NAME[lang3Code] ?? null;
     if (name == null) {
       return null;
     }
@@ -113,7 +111,7 @@ export function getLanguageFromThreeLetterCode(
       .map((word: string) => word[0].toUpperCase() + word.slice(1));
     return capitalizedWords.join(' ');
   } catch (e) {
-    console.warn(`Unable to get language name for code ${lang3Code}: ${e}`);
   }
   return null;
 }

+const LANG3_FULL = {
+  eng: 'english',
+  arb: 'arabic',
+  asm: 'assamese',
+  bel: 'belarusian',
+  bul: 'bulgarian',
+  ben: 'bengali',
+  cat: 'catalan',
+  ces: 'czech',
+  cym: 'welsh',
+  dan: 'danish',
+  deu: 'german',
+  ell: 'greek',
+  spa: 'spanish',
+  est: 'estonian',
+  pes: 'persian',
+  fin: 'finnish',
+  fra: 'french',
+  hin: 'hindi',
+  hun: 'hungarian',
+  ind: 'indonesian',
+  ita: 'italian',
+  jpn: 'japanese',
+  kat: 'georgian',
+  lit: 'lithuanian',
+  lvs: 'latvian',
+  khk: 'mongolian',
+  mar: 'marathi',
+  mlt: 'maltese',
+  nld: 'dutch',
+  pan: 'punjabi',
+  pol: 'polish',
+  por: 'portuguese',
+  ron: 'romanian',
+  rus: 'russian',
+  slk: 'slovak',
+  slv: 'slovenian',
+  swe: 'swedish',
+  swh: 'swahili',
+  tam: 'tamil',
+  tha: 'thai',
+  tur: 'turkish',
+  ukr: 'ukrainian',
+  urd: 'urdu',
+  uzn: 'uzbek',
+  vie: 'vietnamese',
+  cmn: 'chinese',
+  afr: 'afrikaans',
+  isl: 'icelandic',
+  ltz: 'luxembourgish',
+  nob: 'norwegian',
+  glg: 'galician',
+  bos: 'bosnian',
+  hrv: 'croatian',
+  mkd: 'macedonian',
+  srp: 'serbian',
+  hye: 'armenian',
+  azj: 'azerbaijani',
+  kaz: 'kazakh',
+  kor: 'korean',
+  guj: 'gujarati',
+  kan: 'kannada',
+  npi: 'nepali',
+  snd: 'sindhi',
+  tel: 'telugu',
+  jav: 'javanese',
+  zlm: 'malay',
+  mal: 'malayalam',
+  tgl: 'tagalog',
+  mya: 'myanmar',
+  khm: 'khmer',
+  lao: 'lao',
+  heb: 'hebrew',
+  pbt: 'pashto',
+  tgk: 'tajik',
+  amh: 'amharic',
+  lin: 'lingala',
+  som: 'somali',
+  yor: 'yoruba',
+  sna: 'shona',
+  mri: 'maori',
+  hau: 'hausa',
+  oci: 'occitan',
+  bak: 'bashkir',
+  bre: 'breton',
+  yid: 'yiddish',
+  hat: 'haitian creole',
+  mlg: 'malagasy',
+  sin: 'sinhala',
+  sqi: 'albanian',
+  sun: 'sundanese',
+  eus: 'basque',
+  nno: 'nynorsk',
+  tat: 'tatar',
+  bod: 'tibetan',
+  fao: 'faroese',
+  haw: 'hawaiian',
+  lat: 'latin',
+  san: 'sanskrit',
+  tuk: 'turkmen'
 };
+export function getLanguageFromThreeLetterCode(code: string): string | null {
   try {
+    const name = LANG3_FULL[code] ?? null;
     if (name == null) {
       return null;
     }
       .map((word: string) => word[0].toUpperCase() + word.slice(1));
     return capitalizedWords.join(' ');
   } catch (e) {
+    console.warn(`Unable to get language name for code ${code}: ${e}`);
   }
   return null;
 }

streaming-react-app/src/react-xr/TextBlocks.tsx CHANGED Viewed

@@ -1,8 +1,9 @@
-import {useEffect, useRef, useState} from 'react';
 import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
 import robotoFontTexture from '../assets/RobotoMono-Regular.png';
 import ThreeMeshUIText, {ThreeMeshUITextType} from './ThreeMeshUIText';
-import supportedCharSet from './supportedCharSet';
 const NUM_LINES = 3;
@@ -21,80 +22,44 @@ const SCROLL_Y_DELTA = 0.001;
 const OFFSET = 0.01;
 const OFFSET_WIDTH = OFFSET * 3;
-const CHARS_PER_SECOND = 10;
-// The tick interval
-const RENDER_INTERVAL = 300;
-const CURSOR_BLINK_INTERVAL_MS = 1000;
-type TextBlockProps = {
   content: string;
   // The actual position or end position when animating
   y: number;
   // The start position when animating
   startY: number;
   textOpacity: number;
   backgroundOpacity: number;
-  index: number;
-  isBottomLine: boolean;
-  // key: number;
-};
-type TranscriptState = {
-  textBlocksProps: TextBlockProps[];
-  lastTranslationStringIndex: number;
-  lastTranslationLineStartIndex: number;
-  transcriptLines: string[];
-  lastRenderTime: number;
 };
 function TextBlock({
   content,
   y,
   startY,
   textOpacity,
   backgroundOpacity,
   index,
-  isBottomLine,
-}: TextBlockProps) {
   const [scrollY, setScrollY] = useState<number>(y);
   // We are reusing text blocks so this keeps track of when we changed rows so we can restart animation
-  const lastIndex = useRef<number>(index);
   useEffect(() => {
     if (index != lastIndex.current) {
       lastIndex.current = index;
-      !isBottomLine && setScrollY(startY);
     } else if (scrollY < y) {
       setScrollY((prev) => prev + SCROLL_Y_DELTA);
     }
-  }, [isBottomLine, index, scrollY, setScrollY, startY, y]);
-  const [cursorBlinkOn, setCursorBlinkOn] = useState(false);
-  useEffect(() => {
-    if (isBottomLine) {
-      const interval = setInterval(() => {
-        setCursorBlinkOn((prev) => !prev);
-      }, CURSOR_BLINK_INTERVAL_MS);
-      return () => clearInterval(interval);
-    } else {
-      setCursorBlinkOn(false);
-    }
-  }, [isBottomLine]);
-  const numChars = content.length;
-  if (cursorBlinkOn) {
-    content = content + '|';
-  }
-  // Accounting for potential cursor for block width (the +1)
-  const width =
-    (numChars + (isBottomLine ? 1.1 : 0) + (numChars < 10 ? 1 : 0)) *
-    CHAR_WIDTH;
-  const height = LINE_HEIGHT;
   // This is needed to update text content (doesn't work if we just update the content prop)
   const textRef = useRef<ThreeMeshUITextType>();
@@ -146,162 +111,125 @@ function TextBlock({
   );
 }
-function initialTextBlockProps(count: number): TextBlockProps[] {
-  return Array.from({length: count}).map(() => {
-    // Push in non display blocks because mesh UI crashes if elements are add / removed from screen.
-    return {
-      y: Y_COORD_START,
-      startY: 0,
-      index: 0,
-      textOpacity: 0,
-      backgroundOpacity: 0,
-      width: MAX_WIDTH,
-      height: LINE_HEIGHT,
-      content: '',
-      isBottomLine: true,
-    };
-  });
 }
 export default function TextBlocks({
-  translationText,
 }: {
-  translationText: string;
 }) {
-  const transcriptStateRef = useRef<TranscriptState>({
-    textBlocksProps: initialTextBlockProps(NUM_LINES),
-    lastTranslationStringIndex: 0,
-    lastTranslationLineStartIndex: 0,
-    transcriptLines: [],
-    lastRenderTime: new Date().getTime(),
-  });
-  const transcriptState = transcriptStateRef.current;
-  const {textBlocksProps, lastTranslationStringIndex, lastRenderTime} =
-    transcriptState;
-  const [charsToRender, setCharsToRender] = useState<number>(0);
   useEffect(() => {
-    const interval = setInterval(() => {
-      const currentTime = new Date().getTime();
-      const charsToRender = Math.round(
-        ((currentTime - lastRenderTime) * CHARS_PER_SECOND) / 1000,
-      );
-      setCharsToRender(charsToRender);
-    }, RENDER_INTERVAL);
-    return () => clearInterval(interval);
-  }, [lastRenderTime]);
-  const currentTime = new Date().getTime();
-  if (charsToRender < 1) {
-    return textBlocksProps.map((props, idx) => (
-      <TextBlock {...props} key={idx} />
-    ));
-  }
-  const nextTranslationStringIndex = Math.min(
-    lastTranslationStringIndex + charsToRender,
-    translationText.length,
-  );
-  const newString = translationText.substring(
-    lastTranslationStringIndex,
-    nextTranslationStringIndex,
-  );
-  if (nextTranslationStringIndex === lastTranslationStringIndex) {
-    transcriptState.lastRenderTime = currentTime;
-    return textBlocksProps.map((props, idx) => (
-      <TextBlock {...props} key={idx} />
-    ));
-  }
-  // Wait until more characters are accumulated if its just blankspace
-  if (/^\s*$/.test(newString)) {
-    transcriptState.lastRenderTime = currentTime;
-    return textBlocksProps.map((props, idx) => (
-      <TextBlock {...props} key={idx} />
-    ));
-  }
-  // Ideally we continue where we left off but this is complicated when we have mid-words. Recalculating for now
-  const runAll = true;
-  const newSentences = runAll
-    ? translationText.substring(0, nextTranslationStringIndex).split('\n')
-    : newString.split('\n');
-  const transcriptLines = runAll ? [''] : transcriptState.transcriptLines;
-  newSentences.forEach((newSentence, sentenceIdx) => {
-    const words = newSentence.split(/\s+/);
-    words.forEach((word) => {
-      const filteredWord = [...word]
-        .filter((c) => {
-          if (supportedCharSet().has(c)) {
-            return true;
-          }
-          console.error(
-            `Unsupported char ${c} - make sure this is supported in the font family msdf file`,
-          );
-          return false;
-        })
-        .join('');
-      const lastLineSoFar = transcriptLines[0];
-      const charCount = lastLineSoFar.length + filteredWord.length + 1;
-      if (charCount <= CHARS_PER_LINE) {
-        transcriptLines[0] = lastLineSoFar + ' ' + filteredWord;
-      } else {
-        transcriptLines.unshift(filteredWord);
-      }
-    });
-    if (sentenceIdx < newSentences.length - 1) {
-      transcriptLines.unshift('\n');
-      transcriptLines.unshift('');
     }
-  });
-  transcriptState.transcriptLines = transcriptLines;
-  transcriptState.lastTranslationStringIndex = nextTranslationStringIndex;
-  const newTextBlocksProps: TextBlockProps[] = [];
   let currentY = Y_COORD_START;
-  transcriptLines.forEach((line, i) => {
-    if (newTextBlocksProps.length == NUM_LINES) {
-      return;
-    }
-    // const line = transcriptLines[i];
-    if (line === '\n') {
-      currentY += BLOCK_SPACING;
-      return;
-    }
-    const y = currentY + LINE_HEIGHT / 2;
-    const isBottomLine = newTextBlocksProps.length === 0;
-    const textOpacity = 1 - 0.1 * newTextBlocksProps.length;
-    newTextBlocksProps.push({
-      y,
-      startY: currentY,
-      index: i,
-      textOpacity,
-      backgroundOpacity: 0.98,
-      content: line,
-      isBottomLine,
-    });
-    currentY = y + LINE_HEIGHT / 2;
-  });
-  const numRemainingBlocks = NUM_LINES - newTextBlocksProps.length;
   if (numRemainingBlocks > 0) {
-    newTextBlocksProps.push(...initialTextBlockProps(numRemainingBlocks));
   }
-  transcriptState.textBlocksProps = newTextBlocksProps;
-  transcriptState.lastRenderTime = currentTime;
-  return newTextBlocksProps.map((props, idx) => (
-    <TextBlock {...props} key={idx} />
-  ));
 }

+import {JSX, useEffect, useRef, useState} from 'react';
 import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
 import robotoFontTexture from '../assets/RobotoMono-Regular.png';
 import ThreeMeshUIText, {ThreeMeshUITextType} from './ThreeMeshUIText';
+import {getURLParams} from '../URLParams';
+import {CURSOR_BLINK_INTERVAL_MS} from '../cursorBlinkInterval';
 const NUM_LINES = 3;
 const OFFSET = 0.01;
 const OFFSET_WIDTH = OFFSET * 3;
+type Props = {
   content: string;
   // The actual position or end position when animating
   y: number;
   // The start position when animating
   startY: number;
+  width: number;
+  height: number;
   textOpacity: number;
   backgroundOpacity: number;
+  // Use this to keep track of sentence + line position for animation
+  index: string;
+  enableAnimation: boolean;
 };
 function TextBlock({
   content,
   y,
   startY,
+  width,
+  height,
   textOpacity,
   backgroundOpacity,
   index,
+  enableAnimation,
+}: Props) {
   const [scrollY, setScrollY] = useState<number>(y);
   // We are reusing text blocks so this keeps track of when we changed rows so we can restart animation
+  const lastIndex = useRef<string>(index);
   useEffect(() => {
     if (index != lastIndex.current) {
       lastIndex.current = index;
+      enableAnimation && setScrollY(startY);
     } else if (scrollY < y) {
       setScrollY((prev) => prev + SCROLL_Y_DELTA);
     }
+  }, [enableAnimation, index, scrollY, setScrollY, startY, y]);
   // This is needed to update text content (doesn't work if we just update the content prop)
   const textRef = useRef<ThreeMeshUITextType>();
   );
 }
+// Background behind the text so it covers any missing spaces
+function TranscriptionPanel() {
+  const panelHeight = LINE_HEIGHT * NUM_LINES + 2 * BLOCK_SPACING + 2 * OFFSET;
+  const xPosition = OFFSET_WIDTH;
+  return (
+    <block
+      args={[
+        {
+          backgroundOpacity: 1,
+          width:
+            MAX_WIDTH * ((CHARS_PER_LINE + 2) / CHARS_PER_LINE) +
+            2 * OFFSET_WIDTH,
+          height: panelHeight,
+          borderRadius: 0,
+        },
+      ]}
+      position={[
+        -OFFSET + xPosition,
+        Y_COORD_START + panelHeight / 2 - 2 * OFFSET,
+        Z_COORD,
+      ]}></block>
+  );
 }
 export default function TextBlocks({
+  sentences,
+  blinkCursor,
 }: {
+  sentences: string[][];
+  blinkCursor: boolean;
 }) {
+  const showTranscriptionPanel =
+    getURLParams().ARTranscriptionType === 'lines_with_background';
+  const textBlocks: JSX.Element[] = [];
+  const [cursorBlinkOn, setCursorBlinkOn] = useState(false);
   useEffect(() => {
+    if (blinkCursor) {
+      const interval = setInterval(() => {
+        setCursorBlinkOn((prev) => !prev);
+      }, CURSOR_BLINK_INTERVAL_MS);
+      return () => clearInterval(interval);
+    } else {
+      setCursorBlinkOn(false);
     }
+  }, [blinkCursor]);
+  // Start from bottom and populate most recent sentences by line until we fill max lines.
   let currentY = Y_COORD_START;
+  for (let i = sentences.length - 1; i >= 0; i--) {
+    const sentenceLines = sentences[i];
+    for (let j = sentenceLines.length - 1; j >= 0; j--) {
+      if (textBlocks.length == NUM_LINES) {
+        if (showTranscriptionPanel) {
+          textBlocks.push(<TranscriptionPanel key={textBlocks.length} />);
+        }
+        return textBlocks;
+      }
+      const isBottomSentence = i === sentences.length - 1;
+      const isBottomLine = isBottomSentence && textBlocks.length === 0;
+      const y = currentY + LINE_HEIGHT / 2;
+      let textBlockLine = sentenceLines[j];
+      const numChars = textBlockLine.length;
+      if (cursorBlinkOn && isBottomLine) {
+        textBlockLine = textBlockLine + '|';
+      }
+      // Accounting for potential cursor for block width (the +1)
+      const blockWidth =
+        (numChars + (isBottomLine ? 1.1 : 0) + (numChars < 10 ? 1 : 0)) *
+        CHAR_WIDTH;
+      const textOpacity = 1 - 0.1 * textBlocks.length;
+      textBlocks.push(
+        <TextBlock
+          key={textBlocks.length}
+          y={y}
+          startY={currentY}
+          index={`${sentences.length - i},${j}`}
+          textOpacity={textOpacity}
+          backgroundOpacity={0.98}
+          height={LINE_HEIGHT}
+          width={blockWidth}
+          // content={"BLOCK " + textBlocks.length + ": " + content}
+          content={textBlockLine}
+          enableAnimation={!isBottomLine}
+        />,
+      );
+      currentY = y + LINE_HEIGHT / 2;
+    }
+    currentY += showTranscriptionPanel ? BLOCK_SPACING / 3 : BLOCK_SPACING;
+  }
+  const numRemainingBlocks = textBlocks.length - NUM_LINES;
   if (numRemainingBlocks > 0) {
+    Array.from({length: numRemainingBlocks}).forEach(() => {
+      // Push in non display blocks because mesh UI crashes if elements are add / removed from screen.
+      textBlocks.push(
+        <TextBlock
+          key={textBlocks.length}
+          y={Y_COORD_START}
+          startY={0}
+          index="0,0"
+          textOpacity={0}
+          backgroundOpacity={0}
+          enableAnimation={false}
+          width={MAX_WIDTH}
+          height={LINE_HEIGHT}
+          content=""
+        />,
+      );
+    });
   }
+  if (showTranscriptionPanel) {
+    textBlocks.push(<TranscriptionPanel key={textBlocks.length} />);
+  }
+  return textBlocks;
 }

streaming-react-app/src/react-xr/XRConfig.tsx CHANGED Viewed

@@ -25,15 +25,29 @@ import {BLACK, WHITE} from './Colors';
 import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
 import robotoFontTexture from '../assets/RobotoMono-Regular.png';
 import {getURLParams} from '../URLParams';
-import TextBlocks from './TextBlocks';
 import {BufferedSpeechPlayer} from '../createBufferedSpeechPlayer';
 import {CURSOR_BLINK_INTERVAL_MS} from '../cursorBlinkInterval';
-import supportedCharSet from './supportedCharSet';
 // Adds on react JSX for add-on libraries to react-three-fiber
 extend(ThreeMeshUI);
 extend({TextGeometry});
 // This component wraps any children so it is positioned relative to the camera, rather than from the origin
 function CameraLinkedObject({children}) {
   const camera = useThree((state) => state.camera);
@@ -62,7 +76,10 @@ function ThreeMeshUIComponents({
             translationSentences={translationSentences}
           />
         ) : (
-          <TranscriptPanelBlocks translationSentences={translationSentences} />
         )}
         {skipARIntro ? null : (
           <IntroPanel started={started} setStarted={setStarted} />
@@ -136,7 +153,7 @@ function TranscriptPanelSingleBlock({
           (wordChunks, currentWord) => {
             const filteredWord = [...currentWord]
               .filter((c) => {
-                if (supportedCharSet().has(c)) {
                   return true;
                 }
                 console.error(
@@ -206,14 +223,59 @@ function TranscriptPanelSingleBlock({
 // Splits up the lines into separate blocks to treat each one separately.
 // This allows changing of opacity, animating per line, changing height / width per line etc
 function TranscriptPanelBlocks({
   translationSentences,
 }: {
   translationSentences: TranslationSentences;
 }) {
   return (
-    <TextBlocks
-      translationText={'Listening...\n' + translationSentences.join('\n')}
-    />
   );
 }
@@ -299,8 +361,6 @@ export type XRConfigProps = {
   startStreaming: () => Promise<void>;
   stopStreaming: () => Promise<void>;
   debugParam: boolean | null;
-  onARVisible?: () => void;
-  onARHidden?: () => void;
 };
 export default function XRConfig(props: XRConfigProps) {

 import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
 import robotoFontTexture from '../assets/RobotoMono-Regular.png';
 import {getURLParams} from '../URLParams';
+import TextBlocks, {CHARS_PER_LINE} from './TextBlocks';
 import {BufferedSpeechPlayer} from '../createBufferedSpeechPlayer';
 import {CURSOR_BLINK_INTERVAL_MS} from '../cursorBlinkInterval';
 // Adds on react JSX for add-on libraries to react-three-fiber
 extend(ThreeMeshUI);
 extend({TextGeometry});
+async function fetchSupportedCharSet(): Promise<Set<string>> {
+  try {
+    const response = await fetch(robotoFontFamilyJson);
+    const fontFamily = await response.json();
+    return new Set(fontFamily.info.charset);
+  } catch (e) {
+    console.error('Failed to fetch supported XR charset', e);
+    return new Set();
+  }
+}
+let supportedCharSet = new Set();
+fetchSupportedCharSet().then((result) => (supportedCharSet = result));
 // This component wraps any children so it is positioned relative to the camera, rather than from the origin
 function CameraLinkedObject({children}) {
   const camera = useThree((state) => state.camera);
             translationSentences={translationSentences}
           />
         ) : (
+          <TranscriptPanelBlocks
+            animateTextDisplay={animateTextDisplay}
+            translationSentences={translationSentences}
+          />
         )}
         {skipARIntro ? null : (
           <IntroPanel started={started} setStarted={setStarted} />
           (wordChunks, currentWord) => {
             const filteredWord = [...currentWord]
               .filter((c) => {
+                if (supportedCharSet.has(c)) {
                   return true;
                 }
                 console.error(
 // Splits up the lines into separate blocks to treat each one separately.
 // This allows changing of opacity, animating per line, changing height / width per line etc
 function TranscriptPanelBlocks({
+  animateTextDisplay,
   translationSentences,
 }: {
+  animateTextDisplay: boolean;
   translationSentences: TranslationSentences;
 }) {
+  const [didReceiveTranslationSentences, setDidReceiveTranslationSentences] =
+    // Currently causing issues with displaying dummy text, skip over
+    useState(false);
+  // Normally we don't setState in render, but here we need to for computed state, and this if statement assures it won't loop infinitely
+  if (!didReceiveTranslationSentences && translationSentences.length > 0) {
+    setDidReceiveTranslationSentences(true);
+  }
+  const initialPrompt = 'Listening...';
+  const transcriptSentences: string[] = didReceiveTranslationSentences
+    ? translationSentences
+    : [initialPrompt];
+  // The transcript is an array of sentences. For each sentence we break this down into an array of words per line.
+  // This is needed so we can "scroll" through without changing the order of words in the transcript
+  const sentenceLines = transcriptSentences.map((sentence) => {
+    const words = sentence.split(/\s+/);
+    // Here we break each sentence up with newlines so all words per line fit within the panel
+    return words.reduce(
+      (wordChunks, currentWord) => {
+        const filteredWord = [...currentWord]
+          .filter((c) => {
+            if (supportedCharSet.has(c)) {
+              return true;
+            }
+            console.error(
+              `Unsupported char ${c} - make sure this is supported in the font family msdf file`,
+            );
+            return false;
+          })
+          .join('');
+        const lastLineSoFar = wordChunks[wordChunks.length - 1];
+        const charCount = lastLineSoFar.length + filteredWord.length + 1;
+        if (charCount <= CHARS_PER_LINE) {
+          wordChunks[wordChunks.length - 1] =
+            lastLineSoFar + ' ' + filteredWord;
+        } else {
+          wordChunks.push(filteredWord);
+        }
+        return wordChunks;
+      },
+      [''],
+    );
+  });
   return (
+    <TextBlocks sentences={sentenceLines} blinkCursor={animateTextDisplay} />
   );
 }
   startStreaming: () => Promise<void>;
   stopStreaming: () => Promise<void>;
   debugParam: boolean | null;
 };
 export default function XRConfig(props: XRConfigProps) {

streaming-react-app/src/react-xr/XRDialog.tsx CHANGED Viewed

@@ -8,12 +8,27 @@ import {
   Typography,
 } from '@mui/material';
 import CloseIcon from '@mui/icons-material/Close';
-import {XRConfigProps} from './XRConfig';
 import {useEffect, useRef, useState} from 'react';
 import './XRDialog.css';
 import {getRenderer, init, updatetranslationText} from './XRRendering';
 import ARButton from './ARButton';
 import {getURLParams} from '../URLParams';
 function XRContent(props: XRConfigProps) {
   const debugParam = getURLParams().debug;

   Typography,
 } from '@mui/material';
 import CloseIcon from '@mui/icons-material/Close';
 import {useEffect, useRef, useState} from 'react';
 import './XRDialog.css';
 import {getRenderer, init, updatetranslationText} from './XRRendering';
 import ARButton from './ARButton';
 import {getURLParams} from '../URLParams';
+import { BufferedSpeechPlayer } from '../createBufferedSpeechPlayer';
+import { TranslationSentences } from '../types/StreamingTypes';
+import { RoomState } from '../types/RoomState';
+type XRConfigProps = {
+  animateTextDisplay: boolean;
+  bufferedSpeechPlayer: BufferedSpeechPlayer;
+  translationSentences: TranslationSentences;
+  roomState: RoomState | null;
+  roomID: string | null;
+  startStreaming: () => Promise<void>;
+  stopStreaming: () => Promise<void>;
+  debugParam: boolean | null;
+  onARVisible?: () => void;
+  onARHidden?: () => void;
+};
 function XRContent(props: XRConfigProps) {
   const debugParam = getURLParams().debug;

streaming-react-app/src/types/StreamingTypes.ts CHANGED Viewed

@@ -55,9 +55,9 @@ export const SUPPORTED_INPUT_SOURCES: Array<{
   value: SupportedInputSource;
   label: string;
 }> = [
-  {value: 'userMedia', label: 'Microphone'},
-  {value: 'displayMedia', label: 'Browser Tab (Chrome only)'},
-];
 export type StartStreamEventConfig = {
   event: 'config';
@@ -70,7 +70,6 @@ export type StartStreamEventConfig = {
 };
 export interface BrowserAudioStreamConfig {
-  echoCancellation: boolean;
   noiseSuppression: boolean;
   echoCancellation: boolean;
 }
@@ -113,7 +112,6 @@ export type TranslationSentences = Array<string>;
 export type DynamicConfig = {
   // targetLanguage: a 3-letter string representing the desired output language.
   targetLanguage: string;
-  expressive: boolean | null;
 };
 export type PartialDynamicConfig = Partial<DynamicConfig>;

   value: SupportedInputSource;
   label: string;
 }> = [
+    { value: 'userMedia', label: 'Microphone' },
+    { value: 'displayMedia', label: 'Browser Tab' },
+  ];
 export type StartStreamEventConfig = {
   event: 'config';
 };
 export interface BrowserAudioStreamConfig {
   noiseSuppression: boolean;
   echoCancellation: boolean;
 }
 export type DynamicConfig = {
   // targetLanguage: a 3-letter string representing the desired output language.
   targetLanguage: string;
 };
 export type PartialDynamicConfig = Partial<DynamicConfig>;

streaming-react-app/vite.config.ts CHANGED Viewed

@@ -1,5 +1,10 @@
 import { defineConfig } from 'vite';
 import react from '@vitejs/plugin-react';
 // https://vitejs.dev/config/
 export default defineConfig(({ command }) => {

 import { defineConfig } from 'vite';
 import react from '@vitejs/plugin-react';
+// import {resolve} from 'path';
+// const rootDir = resolve(__dirname, 'src');
+// const assetsDir = resolve(rootDir, 'assets');
+// const typesDir = resolve(__dirname, 'types');
 // https://vitejs.dev/config/
 export default defineConfig(({ command }) => {

streaming-react-app/yarn.lock CHANGED Viewed

@@ -1853,6 +1853,11 @@ isexe@^2.0.0:
   resolved "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz"
   integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
 isomorphic-unfetch@^3.0.0:
   version "3.1.0"
   resolved "https://registry.npmjs.org/isomorphic-unfetch/-/isomorphic-unfetch-3.1.0.tgz"

   resolved "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz"
   integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
+iso-639-1@^3.1.0:
+  version "3.1.0"
+  resolved "https://registry.npmjs.org/iso-639-1/-/iso-639-1-3.1.0.tgz"
+  integrity sha512-rWcHp9dcNbxa5C8jA/cxFlWNFNwy5Vup0KcFvgA8sPQs9ZeJHj/Eq0Y8Yz2eL8XlWYpxw4iwh9FfTeVxyqdRMw==
 isomorphic-unfetch@^3.0.0:
   version "3.1.0"
   resolved "https://registry.npmjs.org/isomorphic-unfetch/-/isomorphic-unfetch-3.1.0.tgz"