diff --git "a/big_vision_repo/big_vision/configs/proj/image_text/SigLIP_demo.ipynb" "b/big_vision_repo/big_vision/configs/proj/image_text/SigLIP_demo.ipynb" new file mode 100644--- /dev/null +++ "b/big_vision_repo/big_vision/configs/proj/image_text/SigLIP_demo.ipynb" @@ -0,0 +1,1023 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# General information\n", + "\n", + "Example colab for SigLIP models described in [the SigLIP paper](https://arxiv.org/abs/2303.15343).\n", + "\n", + "**These models are not official Google products and were trained and released for research purposes.**\n", + "\n", + "If you find our model(s) useful for your research, consider citing\n", + "\n", + "```\n", + "@article{zhai2023sigmoid,\n", + " title={Sigmoid loss for language image pre-training},\n", + " author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},\n", + " journal={International Conference on Computer Vision ({ICCV})},\n", + " year={2023}\n", + "}\n", + "```\n", + "\n", + "If you use our released models in your products, we will appreciate any direct feedback. We are reachable by xzhai@google.com, basilm@google.com, akolesnikov@google.com and lbeyer@google.com.\n", + "\n", + "\n", + "Only the models explicitly marked with `i18n` in the name are expected to perform reasonably well on non-english data." + ], + "metadata": { + "id": "wR53lePHuiP-" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Environment setup\n", + "#@markdown **IMPORTANT NOTE**: Modern jax (>0.4) does not support the Colab TPU\n", + "#@markdown anymore, so don't select TPU runtime here. CPU and GPU work and are both fast enough.\n", + "\n", + "# Install the right jax version for TPU/GPU/CPU\n", + "import os\n", + "if 'COLAB_TPU_ADDR' in os.environ:\n", + " raise \"TPU colab not supported.\"\n", + "elif 'NVIDIA_PRODUCT_NAME' in os.environ:\n", + " !nvidia-smi\n", + "import jax\n", + "jax.devices()\n", + "\n", + "\n", + "# Get latest version of big_vision codebase.\n", + "!git clone --quiet --branch=main --depth=1 https://github.com/google-research/big_vision\n", + "!cd big_vision && git pull --rebase --quiet\n", + "!pip -q install -r big_vision/big_vision/requirements.txt\n", + "# Gives us ~2x faster gsutil cp to get the model checkpoints.\n", + "!pip3 -q install --no-cache-dir -U crcmod\n", + "\n", + "%cd big_vision\n", + "\n", + "\n", + "import numpy as np\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "%config InlineBackend.figure_format = 'retina'\n", + "\n", + "import jax\n", + "import jax.numpy as jnp\n", + "import ml_collections\n", + "\n", + "from google.colab.output import _publish as publish" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kXSdSXVg2PAI", + "outputId": "ba908946-0cd3-4468-9034-cd108529986f", + "cellView": "form" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Thu Sep 28 09:08:47 2023 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 75C P8 14W / 70W | 0MiB / 15360MiB | 0% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n", + "fatal: destination path 'big_vision' already exists and is not an empty directory.\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "/content/big_vision\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Choose and load model, perform inference" + ], + "metadata": { + "id": "byHpmgAO6inM" + } + }, + { + "cell_type": "code", + "source": [ + "# Pick your hero: (WHEN CHANGING THIS, RERUN IMAGE/TEXT EMBEDDING CELLS)\n", + "# Give this cell 1-3mins.\n", + "\n", + "# VARIANT, RES = 'B/16', 224\n", + "# VARIANT, RES = 'B/16', 256\n", + "# VARIANT, RES = 'B/16', 384\n", + "# VARIANT, RES = 'B/16', 512\n", + "# VARIANT, RES = 'L/16', 256\n", + "VARIANT, RES = 'L/16', 384\n", + "# VARIANT, RES = 'So400m/14', 224\n", + "# VARIANT, RES = 'So400m/14', 384\n", + "# VARIANT, RES = 'B/16-i18n', 256\n", + "# VARIANT, RES = 'So400m/16-i18n', 256\n", + "\n", + "CKPT, TXTVARIANT, EMBDIM, SEQLEN, VOCAB = {\n", + " ('B/16', 224): ('webli_en_b16_224_63724782.npz', 'B', 768, 64, 32_000),\n", + " ('B/16', 256): ('webli_en_b16_256_60500360.npz', 'B', 768, 64, 32_000),\n", + " ('B/16', 384): ('webli_en_b16_384_68578854.npz', 'B', 768, 64, 32_000),\n", + " ('B/16', 512): ('webli_en_b16_512_68580893.npz', 'B', 768, 64, 32_000),\n", + " ('L/16', 256): ('webli_en_l16_256_60552751.npz', 'L', 1024, 64, 32_000),\n", + " ('L/16', 384): ('webli_en_l16_384_63634585.npz', 'L', 1024, 64, 32_000),\n", + " ('So400m/14', 224): ('webli_en_so400m_224_57633886.npz', 'So400m', 1152, 16, 32_000),\n", + " ('So400m/14', 384): ('webli_en_so400m_384_58765454.npz', 'So400m', 1152, 64, 32_000),\n", + " ('B/16-i18n', 256): ('webli_i18n_b16_256_66117334.npz', 'B', 768, 64, 250_000),\n", + " ('So400m/16-i18n', 256): ('webli_i18n_so400m_16_256_78061115.npz', 'So400m', None, 64, 250_000),\n", + "}[VARIANT, RES]\n", + "\n", + "# It is significantly faster to first copy the checkpoint (30s vs 8m30 for B and 1m vs ??? for L)\n", + "!test -f /tmp/{CKPT} || gsutil cp gs://big_vision/siglip/{CKPT} /tmp/\n", + "\n", + "if VARIANT.endswith('-i18n'):\n", + " VARIANT = VARIANT[:-len('-i18n')]\n", + "\n", + "import big_vision.models.proj.image_text.two_towers as model_mod\n", + "\n", + "model_cfg = ml_collections.ConfigDict()\n", + "model_cfg.image_model = 'vit' # TODO(lbeyer): remove later, default\n", + "model_cfg.text_model = 'proj.image_text.text_transformer' # TODO(lbeyer): remove later, default\n", + "model_cfg.image = dict(variant=VARIANT, pool_type='map')\n", + "model_cfg.text = dict(variant=TXTVARIANT, vocab_size=VOCAB)\n", + "model_cfg.out_dim = (None, EMBDIM) # (image_out_dim, text_out_dim)\n", + "model_cfg.bias_init = -10.0\n", + "model_cfg.temperature_init = 10.0\n", + "\n", + "model = model_mod.Model(**model_cfg)\n", + "\n", + "# Using `init_params` is slower but will lead to `load` below performing sanity-checks.\n", + "# init_params = jax.jit(model.init, backend=\"cpu\")(jax.random.PRNGKey(42), jnp.zeros([1, RES, RES, 3], jnp.float32), jnp.zeros([1, SEQLEN], jnp.int32))['params']\n", + "init_params = None # Faster but bypasses loading sanity-checks.\n", + "\n", + "params = model_mod.load(init_params, f'/tmp/{CKPT}', model_cfg)" + ], + "metadata": { + "id": "0DsOabGD7MRG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5afc9f52-7eb4-4a0d-b681-3ab5945ce9b4" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Copying gs://big_vision/siglip/webli_i18n_b16_256_66117334.npz...\n", + "- [1 files][ 1.3 GiB/ 1.3 GiB] 45.3 MiB/s \n", + "Operation completed over 1 objects/1.3 GiB. \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Load and embed images\n", + "\n", + "import big_vision.pp.builder as pp_builder\n", + "import big_vision.pp.ops_general\n", + "import big_vision.pp.ops_image\n", + "import big_vision.pp.ops_text\n", + "import PIL\n", + "\n", + "!wget -q https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg\n", + "!wget -q https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg\n", + "!wget -q 'https://images.unsplash.com/photo-1566467021888-b03548769dd1?ixlib=rb-4.0.3&q=85&fm=jpg&crop=entropy&cs=srgb&dl=svetlana-gumerova-hQHm2D1fH70-unsplash.jpg&w=640' -O cold_drink.jpg\n", + "!wget -q 'https://images.rawpixel.com/image_1300/czNmcy1wcml2YXRlL3Jhd3BpeGVsX2ltYWdlcy93ZWJzaXRlX2NvbnRlbnQvbHIvdXB3azU4ODU5NzY1LXdpa2ltZWRpYS1pbWFnZS1rb3diMmhkeC5qcGc.jpg' -O hot_drink.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/authors.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/siglip.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/caffeine.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/robosign.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/fried_fish.jpeg\n", + "!wget -q 'https://pbs.twimg.com/media/FTyEyxyXsAAyKPc?format=jpg&name=small' -O cow_beach.jpg\n", + "!wget -q 'https://storage.googleapis.com/big_vision/siglip/cow_beach2.jpg' -O cow_beach2.jpg\n", + "!wget -q 'https://pbs.twimg.com/media/Frb6NIEXwAA8-fI?format=jpg&name=medium' -O mountain_view.jpg\n", + "\n", + "\n", + "images = [PIL.Image.open(fname) for fname in [\n", + " 'apple-ipod.jpg',\n", + " 'apple-blank.jpg',\n", + " 'cold_drink.jpg',\n", + " 'hot_drink.jpg',\n", + " 'caffeine.jpg',\n", + " 'siglip.jpg',\n", + " 'authors.jpg',\n", + " 'robosign.jpg',\n", + " 'cow_beach.jpg',\n", + " 'cow_beach2.jpg',\n", + " 'mountain_view.jpg',\n", + "]]\n", + "\n", + "pp_img = pp_builder.get_preprocess_fn(f'resize({RES})|value_range(-1, 1)')\n", + "imgs = np.array([pp_img({'image': np.array(image)})['image'] for image in images])\n", + "zimg, _, out = model.apply({'params': params}, imgs, None)\n", + "\n", + "print(imgs.shape, zimg.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xmuXfCfBjgeF", + "outputId": "3627819b-007e-4107-e1f4-06b7ad3ac03a" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(11, 384, 384, 3) (11, 1024)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Tokenize and embed texts\n", + "\n", + "texts = [\n", + " 'an apple',\n", + " 'a picture of an apple',\n", + " 'an ipod',\n", + " 'granny smith',\n", + " 'an apple with a note saying \"ipod\"',\n", + " 'a cold drink on a hot day',\n", + " 'a hot drink on a cold day',\n", + " 'a photo of a cold drink on a hot day',\n", + " 'a photo of a hot drink on a cold day',\n", + " #\n", + " 'a photo of two guys in need of caffeine',\n", + " 'a photo of two guys in need of water',\n", + " 'a photo of the SigLIP authors',\n", + " 'a photo of a rock band',\n", + " 'a photo of researchers at Google Brain',\n", + " 'a photo of researchers at OpenAI',\n", + " #\n", + " 'a robot on a sign',\n", + " 'a photo of a robot on a sign',\n", + " 'an empty street',\n", + " 'autumn in Toronto',\n", + " 'a photo of autumn in Toronto',\n", + " 'a photo of Toronto in autumn',\n", + " 'a photo of Toronto in summer',\n", + " 'autumn in Singapore',\n", + " #\n", + " 'cow',\n", + " 'a cow in a tuxedo',\n", + " 'a cow on the beach',\n", + " 'a cow in the prairie',\n", + " #\n", + " 'the real mountain view',\n", + " 'Zürich',\n", + " 'San Francisco',\n", + " 'a picture of a laptop with the lockscreen on, a cup of cappucino, salt and pepper grinders. The view through the window reveals lake Zürich and the Alps in the background of the city.',\n", + "]\n", + "\n", + "TOKENIZERS = {\n", + " 32_000: 'c4_en',\n", + " 250_000: 'mc4',\n", + "}\n", + "pp_txt = pp_builder.get_preprocess_fn(f'tokenize(max_len={SEQLEN}, model=\"{TOKENIZERS[VOCAB]}\", eos=\"sticky\", pad_value=1, inkey=\"text\")')\n", + "txts = np.array([pp_txt({'text': text})['labels'] for text in texts])\n", + "_, ztxt, out = model.apply({'params': params}, None, txts)\n", + "\n", + "print(txts.shape, ztxt.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KGrpkRTtjU-L", + "outputId": "7c43b56e-cd53-4801-b1e3-66774368a1d2" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(31, 64) (31, 1024)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# This is how to get all probabilities:\n", + "print(f\"Learned temperature {out['t'].item():.1f}, learned bias: {out['b'].item():.1f}\")\n", + "probs = jax.nn.sigmoid(zimg @ ztxt.T * out['t'] + out['b'])\n", + "print(f\"{probs[0][0]:.1%} that image 0 is '{texts[0]}'\")\n", + "print(f\"{probs[0][1]:.1%} that image 0 is '{texts[1]}'\")" + ], + "metadata": { + "id": "TIdAVw9VGEAw", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "22fc0d9a-8986-4679-ca89-6e4330a55c6e" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Learned temperature 118.2, learned bias: -12.7\n", + "10.4% that image 0 is 'an apple'\n", + "42.8% that image 0 is 'a picture of an apple'\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# @title Pretty demo (code)\n", + "from IPython.display import Javascript\n", + "\n", + "DEMO_IMG_SIZE = 96\n", + "\n", + "import base64\n", + "import io\n", + "\n", + "def bv2rgb(bv_img):\n", + " return (bv_img * 127.5 + 127.5).astype(np.uint8)\n", + "\n", + "def html_img(*, enc_img=None, pixels=None, id=None, size=100, max_size=None, max_height=None, style=\"\"):\n", + " if enc_img is None and pixels is not None:\n", + " with io.BytesIO() as buf:\n", + " PIL.Image.fromarray(np.asarray(pixels)).save(buf, format=\"JPEG\")\n", + " enc_img = buf.getvalue()\n", + "\n", + " img_data = base64.b64encode(np.ascontiguousarray(enc_img)).decode('ascii')\n", + "\n", + " id_spec = f'id={id}' if id else ''\n", + " if size is not None:\n", + " style_spec = f'style=\"{style}; width: {size}px; height: {size}px\"'\n", + " elif max_size is not None:\n", + " style_spec = f'style=\"{style}; width: auto; height: auto; max-width: {max_size}px; max-height: {max_size}px;\"'\n", + " elif max_height is not None:\n", + " style_spec = f'style=\"{style}; object-fit: cover; width: auto; height: {max_height}px;\"'\n", + " else: style_spec = ''\n", + "\n", + " return f''\n", + "\n", + "\n", + "def make_table(zimg, ztxt, out):\n", + " # The default learnable bias is a little conservative. Play around with it!\n", + " t, b = out['t'].item(), out['b'].item()\n", + " tempered_logits = zimg @ ztxt.T * t\n", + " probs = 1 / (1 + np.exp(-tempered_logits - b))\n", + " publish.javascript(f\"var logits = {tempered_logits.tolist()};\")\n", + "\n", + " def color(p):\n", + " return mpl.colors.rgb2hex(mpl.cm.Greens(p / 2)) if p >= 0.01 else \"transparent\"\n", + "\n", + " publish.javascript(f\"var cmap = {[color(x) for x in np.linspace(0, 1, 50)]};\")\n", + " def cell(x, iimg, itxt):\n", + " return f\"
{x * 100:>4.0f}%
\"\n", + "\n", + " html = f'''\n", + "

\n", + " \n", + " \n", + " \n", + "

\n", + " '''\n", + "\n", + " html += \"\\n\"\n", + " html += \"\"\n", + " html += \"\".join([f\"\" + \"\".join([cell(probs[iimg, itxt], iimg, itxt) for iimg in range(len(imgs))]) + f\"
\" + html_img(pixels=bv2rgb(img), size=DEMO_IMG_SIZE) for img in imgs])\n", + " html += \"\"\n", + " for itxt, txt in enumerate(texts):\n", + " html += f\"
{txt}\"\n", + "\n", + " publish.css(r\"\"\"\n", + " table {\n", + " border-collapse: collapse;\n", + " }\n", + "\n", + " tr {\n", + " border: 1px transparent;\n", + " }\n", + "\n", + " tr:nth-child(odd) {\n", + " background-color: #F5F5F5;\n", + " }\n", + "\n", + " tr:hover {\n", + " background-color: lightyellow;\n", + " border: 1px solid black;\n", + " }\n", + "\n", + " td.pct {\n", + " text-align: center;\n", + " }\n", + " \"\"\")\n", + " publish.html(html)\n", + "\n", + " # JS code to compute and write all probs from the logits.\n", + " display(Javascript('''\n", + " function update(b) {\n", + " for(var iimg = 0; iimg < logits.length; iimg++) {\n", + " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n", + " const el = document.getElementById(`p_${iimg}_${itxt}`);\n", + " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n", + " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n", + " el.innerHTML = pad + (p).toFixed(0) + '%';\n", + "\n", + " const td = document.getElementById(`td_${iimg}_${itxt}`);\n", + " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n", + " td.style.backgroundColor = c;\n", + " }\n", + " }\n", + " }\n", + " '''))\n", + "\n", + " # JS code to connect the bias value slider\n", + " display(Javascript('''\n", + " const value = document.querySelector(\"#value\");\n", + " const input = document.querySelector(\"#b\");\n", + " value.textContent = input.value;\n", + " input.addEventListener(\"input\", (event) => {\n", + " value.textContent = event.target.value;\n", + " update(event.target.value);\n", + " });\n", + " '''))\n", + "\n", + " # Make the cell output as large as the table to avoid annoying scrollbars.\n", + " display(Javascript(f'update({b})'))\n", + " display(Javascript('google.colab.output.resizeIframeToContent()'))" + ], + "metadata": { + "cellView": "form", + "id": "eolOc7vd_ZSj" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "make_table(zimg, ztxt, out)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 767 + }, + "id": "mt5BIywzzA6c", + "outputId": "3b06cfb9-a3da-42d7-8caf-d5366d058f8b" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "var logits = [[10.509522438049316, 12.372017860412598, 13.07434368133545, 9.578202247619629, 21.19094467163086, 1.310517430305481, 1.2763848304748535, 3.0990359783172607, 2.360225200653076, -3.670855760574341, -4.780072212219238, -1.4530967473983765, -3.3108861446380615, -3.8945610523223877, -4.378420829772949, 0.35140618681907654, 2.7228779792785645, -6.806382656097412, -3.9012961387634277, -1.7843879461288452, -4.578653812408447, -7.306142807006836, -1.253274917602539, -1.8402824401855469, -6.329799175262451, -9.506726264953613, -5.78713846206665, -1.6370103359222412, -9.404793739318848, -4.342881202697754, -13.128281593322754], [12.365941047668457, 13.45022964477539, 0.9843839406967163, 12.809731483459473, 6.767915725708008, 2.808335304260254, 1.050551414489746, 3.6161491870880127, 1.152547001838684, -7.214369297027588, -5.146897792816162, -6.283102035522461, -11.463550567626953, -7.751645565032959, -11.252680778503418, -9.319047927856445, -8.11094856262207, -8.898587226867676, -2.15217661857605, -0.10237424820661545, -3.6214966773986816, -12.085700035095215, -1.599789023399353, -1.7422595024108887, -7.456813335418701, -8.457598686218262, -5.5325212478637695, -2.4997880458831787, -8.217476844787598, -8.986675262451172, -10.336335182189941], [-1.1052173376083374, -1.3570473194122314, -3.8713269233703613, 2.3654367923736572, -9.037796020507812, 11.620930671691895, 2.1417031288146973, 13.036051750183105, -0.11228565871715546, 0.33224615454673767, 3.9813454151153564, -6.005640029907227, -5.856462001800537, -7.669452667236328, -9.974565505981445, -11.242084503173828, -12.130292892456055, -5.630223274230957, -5.570030689239502, -6.117311000823975, -7.32966423034668, -5.952571392059326, 0.4303727447986603, -0.5507297515869141, -7.554576873779297, -3.3274905681610107, -3.4397053718566895, 0.9088093638420105, -4.845495700836182, -7.663942337036133, -10.328642845153809], [1.1323682069778442, 1.3157405853271484, 0.828519880771637, -1.6223008632659912, -7.967062950134277, 4.090002059936523, 14.007913589477539, 6.785359859466553, 16.369604110717773, 1.524818778038025, -4.911859035491943, -9.018620491027832, -9.306066513061523, -8.402979850769043, -11.57016658782959, -9.890503883361816, -10.68331527709961, -5.442021369934082, 4.999141216278076, 5.106411933898926, 4.015860080718994, -12.08991527557373, 6.171087741851807, -1.0262863636016846, -8.962656021118164, -6.404715538024902, -4.912563323974609, -2.5522496700286865, -6.039242744445801, -10.613517761230469, -6.997122287750244], [-3.4062156677246094, -3.2604005336761475, -4.109685897827148, -4.58593225479126, -9.489058494567871, 1.6483688354492188, 2.376404047012329, 0.7108156681060791, 0.5808579921722412, 17.98756980895996, 9.364227294921875, 1.8207945823669434, -6.545583724975586, 3.3331942558288574, 2.5704448223114014, -7.702937602996826, -9.870623588562012, -1.303507924079895, -5.957301616668701, -6.226568222045898, -6.917541980743408, -7.621560573577881, -0.5124773979187012, -2.2896718978881836, -12.721405029296875, -6.885163307189941, -9.90884780883789, -1.4125298261642456, 2.3772332668304443, -5.4370293617248535, -1.6405099630355835], [-3.2013378143310547, -3.3440065383911133, -1.2165169715881348, -4.172476291656494, -5.278318881988525, -2.3818702697753906, -3.210822582244873, -3.580622911453247, -5.1373138427734375, -1.7848750352859497, -1.4050911664962769, 16.463136672973633, -1.4766411781311035, 16.46843147277832, 11.259382247924805, -1.0086976289749146, -1.908290982246399, -4.666292667388916, -2.9601247310638428, -2.0503976345062256, -1.600439190864563, -1.4223682880401611, -2.251126289367676, -4.444605827331543, -9.10830020904541, -10.853714942932129, -11.52085018157959, -1.6640691757202148, 2.193969964981079, 2.127061367034912, -4.728240013122559], [-0.5153040289878845, -1.290441632270813, -1.3887863159179688, -2.88513445854187, -8.828889846801758, 1.3482768535614014, 0.010438825935125351, -0.6988681554794312, -2.9927048683166504, 2.8313045501708984, 2.5383071899414062, 6.094320297241211, -1.2357840538024902, 19.095901489257812, 12.049205780029297, -2.1667087078094482, -3.2871627807617188, -4.000303268432617, -2.7362473011016846, -1.7782089710235596, -1.643406629562378, -4.0933918952941895, -2.1210238933563232, -3.1019272804260254, -8.912919998168945, -8.04006290435791, -10.427931785583496, 0.8204227089881897, -1.7909467220306396, -0.8497583270072937, -5.065787315368652], [-1.4752472639083862, -0.13337232172489166, 1.7657679319381714, -2.7154576778411865, -2.644958257675171, -1.401767373085022, 0.21228086948394775, -0.5131799578666687, 1.4820858240127563, -2.5781843662261963, 3.075222969055176, -2.9382081031799316, -7.704923152923584, -3.6199238300323486, -3.213698625564575, 10.677529335021973, 12.515663146972656, 3.690605401992798, 10.979350090026855, 12.963836669921875, 11.986873626708984, 4.023745059967041, 0.9576215744018555, -4.142323970794678, -7.46238374710083, -9.735015869140625, -8.231826782226562, -1.0106267929077148, -2.2898473739624023, -2.2792820930480957, -6.5174055099487305], [-0.3335295617580414, 1.2584013938903809, -1.2919337749481201, -2.0686888694763184, -11.050207138061523, 5.148484706878662, 0.46310505270957947, 4.050027847290039, -1.6178984642028809, -6.791775703430176, -2.2926063537597656, -7.568892002105713, -10.240560531616211, -7.8912248611450195, -11.374415397644043, -7.808314323425293, -7.384036540985107, -5.577442646026611, -4.582977771759033, -4.019510746002197, -5.569993019104004, -2.2238216400146484, -0.21682055294513702, 12.080615043640137, 6.551390647888184, 17.416383743286133, 8.308161735534668, -0.3994586169719696, -1.8691462278366089, -2.187755823135376, -4.866983413696289], [-2.294813394546509, -1.4864670038223267, -1.4635752439498901, -2.9900710582733154, -14.971826553344727, 4.747520446777344, -0.9042328000068665, 3.1032114028930664, -3.679764747619629, -5.160387992858887, -1.1286523342132568, -7.035560607910156, -6.664344787597656, -7.769715309143066, -10.94699478149414, -6.526098251342773, -6.273430347442627, -6.723901271820068, -5.448723316192627, -5.721604824066162, -7.575157165527344, -4.370161056518555, -1.393196702003479, 11.913715362548828, 17.861845016479492, 15.086359024047852, 6.581197261810303, -0.31534600257873535, -2.1320040225982666, -4.305175304412842, -7.700469970703125], [-2.552478790283203, -1.305349349975586, 0.03923465311527252, -5.891383647918701, -7.833784580230713, 1.2974026203155518, 5.689708709716797, 2.8017938137054443, 7.800131320953369, -0.12797383964061737, -4.34028434753418, -4.815661430358887, -8.476018905639648, -1.2871994972229004, -1.1152652502059937, -6.992332458496094, -7.258864402770996, 0.09565334022045135, -6.82894229888916, -5.026597023010254, -3.2372162342071533, -7.9831085205078125, -3.8290252685546875, -0.595430850982666, -5.086977005004883, -4.143807888031006, -5.033395290374756, 4.200597763061523, 6.196822166442871, -4.807774066925049, 23.876855850219727]];\n", + "//# sourceURL=js_5e545691b3" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "var cmap = ['transparent', '#f6fcf4', '#f4fbf2', '#f3faf0', '#f1faee', '#f0f9ec', '#eff9eb', '#edf8e9', '#ecf8e8', '#eaf7e6', '#e8f6e4', '#e7f6e3', '#e5f5e1', '#e4f5df', '#e1f3dc', '#def2d9', '#dcf2d7', '#daf0d4', '#d7efd1', '#d5efcf', '#d2edcc', '#d0edca', '#cdecc7', '#cbeac4', '#c9eac2', '#c6e8bf', '#c3e7bc', '#c0e6b9', '#bce4b5', '#bae3b3', '#b6e2af', '#b4e1ad', '#b0dfaa', '#acdea6', '#aadda4', '#a7dba0', '#a3da9d', '#a0d99b', '#9cd797', '#99d595', '#95d391', '#91d28e', '#8ed08b', '#8ace88', '#87cd86', '#83cb82', '#7fc97f', '#7cc87c', '#78c679', '#73c476'];\n", + "//# sourceURL=js_b212ab59e1" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " \n", + "

\n", + " \n", + "
  10%
  43%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
an apple
  43%
  69%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a picture of an apple
  60%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
an ipod
   4%
  54%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
granny smith
 100%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
an apple with a note saying \"ipod\"
   0%
   0%
  26%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a cold drink on a hot day
   0%
   0%
   0%
  79%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a hot drink on a cold day
   0%
   0%
  59%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a photo of a cold drink on a hot day
   0%
   0%
   0%
  98%
   0%
   0%
   0%
   0%
   0%
   0%
   1%
a photo of a hot drink on a cold day
   0%
   0%
   0%
   0%
 100%
   0%
   0%
   0%
   0%
   0%
   0%
a photo of two guys in need of caffeine
   0%
   0%
   0%
   0%
   4%
   0%
   0%
   0%
   0%
   0%
   0%
a photo of two guys in need of water
   0%
   0%
   0%
   0%
   0%
  98%
   0%
   0%
   0%
   0%
   0%
a photo of the SigLIP authors
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a photo of a rock band
   0%
   0%
   0%
   0%
   0%
  98%
 100%
   0%
   0%
   0%
   0%
a photo of researchers at Google Brain
   0%
   0%
   0%
   0%
   0%
  20%
  35%
   0%
   0%
   0%
   0%
a photo of researchers at OpenAI
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  12%
   0%
   0%
   0%
a robot on a sign
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  46%
   0%
   0%
   0%
a photo of a robot on a sign
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
an empty street
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  16%
   0%
   0%
   0%
autumn in Toronto
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  57%
   0%
   0%
   0%
a photo of autumn in Toronto
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  34%
   0%
   0%
   0%
a photo of Toronto in autumn
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
a photo of Toronto in summer
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
autumn in Singapore
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  36%
  32%
   0%
cow
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  99%
   0%
a cow in a tuxedo
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  99%
  92%
   0%
a cow on the beach
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   1%
   0%
   0%
a cow in the prairie
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
the real mountain view
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
Zürich
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
San Francisco
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
 100%
a picture of a laptop with the lockscreen on, a cup of cappucino, salt and pepper grinders. The view through the window reveals lake Zürich and the Alps in the background of the city." + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " function update(b) {\n", + " for(var iimg = 0; iimg < logits.length; iimg++) {\n", + " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n", + " const el = document.getElementById(`p_${iimg}_${itxt}`);\n", + " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n", + " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n", + " el.innerHTML = pad + (p).toFixed(0) + '%';\n", + "\n", + " const td = document.getElementById(`td_${iimg}_${itxt}`);\n", + " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n", + " td.style.backgroundColor = c;\n", + " }\n", + " }\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " const value = document.querySelector(\"#value\");\n", + " const input = document.querySelector(\"#b\");\n", + " value.textContent = input.value;\n", + " input.addEventListener(\"input\", (event) => {\n", + " value.textContent = event.target.value;\n", + " update(event.target.value);\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "update(-12.661874771118164)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "google.colab.output.resizeIframeToContent()" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# More international examples (choose i18n model for this)" + ], + "metadata": { + "id": "f5lIiaD700UK" + } + }, + { + "cell_type": "code", + "source": [ + "#@title Load and embed images\n", + "\n", + "import big_vision.pp.builder as pp_builder\n", + "import big_vision.pp.ops_general\n", + "import big_vision.pp.ops_image\n", + "import big_vision.pp.ops_text\n", + "import PIL\n", + "\n", + "!wget -q 'https://live.staticflickr.com/4152/5189547658_3b2a7126cb_b.jpg' -O ants_climbing_a_tree_food.jpg\n", + "!wget -q 'https://storage.googleapis.com/big_vision/siglip/pexels-poranimm-athithawatthee-842401.jpg' -O ants_climbing_tree.jpg\n", + "!wget -q 'https://images.rawpixel.com/image_1300/cHJpdmF0ZS9zdGF0aWMvaW1hZ2Uvd2Vic2l0ZS8yMDIyLTA0L2xyL3B4OTE3NDYyLWltYWdlLWt3eW8ydmxrLmpwZw.jpg' -O lion_head.jpg\n", + "!wget -q 'https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIzLTA5L3Jhd3BpeGVsX29mZmljZV8yN19taW5pbWFsX3NpbXBsZV9fbGlvbl9fcGFwZXJfY29sbGFnZV9taW5pbWFsX183OGRlOGU3OS02ZTE3LTQ2YzAtYTUyOS02ZDAxM2YzNDg0OWVfMi5qcGc.jpg' -O lion_head_red.jpg\n", + "!wget -q https://live.staticflickr.com/232/551040940_87299a85ec_h.jpg -O meat_ball.jpg\n", + "!wget -q https://storage.googleapis.com/big_vision/siglip/squirrel_fish.jpg -O squirrel_fish.jpg\n", + "# !wget -q 'https://ideogram.ai/api/images/direct/F3lMxBprSk6ligq5Vy3XSw' -O squirrel_fish2.jpg # Seems like ideogram now forbits (403) direct downloads?\n", + "!wget -q 'https://pbs.twimg.com/media/FTyEyxyXsAAyKPc?format=jpg&name=small' -O cow_beach.jpg\n", + "!wget -q 'https://storage.googleapis.com/big_vision/siglip/cow_beach2.jpg' -O cow_beach2.jpg\n", + "\n", + "\n", + "images = [PIL.Image.open(fname) for fname in [\n", + " 'ants_climbing_a_tree_food.jpg',\n", + " 'ants_climbing_tree.jpg',\n", + " 'meat_ball.jpg',\n", + " 'lion_head.jpg',\n", + " 'lion_head_red.jpg',\n", + " 'fried_fish.jpeg',\n", + " 'squirrel_fish.jpg',\n", + " # 'squirrel_fish2.jpg',\n", + " 'cow_beach.jpg',\n", + " 'cow_beach2.jpg',\n", + "]]\n", + "\n", + "pp_img = pp_builder.get_preprocess_fn(f'resize({RES})|value_range(-1, 1)')\n", + "imgs = np.array([pp_img({'image': np.array(image)})['image'] for image in images])\n", + "zimg, _, out = model.apply({'params': params}, imgs, None)\n", + "\n", + "print(imgs.shape, zimg.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YsK74v2J04Xp", + "outputId": "63f024ad-205c-4dd3-a5af-4dfd5ff198ca" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: \n", + "\n", + "TensorFlow Addons (TFA) has ended development and introduction of new features.\n", + "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n", + "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n", + "\n", + "For more information see: https://github.com/tensorflow/addons/issues/2807 \n", + "\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(10, 256, 256, 3) (10, 768)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Tokenize and embed texts\n", + "\n", + "texts = [\n", + " '蚂蚁上树',\n", + " '肉末粉丝',\n", + " 'ants climbing a tree',\n", + " 'minced pork rice noodle',\n", + " #\n", + " '红烧狮子头',\n", + " 'red burned lion head',\n", + " 'lion head',\n", + " 'meat ball with soy sauce',\n", + " #\n", + " '松鼠鳜鱼',\n", + " 'squirrel',\n", + " 'squirrel and fish',\n", + " 'squirrel mandarinfish',\n", + " 'squirrel mandarin fish',\n", + " 'sweet and sour mandarin fish',\n", + " #\n", + " 'cow',\n", + " 'a cow in a tuxedo',\n", + " 'a cow on the beach',\n", + " 'a cow in the prairie',\n", + " 'une vache sur la plage',\n", + " 'eine Kuh am Strand',\n", + " 'วัวอยู่ที่ชายหาด',\n", + " '一只躺在沙滩上的牛',\n", + " '一只沙滩上的牛',\n", + " 'корова на пляже',\n", + " 'بقرة على الشاطئ',\n", + "]\n", + "\n", + "TOKENIZERS = {\n", + " 32_000: 'c4_en',\n", + " 250_000: 'mc4',\n", + "}\n", + "pp_txt = pp_builder.get_preprocess_fn(f'tokenize(max_len={SEQLEN}, model=\"{TOKENIZERS[VOCAB]}\", eos=\"sticky\", pad_value=1, inkey=\"text\")')\n", + "txts = np.array([pp_txt({'text': text})['labels'] for text in texts])\n", + "_, ztxt, out = model.apply({'params': params}, None, txts)\n", + "\n", + "print(txts.shape, ztxt.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dAzAuYJh1eQ3", + "outputId": "6c07c1a2-c236-4b68-b7e3-f92dcc070fcc" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(25, 64) (25, 768)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "make_table(zimg, ztxt, out)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 633 + }, + "id": "JlMwn6K1-62i", + "outputId": "6b8fa113-06f3-492c-ffa7-942d4799cae3" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "var logits = [[15.194855690002441, 14.548081398010254, 4.362802505493164, 8.915352821350098, 0.12249733507633209, -1.8669313192367554, -2.1026358604431152, 4.83571195602417, -1.48772132396698, -2.885380744934082, -3.757584571838379, -9.74438190460205, -6.739628791809082, 1.0982742309570312, -1.8383992910385132, -8.639388084411621, -8.514564514160156, -8.664950370788574, -9.010446548461914, -8.695591926574707, -0.29446348547935486, -2.3145699501037598, 0.3301776945590973, -9.183826446533203, -7.548545837402344], [3.1235272884368896, -2.662849187850952, 15.499628067016602, -5.6270432472229, -8.800381660461426, -5.2857537269592285, -4.901862621307373, -8.64078426361084, -8.457619667053223, -0.7642378211021423, -6.292320251464844, -6.919025421142578, -5.699285984039307, -6.146625518798828, -1.7575650215148926, -9.384129524230957, -6.215198040008545, -6.763903617858887, -6.789668560028076, -6.646523952484131, 2.078498125076294, 0.1571565568447113, 1.2640687227249146, -4.958133697509766, -4.504084587097168], [2.4513118267059326, 3.711794853210449, -2.7506296634674072, 6.2139153480529785, 12.623679161071777, -2.242187261581421, -0.873506486415863, 12.75291633605957, 5.779244422912598, -3.411043405532837, -2.7684485912323, 0.8032691478729248, 2.4132730960845947, 10.139656066894531, -1.5548374652862549, -7.363276481628418, -10.937602043151855, -10.354545593261719, -12.12853717803955, -11.330802917480469, -3.7032158374786377, -4.167450428009033, -2.857227087020874, -12.429163932800293, -10.023411750793457], [-7.848373889923096, -8.82786750793457, -4.246535301208496, -11.672212600708008, -4.754408836364746, 5.023717403411865, 10.245930671691895, -9.671830177307129, -5.305540561676025, 0.939210832118988, -3.7660276889801025, -6.9834089279174805, -5.540616512298584, -7.520627498626709, 0.6897578239440918, -4.008193016052246, -3.137038230895996, -2.492392063140869, -3.349771022796631, -2.571514129638672, -0.5961494445800781, 1.920261025428772, -0.5972135066986084, -3.192373275756836, -2.797152280807495], [-7.591951370239258, -9.57149887084961, -7.410569667816162, -10.887884140014648, -2.1018383502960205, 10.839365005493164, 12.306414604187012, -8.755990028381348, -6.4970011711120605, 1.732677698135376, -1.484777808189392, -3.788830280303955, -2.954533338546753, -4.137475967407227, 1.2805907726287842, -4.848579406738281, -4.63262939453125, -4.869859218597412, -4.654362201690674, -4.7860589027404785, -0.6505587697029114, -0.741170346736908, -1.2220640182495117, -5.068485260009766, -4.302990913391113], [0.38381102681159973, -0.5291793346405029, -4.558042049407959, -0.798613965511322, 1.3992505073547363, -3.269932508468628, -2.243269205093384, 3.4091484546661377, 13.690838813781738, -3.199730396270752, 2.4068713188171387, 4.793602466583252, 6.522286415100098, 12.24045467376709, -0.973887026309967, -5.842926025390625, -8.813263893127441, -10.347548484802246, -10.193572044372559, -9.09493350982666, 0.17290785908699036, -2.690534830093384, 0.4429348409175873, -10.299919128417969, -7.2381591796875], [-11.066581726074219, -10.138232231140137, -5.7180986404418945, -11.073030471801758, -9.701227188110352, 1.2774648666381836, 0.6818075776100159, -11.766871452331543, 7.582111358642578, 6.539462089538574, 13.692913055419922, 11.608633041381836, 12.523263931274414, 2.838015556335449, 0.06712919473648071, -8.434947967529297, -5.371018409729004, -7.046348571777344, -5.160297393798828, -4.178375244140625, -1.4383944272994995, -1.4511940479278564, -0.826172947883606, -4.657361030578613, -4.185240745544434], [-3.598116874694824, -6.576178073883057, -2.7102479934692383, -8.999201774597168, -6.829661846160889, -5.066120147705078, -1.7694122791290283, -7.724926471710205, 0.23896828293800354, 11.48562240600586, 18.98163414001465, 10.054450035095215, 10.879026412963867, -0.23405185341835022, 1.1370410919189453, -4.135552406311035, -0.34031882882118225, -1.2078852653503418, -1.5318009853363037, -3.0245869159698486, -0.7356898188591003, 2.346902847290039, 1.158348560333252, -1.281561017036438, -1.2338509559631348], [-9.843914985656738, -9.799589157104492, -6.7716383934021, -9.883660316467285, -12.059309005737305, -6.143594264984131, -3.1696691513061523, -7.953651428222656, -14.6300048828125, -5.153632164001465, -9.101214408874512, -8.86422061920166, -7.411843299865723, -9.261401176452637, 12.271851539611816, 7.439639091491699, 19.08420181274414, 9.05471420288086, 18.37834930419922, 18.505441665649414, 14.171286582946777, 12.338602066040039, 14.924001693725586, 17.368127822875977, 17.931604385375977], [-9.439372062683105, -8.37105941772461, -9.730523109436035, -9.263359069824219, -7.634936809539795, -5.775638580322266, -0.2548319399356842, -6.097734451293945, -12.719864845275879, -5.2038702964782715, -8.733600616455078, -8.040817260742188, -6.40618896484375, -8.534762382507324, 11.509172439575195, 18.91118049621582, 14.150744438171387, 6.8233747482299805, 13.563973426818848, 13.099942207336426, 10.563776016235352, 10.233851432800293, 11.005309104919434, 15.13718032836914, 14.48193359375]];\n", + "//# sourceURL=js_ca0f68d49c" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "var cmap = ['transparent', '#f6fcf4', '#f4fbf2', '#f3faf0', '#f1faee', '#f0f9ec', '#eff9eb', '#edf8e9', '#ecf8e8', '#eaf7e6', '#e8f6e4', '#e7f6e3', '#e5f5e1', '#e4f5df', '#e1f3dc', '#def2d9', '#dcf2d7', '#daf0d4', '#d7efd1', '#d5efcf', '#d2edcc', '#d0edca', '#cdecc7', '#cbeac4', '#c9eac2', '#c6e8bf', '#c3e7bc', '#c0e6b9', '#bce4b5', '#bae3b3', '#b6e2af', '#b4e1ad', '#b0dfaa', '#acdea6', '#aadda4', '#a7dba0', '#a3da9d', '#a0d99b', '#9cd797', '#99d595', '#95d391', '#91d28e', '#8ed08b', '#8ace88', '#87cd86', '#83cb82', '#7fc97f', '#7cc87c', '#78c679', '#73c476'];\n", + "//# sourceURL=js_b212ab59e1" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " \n", + "

\n", + " \n", + "
  91%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
蚂蚁上树
  84%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
肉末粉丝
   0%
  93%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
ants climbing a tree
   2%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
minced pork rice noodle
   0%
   0%
  43%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
红烧狮子头
   0%
   0%
   0%
   0%
  11%
   0%
   0%
   0%
   0%
   0%
red burned lion head
   0%
   0%
   0%
   7%
  36%
   0%
   0%
   0%
   0%
   0%
lion head
   0%
   0%
  47%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
meat ball with soy sauce
   0%
   0%
   0%
   0%
   0%
  69%
   0%
   0%
   0%
   0%
松鼠鳜鱼
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  20%
   0%
   0%
squirrel
   0%
   0%
   0%
   0%
   0%
   0%
  69%
 100%
   0%
   0%
squirrel and fish
   0%
   0%
   0%
   0%
   0%
   0%
  22%
   6%
   0%
   0%
squirrel mandarinfish
   0%
   0%
   0%
   0%
   0%
   0%
  41%
  12%
   0%
   0%
squirrel mandarin fish
   0%
   0%
   6%
   0%
   0%
  34%
   0%
   0%
   0%
   0%
sweet and sour mandarin fish
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  35%
  20%
cow
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
 100%
a cow in a tuxedo
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
 100%
  78%
a cow on the beach
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   2%
   0%
a cow in the prairie
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
 100%
  66%
une vache sur la plage
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
 100%
  55%
eine Kuh am Strand
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  78%
   9%
วัวอยู่ที่ชายหาด
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  37%
   7%
一只躺在沙滩上的牛
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  88%
  13%
一只沙滩上的牛
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  99%
  90%
корова на пляже
   0%
   0%
   0%
   0%
   0%
   0%
   0%
   0%
  99%
  83%
بقرة على الشاطئ" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " function update(b) {\n", + " for(var iimg = 0; iimg < logits.length; iimg++) {\n", + " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n", + " const el = document.getElementById(`p_${iimg}_${itxt}`);\n", + " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n", + " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n", + " el.innerHTML = pad + (p).toFixed(0) + '%';\n", + "\n", + " const td = document.getElementById(`td_${iimg}_${itxt}`);\n", + " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n", + " td.style.backgroundColor = c;\n", + " }\n", + " }\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " const value = document.querySelector(\"#value\");\n", + " const input = document.querySelector(\"#b\");\n", + " value.textContent = input.value;\n", + " input.addEventListener(\"input\", (event) => {\n", + " value.textContent = event.target.value;\n", + " update(event.target.value);\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "update(-12.885268211364746)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "google.colab.output.resizeIframeToContent()" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Explanation for non-Chinese speakers:\n", + "\n", + "- The first dish is literally called \"ants climbing a tree\" in Chinese.\n", + "- The second dish is literally called \"red burned lion head\" in Chinese.\n", + "- The third dish is literally called \"squirrel mandarinfish\" in Chinese.\n", + "\n", + "We are looking for more interesting examples that highlight culture-language aspects and where a non-EN model should \"get it\" while an EN-only does not." + ], + "metadata": { + "id": "bNGoftU3y4UQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Example image credits\n", + "\n", + "- The apple and apple + iPod images are from OpenAI.\n", + "- [Cold drink on hot day](https://unsplash.com/fr/photos/hQHm2D1fH70).\n", + "- [Hot drink on cold day](https://www.rawpixel.com/image/3282934).\n", + "- Cows on beach were created by Chitwan Saharia using the Imagen model and shared with permission.\n", + "- [\"ant climbing tree\" noodles](https://www.flickr.com/photos/avlxyz/5189547658)\n", + "- [actual ants climbing on a tree](https://www.pexels.com/photo/macro-photo-of-five-orange-ants-842401/)\n", + "- [real lion head](https://www.rawpixel.com/image/5941715/free-public-domain-cc0-photo)\n", + "- [cartoon red lion head](https://www.rawpixel.com/image/12447997/image-texture-paper-png)\n", + "- Collaged [squirrel](https://www.pexels.com/photo/brown-squirrel-47547/) and [fish](https://zh.wikipedia.org/zh-hans/%E9%B3%9C%E9%B1%BC) images.\n", + "- cartoon [squirrel and fish](https://ideogram.ai/g/zgoma01ASS21U1YwIC7MrA/2) generated by [ideogram.ai](http://ideogram.ai) [with permission](https://x.com/ideogram_ai/status/1697428471184515316?s=20).\n", + "- The remaining pictures are personal photos taken by the authors, long after the models were trained." + ], + "metadata": { + "id": "etDZ3sl4kZ_q" + } + } + ] +}