Spaces:

flax-community
/

dalle-mini

Running

File size: 9,965 Bytes

16f038a

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d0b72877",
   "metadata": {},
   "source": [
    "# vqgan-jax-encoding-with-captions"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "875c82b3",
   "metadata": {},
   "source": [
    "Notebook based on [vqgan-jax-reconstruction](https://colab.research.google.com/drive/1mdXXsMbV6K_LTvCh3IImRsFIWcKU5m1w?usp=sharing) by @surajpatil.\n",
    "\n",
    "We process a `tsv` file with `image_file` and `caption` fields, and add a `vqgan_indices` column with indices extracted from a VQGAN-JAX model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3b59489e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "\n",
    "import requests\n",
    "from PIL import Image\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "\n",
    "import torch\n",
    "import torchvision.transforms as T\n",
    "import torchvision.transforms.functional as TF\n",
    "from torchvision.transforms import InterpolationMode\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "\n",
    "import jax\n",
    "from jax import pmap"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "511c3b9e",
   "metadata": {},
   "source": [
    "## VQGAN-JAX model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb408f6c",
   "metadata": {},
   "source": [
    "`dalle_mini` is a local package that contains the VQGAN-JAX model and other utilities."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2ca50dc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b60da9a",
   "metadata": {},
   "source": [
    "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "29ce8b15",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "db406bdfc5d5428eaeae1631a04989dd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3e37f07fba6d48fca70313ae1fa8cc32",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/304M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:absl:Starting the local TPU driver.\n",
      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: Interpreter Host TPU\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n"
     ]
    }
   ],
   "source": [
    "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7c4c1e6",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7014a7ce",
   "metadata": {},
   "source": [
    "We use Luke Melas-Kyriazi's `dataset.py` which reads image paths and captions from a tsv file that contains both. We only need the images for encoding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "85832702",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dalle_mini.dataset import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "81b19eca",
   "metadata": {},
   "outputs": [],
   "source": [
    "cc12m_images = '/data/CC12M/images'\n",
    "cc12m_list = '/data/CC12M/images-list-clean.tsv'\n",
    "# cc12m_list = '/data/CC12M/images-10000.tsv'\n",
    "cc12m_output = '/data/CC12M/images-encoded.tsv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fecc9a00",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_size = 256\n",
    "def image_transform(image):\n",
    "    s = min(image.size)\n",
    "    r = image_size / s\n",
    "    s = (round(r * image.size[1]), round(r * image.size[0]))\n",
    "    image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n",
    "    image = TF.center_crop(image, output_size = 2 * [image_size])\n",
    "    image = torch.unsqueeze(T.ToTensor()(image), 0)\n",
    "    image = image.permute(0, 2, 3, 1).numpy()\n",
    "    return image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4ce2211f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = CaptionDataset(\n",
    "    images_root=cc12m_images,\n",
    "    captions_path=cc12m_list,\n",
    "    image_transform=image_transform,\n",
    "    image_transform_type='torchvision',\n",
    "    include_captions=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cc922704",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8592141"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62ad01c3",
   "metadata": {},
   "source": [
    "## Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "88f36d0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode(model, batch):\n",
    "#     print(\"jitting encode function\")\n",
    "    _, indices = model.encode(batch)\n",
    "    return indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1f35f0cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def superbatch_generator(dataloader, num_tpus):\n",
    "    iter_loader = iter(dataloader)\n",
    "    for batch in iter_loader:\n",
    "        superbatch = [batch.squeeze(1)]\n",
    "        try:\n",
    "            for b in range(num_tpus-1):\n",
    "                batch = next(iter_loader)\n",
    "                if batch is None:\n",
    "                    break\n",
    "                # Skip incomplete last batch\n",
    "                if batch.shape[0] == dataloader.batch_size:\n",
    "                    superbatch.append(batch.squeeze(1))\n",
    "        except StopIteration:\n",
    "            pass\n",
    "        superbatch = torch.stack(superbatch, axis=0)\n",
    "        yield superbatch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2210705b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "def encode_captioned_dataset(dataset, output_tsv, batch_size=32, num_workers=16):\n",
    "    if os.path.isfile(output_tsv):\n",
    "        print(f\"Destination file {output_tsv} already exists, please move away.\")\n",
    "        return\n",
    "    \n",
    "    num_tpus = 8    \n",
    "    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n",
    "    superbatches = superbatch_generator(dataloader, num_tpus=num_tpus)\n",
    "    \n",
    "    p_encoder = pmap(lambda batch: encode(model, batch))\n",
    "\n",
    "    # We save each superbatch to avoid reallocation of buffers as we process them.\n",
    "    # We keep the file open to prevent excessive file seeks.\n",
    "    with open(output_tsv, \"w\") as file:\n",
    "        iterations = len(dataset) // (batch_size * num_tpus)\n",
    "        for n in tqdm(range(iterations)):\n",
    "            superbatch = next(superbatches)\n",
    "            encoded = p_encoder(superbatch.numpy())\n",
    "            encoded = encoded.reshape(-1, encoded.shape[-1])\n",
    "\n",
    "            # Extract fields from the dataset internal `captions` property, and save to disk\n",
    "            start_index = n * batch_size * num_tpus\n",
    "            end_index = (n+1) * batch_size * num_tpus\n",
    "            paths = dataset.captions[\"image_file\"][start_index:end_index].values\n",
    "            captions = dataset.captions[\"caption\"][start_index:end_index].values\n",
    "            encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n",
    "            batch_df = pd.DataFrame.from_dict({\"image_file\": paths, \"caption\": captions, \"encoding\": encoded_as_string})\n",
    "            batch_df.to_csv(file, sep='\\t', header=(n==0), index=None)\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7704863d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  4%|██▋                                                                      | 621/16781 [07:09<3:02:46,  1.47it/s]"
     ]
    }
   ],
   "source": [
    "encode_captioned_dataset(dataset, cc12m_output, batch_size=64, num_workers=16)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8953dd84",
   "metadata": {},
   "source": [
    "----"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}