Spaces:

ndhieunguyen
/

Lang2mol-Diff

Sleeping

App Files Files Community

ndhieunguyen commited on Jun 5, 2024

Commit

7dd9869

1 Parent(s): 925f3c0

Add application file

Browse files

Files changed (46) hide show

.gitignore +160 -0
README.md +1 -1
app.py +110 -0
checkpoints/PLAIN_ema_0.9999_360000.pt +3 -0
dataset/selfies_dict.txt +2944 -0
environment.yaml +129 -0
inference.py +202 -0
inference_submission.py +189 -0
requirements.txt +0 -0
src/__init__.py +0 -0
src/anlg_infill/anlg.py +130 -0
src/anlg_infill/mbr_eval.py +351 -0
src/anlg_infill/post_process.py +35 -0
src/anlg_infill/run_evaluation.py +81 -0
src/control_gen/baseline_control.py +500 -0
src/control_gen/eval_control.py +567 -0
src/ev.py +117 -0
src/evaluation/fcd_metric.py +54 -0
src/evaluation/fingerprint_metrics.py +81 -0
src/evaluation/mol_translation_metrics.py +129 -0
src/improved_diffusion/__init__.py +0 -0
src/improved_diffusion/dist_util.py +87 -0
src/improved_diffusion/fp16_util.py +76 -0
src/improved_diffusion/gaussian_diffusion.py +1606 -0
src/improved_diffusion/image_datasets.py +120 -0
src/improved_diffusion/logger.py +498 -0
src/improved_diffusion/losses.py +119 -0
src/improved_diffusion/nn.py +170 -0
src/improved_diffusion/resample.py +154 -0
src/improved_diffusion/respace.py +131 -0
src/improved_diffusion/rounding.py +119 -0
src/improved_diffusion/script_util.py +201 -0
src/improved_diffusion/test_util.py +108 -0
src/improved_diffusion/text_datasets.py +948 -0
src/improved_diffusion/train_util.py +445 -0
src/improved_diffusion/transformer_model.py +118 -0
src/improved_diffusion/transformer_utils.py +450 -0
src/scripts/__init__.py +0 -0
src/scripts/batch_decode.py +149 -0
src/scripts/batch_nll.py +29 -0
src/scripts/infill_util.py +355 -0
src/scripts/mydatasets.py +326 -0
src/scripts/mytokenizers.py +249 -0
src/scripts/nll.py +241 -0
src/scripts/tree_helper.py +110 -0
train.py +177 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Lang2mol Diff
-emoji: 🏆
 colorFrom: pink
 colorTo: pink
 sdk: streamlit

 ---
 title: Lang2mol Diff
+emoji: 🧬
 colorFrom: pink
 colorTo: pink
 sdk: streamlit

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import argparse
+import selfies as sf
+from tqdm import tqdm
+from transformers import T5EncoderModel
+from transformers import set_seed
+from src.scripts.mytokenizers import Tokenizer
+from src.improved_diffusion import gaussian_diffusion as gd
+from src.improved_diffusion import dist_util, logger
+from src.improved_diffusion.respace import SpacedDiffusion
+from src.improved_diffusion.transformer_model import TransformerNetModel
+from src.improved_diffusion.script_util import (
+    model_and_diffusion_defaults,
+    add_dict_to_argparser,
+)
+from src.scripts.mydatasets import Lang2molDataset_submission
+import streamlit as st
+import os
+@st.cache_resource
+def get_encoder():
+    model = T5EncoderModel.from_pretrained("QizhiPei/biot5-base-text2mol")
+    model.eval()
+    return model
+@st.cache_resource
+def get_tokenizer():
+    return Tokenizer()
+@st.cache_resource
+def get_model():
+    model = TransformerNetModel(
+        in_channels=32,
+        model_channels=128,
+        dropout=0.1,
+        vocab_size=35073,
+        hidden_size=1024,
+        num_attention_heads=16,
+        num_hidden_layers=12,
+    )
+    model.load_state_dict(
+        dist_util.load_state_dict(
+            os.path.join("checkpoints", "PLAIN_ema_0.9999_360000.pt"),
+            map_location="cpu",
+        )
+    )
+    model.eval()
+    return model
+@st.cache_resource
+def get_diffusion():
+    return SpacedDiffusion(
+        use_timesteps=[i for i in range(0, 2000, 10)],
+        betas=gd.get_named_beta_schedule("sqrt", 2000),
+        model_mean_type=(gd.ModelMeanType.START_X),
+        model_var_type=((gd.ModelVarType.FIXED_LARGE)),
+        loss_type=gd.LossType.E2E_MSE,
+        rescale_timesteps=True,
+        model_arch="transformer",
+        training_mode="e2e",
+    )
+tokenizer = get_tokenizer()
+encoder = get_encoder()
+model = get_model()
+diffusion = get_diffusion()
+sample_fn = diffusion.ddim_sample_loop
+text_input = st.text_area("Enter molecule description")
+output = tokenizer(
+    text_input,
+    max_length=256,
+    truncation=True,
+    padding="max_length",
+    add_special_tokens=True,
+    return_tensors="pt",
+    return_attention_mask=True,
+)
+caption_state = encoder(
+    input_ids=output["input_ids"],
+    attention_mask=output["attention_mask"],
+).last_hidden_state
+caption_mask = output["attention_mask"]
+outputs = sample_fn(
+    model,
+    (1, 256, 32),
+    clip_denoised=False,
+    denoised_fn=None,
+    model_kwargs={},
+    top_p=1.0,
+    progress=True,
+    caption=(caption_state, caption_mask),
+)
+logits = model.get_logits(torch.tensor(outputs))
+cands = torch.topk(logits, k=1, dim=-1)
+outputs = cands.indices
+outputs = outputs.squeeze(-1)
+outputs = tokenizer.decode(outputs)
+result = sf.decoder(
+    outputs[0].replace("<pad>", "").replace("</s>", "").replace("\t", "")
+).replace("\t", "")
+st.write(result)

checkpoints/PLAIN_ema_0.9999_360000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d77c45acf5644b5e42e68000b1b2f94a25c1f3b4eb1dde26fdfcca3d7482f11b
+size 1021819692

dataset/selfies_dict.txt ADDED Viewed

	@@ -0,0 +1,2944 @@

+[U-5]
+[V]
+[40Ca]
+[SbH3]
+[232Np]
+[127Sn]
+[SnH2+2]
+[195Pt+2]
+[21NH3]
+[SiH1+1]
+[ClH0]
+[175Yb+3]
+[184Ta]
+[Pt+1]
+[81Sr]
+[=32P]
+[116Sn]
+[C@@]
+[ClH3+2]
+[99Tc+5]
+[=Mo+4]
+[238Th]
+[141Pr]
+[SiH4]
+[/SiH2]
+[=Branch3]
+[PoH2]
+[52Fe]
+[66Cu]
+[226Rn]
+[138Xe]
+[PH4+1]
+[Zn+1]
+[V+1]
+[253Fm]
+[121IH1]
+[199Po]
+[62Cu+2]
+[12BH2]
+[I+3]
+[Te]
+[208Bi]
+[O-1]
+[Cu-3]
+[#Branch3]
+[198Au]
+[224Ra]
+[156Ho]
+[=Dy]
+[CH2]
+[\N]
+[125Sn]
+[220Ra]
+[/13C@H1]
+[Ta-1]
+[/SH0]
+[=WH4]
+[#11C]
+[65Cu]
+[169Lu]
+[=Si+2]
+[72As]
+[=U]
+[/O+1]
+[ClH1+1]
+[98Tc+4]
+[/Al-1]
+[#Ce]
+[GeH3]
+[N@]
+[107Cd]
+[202Bi]
+[CuH1+1]
+[248Cm]
+[\O]
+[=TeH2]
+[72Ge]
+[#Yb]
+[/Te]
+[=Al]
+[#121Sb]
+[246Pu]
+[18OH2]
+[=Si-2]
+[\N@+1]
+[Ni+2]
+[Nb-1]
+[171Tm]
+[Co-2]
+[71Zn]
+[/Hg]
+[PtH2]
+[86Y+3]
+[18O-2]
+[Ta+2]
+[IH1]
+[153Tb]
+[169Er+3]
+[211Bi]
+[=11C]
+[Li-1]
+[107Rh]
+[=Cu]
+[126Xe]
+[88Rb]
+[Ge]
+[123I]
+[\NH1+1]
+[Rh-3]
+[184W]
+[\CH1]
+[9C-1]
+[110Cd]
+[=AlH1]
+[MnH2]
+[Ge@H1]
+[108Ag]
+[141Pm]
+[In+3]
+[13NH2-1]
+[#Cr]
+[=P@]
+[8BH2]
+[94Zr+4]
+[130Ba]
+[13NH4+1]
+[=V]
+[12C-1]
+[Mg+2]
+[#SH1-1]
+[19F]
+[89Zr+3]
+[232Th]
+[112Cd]
+[In]
+[RhH3]
+[91Y]
+[I]
+[184Re]
+[92Sr]
+[BiH3]
+[\P+1]
+[/P-1]
+[151Sm]
+[Au-3]
+[69Ge]
+[=TeH1]
+[SmH3]
+[183Re]
+[ReH2]
+[17F]
+[122Te]
+[195Pt]
+[167Tm+3]
+[2H-1]
+[232Pa]
+[113In+3]
+[=95Tc+4]
+[=InH1]
+[Ag-1]
+[NiH2+2]
+[AuH3]
+[70Zn+2]
+[160Tb]
+[/131I]
+[14CH1]
+[35P]
+[Ni-2]
+[=W]
+[/NH1+1]
+[13OH2]
+[197Po]
+[RuH2+2]
+[39ClH1]
+[FeH1]
+[NH1]
+[7Be]
+[144Ce+4]
+[Po@]
+[33ClH1]
+[\AlH1]
+[18CH3]
+[SnH1]
+[45Ca]
+[As]
+[Sn@@]
+[/BH1-1]
+[107Pd]
+[Tm]
+[SiH2]
+[ZrH3]
+[20OH1]
+[SH1+1]
+[44Ti]
+[AlH5-2]
+[MoH1]
+[149Pr]
+[#Ta]
+[176Ta]
+[=20CH1]
+[Ru+3]
+[=W-1]
+[14C@@]
+[33PH3]
+[16OH1]
+[=GaH1]
+[53Ni]
+[35Cl-1]
+[92Zr]
+[83Kr]
+[32Cl]
+[TeH1]
+[Ir-4]
+[13N+1]
+[19BH2]
+[=18O]
+[31PH1]
+[#Dy]
+[PH1]
+[Se+4]
+[146Nd]
+[125Sb]
+[XeH1]
+[186Pt]
+[BiH2]
+[=Tc+4]
+[44Sc]
+[BiH2+2]
+[CoH3]
+[SiH1-1]
+[\PH0]
+[203Tl]
+[=Ta]
+[Ge-1]
+[Y]
+[68Ga]
+[=CoH1]
+[Cl+3]
+[=16O]
+[/As+1]
+[103Ru+2]
+[62Co]
+[207Bi]
+[191Po]
+[\F]
+[Rb]
+[113Sn]
+[Ti+2]
+[Sm+3]
+[#PH1+1]
+[V+2]
+[125Xe]
+[SbH1+1]
+[Tc+6]
+[AsH1]
+[-/Ring2]
+[#16O+1]
+[CuH1]
+[Zr-2]
+[#GeH1]
+[58Ni]
+[77Ge]
+[Co+2]
+[87Sr+2]
+[\PH2+1]
+[93Y]
+[=Mg]
+[172Ta]
+[=CrH2]
+[#Tb]
+[\2H]
+[139Cs]
+[136Nd]
+[Ca+1]
+[#P]
+[36SH2]
+[49Ca]
+[19CH3]
+[CH1-1]
+[80Br-1]
+[49Ti]
+[88Y]
+[TlH2]
+[FeH4]
+[226Ra]
+[BH4-1]
+[=14C-1]
+[13CH2+1]
+[Ge@]
+[=Zr]
+[47Ti]
+[111IH1]
+[\SH2+1]
+[/9C]
+[58Co]
+[=NH2+1]
+[206Pb]
+[12CH1]
+[93Mo]
+[34S-2]
+[77Kr]
+[/Si-1]
+[=32S]
+[240Cm]
+[249Bk]
+[20CH2]
+[128Sb]
+[Zn-2]
+[In+1]
+[203Pb]
+[18CH1]
+[GaH1]
+[\NH1-1]
+[124Sn]
+[Re]
+[/NH1]
+[/C-1]
+[94Tc]
+[118Sb]
+[186Os]
+[Co]
+[47Ca+2]
+[=SbH2]
+[Branch3]
+[30Si]
+[Ring1]
+[/Tl]
+[S-1]
+[96Mo]
+[15N]
+[SiH3-1]
+[PH3+1]
+[143Nd]
+[=SbH3]
+[\Ge]
+[36Ar]
+[=Th]
+[=Pb]
+[=Tc+3]
+[/13CH1-1]
+[AlH1]
+[141Ba]
+[177Ta]
+[BrH1+1]
+[=19O]
+[156Gd]
+[N@@H1+1]
+[16OH2]
+[N-1]
+[254Fm]
+[186Lu]
+[18C]
+[246Am]
+[#Th]
+[194Po]
+[#Mo+1]
+[=34S]
+[110Ru]
+[92Mo]
+[169Yb+3]
+[89Y+3]
+[15NH2]
+[173Yb]
+[185Ir]
+[3H+1]
+[/79Br]
+[IH0]
+[121I]
+[\15NH1]
+[=Gd]
+[=SnH1]
+[151Nd]
+[Os+7]
+[74Kr]
+[Bi-1]
+[78Kr]
+[119Sb]
+[9CH4]
+[=Ring1]
+[\SiH2]
+[#Nd]
+[19Ne]
+[#Ti]
+[=CH0]
+[95Tc]
+[138Ba]
+[16NH2]
+[31P]
+[120Xe]
+[Se@@]
+[15NH2-1]
+[Pt+4]
+[13NH3]
+[85Sr+2]
+[197Hg+2]
+[14C@]
+[Tl-3]
+[233U]
+[146Pm]
+[221Fr]
+[/Hg+1]
+[N@H1+1]
+[#12CH1]
+[AlH3-1]
+[/Ge]
+[181Ta]
+[#Y]
+[143Ce]
+[33S]
+[=La]
+[#In]
+[Cu+1]
+[Nb+3]
+[65Cu+1]
+[Zn+2]
+[\OH1+1]
+[=SH0]
+[10Be]
+[74As]
+[164Er]
+[Sn+2]
+[188W]
+[157Tb]
+[84BrH1]
+[71Se]
+[/S]
+[55Fe+3]
+[208Tl]
+[199Pt]
+[WH6]
+[151Pm]
+[AlH3-3]
+[65Zn]
+[=Ag]
+[77As]
+[Co+3]
+[132IH1]
+[Rh-1]
+[15NH1]
+[PoH1]
+[100Rh]
+[8He]
+[168Yb]
+[#Ge]
+[29Si]
+[27Mg]
+[205Bi+3]
+[109Ag]
+[13CH3-1]
+[237Np]
+[=Cd]
+[35ClH1]
+[137Cs]
+[/Se-1]
+[64Cu]
+[AlH1-1]
+[172Hf]
+[92Nb]
+[97Ru]
+[2H+1]
+[Cr+6]
+[#14N]
+[122Sn]
+[=Pr]
+[146Ce]
+[SnH2]
+[174Hf]
+[212Pb+2]
+[164Ho]
+[TaH2]
+[=Mo]
+[104Cd]
+[140Ce]
+[98Mo]
+[126Ba]
+[Sn+3]
+[=YH1]
+[137Ce]
+[85Kr]
+[222Fr]
+[CeH3]
+[111Cd+2]
+[Pd+1]
+[24Mg]
+[241Pu]
+[/80Br]
+[19O]
+[129Cs+1]
+[=PH1]
+[127I-1]
+.
+[=14C]
+[65Ga]
+[12C@]
+[GeH1]
+[Ga-3]
+[Ge-2]
+[3HH1]
+[/Br-1]
+[33SH2]
+[16OH1-1]
+[133Xe]
+[\123I]
+[#MoH1]
+[244Am]
+[LaH3]
+[\SnH3]
+[/Al+2]
+[157Gd]
+[132Ba]
+[Tl-1]
+[10BH1-1]
+[212Pb]
+[Si+1]
+[161Gd]
+[=BH2-1]
+[52Cr]
+[30PH3]
+[\CH1-1]
+[238Pu]
+[#Ta+1]
+[69Ga+3]
+[144Nd]
+[=Be]
+[97Nb]
+[#N]
+[206Tl]
+[UH3]
+[=P-1]
+[141Nd]
+[83Sr+2]
+[109Cd+2]
+[185W]
+[46Sc]
+[Ir-3]
+[32S]
+[75Se]
+[/PH1-1]
+[250Cm]
+[BiH4-1]
+[\PH3+1]
+[166Tm]
+[203Hg+1]
+[Mg]
+[Gd+2]
+[11C-1]
+[91Y+3]
+[Tb+3]
+[\C+1]
+[FeH6-4]
+[12C]
+[141Sm]
+[S]
+[ReH7]
+[P@H1]
+[/SnH2]
+[13OH1]
+[IH1+1]
+[Fe+3]
+[Ge@@H1]
+[=12CH1]
+[S@@]
+[Mo-2]
+[182W]
+[=13O]
+[190Po]
+[131La]
+[13CH1+1]
+[157Gd+3]
+[BiH1+1]
+[109In]
+[OsH3]
+[#Si+1]
+[137Ba]
+[211Po]
+[130I-1]
+[/123I]
+[Kr]
+[228Rn]
+[25Mg]
+[13CH1]
+[Sc]
+[Rn]
+[\I]
+[228Ac]
+[22Na+1]
+[Cu]
+[=Tc+2]
+[Ti-1]
+[55Fe+2]
+[=Se]
+[Ni+1]
+[Po]
+[149Eu]
+[ThH2]
+[=S]
+[CoH1+2]
+[#Cl]
+[#SiH1]
+[13CH3+1]
+[224Ac]
+[60Co+3]
+[\As]
+[9Be]
+[BH1]
+[245Pu]
+[#PH2]
+[249Cm]
+[138La]
+[#Branch2]
+[SiH3+1]
+[231Th]
+[-\Ring1]
+[122IH1]
+[117Sn+4]
+[180Os]
+[126Sb]
+[209Tl]
+[\Si]
+[\Sn]
+[67Ga+3]
+[=Ca]
+[208Pb]
+[137Ba+2]
+[99Tc]
+[Ru+8]
+[\11C]
+[=FeH1]
+[BH2+1]
+[IH2+1]
+[243Pu]
+[32PH2]
+[MoH2]
+[TiH2]
+[/Al+1]
+[237Pu]
+[\76Br]
+[H]
+[B-2]
+[WH2]
+[Nb]
+[GaH2]
+[\Pb]
+[60Ni]
+[238Cm]
+[\C@@H1]
+[218AtH1]
+[P@H1+1]
+[Co-1]
+[\Sn+1]
+[159Ho]
+[BH2]
+[11B-1]
+[Ta-2]
+[70Ge]
+[/34S]
+[134IH1]
+[Rb+1]
+[153Gd]
+[135La]
+[=Al-1]
+[YbH2]
+[/127I]
+[Ho+3]
+[44Sc+3]
+[48V]
+[104Ag]
+[ClH2+2]
+[12B]
+[ReH3]
+[43K+1]
+[=NH0]
+[\N-1]
+[22CH3-1]
+[Bi+2]
+[82Kr]
+[102Rh]
+[#Sc]
+[192Po]
+[228Th+4]
+[225Ra]
+[/Sn+3]
+[31PH3]
+[#Ga]
+[101Mo]
+[232U]
+[BiH1]
+[220Fr]
+[#17O+1]
+[128Sn]
+[18FH1]
+[SiH2-2]
+[=16N]
+[75As]
+[99Tc+4]
+[210Pb]
+[BrH1]
+[\Bi]
+[SnH3+1]
+[\CH2+1]
+[Al-3]
+[254Es]
+[66Zn+2]
+[S@@H1]
+[Ni-3]
+[94Nb]
+[217Bi]
+[11C]
+[166Tb]
+[CH3]
+[175Hf]
+[AlH1+1]
+[SbH2+1]
+[162Ho]
+[90Mo]
+[Os+4]
+[=Si-1]
+[204Tl]
+[13CH1-1]
+[U+3]
+[\P@]
+[Cl+1]
+[155Eu]
+[215Po]
+[33PH1]
+[Cd]
+[AtH1]
+[57Fe]
+[/CH2-1]
+[142La]
+[Se-1]
+[14CH2]
+[Cu-4]
+[Sr+2]
+[/C]
+[35Cl]
+[191Pt+2]
+[169Er]
+[15NH4+1]
+[23Na]
+[38Ar]
+[/Sn+2]
+[143La]
+[43Ca]
+[\I+1]
+[213BiH1]
+[SH2+1]
+[13C@@]
+[14CH3]
+[194Hg]
+[70Se]
+[Zr+3]
+[18O]
+[=Ru]
+[EuH2]
+[#13C]
+[SiH3]
+[=13C]
+[\14C@H1]
+[-\Ring2]
+[14C]
+[/15N]
+[\-Ring3]
+[14CH4]
+[46Ca]
+[10B]
+[#B]
+[66Zn]
+[#Sb]
+[Os+1]
+[=99Tc+2]
+[#17C-1]
+[Au]
+[75SeH1]
+[179Ta]
+[139Pr]
+[89Y]
+[Branch2]
+[/O-1]
+[200Bi]
+[2HH1]
+[=13CH1]
+[Fr]
+[166Yb]
+[239Pu]
+[11CH3-1]
+[103Ru]
+[61Co]
+[106Pd]
+[103Rh]
+[35SH1]
+[Sb]
+[18OH3+1]
+[47V]
+[50Cr+3]
+[121Sn]
+[171Lu]
+[184Hf]
+[110In]
+[247Bk]
+[AsH2]
+[184Os]
+[Er+3]
+[86Zr]
+[#Ni]
+[126I]
+[14NH3]
+[32PH3]
+[Si-1]
+[125Te]
+[#Ru]
+[Ru-2]
+[76Br-1]
+[227Ra]
+[/OH0]
+[=14CH2]
+[NH0]
+[227Ac]
+[234Pa]
+[OsH1-1]
+[69Ga]
+[182Re]
+[U+4]
+[239Np]
+[WH3]
+[Ru+2]
+[/N@+1]
+[=In]
+[201Bi]
+[126Sb+3]
+[Pd-1]
+[#188Re]
+[=C]
+[OsH1]
+[45Sc]
+[/S-1]
+[=99Tc+1]
+[=VH1]
+[GeH2-1]
+[/NH2+1]
+[NbH3]
+[Sn-1]
+[230U]
+[37SH2]
+[180W]
+[105Ag]
+[67Ge]
+[91Zr]
+[Tb+4]
+[\14CH1]
+[=WH1]
+[UH2]
+[258Md]
+[Dy+3]
+[220Rn]
+[TeH3]
+[86Sr]
+[#Branch1]
+[=15NH2+1]
+[#Br]
+[42Ca]
+[46Ti]
+[IrH1]
+[133I-1]
+[3H]
+[/Se]
+[/Ga]
+[11CH4]
+[Bi+1]
+[MnH1]
+[#18CH1]
+[Zn-4]
+[156Sm]
+[113Ag]
+[\BiH1]
+[128Xe]
+[175Ta]
+[\NH3+1]
+[=SeH1]
+[69Zn]
+[\Al]
+[#W+1]
+[233Np]
+[253Cf]
+[134Cs]
+[\Br]
+[253Es]
+[C@@H1]
+[#13N]
+[/P@]
+[173Ta]
+[Nb+2]
+[VH1]
+[126I-1]
+[121I-1]
+[207At]
+[\S]
+[182Os]
+[7Li]
+[SH1]
+[/AlH1+1]
+[115In]
+[AlH4-1]
+[59Ni]
+[123IH1]
+[FH1+1]
+[82Br-1]
+[Cl@@-1]
+[137Pr]
+[SbH5]
+[67Zn+2]
+[132I-1]
+[\SiH3]
+[AlH3]
+[AsH3]
+[111In-1]
+[/76Br]
+[164Dy+3]
+[50Cr]
+[=Tc+5]
+[82Se+6]
+[SeH3+1]
+[#W-1]
+[Ir-2]
+[\13C@@H1]
+[/AlH2]
+[99Mo]
+[/14C@H1]
+[76Br]
+[Ag]
+[145Eu]
+[135I]
+[/PH1]
+[141Ce+3]
+[84Sr]
+[B+2]
+[Th+2]
+[117SnH2]
+[=64Zn]
+[Mg+1]
+[38Cl-1]
+[140Ba]
+[22Ne]
+[118Sn]
+[145Pr]
+[202Pb]
+[125Sn+4]
+[61Ni]
+[233U+4]
+[/18F]
+[SeH1-1]
+[12CH4]
+[Cu-5]
+[/NH0]
+[=SH1+1]
+[#U]
+[153Sm]
+[76Ge]
+[207Tl]
+[BiH5]
+[Ru+4]
+[ZrH1]
+[131I]
+[81Kr]
+[66Ge]
+[9C]
+[193Os]
+[59Co]
+[Pb]
+[Cr-1]
+[95Zr]
+[Gd+3]
+[#PbH1]
+[18OH1]
+[134La]
+[15CH2]
+[Al+2]
+[214Pb]
+[17NH3]
+[134Ba]
+[\Si+1]
+[17B]
+[145Pm]
+[/12C]
+[Tl+1]
+[=Fe]
+[170Lu]
+[182Ta]
+[95Nb]
+[SnH4+2]
+[=As+1]
+[\CH0]
+[#S]
+[79Rb]
+[47Sc]
+[49V]
+[Nb-2]
+[=As]
+[81Se]
+[19FH1]
+[75Ge]
+[99Y]
+[79Br]
+[193Au]
+[210BiH3]
+[73Se]
+[54Mn]
+[51Ti]
+[ClH2+1]
+[90Sr+2]
+[TiH1+1]
+[129IH1]
+[/15N+1]
+[Fe+2]
+[199Hg]
+[74Br-1]
+[\15NH2]
+[85Rb+1]
+[42K+1]
+[203Tl+1]
+[#Er]
+[=76As]
+[SnH4]
+[/C@@]
+[182Ir]
+[VH2]
+[150Nd]
+[PH2+1]
+[137La]
+[135Xe]
+[179Hf]
+[HgH1]
+[Nd+3]
+[#O+1]
+[ReH4]
+[\Al-1]
+[Bi]
+[133Ba+2]
+[138Cs+1]
+[231Pa]
+[90Zr]
+[\CH1+1]
+[105Rh]
+[166Er]
+[34Cl-1]
+[PtH2+2]
+[/CH1-1]
+[=12CH2]
+[U]
+[Zn-1]
+[/IH1]
+[=13C-1]
+[=18O+1]
+[S@@+1]
+[154Eu+3]
+[97Zr]
+[178Yb]
+[InH1]
+[24Na]
+[82Br]
+[137Xe]
+[132La]
+[218Rn]
+[37S]
+[53Mn]
+[\W]
+[CeH1]
+[RuH5]
+[/PH2+1]
+[Re-2]
+[/Po]
+[28Si]
+[135Cs+1]
+[68Ga+3]
+[Co-4]
+[Sb+5]
+[177Yb]
+[=Ti]
+[246Cf]
+[196Bi]
+[22CH3]
+[90Nb]
+[#V+1]
+[GeH2+1]
+[243Am]
+[\B]
+[#Ir+1]
+[127Xe]
+[191Ir]
+[KrH1]
+[No]
+[#La]
+[194Ir]
+[89Sr+2]
+[/13CH1]
+[185Re]
+[\Cl]
+[/N+1]
+[\S@]
+[Tc+5]
+[60Cu]
+[/C@]
+[BiH2+1]
+[193Hg]
+[102Pd]
+[=188Re]
+[AsH3+1]
+[203Bi]
+[Pr]
+[/Cl+1]
+[94Zr]
+[43K]
+[138Cs]
+[153Gd+3]
+[\-Ring2]
+[OsH6]
+[=Er]
+[MnH1+1]
+[159Gd+3]
+[12NH3]
+[67Cu]
+[/XeH1]
+[77Br-1]
+[=14N]
+[=C-1]
+[MgH1]
+[#13C-1]
+[Hg+1]
+[SeH2]
+[=99Tc+4]
+[28Al]
+[Cm]
+[82Rb+1]
+[252Cf]
+[159Dy]
+[52Fe+3]
+[Se@]
+[BH0]
+[81Rb]
+[106Rh]
+[74BrH1]
+[210Bi]
+[206Bi]
+[\C@]
+[73As]
+[Cu-1]
+[\SiH2+1]
+[\Po]
+[Te+1]
+[144Ce+3]
+[41Ca+2]
+[132Xe]
+[=Xe]
+[87Y]
+[187Ir]
+[Br-1]
+[17O-1]
+[Cl+2]
+[229Th]
+[#Re]
+[146Eu]
+[238Am]
+[79Se]
+[136Ce]
+[SbH3+1]
+[58Co+2]
+[AsH2-1]
+[#C]
+[150Tb]
+[/18O]
+[109Cd]
+[B@@H1-1]
+[=11CH2]
+[124Xe]
+[1H]
+[#Nb]
+[219Rn]
+[Al]
+[90Y]
+[Cu-2]
+[170Er]
+[15OH2]
+[149Pm]
+[=O]
+[Rh]
+[228Th]
+[SbH6+3]
+[250Cf]
+[197Pb]
+[/CH2+1]
+[Pd+2]
+[12C@@]
+[10B-1]
+[#Pd]
+[=18C]
+[Ce+4]
+[\CH2-1]
+[13CH2-1]
+[181Ta+2]
+[\14C@]
+[117Cd]
+[186Ta]
+[#15N]
+[\SeH1]
+[\Se]
+[/SiH1]
+[HgH2]
+[32P+1]
+[V-1]
+[Cr+1]
+[SiH1-2]
+[=13N]
+[1H-1]
+[/35S]
+[13C-1]
+[74Se]
+[64Zn]
+[Cl]
+[142Pr]
+[72Br-1]
+[Pd]
+[200Tl]
+[92Sr+2]
+[=B-1]
+[79BrH1]
+[122I-1]
+[86Rb+1]
+[C-1]
+[187Re]
+[202Hg]
+[213Bi+3]
+[PtH3]
+[=35S]
+[39Ar]
+[13C+1]
+[152Sm+3]
+[161Ho]
+[181Hf]
+[26Mg]
+[/32P]
+[#C-1]
+[203Hg]
+[131Ba]
+[AsH4+1]
+[=SiH1]
+[FeH2]
+[227Th]
+[89Rb+1]
+[\14CH3]
+[152Tb]
+[Zr-4]
+[124IH1]
+[154Tb]
+[12CH3]
+[62Cu]
+[133I]
+[SiH2+1]
+[#SeH1]
+[39K+1]
+[As+3]
+[82BrH1]
+[/SiH3]
+[195Pb]
+[PdH1]
+[FeH3]
+[Pt-2]
+[=Mo+2]
+[/14CH1]
+[GaH4-1]
+[Ni-4]
+[Rh-2]
+[\Hg+1]
+[146Sm]
+[173Tm]
+[Pt+2]
+[P-3]
+[/I+1]
+[199Au]
+[66Ni]
+[78BrH1]
+[211Rn]
+[157Sm]
+[=Ni]
+[BrH2+1]
+[=S+1]
+[136Cs]
+[130Xe]
+[144Pr+3]
+[210At]
+[Cr+4]
+[128IH1]
+[174Lu]
+[185Ta]
+[=Y]
+[148Eu]
+[13N]
+[55Fe]
+[149Nd]
+[120IH1]
+[205Pb]
+[=125Te]
+[=GeH1]
+[=Ce]
+[90Zr+4]
+[105Pd]
+[32ClH1]
+[Mo-3]
+[/TlH1]
+[242Pu]
+[84Rb]
+[51Mn]
+[97Tc]
+[11CH3+1]
+[PbH1]
+[40K+1]
+[254Cf]
+[130IH1]
+[88Nb]
+[Ti]
+[90Y+3]
+[132Cs]
+[129Te]
+[/I-1]
+[182Hf]
+[CoH2]
+[TeH2]
+[#15O+1]
+[B]
+[131Cs+1]
+[59Co+3]
+[RhH1]
+[NiH1+1]
+[Zr-1]
+[Os-3]
+[204Hg+1]
+[193Pt+2]
+[I-1]
+[35S-1]
+[=15N]
+[\SnH1]
+[H-1]
+[108Cd]
+[11CH1]
+[176Yb]
+[TiH1]
+[48Ca]
+[=PH1+1]
+[195Ir]
+[La+3]
+[Se]
+[153Eu]
+[Hg+2]
+[138Pr]
+[Sb+1]
+[101Tc]
+[112Sn]
+[/InH2]
+[Tm+3]
+[#Zr]
+[PbH2+2]
+[\N@@+1]
+[114Cd]
+[Nb+5]
+[194Au]
+[BH4+1]
+[/GeH3]
+[66Ga]
+[\C-1]
+[96Zr]
+[204Po]
+[SiH2-1]
+[63Ni]
+[167Er]
+[234U]
+[Os+6]
+[201Po]
+[130Te]
+[/ClH1+1]
+[129I-1]
+[/Al]
+[Cr+5]
+[173Hf]
+[14C@@H1]
+[YH1]
+[57Mn]
+[111Cd]
+[102Ru]
+[/Sn]
+[21Ne]
+[160Dy]
+[139La]
+[89Sr]
+[257Fm]
+[Zn-3]
+[40PH1]
+[#Pb]
+[136Xe]
+[213Pb]
+[101Pd]
+[\BH0]
+[=17O]
+[1H+1]
+[87Kr]
+[158Gd]
+[NiH2]
+[\P@@]
+[PH1+1]
+[Al-1]
+[Cr]
+[99Tc+7]
+[#Fe+1]
+[172Yb]
+[=Ti+2]
+[235Pu]
+[\Se-1]
+[198Po]
+[134Te]
+[18CH2]
+[171Er]
+[69As]
+[/CH1+1]
+[Ho]
+[IrH2]
+[40PH3]
+[AsH5]
+[\Te+1]
+[Tc+4]
+[Te@]
+[Lr]
+[75As+3]
+[119Sn]
+[203Pb+2]
+[68Ge]
+[197Tl]
+[BH1+1]
+[15CH4]
+[209Bi]
+[75Br-1]
+[44Ca+2]
+[TeH3+1]
+[17C]
+[/14CH2-1]
+[=BiH1]
+[112In]
+[=Tc+1]
+[=15N-1]
+[61Cu+1]
+[4He]
+[51Cr]
+[Au+3]
+[=Tm]
+[222Rn]
+[72Ga]
+[P@+1]
+[193Pt+4]
+[Rf]
+[=P]
+[178Lu]
+[172Er]
+[110Pd]
+[200Pt]
+[SnH1+2]
+[83Se]
+[196Po]
+[111InH3]
+[=Nd]
+[\125I]
+[Br]
+[P@@]
+[70As]
+[SbH4]
+[Fe]
+[144Pr]
+[151Eu+3]
+[45Ca+2]
+[11CH2]
+[66Ga+3]
+[Cd+2]
+[64Zn+2]
+[152Dy]
+[15O-2]
+[AlH1+2]
+[106Ag]
+[=OH1+1]
+[120I]
+[OH3+1]
+[106Cd]
+[=15N+1]
+[52V]
+[116Cd]
+[177W]
+[#Pr]
+[As+1]
+[GaH1-1]
+[230Pu]
+[=Sb+1]
+[IrH3]
+[218At]
+[234Np]
+[155Ho]
+[118Pd+2]
+[192Os]
+[/13CH2]
+[#14CH1]
+[/Te+1]
+[134Xe]
+[10BH2]
+[169Yb]
+[/37Cl]
+[76As]
+[=Ba]
+[=Re]
+[/C@H1]
+[SnH1-1]
+[\HgH1]
+[223Ac]
+[SnH3-1]
+[143Pr]
+[\IH1+1]
+[=BrH1]
+[103Cd]
+[Si@]
+[FeH6]
+[\PH1+1]
+[Pt-1]
+[#Tc+1]
+[96Nb]
+[103Pd]
+[Br+1]
+[19C]
+[=Os+2]
+[83BrH1]
+[#Tl]
+[#18C-1]
+[244Pu]
+[136Eu]
+[Mn+1]
+[54Cr]
+[\O+1]
+[S@+1]
+[201Tl]
+[\C@@]
+[SH3+1]
+[/125I]
+[144Pm]
+[123Sn]
+[Na]
+[161Tb+3]
+[68Zn]
+[=70Zn]
+[Nd]
+[/13C@@H1]
+[86Y]
+[Fe+6]
+[Al-2]
+[121Xe]
+[Mo+4]
+[Es]
+[19B]
+[115Sb]
+[38SH2]
+[14CH2-1]
+[=SiH2]
+[=Si+1]
+[201Au]
+[11CH1-1]
+[28SiH3]
+[Mo]
+[109Pd+2]
+[YH2]
+[#17CH1]
+[Au+1]
+[127Te]
+[#W]
+[S+1]
+[173Lu]
+[Xe]
+[104Pd]
+[/N]
+[SH0]
+[14O]
+[Ca-2]
+[=XeH1]
+[InH4-1]
+[Si-2]
+[AsH4]
+[99Ru+2]
+[Zn]
+[\S-1]
+[=Te]
+[Br+2]
+[198Tl]
+[25Mg+2]
+[/N-1]
+[10BH3]
+[195Pt+4]
+[236Pu]
+[I+1]
+[/SiH1-1]
+[InH2]
+[\B@-1]
+[60Fe]
+[14OH2]
+[233Pa]
+[199Tl+1]
+[Am]
+[Eu]
+[=GeH2]
+[158Tb]
+[=Hf]
+[=WH2]
+[AlH2+1]
+[Er]
+[189Pt]
+[172Tm]
+[Pt-4]
+[16CH2]
+[16N+1]
+[BH1-1]
+[148Pm]
+[225Ac]
+[=19C]
+[99Rh]
+[125I]
+[\79Br]
+[ReH1]
+[27Al+3]
+[Ir]
+[\AsH2]
+[23Na+1]
+[Md]
+[119In]
+[56Co]
+[104Rh]
+[\C@H1]
+[235U]
+[MoH3]
+[\In]
+[247Cm]
+[\O-1]
+[/P@@]
+[36Cl]
+[153Sm+3]
+[236Np]
+[164Dy]
+[U+2]
+[/Sn+1]
+[16C]
+[KH1]
+[Zr-3]
+[241Am]
+[131IH1]
+[ClH1+2]
+[121SnH2]
+[MoH5]
+[/AsH1]
+[#18O+1]
+[Re+1]
+[187Os]
+[=SiH1-1]
+[170Hf]
+[37Cl]
+[184Ir]
+[\TeH1]
+[\Sn-1]
+[/11CH3]
+[#Tm]
+[189Os]
+[48Cr]
+[120Te]
+[201Hg]
+[PH1-1]
+[=AsH2]
+[I+2]
+[\ClH1+1]
+[62Cu+1]
+[Si@@]
+[\I-1]
+[=PH0]
+[BrH0]
+[Li]
+[O+1]
+[117Sn]
+[199Tl]
+[148Nd]
+[NaH1]
+[62Zn+2]
+[S-2]
+[3He]
+[Ta+5]
+[In-1]
+[82Sr+2]
+[194Tl]
+[C]
+[GeH4]
+[36ClH1]
+[14N]
+[73Ga]
+[=99Tc+5]
+[TeH2+1]
+[SbH2]
+[210Tl]
+[13C]
+[=Tl]
+[\15N]
+[/SeH1]
+[181W]
+[9Li]
+[82Rb]
+[72Zn]
+[124Te]
+[Ac]
+[/P]
+[156Eu]
+[203PbH1]
+[110Ag]
+[144Sm]
+[Li+1]
+[Ni]
+[71Ga]
+[65Cu+2]
+[63Ni+2]
+[CuH2-1]
+[113Cd]
+[Cl@-1]
+[178Hf]
+[=S@]
+[45K]
+[127Cs+1]
+[RuH1-1]
+[171Yb]
+[TiH4]
+[58Fe+3]
+[231U]
+[Cr-2]
+[ClH1-1]
+[OH0]
+[37Ar]
+[94Y]
+[EuH3]
+[P@@H1+1]
+[P-1]
+[Co+1]
+[131Te]
+[18F-1]
+[=Mn]
+[67Cu+2]
+[200Po]
+[=14CH1]
+[Os+5]
+[86Rb]
+[SeH5]
+[Lu+3]
+[106Ru+3]
+[/C@@H1]
+[/124I]
+[=Ru+1]
+[91Sr]
+[#14C-1]
+[/GeH2]
+[15NH1-1]
+[201Pb]
+[240Pu]
+[192Bi]
+[Si@@H1]
+[38K+1]
+[As+5]
+[Cd-2]
+[197Hg]
+[=Sb]
+[CH1+1]
+[18O-1]
+[Np]
+[Ru-4]
+[F]
+[=Tc]
+[CH2-1]
+[Ir+1]
+[109Pd]
+[SnH2-1]
+[\P-1]
+[17OH1]
+[142Pm]
+[Ca-4]
+[116Te]
+[Hf]
+[7Li+1]
+[18F]
+[Cr-3]
+[/Si+1]
+[ScH3]
+[51Fe]
+[155Dy]
+[191Pt+4]
+[178Ta]
+[126Sn]
+[148Gd]
+[NH1+1]
+[94Ru]
+[123I-1]
+[38S]
+[64Ni]
+[/14CH3]
+[=Sr]
+[192Ir]
+[=Th+2]
+[Ni+3]
+[PH1-2]
+[85Br]
+[=Zn]
+[=B]
+[Au-1]
+[=RhH1]
+[211At]
+[65Zn+2]
+[OH1-1]
+[P@@+1]
+[/SH2+1]
+[BH2-1]
+[CaH2]
+[N+1]
+[113In]
+[33P]
+[InH1-1]
+[90Tc]
+[Ti+1]
+[\ClH1]
+[Pt-3]
+[213Bi]
+[170Tm+3]
+[=PH2+1]
+[/TeH1]
+[76BrH1]
+[200Pb]
+[82Se-2]
+[191Os]
+[PtH1]
+[75BrH1]
+[Db]
+[/NH1-1]
+[\PH1-1]
+[218Pb]
+[=Co]
+[/In]
+[=Yb]
+[100Tc+4]
+[NH4+1]
+[=Si]
+[Ga]
+[=Pd]
+[64Cu+1]
+[Ce]
+[86Tc]
+[Ru-1]
+[120I-1]
+[217At]
+[\GeH1]
+[234Pu]
+[TeH1+1]
+[/14CH2]
+[180Re]
+[62Ga]
+[=S@@]
+[15O]
+[59Fe+3]
+[168Er]
+[246Bk]
+[BH3+1]
+[81Br-1]
+[53Cr]
+[122I]
+[/Cl-1]
+[=100Tc+1]
+[#14C]
+[127IH1]
+[PtH1+1]
+[126IH1]
+[/-Ring1]
+[/GeH1]
+[TeH4]
+[16NH1]
+[108Pd]
+[35S-2]
+[127I]
+[161Er]
+[145Nd]
+[187W]
+[\NH1]
+[Mn-2]
+[10C]
+[=Lu]
+[38K]
+[Se+1]
+[28Mg+2]
+[135IH1]
+[227Pa]
+[238Np]
+[/S@@]
+[239U]
+[\Te]
+[\BH2-1]
+[#S+1]
+[XeH2]
+[154Gd]
+[Pa]
+[\N+1]
+[/BH0]
+[AlH2-1]
+[=Ga]
+[223Fr]
+[194Os]
+[161Tb]
+[#Bi]
+[K+1]
+[58Fe+2]
+[Ra]
+[OH1]
+[SiH3-2]
+[/18C]
+[AsH2+1]
+[147Sm]
+[SnH3]
+[AsH3-1]
+[RuH3]
+[181Os]
+[63Zn]
+[81Rb+1]
+[78As]
+[162Dy]
+[=Nb]
+[=Sn]
+[177Lu+3]
+[13NH1]
+[233Ra]
+[129I]
+[118Pd]
+[131Xe]
+[=Te-1]
+[142Ba]
+[10CH3]
+[32Si]
+[234Th]
+[250Bk]
+[\14C]
+[10CH2]
+[/15NH1]
+[135I-1]
+[157Dy]
+[Ba+2]
+[/B]
+[SbH1]
+[OH2+1]
+[15CH3]
+[Ring3]
+[WH1]
+[136Pr]
+[82Sr]
+[Sn@]
+[196Pb]
+[76Kr]
+[#Mo]
+[Os-2]
+[\Ga]
+[208Tl+1]
+[138Ce]
+[#NH1+1]
+[87Rb]
+[195Tl]
+[Zr+4]
+[8B]
+[112Ag]
+[/N@@+1]
+[150Pm]
+[106Ru]
+[13C@@H1]
+[3H-1]
+[37ClH1]
+[227Th+4]
+[IrH4]
+[16CH3]
+[/Bi]
+[Th+4]
+[AlH2-2]
+[/C+1]
+[/Sb]
+[242Cm]
+[39K]
+[155Gd]
+[Branch1]
+[=TaH1]
+[208Po]
+[98Nb]
+[196Au]
+[=Rh]
+[17NH1]
+[K]
+[57Fe+2]
+[218Po]
+[/SnH1]
+[=13CH2]
+[TlH2+1]
+[Sr]
+[88Rb+1]
+[68GaH3]
+[36SH1]
+[\SH1+1]
+[165Er]
+[/S+1]
+[RuH1]
+[=Tb]
+[Mn+3]
+[12CH2]
+[98Tc+5]
+[#99Tc]
+[/19F]
+[Be+2]
+[15C-1]
+[Os-1]
+[=MoH2]
+[191Pt]
+[134Cs+1]
+[120Sn]
+[6Li+1]
+[141Pr+3]
+[SeH1]
+[\GeH3]
+[AgH1]
+[168Tm]
+[26Al]
+[/S@]
+[ZrH2+2]
+[130Sb]
+[GeH2]
+[170Yb]
+[129Xe]
+[15N-1]
+[228Pa]
+[/Ru]
+[#B-1]
+[As-1]
+[41Ar]
+[103Ag]
+[Tc]
+[120Sb]
+[P-2]
+[/W]
+[22NH1]
+[=15NH1+1]
+[\At]
+[Pb+2]
+[242Am]
+[148Sm]
+[56Fe]
+[222Ra]
+[251Cf]
+[1HH1]
+[RuH1+2]
+[61Cu]
+[#As+1]
+[114In]
+[38PH3]
+[=12C]
+[88Kr]
+[/CH0]
+[HH1]
+[123Te]
+[F-1]
+[117Sb]
+[IH2]
+[152Sm]
+[42K]
+[189Re]
+[115Sn]
+[212Bi]
+[Mn]
+[31Si]
+[/18OH1]
+[Ba+1]
+[Ni-1]
+[245Am]
+[#Te]
+[104Tc]
+[Ir+3]
+[PdH2]
+[V+4]
+[Cr+2]
+[=Pd-3]
+[12C@H1]
+[94Mo]
+[RhH2]
+[89Zr]
+[\NH2+1]
+[13C@H1]
+[\35Cl]
+[12C@@H1]
+[TiH1+3]
+[\3H]
+[=BH0]
+[13O]
+[\14CH2]
+[205Tl]
+[167Yb]
+[27Al]
+[51Cr+3]
+[178Re]
+[Fe-3]
+[Eu+3]
+[84Kr]
+[166Ho]
+[244Cf]
+[PH0]
+[111Ag]
+[=IH1]
+[51V]
+[FeH4-3]
+[NH2+1]
+[\BH3-1]
+[245Bk]
+[\SiH1]
+[151Gd]
+[100Tc]
+[/14NH1]
+[98Tc+7]
+[=Eu]
+[197Pt]
+[\BH1-1]
+[80Rb]
+[216Po]
+[Mo+2]
+[88Zr]
+[/-Ring2]
+[230Pa]
+[123Xe]
+[/Si@]
+[34S-1]
+[At]
+[Hg-1]
+[126Te]
+[44Ca]
+[Yb]
+[Fe+1]
+[/Br]
+[14N+1]
+[99Y+3]
+[75As+5]
+[100Mo]
+[205Bi]
+[Si+3]
+[=Bi+1]
+[148Tb]
+[212Ra]
+[#AsH1]
+[142Nd]
+[127Sb]
+[Sb-1]
+[=77Se]
+[17OH1-1]
+[18N]
+[128I]
+[Sb+3]
+[=Re+1]
+[20Ne]
+[TlH3]
+[151Eu]
+[/Si]
+[99Ru]
+[124I-1]
+[CrH2]
+[MoH4]
+[240U]
+[162Yb]
+[22Na]
+[AsH1-1]
+[ThH4]
+[#Os-1]
+[90Sr]
+[74Ge]
+[19OH2]
+[149Tb]
+[\13CH1]
+[43Sc]
+[188Ir]
+[255Fm]
+[197Au]
+[SeH1+1]
+[Rh+2]
+[Tl+3]
+[\Br-1]
+[36Cl-1]
+[/I]
+[121Te]
+[ClH1]
+[Sn]
+[\SH0]
+[186Re]
+[188Pt]
+[\13CH3]
+[Si]
+[15NH2+1]
+[/2H]
+[=Fe+1]
+[209BiH3]
+[152Eu]
+[/CH2]
+[20CH1]
+[38Cl]
+[Bi-2]
+[94Tc+7]
+[\GeH2]
+[11B]
+[/Si@H1]
+[68Cu]
+[#Mn]
+[181Re]
+[Os]
+[Br+3]
+[230Ra]
+[156Tb]
+[152Gd]
+[/NH3+1]
+[Bk]
+[190Os]
+[ClH4+3]
+[Cl-1]
+[\C]
+[\SiH1-1]
+[#I]
+[Lu]
+[SnH1+1]
+[162Tm]
+[236U]
+[Cr+3]
+[122Sb]
+[131Sb]
+[209Po]
+[Ar]
+[166Ho+3]
+[114Sn]
+[48Ti]
+[Ti+4]
+[121Sb]
+[190Ir]
+[W]
+[Cs]
+[SnH1+3]
+[105Rh+3]
+[Mo-1]
+[C@H1]
+[MgH2]
+[AlH2]
+[20CH3]
+[Tb]
+[92Y]
+[/15NH2]
+[#C+1]
+[17O]
+[144Ce]
+[162Er]
+[175Yb]
+[80Br]
+[127Sb+3]
+[77Se]
+[177Hf]
+[64Ga]
+[144Cs]
+[Al+1]
+[139Ba]
+[=CH1]
+[\Sb]
+[89Rb]
+[142Sm]
+[89Kr]
+[=15NH1]
+[=Branch2]
+[Y+3]
+[13NH2]
+[14NH4+1]
+[=10B]
+[67Ga]
+[=P@@]
+[57Ni]
+[CH3-1]
+[223Ra]
+[62Zn]
+[SH1-1]
+[=Ir]
+[CH2+1]
+[212PbH2]
+[GeH6-2]
+[=Ho]
+[\CH2]
+[125IH1]
+[65Ni]
+[124Sb]
+[246Cm]
+[167Dy]
+[CH0]
+[224Rn]
+[Th]
+[B-1]
+[=11CH1]
+[=106Ru]
+[LiH1]
+[241Cm]
+[=99Tc]
+[\Tl]
+[RuH1+1]
+[OsH2]
+[ZrH2]
+[93Tc]
+[71Ge]
+[Te+4]
+[143Cs]
+[140La]
+[131I-1]
+[172Lu]
+[78Se]
+[6He]
+[238U]
+[#As]
+[#Ru-1]
+[=ZrH2]
+[204Pb]
+[82Se+4]
+[205Po]
+[=B+1]
+[=CH1-1]
+[=ReH1]
+[191Os+4]
+[60Co+2]
+[Pd-2]
+[/B-1]
+[/14C]
+[Ne]
+[51Cr+6]
+[SeH3]
+[183Hf]
+[\AlH2]
+[Ru]
+[B@-1]
+[186W]
+[S@]
+[SiH4-1]
+[194Pb]
+[239Th]
+[105Ru]
+[SbH1-1]
+[=BH1-1]
+[107Ag]
+[115Ag]
+[PtH4]
+[154Eu]
+[14NH1]
+[BiH4]
+[70Zn]
+[#Al]
+[\AsH1]
+[174Hf+4]
+[#15N+1]
+[CH1]
+[157Lu]
+[89Nb]
+[GeH5-1]
+[50Ti]
+[207Po]
+[31P-3]
+[\S@@]
+[47Ca]
+[Dy]
+[Ag+1]
+[147Pr]
+[=238U]
+[139Nd]
+[CrH1+2]
+[230Th]
+[216Bi]
+[OH1+1]
+[55Co]
+[#Se]
+[83Sr]
+[158Dy]
+[#Co]
+[35SH2]
+[C@]
+[185Os]
+[161Dy]
+[/F]
+[\SbH1]
+[210Po]
+[34ClH1]
+[\-Ring1]
+[125Te+4]
+[141La]
+[NH2-1]
+[30S]
+[166Dy]
+[11CH3]
+[TlH1]
+[OsH4]
+[Re-1]
+[AlH6-3]
+[202Po]
+[=C+1]
+[=Se+1]
+[N]
+[32SH2]
+[=Branch1]
+[P@@H1]
+[Pd-3]
+[17OH2]
+[Si+2]
+[#Tc]
+[188Os]
+[195Hg]
+[244Cm]
+[133Ba]
+[PH2-1]
+[15NH1+1]
+[6Li]
+[138Nd]
+[PbH3]
+[10CH4]
+[#Os+2]
+[22CH2]
+[/At]
+[214Bi]
+[228Ra]
+[Ba]
+[14C-1]
+[Cs+1]
+[239Am]
+[85Sr]
+[/OH1+1]
+[29Al]
+[NbH2]
+[70Ga]
+[59Fe]
+[RuH1+3]
+[111Sn]
+[Ta]
+[112Pd]
+[Rh+3]
+[Ru-3]
+[245Cm]
+[=N]
+[Ge+4]
+[\13CH2]
+[=SiH1+1]
+[59Fe+2]
+[202Tl]
+[117Sn+2]
+[40Ar]
+[156Dy]
+[79Rb+1]
+[/HgH1]
+[15N+1]
+[O]
+[125I-1]
+[99Tc+6]
+[186Ir]
+[SiH1]
+[/13C]
+[/SnH3]
+[131Cs]
+[111In+3]
+[Pm]
+[150Sm]
+[117In]
+[20C]
+[194Bi]
+[16O]
+[Si+4]
+[=I]
+[Mo+1]
+[Pr+3]
+[80Kr]
+[=10CH1]
+[49Cr]
+[248Cf]
+[160Gd]
+[Ca]
+[132Te]
+[/P+1]
+[48Sc]
+[=RuH1]
+[150Eu]
+[79Kr]
+[Al+3]
+[#Si]
+[Ca+2]
+[SeH2+1]
+[#Si-1]
+[Ga-1]
+[/OH2+1]
+[Se-2]
+[195Au]
+[102Ag]
+[#P+1]
+[115Cd]
+[14NH2]
+[=RuH2]
+[243Cm]
+[Se+6]
+[209Pb]
+[Ge@@]
+[ClH3+3]
+[16NH3]
+[248Am]
+[#34S+1]
+[12N+1]
+[#WH1]
+[135Ce]
+[240Am]
+[=SbH1]
+[SbH4+1]
+[32PH1]
+[80Sr]
+[=CH1+1]
+[=33S]
+[ZnH2]
+[\Se+1]
+[11BH3]
+[203Hg+2]
+[15OH1]
+[Tl]
+[Hs]
+[/PH0]
+[87Sr]
+[=N+1]
+[\Hg]
+[=15O]
+[100Pd]
+[10CH1]
+[Pd-4]
+[98Tc]
+[226Ac]
+[13CH2]
+[#Lu]
+[B@H1-1]
+[240Np]
+[110Ag+1]
+[137Cs+1]
+[=15CH1]
+[147Eu]
+[257Md]
+[#Hf+1]
+[=Mn-1]
+[\OH0]
+[=SnH2]
+[Se@@H1]
+[Zr]
+[32SH1]
+[#TaH1]
+[198Au+3]
+[38ClH1]
+[33SH1]
+[\Cl-1]
+[38PH1]
+[11C@H1]
+[9CH1]
+[134Ce]
+[Si@H1]
+[=Au]
+[AsH1+1]
+[15CH1]
+[/PH1+1]
+[Ce+3]
+[CoH1+1]
+[Os+8]
+[/125Te]
+[145Gd]
+[204Hg]
+[=Pt]
+[#13CH1]
+[W+2]
+[RuH2]
+[#Sn]
+[=Ge]
+[Tc+7]
+[37Cl-1]
+[237U]
+[16N]
+[/Si-2]
+[63Cu]
+[WH4]
+[Yb+2]
+[=SH1-1]
+[121Sn+2]
+[176Hf]
+[217Po]
+[177Lu]
+[176Lu]
+[78Ge]
+[130Cs+1]
+[211Pb]
+[Hg]
+[81Br]
+[=NiH1]
+[116In]
+[GeH3-1]
+[45Ti]
+[15C]
+[=OsH1]
+[BH3-1]
+[128Ba]
+[165Tm]
+[40K]
+[SnH2+1]
+[=Sm]
+[41K]
+[154Sm]
+[158Eu]
+[97Mo]
+[116Sb]
+[207Pb]
+[11C@@H1]
+[Ti+3]
+[Eu+2]
+[=14NH1]
+[=IH2]
+[142Ce]
+[=14O]
+[Cd-1]
+[Os+2]
+[#Os-2]
+[Sn+4]
+[Fe-2]
+[P]
+[226Th]
+[SrH2]
+[34SH2]
+[193Ir]
+[TaH3]
+[N@@+1]
+[41Ca]
+[125Cs]
+[200Au]
+[155Tb]
+[13CH4]
+[34SH1]
+[#Ring2]
+[111In]
+[=235U+2]
+[149Sm]
+[19CH2]
+[132Cs+1]
+[44K]
+[18OH1-1]
+[=Ring3]
+[/CH1]
+[64Cu+2]
+[159Gd]
+[\OH2+1]
+[#11CH1]
+[=U+2]
+[82Se]
+[RuH6]
+[249Cf]
+[Na+1]
+[O-2]
+[#Zr+1]
+[201Tl+1]
+[86Kr]
+[/11C]
+[/3H]
+[As@@]
+[124I]
+[Fe-4]
+[Fe+4]
+[75Br]
+[147Nd]
+[128Te]
+[141Ce]
+[Bi+3]
+[103Pd+2]
+[198Hg]
+[199Pb]
+[101Rh]
+[=Cr]
+[136Ba]
+[127Cs]
+[135Cs]
+[56Mn]
+[NiH1]
+[55Mn]
+[=V+2]
+[178W]
+[139Ce]
+[167Tm]
+[147Pm]
+[#11C-1]
+[188Re]
+[Fm]
+[Yb+3]
+[Gd]
+[Fe+5]
+[NH2]
+[57Co]
+[88Sr+2]
+[147Gd]
+[Cf]
+[79Br-1]
+[=Sc]
+[#CH0]
+[22CH4]
+[135Ba]
+[237Am]
+[146Gd]
+[Te@@]
+[N@@]
+[/13CH3]
+[Sm]
+[73Ge]
+[71As]
+[PbH2]
+[TaH1]
+[122Xe]
+[165Dy]
+[123Sb]
+[67GaH3]
+[/Se+1]
+[B+1]
+[83Rb]
+[Cu+2]
+[13C@]
+[AuH1]
+[\P]
+[157Eu]
+[85Rb]
+[Pt]
+[235Np]
+[80BrH1]
+[\18F]
+[P@]
+[203Po]
+[125Cs+1]
+[P+1]
+[=18CH2]
+[45K+1]
+[Co-3]
+[ZnH1+1]
+[57Co+2]
+[=PbH2]
+[=Ti+1]
+[174Ta]
+[#Ho]
+[/B+1]
+[\37Cl]
+[100Tc+5]
+[2H]
+[13B]
+[155Sm]
+[#N+1]
+[NH1-1]
+[32P]
+[58Co+3]
+[/35Cl]
+[=NH1+1]
+[=Pr+1]
+[Ir+2]
+[/Pb]
+[15NH3]
+[CuH2]
+[114In+3]
+[Ru+1]
+[Fe-1]
+[198Bi]
+[SH2]
+[RhH1+2]
+[176W]
+[200Hg]
+[Hf+4]
+[10BH1]
+[Hg-2]
+[179W]
+[252Fm]
+[PbH4]
+[/O]
+[He]
+[=Hg]
+[183W]
+[157Ho]
+[Be]
+[#Ti+1]
+[Rh-4]
+[=S-1]
+[72Se]
+[#Sm]
+[=9C]
+[Be+1]
+[180Ta]
+[/-Ring3]
+[/IH1+1]
+[Ring2]
+[/H]
+[129Sb]
+[174Yb]
+[149Gd]
+[=Br]
+[Mn+2]
+[36S]
+[14C@H1]
+[34S]
+[CoH1]
+[\TlH1]
+[170Tm]
+[68Ge+4]
+[210PoH2]
+[=Os]
+[179Lu]
+[/AlH1]
+[195Po]
+[Ru+5]
+[81BrH1]
+[17FH1]
+[#S-1]
+[136Eu+3]
+[NH3+1]
+[68GaH1]
+[28Mg]
+[=O+1]
+[#Fe]
+[60Ni+2]
+[Rh+1]
+[43Ca+2]
+[/As]
+[PdH1+1]
+[141Cs]
+[=AsH1]
+[#V]
+[229Rn]
+[17CH1]
+[95Ru]
+[67Zn]
+[153Pm]
+[#P-1]
+[Bh]
+[=Cl]
+[80Se]
+[RuH4]
+[143Pm]
+[=N-1]
+[#Os]
+[N@+1]
+[/Si@@H1]
+[Sg]
+[76Se]
+[=AsH3]
+[96Tc]
+[=P+1]
+[167Ho]
+[InH3]
+[193Po]
+[165Dy+3]
+[95Y]
+[C+1]
+[=Zr+2]
+[24Na+1]
+[89Zr+4]
+[189Ir]
+[=Bi]
+[198Pb]
+[#Gd]
+[La]
+[=Hf+2]
+[B@@-1]
+[/Cl]
+[GaH3]
+[93Zr]
+[251Es]
+[111InH2]
+[175Tm]
+[/SiH2+1]
+[H+1]
+[163Dy]
+[#Eu]
+[31S]
+[16O-1]
+[Mt]
+[110Sn]
+[Ti-2]
+[54Fe]
+[Mo+3]
+[/SH1+1]
+[72BrH1]
+[=TlH1]
+[Sn+1]
+[\H]
+[14CH3-1]
+[57Co+3]
+[14CH1-1]
+[145Sm]
+[Zr+2]
+[197Hg+1]
+[Ru+6]
+[17NH4+1]
+[60Co]
+[77Br]
+[193Pt]
+[35S]
+[133IH1]
+[147Tb]
+[95Mo]
+[52Ti]
+[129Cs]
+[133Te]
+[FH0]
+[=Ring2]
+[\B-1]
+[52Mn]
+[/PH3+1]
+[58Fe]
+[177Re]
+[49Sc]
+[52Mn+2]
+[250Es]
+[=99Tc+3]
+[53Cr+6]
+[206Po]
+[Pu]
+[/Si@@]
+[130Cs]
+[=SeH2]
+[IrH1+2]
+[180Hf]
+[83Rb+1]
+[15NH3+1]
+[Ga+3]
+[56Ni]
+[\Si-1]
+[13CH3]
+[62Ni]
+[110Te]
+[93Nb]
+[Sc+3]
+[88Sr]
+[12CH1-1]
+[CH3+1]
+[\13C]
+[151Tb]
+[77BrH1]
+[\S+1]
+[PH2]
+[\NH0]

environment.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+name: molecule
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_5
+  - ca-certificates=2024.3.11=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgfortran-ng=7.5.0=ha8ba4b0_17
+  - libgfortran4=7.5.0=ha8ba4b0_17
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - mpi=1.0=mpich
+  - mpi4py=3.1.4=py311hfc96bbd_0
+  - mpich=3.3.2=hc856adb_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.13=h7f8727e_0
+  - pip=23.3.1=py311h06a4308_0
+  - python=3.11.9=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.2.2=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.6=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - aiohttp==3.9.5
+      - aiosignal==1.3.1
+      - annotated-types==0.6.0
+      - appdirs==1.4.4
+      - attrs==23.2.0
+      - blis==0.7.11
+      - blobfile==2.1.1
+      - catalogue==2.0.10
+      - certifi==2024.2.2
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - cloudpathlib==0.16.0
+      - confection==0.1.4
+      - cymem==2.0.8
+      - datasets==2.19.0
+      - dill==0.3.8
+      - docker-pycreds==0.4.0
+      - fcd==1.2.2
+      - filelock==3.13.4
+      - frozenlist==1.4.1
+      - fsspec==2024.3.1
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - huggingface-hub==0.22.2
+      - idna==3.7
+      - jinja2==3.1.3
+      - joblib==1.4.0
+      - levenshtein==0.25.1
+      - lxml==4.9.4
+      - markupsafe==2.1.5
+      - mpmath==1.3.0
+      - multidict==6.0.5
+      - multiprocess==0.70.16
+      - murmurhash==1.0.10
+      - networkx==3.2.1
+      - nltk==3.8.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.20.5
+      - nvidia-nvjitlink-cu12==12.4.127
+      - nvidia-nvtx-cu12==12.1.105
+      - packaging==24.0
+      - pandas==2.2.1
+      - pfzy==0.3.4
+      - pillow==10.3.0
+      - preshed==3.0.9
+      - prompt-toolkit==3.0.43
+      - protobuf==4.25.3
+      - psutil==5.9.8
+      - pyarrow==15.0.2
+      - pyarrow-hotfix==0.6
+      - pycryptodomex==3.20.0
+      - pydantic==2.6.4
+      - pydantic-core==2.16.3
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - pyyaml==6.0.1
+      - rapidfuzz==3.8.1
+      - rdkit==2023.9.5
+      - regex==2023.12.25
+      - requests==2.31.0
+      - safetensors==0.4.2
+      - scipy==1.13.0
+      - selfies==2.1.1
+      - sentry-sdk==1.44.1
+      - setproctitle==1.3.3
+      - six==1.16.0
+      - smart-open==6.4.0
+      - smmap==5.0.1
+      - spacy-legacy==3.0.12
+      - spacy-loggers==1.0.5
+      - srsly==2.4.8
+      - sympy==1.12
+      - thinc==8.2.3
+      - tokenizers==0.15.2
+      - torch==2.3.0
+      - tqdm==4.66.2
+      - transformers==4.39.3
+      - triton==2.3.0
+      - typer==0.9.4
+      - typing-extensions==4.10.0
+      - tzdata==2024.1
+      - urllib3==2.2.1
+      - wandb==0.16.6
+      - wasabi==1.1.2
+      - wcwidth==0.2.13
+      - weasel==0.3.4
+      - xxhash==3.4.1
+      - yarl==1.9.4
+prefix: /opt/conda/envs/molecule

inference.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import argparse
+import selfies as sf
+from tqdm import tqdm
+from transformers import set_seed
+from src.scripts.mytokenizers import Tokenizer
+from src.improved_diffusion import gaussian_diffusion as gd
+from src.improved_diffusion import dist_util, logger
+from src.improved_diffusion.respace import SpacedDiffusion
+from src.improved_diffusion.transformer_model import TransformerNetModel
+from src.improved_diffusion.script_util import (
+    model_and_diffusion_defaults,
+    add_dict_to_argparser,
+)
+from src.scripts.mydatasets import Lang2molDataset_eval
+def main():
+    set_seed(42)
+    args = create_argparser().parse_args()
+    # dist_util.setup_dist()
+    logger.configure()
+    args.sigma_small = True
+    # args.diffusion_steps = 200 #500  # DEBUG
+    if args.experiment == "random1":
+        args.experiment = "random"
+    logger.log("creating model and diffusion...")
+    tokenizer = Tokenizer()
+    model = TransformerNetModel(
+        in_channels=args.model_in_channels,
+        model_channels=args.model_model_channels,
+        dropout=args.model_dropout,
+        vocab_size=len(tokenizer),
+        hidden_size=args.model_hidden_size,
+        num_attention_heads=args.model_num_attention_heads,
+        num_hidden_layers=args.model_num_hidden_layers,
+    )
+    model.eval()
+    diffusion = SpacedDiffusion(
+        use_timesteps=[i for i in range(0, args.diffusion_steps, 10)],
+        betas=gd.get_named_beta_schedule("sqrt", args.diffusion_steps),
+        model_mean_type=(gd.ModelMeanType.START_X),
+        model_var_type=((gd.ModelVarType.FIXED_LARGE)),
+        loss_type=gd.LossType.E2E_MSE,
+        rescale_timesteps=True,
+        model_arch="transformer",
+        training_mode="e2e",
+    )
+    model.load_state_dict(
+        dist_util.load_state_dict(args.model_path, map_location="cpu")
+    )
+    pytorch_total_params = sum(p.numel() for p in model.parameters())
+    logger.log(f"the parameter count is {pytorch_total_params}")
+    model.to(dist_util.dev())
+    model.eval()
+    logger.log("sampling...")
+    print("--" * 30)
+    print(f"Loading {args.split} set")
+    print("--" * 30)
+    validation_dataset = Lang2molDataset_eval(
+        dir=args.dataset_path,
+        tokenizer=tokenizer,
+        split=args.split,
+        corrupt_prob=0.0,
+        token_max_length=args.token_max_length,
+        dataset_name=args.dataset_name,
+    )
+    print("-------------------- DATASET INFO --------------------")
+    print(f"Size: {len(validation_dataset)} samples")
+    print(f'Sample shape: {validation_dataset[0]["caption_state"].shape}')
+    print(f"Use DDIM: {args.use_ddim}")
+    sample_fn = (
+        diffusion.p_sample_loop if not args.use_ddim else diffusion.ddim_sample_loop
+    )
+    print(f"Batch size: {args.batch_size}")
+    next_batch_start = args.start
+    next_batch_end = next_batch_start + args.batch_size
+    all_outputs = []
+    all_caption = []
+    all_smiles = []
+    pbar = tqdm(
+        total=len(validation_dataset) // args.batch_size + 1
+        if len(validation_dataset) % args.batch_size != 0
+        else len(validation_dataset) // args.batch_size
+    )
+    while True:
+        sample = [
+            (
+                validation_dataset[i]["caption_state"],
+                validation_dataset[i]["caption_mask"],
+                validation_dataset[i]["caption"],
+                validation_dataset[i]["smiles"],
+            )
+            for i in range(next_batch_start, next_batch_end)
+        ]
+        caption_state = torch.concat([i[0] for i in sample], dim=0)
+        caption_mask = torch.concat([i[1] for i in sample], dim=0)
+        caption = [i[2] for i in sample]
+        smiles = [i[3] for i in sample]
+        outputs = sample_fn(
+            model,
+            (args.batch_size, 256, model.in_channels),
+            clip_denoised=args.clip_denoised,
+            denoised_fn=None,
+            model_kwargs={},
+            top_p=args.top_p,
+            progress=True,
+            caption=(caption_state, caption_mask),
+        )
+        logits = model.get_logits(torch.tensor(outputs).cuda())
+        cands = torch.topk(logits, k=1, dim=-1)
+        outputs = cands.indices
+        outputs = outputs.squeeze(-1)
+        outputs = tokenizer.decode(outputs)
+        with open(args.outputdir, "a") as f:
+            for i, x in enumerate(outputs):
+                f.write(
+                    caption[i]
+                    + "\t"
+                    + smiles[i]
+                    + "\t"
+                    + sf.decoder(x.replace("<pad>", "").replace("</s>", ""))
+                    + "\n"
+                )
+        all_outputs += outputs
+        all_caption += caption
+        all_smiles += smiles
+        next_batch_start = next_batch_end
+        next_batch_end = min(next_batch_end + args.batch_size, len(validation_dataset))
+        pbar.update(1)
+        if next_batch_start == len(validation_dataset):
+            break
+    with open(args.outputdir.replace(".txt", "_final.txt"), "w") as f:
+        for i, x in enumerate(all_outputs):
+            f.write(
+                all_caption[i]
+                + "\t"
+                + all_smiles[i]
+                + "\t"
+                + sf.decoder(x.replace("<pad>", "").replace("</s>", ""))
+                + "\n"
+            )
+def create_argparser():
+    defaults = dict(
+        clip_denoised=False,
+        mbr_sample=1,
+        model_path="",
+        model_arch="conv-unet",
+        verbose="yes",
+    )
+    text_defaults = dict(
+        modality="text",
+        dataset_name="",
+        dataset_config_name="wikitext-2-raw-v1",
+        dataset_path="dataset",
+        experiment="gpt2_pre_compress",
+        model_arch="trans-unet",
+        model_in_channels=32,
+        model_model_channels=128,
+        model_dropout=0.1,
+        model_hidden_size=1024,
+        model_num_attention_heads=16,
+        model_num_hidden_layers=12,
+        preprocessing_num_workers=1,
+        emb_scale_factor=1.0,
+        clamp="clamp",
+        split="validation",
+        model_path="",
+        use_ddim=False,
+        batch_size=16,
+        top_p=1.0,
+        outputdir="output.txt",
+        diffusion_steps=2000,
+        token_max_length=256,
+        start=0,
+    )
+    defaults.update(model_and_diffusion_defaults())
+    defaults.update(text_defaults)
+    parser = argparse.ArgumentParser()
+    add_dict_to_argparser(parser, defaults)
+    return parser
+if __name__ == "__main__":
+    main()

inference_submission.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import argparse
+import selfies as sf
+from tqdm import tqdm
+from transformers import set_seed
+from src.scripts.mytokenizers import Tokenizer
+from src.improved_diffusion import gaussian_diffusion as gd
+from src.improved_diffusion import dist_util, logger
+from src.improved_diffusion.respace import SpacedDiffusion
+from src.improved_diffusion.transformer_model import TransformerNetModel
+from src.improved_diffusion.script_util import (
+    model_and_diffusion_defaults,
+    add_dict_to_argparser,
+)
+from src.scripts.mydatasets import Lang2molDataset_submission
+def main():
+    set_seed(42)
+    args = create_argparser().parse_args()
+    # dist_util.setup_dist()
+    logger.configure()
+    args.sigma_small = True
+    # args.diffusion_steps = 200 #500  # DEBUG
+    if args.experiment == "random1":
+        args.experiment = "random"
+    logger.log("creating model and diffusion...")
+    tokenizer = Tokenizer()
+    model = TransformerNetModel(
+        in_channels=args.model_in_channels,
+        model_channels=args.model_model_channels,
+        dropout=args.model_dropout,
+        vocab_size=len(tokenizer),
+        hidden_size=args.model_hidden_size,
+        num_attention_heads=args.model_num_attention_heads,
+        num_hidden_layers=args.model_num_hidden_layers,
+    )
+    model.eval()
+    diffusion = SpacedDiffusion(
+        use_timesteps=[i for i in range(0, args.diffusion_steps, 10)],
+        betas=gd.get_named_beta_schedule("sqrt", args.diffusion_steps),
+        model_mean_type=(gd.ModelMeanType.START_X),
+        model_var_type=((gd.ModelVarType.FIXED_LARGE)),
+        loss_type=gd.LossType.E2E_MSE,
+        rescale_timesteps=True,
+        model_arch="transformer",
+        training_mode="e2e",
+    )
+    model.load_state_dict(
+        dist_util.load_state_dict(args.model_path, map_location="cpu")
+    )
+    pytorch_total_params = sum(p.numel() for p in model.parameters())
+    logger.log(f"the parameter count is {pytorch_total_params}")
+    model.to(dist_util.dev())
+    model.eval()
+    logger.log("sampling...")
+    print("--" * 30)
+    print(f"Loading {args.split} set")
+    print("--" * 30)
+    validation_dataset = Lang2molDataset_submission(
+        dir=args.dataset_path,
+        tokenizer=tokenizer,
+        split=args.split,
+        corrupt_prob=0.0,
+        token_max_length=args.token_max_length,
+        dataset_name=args.dataset_name,
+    )
+    print("-------------------- DATASET INFO --------------------")
+    print(f"Size: {len(validation_dataset)} samples")
+    print(f'Sample shape: {validation_dataset[0]["caption_state"].shape}')
+    print(f"Use DDIM: {args.use_ddim}")
+    sample_fn = (
+        diffusion.p_sample_loop if not args.use_ddim else diffusion.ddim_sample_loop
+    )
+    print(f"Batch size: {args.batch_size}")
+    next_batch_start = args.start
+    next_batch_end = next_batch_start + args.batch_size
+    all_outputs = []
+    all_caption = []
+    pbar = tqdm(
+        total=len(validation_dataset) // args.batch_size + 1
+        if len(validation_dataset) % args.batch_size != 0
+        else len(validation_dataset) // args.batch_size
+    )
+    while True:
+        sample = [
+            (
+                validation_dataset[i]["caption_state"],
+                validation_dataset[i]["caption_mask"],
+                validation_dataset[i]["caption"],
+            )
+            for i in range(next_batch_start, next_batch_end)
+        ]
+        caption_state = torch.concat([i[0] for i in sample], dim=0)
+        caption_mask = torch.concat([i[1] for i in sample], dim=0)
+        caption = [i[2] for i in sample]
+        outputs = sample_fn(
+            model,
+            (args.batch_size, 256, model.in_channels),
+            clip_denoised=args.clip_denoised,
+            denoised_fn=None,
+            model_kwargs={},
+            top_p=args.top_p,
+            progress=True,
+            caption=(caption_state, caption_mask),
+        )
+        logits = model.get_logits(torch.tensor(outputs).cuda())
+        cands = torch.topk(logits, k=1, dim=-1)
+        outputs = cands.indices
+        outputs = outputs.squeeze(-1)
+        outputs = tokenizer.decode(outputs)
+        with open(args.outputdir, "a") as f:
+            for i, x in enumerate(outputs):
+                f.write(
+                    sf.decoder(
+                        x.replace("<pad>", "").replace("</s>", "").replace("\t", "")
+                    ).replace("\t", "")
+                    + "\n"
+                )
+        all_outputs += outputs
+        all_caption += caption
+        next_batch_start = next_batch_end
+        next_batch_end = min(next_batch_end + args.batch_size, len(validation_dataset))
+        pbar.update(1)
+        if next_batch_start == len(validation_dataset):
+            break
+    with open(args.outputdir.replace(".txt", "_final.txt"), "w") as f:
+        for i, x in enumerate(all_outputs):
+            f.write(sf.decoder(x.replace("<pad>", "").replace("</s>", "")) + "\n")
+def create_argparser():
+    defaults = dict(
+        clip_denoised=False,
+        mbr_sample=1,
+        model_path="",
+        model_arch="conv-unet",
+        verbose="yes",
+    )
+    text_defaults = dict(
+        modality="text",
+        dataset_name="language-plus-molecules/LPM-24_eval-molgen",
+        dataset_config_name="wikitext-2-raw-v1",
+        dataset_path="dataset",
+        experiment="gpt2_pre_compress",
+        model_arch="trans-unet",
+        model_in_channels=32,
+        model_model_channels=128,
+        model_dropout=0.1,
+        model_hidden_size=1024,
+        model_num_attention_heads=16,
+        model_num_hidden_layers=12,
+        preprocessing_num_workers=1,
+        emb_scale_factor=1.0,
+        clamp="clamp",
+        split="train",
+        model_path="",
+        use_ddim=False,
+        batch_size=7,
+        top_p=1.0,
+        outputdir="output.txt",
+        diffusion_steps=2000,
+        token_max_length=256,
+        start=0,
+    )
+    defaults.update(model_and_diffusion_defaults())
+    defaults.update(text_defaults)
+    parser = argparse.ArgumentParser()
+    add_dict_to_argparser(parser, defaults)
+    return parser
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

Binary file (128 Bytes). View file

src/__init__.py ADDED Viewed

File without changes

src/anlg_infill/anlg.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import json
+import sys, os, torch
+from spacy.lang.en import English
+from improved_diffusion.rounding import rounding_func, load_models, load_tokenizer
+from transformers import AutoModelForCausalLM
+# read files.
+# with open('diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json', 'r') as f:
+SPLIT = 'test'
+if SPLIT == 'val':
+    source_file = 'diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json'
+elif SPLIT == 'test':
+    source_file = 'diffusion_lm/ROCstory/anlg/anlg/test_cleanup_no_label.json'
+else:
+    assert False, "invalid split"
+with open(source_file, 'r') as f:
+    sent_lst = json.load(f)
+nlp = English()
+tokenizer = nlp.tokenizer
+MODE = 'ar'
+'''
+ "00b9adb2-b3b6-4737-902a-50f308bac4b5-1": {
+        "gold_labels": [
+            "I put my baby in the car and drove around.",
+            "I realized he needed his blanket, which I had forgotten at a faraway hotel.",
+            "I took a drive to get my baby to sleep.",
+            "I took my baby for a drive and she fell asleep in the car."
+        ],
+        "obs1": "My baby would not go to sleep last night.",
+        "obs2": "I wound up driving for hours."
+    },
+'''
+print(len(sent_lst))
+if MODE == 'ar':
+    model_name = 'predictability/diff_models/roc_e=20_b=32_m=gpt2_wikitext-103-raw-v1_101_wp_pad_infill'
+    model_name = 'predictability/diff_models/roc_e=6_b=10_m=gpt2_wikitext-103-raw-v1_101_wp_pad_infill_v2'
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,  # path to the AR model trained for LMing this task.
+    ).cuda()
+    tokenizer2 = load_tokenizer('roc', 'random',
+                               'predictability/diffusion_models_v7/diff_roc_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart')
+    vocab = {v: k for k, v in tokenizer2.items()}
+    print(len(tokenizer2), len(vocab), 'loaded vocabs')
+    outfile='ar_sample_full_test_v2.json'
+    filehandle = open(outfile, 'w')
+for idx, (key, val) in enumerate(sent_lst.items()):
+    # if idx <= 499:
+    #     continue
+    # if idx >= 500:
+    #     continue
+    # if idx != 684:
+    #     continue
+    if MODE == 'diff':
+        partial_seq = f"{val['obs1']} " + "PAD "*10 + f"{val['obs2']}"
+        word_lst = [x.text for x in tokenizer(partial_seq)]
+        partial_seq = " ".join(word_lst)
+        print(partial_seq, idx)
+        # partial_seq = "Brenna and I used to be best friends . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD We never talked again ."
+        COMMAND = "python ../scripts/infill.py " \
+                  "--model_path predictability/diffusion_models_v7/diff_roc_pad_rand128_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart_e2e_long/ema_0.9999_800000.pt " \
+                  " --batch_size 50  " \
+                  f"--partial_seq \'{partial_seq}\' " \
+                  f"--eval_task_ infill --notes {SPLIT}_{idx} " \
+                  f"--out_dir ../anlg_results"
+        os.system(COMMAND)
+        torch.cuda.empty_cache()
+    elif MODE == 'ar':
+        partial_seq = f"{val['obs1']} " + f"{val['obs2']}"
+        print(partial_seq)
+        word_idx_lst = [vocab['START']] + [vocab.get(x.text, vocab['UNK']) for x in tokenizer(partial_seq)]
+        init_prompt = torch.LongTensor(word_idx_lst).cuda().unsqueeze(0)
+        print(init_prompt.shape)
+        # sample_out = model.generate(init_prompt, do_sample=True, max_length=64, top_k=len(vocab))
+        if 'sample' in outfile:
+            print('sampling 50 examples.')
+            init_prompt = init_prompt.expand(50, -1)
+            sample_out = model.generate(init_prompt, do_sample=True, max_length=64, top_k=len(vocab))
+        else:
+            sample_out = model.generate(init_prompt, do_sample=False, num_beam=4, max_length=64, top_k=len(vocab))
+        print(sample_out.shape)
+        sample_out = sample_out[:, init_prompt.size(1):]
+        # decode
+        if 'sample' in outfile:
+            sample_lst = []
+            for examp in sample_out:
+                sample = examp.tolist()
+                words_sample = [tokenizer2[s] for s in sample]
+                tempsent = [x for x in words_sample if x != 'PAD']
+                if tempsent[0] == 'START':
+                    tempsent = tempsent[1:]
+                if tempsent[-1] == 'END':
+                    tempsent = tempsent[:-1]
+                result_sent = " ".join(tempsent)
+                sample_lst.append(result_sent)
+            out_dict = {'idx': idx,
+                        'obs1': val['obs1'],
+                        'obs2': val['obs2'],
+                        'samples': sample_lst}
+            print(json.dumps(out_dict), file=filehandle)
+        else:
+            sample = sample_out[0].tolist()
+            words_sample = [tokenizer2[s] for s in sample]
+            tempsent = [x for x in words_sample if x != 'PAD']
+            if tempsent[0] == 'START':
+                tempsent = tempsent[1:]
+            if tempsent[-1] == 'END':
+                tempsent = tempsent[:-1]
+            result_sent = " ".join(tempsent)
+            out_dict = {'idx':idx,
+                        'obs1':val['obs1'],
+                        'obs2':val['obs2'],
+                        'sample':result_sent}
+            print(json.dumps(out_dict), file=filehandle)
+filehandle.close()
+print(f'written to {outfile}')

src/anlg_infill/mbr_eval.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import os, sys, json
+import glob
+from functools import partial
+sys.path.insert(0, 'e2e-metrics')
+import numpy as np
+from pycocotools.coco import COCO
+from pycocoevalcap.eval import COCOEvalCap
+from metrics.pymteval import BLEUScore, NISTScore
+from nltk.translate.meteor_score import meteor_score
+from parse import *
+import json
+import sys, os, torch
+from spacy.lang.en import English
+import ast
+from transformers import BertForMaskedLM, BertTokenizer
+MODE = sys.argv[1] # ar or diff
+SPLIT = sys.argv[2] # val or test
+OUT_PATH = sys.argv[3] # output path.
+INPUT_PATH = sys.argv[4] # input path. e.g. diffusion_lm/improved-diffusion/anlg_results/diff_roc_pad_rand128_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart_e2e_long.ema_0.9999_800000.pt.infill_infill
+def load_results_simple(path):
+    with open(path, 'r') as f:
+        full_result_dict = json.load(f)
+    return full_result_dict
+def post_process(filename, fileout, tokenizer_spacy):
+    # filename = 'diffusion_lm/improved-diffusion/anlg_results/diff_roc_mbr.json2'
+    bert_model = 'bert-base-cased'
+    tokenizer = BertTokenizer.from_pretrained(bert_model)
+    model = BertForMaskedLM.from_pretrained(bert_model).cuda()
+    fileout_handle = open(fileout, 'w')
+    full_lst = []
+    with open(filename, 'r') as f:
+        for line in f:
+            line = json.loads(line)
+            full_lst.append(line)
+    for example in full_lst:
+        sent = example['sample']
+        obs1 = example['obs1']
+        obs2 = example['obs2']
+        if 'UNK' in sent:
+            sent = obs1 + sent.replace('UNK', tokenizer.mask_token) + obs2
+            print(sent)
+            model_inputs = tokenizer(sent, return_tensors="pt")
+            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
+            model_out = model(**model_inputs)
+            mask_words = model_inputs['input_ids'] == tokenizer.mask_token_id
+            masked_logits = model_out.logits[mask_words].view(-1, model_out.logits.size(-1))
+            # take argmax from this.
+            max_cands = torch.max(masked_logits, dim=-1)
+            indices = max_cands.indices
+            model_inputs['input_ids'][mask_words] = indices
+            out = tokenizer.batch_decode(model_inputs['input_ids'].tolist(),
+                                         skip_special_tokens=True)[0]
+            print(out)
+            word_lstout = [x.text for x in tokenizer_spacy(out)]
+            word_lst1 = [x.text for x in tokenizer_spacy(example['obs1'])]
+            word_lst2 = [x.text for x in tokenizer_spacy(example['obs2'])]
+            example['sample'] = " ".join(word_lstout[len(word_lst1):-len(word_lst2)])
+            print(example['sample'])
+            print()
+        else:
+            print('NO NEED THIS FIX. ')
+        print(json.dumps(example), file=fileout_handle)
+    fileout_handle.close()
+def load_results(sent_lst, tokenizer):
+    # target_file = f"{INPUT_PATH}_*.json"
+    # target_file = glob.glob(target_file)
+    # print([x for x in target_file if 'val' not in x and 'test' not in x])
+    # 10/0
+    full_result_dict = {}
+    failed_instances = []
+    found_idx = []
+    sent_lst_lst = list(sent_lst.items())
+    for idx, (key, val) in enumerate(sent_lst_lst):
+        # if idx < 2500: continue
+        if idx in full_result_dict.keys(): continue
+        word_lst1 = [x.text for x in tokenizer(val['obs1'])]
+        word_lst2 = [x.text for x in tokenizer(val['obs2'])]
+        # target_file = f"diffusion_lm/improved-diffusion/anlg_results/diff_roc_pad_rand128_" \
+        #               f"transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart_e2e_long.ema" \
+        #               f"_0.9999_800000.pt.infill_infill_*_{SPLIT}_{idx}.json"
+        target_file = f"{INPUT_PATH}_*_{SPLIT}_{idx}.json"
+        file_lst = glob.glob(target_file)
+        # print(file_lst, target_file)
+        try:
+            assert len(file_lst) == 1
+        except:
+            print('the file must have existed in a batched version')
+            # if SPLIT == 'val': assert False
+            # if idx % 100 == 1: idx = idx-1
+            target_file = f"{INPUT_PATH}_*_{idx}.json"
+            file_lst = glob.glob(target_file)
+            print(file_lst, target_file)
+            print(file_lst)
+        target_file = file_lst[0]
+        if "x128" in target_file:
+            infill_lst = []
+            with open(target_file, 'r') as f:
+                for line in f:
+                    example = json.loads(line)[0]
+                    infill_ = example.split()[len(word_lst1):-len(word_lst2)]
+                    # print(len(infill_))
+                    # print(infill_, example)
+                    # assert len(infill_) == 10
+                    infill_=' '.join(infill_)
+                    # print(infill_)
+                    infill_lst.append(infill_)
+            result_dict = {
+                "pred_samples": infill_lst,
+                "sample": None,
+                "obs1": val['obs1'],
+                "obs2": val['obs2']
+            }
+            full_result_dict[idx] = result_dict
+        else:
+            with open(target_file, 'r') as f:
+                for line in f:
+                    example = ast.literal_eval(line.strip())
+                    index, template = list(example.keys())[0]
+                    print(index, idx)
+                    if int(index) < int(idx):
+                        continue
+                    assert int(index) == int(idx)
+                    found_idx.append(idx)
+                    example = list(example.values())[0]
+                    kk, val = sent_lst_lst[idx]
+                    word_lst1 = [x.text for x in tokenizer(val['obs1'])]
+                    word_lst2 = [x.text for x in tokenizer(val['obs2'])]
+                    infill_lst = [" ".join(xx.split()[len(word_lst1):-len(word_lst2)]) for xx in example]
+                    result_dict = {
+                        "pred_samples": infill_lst,
+                        "sample": None,
+                        "obs1": val['obs1'],
+                        "obs2": val['obs2']
+                    }
+                    full_result_dict[idx] = result_dict
+                    idx += 1
+    with open('full_diff_test_outputs_aug.json', 'w') as f:
+        json.dump(full_result_dict, f)
+    return full_result_dict
+# read files.
+def mbr(result_lst, total_len, sample_size, utility):
+    result = []
+    for i in range(total_len):
+        example_set = result_lst[i * sample_size:(i + 1) * sample_size]
+        # print(example_set)
+        score_dict = {}
+        for idx in range(len(example_set)):
+            y = example_set[idx]
+            utility_lst = []
+            for idx_x in range(len(example_set)):
+                if idx_x != idx:
+                    utility_lst.append(utility(example_set[idx_x], y))
+            score_dict[idx] = np.array(utility_lst).mean()
+        # print(score_dict)
+        best_y = sorted(score_dict.items(), key=lambda item: item[1])[-1]
+        result.append(example_set[best_y[0]])
+        # print(best_y)
+    return result
+def bleu_score(scorer, sent_sys, sents_ref):
+    scorer.reset()
+    scorer.append(sent_sys, [sents_ref])
+    return scorer.score()
+def meteor_score2(pred, ref):
+    meteor = meteor_score([ref.split()], pred.split())
+    return meteor
+def apply_mbr_func(full_result_dict, outpath, sent_lst):
+    assert len(sent_lst) == len(full_result_dict)
+    out_handle = open(outpath, 'w')
+    count = 0
+    for idx, val in full_result_dict.items():
+        infill_lst = val['pred_samples']
+        print(count, idx )
+        assert count == int(idx)
+        count += 1
+        sample_size = len(infill_lst)
+        total_len = 1
+        mteval_scorers = [BLEUScore(), BLEUScore(smoothing=1.0), NISTScore()]
+        result_lst = mbr(infill_lst, total_len, sample_size, partial(bleu_score, mteval_scorers[1]))
+        print(infill_lst)
+        print(result_lst)
+        result_str = result_lst[0]
+        result_dict = {
+            "pred_samples": infill_lst,
+            "sample": result_str,
+            "obs1": val['obs1'],
+            "obs2": val['obs2']
+        }
+        print(json.dumps(result_dict), file=out_handle)
+    out_handle.close()
+    print(f'written to {outpath}')
+    return
+if SPLIT == 'val':
+    source_file = 'diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json'
+elif SPLIT == 'test':
+    source_file = 'diffusion_lm/ROCstory/anlg/anlg/test_cleanup_no_label.json'
+else:
+    assert False, "invalid split"
+with open(source_file, 'r') as f:
+    sent_lst = json.load(f)
+if MODE == 'diff':
+    nlp = English()
+    tokenizer = nlp.tokenizer
+    # load_results(sent_lst, tokenizer)
+    # 10/0
+    decoded_dict = load_results_simple(INPUT_PATH)
+    ############3
+    # small_decoded_dict = {}
+    # for i in range(10):
+    #     small_decoded_dict[i] = decoded_dict[str(i)]
+    # decoded_dict = small_decoded_dict
+    # small_sent_lst = {}
+    # for k, v in sent_lst.items():
+    #     if len(small_sent_lst) > 9: break
+    #     small_sent_lst[k] = v
+    # sent_lst = small_sent_lst
+    ############3
+    outpath = OUT_PATH
+    apply_mbr_func(decoded_dict, outpath, sent_lst)
+    post_process(outpath, outpath+'.clean.json', tokenizer)
+    #
+    # # load_results(sent_lst, tokenizer)
+    # # 10/0
+    # print(len(sent_lst))
+    # for idx, (key, val) in enumerate(sent_lst.items()):
+    #     # if idx < 518: continue
+    #     if idx > 900:
+    #         break
+    #     # change the matching method.
+    #     word_lst1 = [x.text for x in tokenizer(val['obs1'])]
+    #     word_lst2 = [x.text for x in tokenizer(val['obs2'])]
+    #     # partial_seq = f"{val['obs1']} " + "PAD " + f"{val['obs2']}"
+    #     # word_lst = [x.text for x in tokenizer(partial_seq)]
+    #     # partial_seq = " ".join(word_lst)
+    #     # partial_seq = partial_seq.replace('PAD', '{}')
+    #     # print(partial_seq, idx)
+    #
+    #     # target_file = f"diffusion_lm/improved-diffusion/anlg_results/diff_roc_pad_rand128_" \
+    #     #               f"transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd108_xstart_e2e_long.ema" \
+    #     #               f"_0.9999_800000.pt.infill_infill_*_{SPLIT}_{idx}.json"
+    #     target_file = f"{INPUT_PATH}_*_{SPLIT}_{idx}.json"
+    #
+    #     file_lst = glob.glob(target_file)
+    #     print(file_lst, target_file)
+    #     assert len(file_lst) == 1
+    #     target_file = file_lst[0]
+    #     # print(target_file)
+    #     infill_lst = []
+    #     with open(target_file, 'r') as f:
+    #         for line in f:
+    #             example = json.loads(line)[0]
+    #             # print(example, partial_seq)
+    #             # infill_ = parse(partial_seq, example)
+    #             # print(example)
+    #             infill_ = example.split()[len(word_lst1):-len(word_lst2)]
+    #             # print(len(infill_))
+    #             # print(infill_, example)
+    #             # assert len(infill_) == 10
+    #             infill_=' '.join(infill_)
+    #             # print(infill_)
+    #             infill_lst.append(infill_)
+    #     infill_lst = infill_lst
+    #     sample_size = len(infill_lst)
+    #     total_len = 1
+    #     mteval_scorers = [BLEUScore(), BLEUScore(smoothing=1.0), NISTScore()]
+    #     result_lst = mbr(infill_lst, total_len, sample_size, partial(bleu_score, mteval_scorers[1]))
+    #     print(infill_lst)
+    #     print(result_lst)
+    #     result_str = result_lst[0]
+    #     result_dict = {
+    #         "pred_samples": infill_lst,
+    #         "sample":result_str,
+    #         "obs1": val['obs1'],
+    #         "obs2": val['obs2']
+    #     }
+    #     print(json.dumps(result_dict), file=out_handle)
+    #
+    # out_handle.close()
+    # print(f'written to {outpath}')
+elif MODE == 'ar':
+    outpath = OUT_PATH #'diffusion_lm/improved-diffusion/anlg_results/ar_full_mbr.json'
+    out_handle = open(outpath, 'w')
+    sample_file = INPUT_PATH #'diffusion_lm/improved-diffusion/anlg_results/ar_sample_500_v2.json'
+    nlp = English()
+    tokenizer = nlp.tokenizer
+    print(len(sent_lst))
+    sample_lst = []
+    with open(sample_file, 'r') as f:
+        for line in f:
+            sample_dict = json.loads(line)
+            sample_lst.append(sample_dict)
+    for idx, (key, val) in enumerate(sent_lst.items()):
+        # if idx < 109: continue
+        # if idx > 499:
+        #     break
+        infill_lst = sample_lst[idx]['samples']
+        sample_size = len(infill_lst)
+        total_len = 1
+        mteval_scorers = [BLEUScore(), BLEUScore(smoothing=1.0), NISTScore()]
+        result_lst = mbr(infill_lst, total_len, sample_size, partial(bleu_score, mteval_scorers[1]))
+        print(infill_lst)
+        print(result_lst)
+        result_str = result_lst[0]
+        result_dict = {
+            "pred_samples": infill_lst,
+            "sample": result_str,
+            "obs1": val['obs1'],
+            "obs2": val['obs2']
+        }
+        print(json.dumps(result_dict), file=out_handle)
+    out_handle.close()
+    print(f'written to {outpath}')
+    post_process(outpath, outpath + '.clean.json', tokenizer)
+# print(file+'.clean')
+# with open(file+'.clean', 'w') as f:
+#     for line in result_lst:
+#         print(line, file=f)

src/anlg_infill/post_process.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import json
+from transformers import BertForMaskedLM, BertTokenizer
+filename = 'diffusion_lm/improved-diffusion/anlg_results/diff_roc_mbr.json2'
+bert_model = 'bert-base-uncased'
+tokenizer = BertTokenizer.from_pretrained(bert_model)
+model = BertForMaskedLM.from_pretrained(bert_model).cuda()
+full_lst = []
+with open(filename, 'r') as f:
+    for line in f:
+        line = json.loads(line)
+        full_lst.append(line)
+for example in full_lst:
+    sent = example['sample']
+    obs1 = example['obs1']
+    obs2 = example['obs2']
+    if 'UNK' in sent:
+        sent = obs1 + sent.replace('UNK', tokenizer.mask_token) + obs2
+        print(sent)
+        model_inputs = tokenizer(sent,return_tensors="pt")
+        model_inputs = {k:v.to(model.device) for k,v in model_inputs.items()}
+        model_out = model(**model_inputs)
+        mask_words = model_inputs['input_ids'] == tokenizer.mask_token_id
+        masked_logits = model_out.logits[mask_words].view(-1, model_out.logits.size(-1))
+        if masked_logits.size(0) > 0:
+            # take argmax from this.
+            max_cands = torch.max(masked_logits, dim=-1)
+            indices = max_cands.indices
+        model_inputs['input_ids'][mask_words] = indices
+        print(tokenizer.batch_decode(model_inputs['input_ids'].tolist()))
+    else:
+        print('NO NEED THIS FIX. ')

src/anlg_infill/run_evaluation.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch, json, sys
+SPLIT = sys.argv[1] # val or test
+MBR_PATH = sys.argv[2] # output path.
+# read files.
+if SPLIT == 'val':
+    source_file = '/diffusion_lm/ROCstory/anlg/anlg/dev_cleanup.json'
+elif SPLIT == 'test':
+    source_file = '/diffusion_lm/ROCstory/anlg/anlg/test_cleanup_no_label.json'
+else:
+    assert False, "invalid split"
+with open(source_file, 'r') as f:
+    sent_lst = json.load(f)
+# read generation
+generated_lst = []
+# with open('/diffusion_lm/improved-diffusion/anlg_results/ar_beam_500.json', 'r') as f:
+# with open('/diffusion_lm/improved-diffusion/anlg_results/ar_beam_500_v2.json', 'r') as f:
+# with open('/diffusion_lm/improved-diffusion/anlg_results/ar_full_mbr.json', 'r') as f:
+# with open('/diffusion_lm/improved-diffusion/anlg_results/diff_full.json', 'r') as f:
+with open(MBR_PATH, 'r') as f:
+    for line in f:
+        generated_lst.append(json.loads(line))
+print(len(generated_lst), len(sent_lst))
+# eval_file_gen = "/diffusion_lm/improved-diffusion/anlg_results/ar_gen_mbr_v2.txt"
+# eval_file_gold = "/diffusion_lm/improved-diffusion/anlg_results/ar_ref_mbr_v2.txt"
+if SPLIT == 'val':
+    eval_file_gen = f"{MBR_PATH}_gen.txt"
+    fgen = open(eval_file_gen, 'w')
+    eval_file_gold = f"{MBR_PATH}_ref.txt"  # "/diffusion_lm/improved-diffusion/anlg_results/diff_ref_v1.txt"
+    fgold = open(eval_file_gold, 'w')
+    for gen, gold in zip(generated_lst, sent_lst.items()):
+        print(gen['sample'], file=fgen)
+        gold = gold[1]
+        for x in gold['gold_labels']:
+            print(x, file=fgold)
+        print('', file=fgold)
+    fgold.close()
+    fgen.close()
+elif SPLIT == 'test':
+    eval_file_prediction = f"{MBR_PATH}_prediction.json"  # "/diffusion_lm/improved-diffusion/anlg_results/diff_ref_v1.txt"
+    # fpred = open(eval_file_prediction, 'w')
+    full_dict = {}
+    for gen, gold in zip(generated_lst, sent_lst.items()):
+        print(gold)
+        print(gen['sample'])
+        full_dict[gold[0]] = gen['sample']
+        # temp_dict = {gold[0]:gen['sample']}
+        # print(temp_dict)
+        # print(json.dumps(temp_dict), file=fpred)
+        # gold = gold[1]
+        # for x in gold['gold_labels']:
+        #     print(x, file=fgold)
+        # print('', file=fgold)
+    with open(eval_file_prediction, 'w') as fpred:
+        json.dump(full_dict, fpred)
+    ###########
+    test_ref = '/diffusion_lm/ROCstory/anlg/anlg/test_cleanup_ref.json'
+    with open(test_ref, 'r') as f:
+        test_ref_lst = json.load(f)
+    eval_file_gen = f"{MBR_PATH}_gen.txt"
+    fgen = open(eval_file_gen, 'w')
+    eval_file_gold = f"{MBR_PATH}_ref.txt"  # "/diffusion_lm/improved-diffusion/anlg_results/diff_ref_v1.txt"
+    fgold = open(eval_file_gold, 'w')
+    for gen, gold in zip(generated_lst, sent_lst.items()):
+        story_id = gold[0]
+        print(gen['sample'], file=fgen)
+        for x in test_ref_lst[story_id]:
+            print(x, file=fgold)
+        print('', file=fgold)
+    fgold.close()
+    fgen.close()
+# generate prediction.json

src/control_gen/baseline_control.py ADDED Viewed

	@@ -0,0 +1,500 @@

+# syntax, semantics, etc...
+import torch, json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import argparse
+import os
+import numpy as np
+import torch as th
+import torch.distributed as dist
+from transformers import set_seed
+from improved_diffusion.rounding import rounding_func, load_models, load_tokenizer
+from improved_diffusion import dist_util, logger
+from improved_diffusion.script_util import (
+    NUM_CLASSES,
+    model_and_diffusion_defaults,
+    create_model_and_diffusion,
+    add_dict_to_argparser,
+    args_to_dict,
+)
+from nltk.tree import Tree
+from improved_diffusion.test_util import  load_results
+def remove_leaves(tree_):
+    # simple_increm = 0
+    for s in tree_.subtrees(lambda t: t.height() == 2):
+        s[0] = '*'
+        s._label = ''
+    return tree_
+def main():
+    args = create_argparser().parse_args()
+    set_seed(42)
+    # toy1 = 'START Alimentum is not a family - friendly place , located in city centre . \n END'.split()
+    # toy1 = 'START Located in riverside area , Alimentum restaurant is a place to bring the whole family . \n END'.split()
+    toy1 = ['START', 'The', 'Vaults', 'pub', 'near', 'Café', 'Adriatic', 'has', 'a', '5', 'star', 'rating',
+            '.', 'Prices', 'start', 'at', '£', '30', '.', 'END']
+    if args.mode == 'tree':
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, # path to the AR model trained for LMing this task.
+        ).cuda()
+        model.eval()
+        if args.finetune == 'yes':
+            tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+        else:
+            pass
+        control_label_lst = []
+        with open('diffusion_lm/improved-diffusion/control_gen/target_tree.json', 'r') as controlf:
+            for line in controlf:
+                control_label_lst.append(json.loads(line))
+        result_dict = {}
+        for label_class_dict in control_label_lst:  # control_label_lst[:100]:
+            '''
+                input_strings = [" ".join(pos_) + tokenizer.bos_token + " ".join(seq) + tokenizer.eos_token
+                                         for (pos_, seq) in zip(pos_lst, examples['text'])]
+            '''
+            parse_tree = Tree.fromstring(label_class_dict['tree'])
+            print(parse_tree)
+            parse_tree = remove_leaves(parse_tree)
+            prompt_strings =  parse_tree._pformat_flat("", "()", False) + tokenizer.bos_token
+            prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+            out_text = generate_samples(args, prompt_ids['input_ids'].cuda(), model, tokenizer)
+            result_dict[(label_class_dict['tree'],)] = out_text
+            print(len(out_text))
+        fout = open(args.output_text, 'w')
+        for k, word_lst in result_dict.items():
+            print({k: word_lst}, file=fout)
+        fout.close()
+        # # load trees.
+        # import benepar
+        # parser = benepar.Parser("benepar_en3")
+        # input_sentence1 = benepar.InputSentence(
+        #     words=toy1[1:-1],
+        # )
+        # parse_lst = list(parser.parse_sents([input_sentence1]))[0]
+        # print(parse_lst)
+        # parse_lst = remove_leaves(parse_lst)
+        # prompt_strings = parse_lst._pformat_flat("", "()", False) + tokenizer.bos_token
+        # print(prompt_strings)
+        # prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+        # print(prompt_ids['input_ids'].shape)
+        #
+        # generate_gpt2(args, prompt_ids['input_ids'].cuda())
+        # eval(args)
+    if args.mode == 'spans':
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, # path to the AR model trained for LMing this task.
+        ).cuda()
+        model.eval()
+        if args.finetune == 'yes':
+            tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+        else:
+            import benepar
+            parser = benepar.Parser("benepar_en3")
+            tree_vocab = parser._parser.config["label_vocab"]
+            model_path = 'predictability/diffusion_models_v6/diff_e2e-tgt_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart'
+            tokenizer2 = load_tokenizer('e2e-tgt', 'random', model_path)
+            tokenizer = {v: k for k, v in tokenizer2.items()}
+            print(len(tokenizer), len(tokenizer2), 'loaded vocabs')
+            print('update the vocab to include tree vocabs')
+            print(len(tokenizer))
+            for x in tree_vocab.keys():
+                tokenizer[x] = len(tokenizer)
+            print('update the vocab to include indices')
+            # tokenizer.add_tokens([str(xx) for xx in range(64)])
+            for x in range(64):
+                if str(x) not in tokenizer:
+                    tokenizer[str(x)] = len(tokenizer)
+            vocab_dict = tokenizer
+            rev_tokenizer = {v: k for k, v in vocab_dict.items()}
+        print(len(tokenizer))
+        control_label_lst = []
+        with open('diffusion_lm/improved-diffusion/control_gen/target_spans.json', 'r') as controlf:
+            for line in controlf:
+                control_label_lst.append(json.loads(line))
+        result_dict = {}
+        for span_info in control_label_lst:  # control_label_lst[:100]:
+            (a,b,c) = span_info['spans'][0]
+            if args.finetune == 'yes':
+                prompt_strings = f"{a}, {b}, {c}" + tokenizer.bos_token
+                print(prompt_strings)
+                prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+                out_text = generate_samples(args, prompt_ids['input_ids'].cuda(), model, tokenizer)
+            else:
+                prompt_ids = [vocab_dict.get(x, vocab_dict['UNK']) for x in f"{a} {b} {c}".split()] + [0]
+                print(prompt_ids)
+                prompt_ids = torch.LongTensor(prompt_ids).unsqueeze(0)
+                out_text = generate_samples_from_scratch(args, prompt_ids.cuda(), model, tokenizer, rev_tokenizer)
+            # str(label_class_dict['spans'][0]),
+            result_dict[str(span_info['spans'][0])] = out_text
+            print(len(out_text))
+        fout = open(args.output_text, 'w')
+        for k, word_lst in result_dict.items():
+            print({(k,): word_lst}, file=fout)
+        fout.close()
+    elif args.mode == 'pos':
+        import spacy_stanza
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,  # path to the AR model trained for LMing this task.
+        ).cuda()
+        model.eval()
+        if args.finetune == 'yes':
+            tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+        else:
+            pass
+        control_label_lst = []
+        with open('diffusion_lm/improved-diffusion/control_gen/target_pos.json', 'r') as controlf:
+            for line in controlf:
+                control_label_lst.append(json.loads(line))
+        print(control_label_lst[:5])
+        result_dict = {}
+        for label_class_dict in control_label_lst:  # control_label_lst[:100]:
+            '''
+                input_strings = [" ".join(pos_) + tokenizer.bos_token + " ".join(seq) + tokenizer.eos_token
+                                         for (pos_, seq) in zip(pos_lst, examples['text'])]
+            '''
+            gold_pos = label_class_dict['pos'][1:-1] # remove START, END.
+            words_ = label_class_dict['words_']
+            print(gold_pos, 'target POS tagging sequences', tokenizer.bos_token)
+            prompt_strings = " ".join(gold_pos) + tokenizer.bos_token
+            prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+            out_text = generate_samples(args, prompt_ids['input_ids'].cuda(), model, tokenizer )
+            result_dict[tuple(gold_pos)] = out_text
+            print(len(out_text))
+        fout = open(args.output_text, 'w')
+        for k, word_lst in result_dict.items():
+            print({k:word_lst}, file=fout)
+        fout.close()
+        # tagger = spacy_stanza.load_pipeline("en", processors={"tokenize": "spacy"})
+        # toy1 = 'START The Mill is a coffee shop with an expensive menu near The Sorrento . \n END'.split()
+        # toy1 = ['START', 'The', 'Vaults', 'pub', 'near', 'Café', 'Adriatic', 'has', 'a', '5', 'star', 'rating', '.',
+        #         'Prices', 'start', 'at', '£', '30', '.', '\n', 'END']
+        # sent_full = " ".join(toy1[1:-1])
+        # doc = tagger(sent_full)
+        # gold_pos = [token.pos_ for token in doc]
+        # print(gold_pos, 'target POS tagging sequences')
+        # prompt_strings = " ".join(gold_pos) + tokenizer.bos_token
+        # prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+        # generate_gpt2(args, prompt_ids['input_ids'].cuda())
+    elif args.mode == 'attribute':
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,  # path to the AR model trained for LMing this task.
+        ).cuda()
+        model.eval()
+        if args.finetune == 'yes':
+            tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+        else:
+            pass
+        control_label_lst = []
+        with open('diffusion_lm/improved-diffusion/control_gen/target_attribute.json', 'r') as controlf:
+            for line in controlf:
+                control_label_lst.append(json.loads(line))
+        print(control_label_lst[:5])
+        result_dict = {}
+        for label_class in control_label_lst:  # control_label_lst[:100]:
+            prompt_strings = " ".join(label_class) + tokenizer.bos_token
+            '''
+            input_strings = [
+                        " ".join(attributes) + tokenizer.bos_token + " ".join(words) + tokenizer.eos_token
+                        for (words, attributes) in examples['text']]
+            '''
+            print(label_class, 'target attribute sequences', tokenizer.bos_token)
+            prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+            out_text = generate_samples(args, prompt_ids['input_ids'].cuda(), model, tokenizer)
+            result_dict[tuple(label_class)] = out_text
+            print(len(out_text))
+        fout = open(args.output_text, 'w')
+        for k, word_lst in result_dict.items():
+            print({k: word_lst}, file=fout)
+        fout.close()
+    elif args.mode == 'control_len':
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,  # path to the AR model trained for LMing this task.
+        ).cuda()
+        model.eval()
+        if args.finetune == 'yes':
+            tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+        else:
+            pass
+        result_dict = {}
+        for label_class in range(10, 41):  # control_label_lst[:100]:
+            tgt_len = label_class-2
+            prompt_strings = f"{tgt_len}" + tokenizer.bos_token
+            print(label_class, 'target attribute sequences', tokenizer.bos_token)
+            prompt_ids = tokenizer([prompt_strings], return_tensors='pt')
+            out_text = generate_samples(args, prompt_ids['input_ids'].cuda(), model, tokenizer)
+            result_dict[tuple([label_class])] = out_text
+            print(len(out_text))
+        fout = open(args.output_text, 'w')
+        for k, word_lst in result_dict.items():
+            print({k: word_lst}, file=fout)
+        fout.close()
+        # generate_gpt2(args)
+def eval(args):
+    text_samples = []
+    if args.input_text.endswith('json'):
+        with open(args.input_text, 'r') as f:
+            for line in f:
+                text_samples.append(json.loads(line)[0].split(' '))
+    else:
+        with open(args.input_text, 'r') as f:
+            for line in f:
+                text_samples.append(line.strip().split())
+    # tokenize
+    # load tokenizer.
+    tokenizer = load_tokenizer(args.modality, args.experiment, os.path.split(args.model_path)[0])
+    # print(args.modality, tokenizer, args.experiment)
+    reverse_tokenizer = {v: k for k, v in tokenizer.items()}
+    agg_loss = []
+    for x in text_samples:
+        # print(x)
+        tokenized_x = [reverse_tokenizer[s] for s in x]
+        # print(tokenized_x)
+        tokenized_x = torch.LongTensor(tokenized_x).cuda()
+        labels = tokenized_x.clone()
+        labels[labels == reverse_tokenizer['PAD']] = -100
+        model_output = model(tokenized_x, labels=labels)
+        # print(model_output.loss)
+        agg_loss.append(model_output.loss.item())
+    print(f'\nthe mean loss is {torch.tensor(agg_loss).mean()} for {args.input_text}', )
+    print('-' * 50)
+    if 'infill' in args.input_text:
+        json_path = os.path.join(os.path.split(args.model_path)[0], 'infill_score_decode.json')
+    elif 'ema' in args.model_path:
+        json_path = os.path.join(os.path.split(args.model_path)[0], 'ema_score_decode.json')
+    else:
+        json_path = os.path.join(os.path.split(args.model_path)[0], 'score_decode.json')
+    print(f'written to {json_path}')
+    json_dict = {
+        'score_decode': torch.tensor(agg_loss).mean().item(),
+        'source_decode': args.input_text,
+    }
+    load_results(json_path, json_dict)
+def generate_samples(args, prompt, model, tokenizer):
+    if args.generation_mode == 'search':
+        sample_out = model.generate(prompt, do_sample=False, max_length=200, min_length=prompt.size(1) + 1, num_beams=4,
+                                    top_k=len(tokenizer), top_p=args.top_p, num_return_sequences=1,
+                                    pad_token_id=tokenizer.pad_token_id)
+    else:
+        sample_out = model.generate(prompt, do_sample=True, max_length=200, min_length=prompt.size(1)+1,
+                                    top_k=len(tokenizer), top_p=args.top_p, num_return_sequences=1,
+                                    pad_token_id=tokenizer.pad_token_id)
+    sample_out_lst = sample_out[:, prompt.size(1):]
+    # sample_out_lst.append(sample_out.cpu())
+    # sample_out_lst = torch.cat(sample_out_lst, dim=0)
+    text_out = []
+    for sample in sample_out_lst:
+        sample = sample.tolist()
+        words_sample = tokenizer.decode(sample, skip_special_tokens=True)
+        text_out.append(words_sample)
+    return text_out
+def generate_samples_from_scratch(args, prompt, model, tokenizer, rev_tokenizer):
+    print('generating from scratch')
+    if args.generation_mode == 'search':
+        sample_out = model.generate(prompt, do_sample=False, max_length=200, min_length=prompt.size(1) + 1, num_beams=4,
+                                    top_k=len(tokenizer), top_p=args.top_p, num_return_sequences=1,
+                                    pad_token_id=tokenizer['PAD'], eos_token_id=tokenizer['END'])
+    else:
+        sample_out = model.generate(prompt, do_sample=True, max_length=200, min_length=prompt.size(1) + 1,
+                                    top_k=len(tokenizer), top_p=args.top_p, num_return_sequences=50,
+                                    pad_token_id=tokenizer['PAD'], eos_token_id=tokenizer['END'])
+    sample_out_lst = sample_out[:, prompt.size(1):]
+    # sample_out_lst.append(sample_out.cpu())
+    # sample_out_lst = torch.cat(sample_out_lst, dim=0)
+    text_out = []
+    for sample in sample_out_lst:
+        sample = sample.tolist()
+        words_sample = " ".join([rev_tokenizer[x] for x in sample])
+        text_out.append(words_sample)
+    return text_out
+def generate_gpt2(args, prompt=None):
+    print(f'loading from {args.model_name_or_path}')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,  # path to the AR model trained for LMing this task.
+    ).cuda()
+    # load tokenizer.
+    sample_out_lst = []
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    sample_out = model.generate(prompt, do_sample=True, max_length=200,
+                                top_k=len(tokenizer), top_p=args.top_p, num_return_sequences=50, pad_token_id=tokenizer.pad_token_id)
+    sample_out = sample_out[:, prompt.size(1):]
+    sample_out_lst.append(sample_out.cpu())
+    sample_out_lst = torch.cat(sample_out_lst, dim=0)
+    if args.output_text.endswith('json'):
+        with open(args.output_text, 'w') as f:
+            for sample in sample_out_lst:
+                sample = sample.tolist()
+                words_sample = tokenizer.decode(sample, skip_special_tokens=True)
+                print(json.dumps([words_sample]), file=f)
+    else:
+        with open(args.output_text, 'w') as f:
+            for sample in sample_out_lst:
+                sample = sample.tolist()
+                words_sample = tokenizer.decode(sample,  skip_special_tokens=True)
+                print(words_sample, file=f)
+    agg_loss = []
+    for tokenized_x in sample_out:
+        labels = tokenized_x.clone()
+        labels[labels == tokenizer.eos_token_id] = -100
+        model_output = model(tokenized_x, labels=labels)
+        agg_loss.append(model_output.loss.item())
+    print(f'\nthe mean loss is {torch.tensor(agg_loss).mean()}',)
+    print('-'*50)
+def generate(args):
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,  # path to the AR model trained for LMing this task.
+    ).cuda()
+    print(model.transformer.wte)
+    # print(model)
+    # load tokenizer.
+    tokenizer = load_tokenizer(args.modality, args.experiment, os.path.split(args.model_path)[0])
+    reverse_tokenizer = {v: k for k, v in tokenizer.items()}
+    print(len(tokenizer))
+    init_prompt = torch.LongTensor([reverse_tokenizer['START']]).view(1,1).expand(50, -1).to(model.device)
+    sample_out = model.generate(init_prompt, do_sample=True,  max_length=64,
+                                top_k=len(tokenizer), top_p=args.top_p)
+    print(sample_out.shape)
+    if args.output_text.endswith('json'):
+        with open(args.output_text, 'w') as f:
+            for sample in sample_out:
+                sample = sample.tolist()
+                words_sample = [tokenizer[s] for s in sample]
+                print(json.dumps([" ".join(words_sample)]), file=f)
+    else:
+        with open(args.output_text, 'w') as f:
+            for sample in sample_out:
+                sample = sample.tolist()
+                words_sample = [tokenizer[s] for s in sample]
+                print(" ".join(words_sample), file=f)
+    agg_loss = []
+    for tokenized_x in sample_out:
+        model_output = model(tokenized_x, labels=tokenized_x)
+        agg_loss.append(model_output.loss.item())
+    print(f'\nthe mean loss is {torch.tensor(agg_loss).mean()}',)
+    print('-'*50)
+    ##################
+    text_samples = []
+    if args.output_text.endswith('json'):
+        with open(args.output_text, 'r') as f:
+            for line in f:
+                text_samples.append(json.loads(line)[0].split(' '))
+    else:
+        with open(args.output_text, 'r') as f:
+            for line in f:
+                text_samples.append(line.strip().split())
+    agg_loss = []
+    for idx, x in enumerate(text_samples):
+        # print(x)
+        tokenized_x = [reverse_tokenizer[s] for s in x]
+        tokenized_x = torch.LongTensor(tokenized_x).cuda()
+        # print(tokenized_x)
+        # print(sample_out[idx])
+        # print((tokenized_x == sample_out[idx]).all())
+        model_output = model(tokenized_x, labels=tokenized_x)
+        # print(model_output.loss)
+        agg_loss.append(model_output.loss.item())
+    print(f'\nthe mean loss is {torch.tensor(agg_loss).mean()} for {args.input_text}', )
+def create_argparser():
+    defaults = dict(
+        clip_denoised=True,
+        num_samples=50,#10000,
+        batch_size=16,
+        use_ddim=False,
+        model_path="",
+        model_arch='conv-unet',
+        verbose='yes',
+        finetune='yes',
+        generation_mode='sample',
+    )
+    text_defaults = dict(modality='text',
+                         dataset_name='wikitext',
+                         input_text='',
+                         mode='eval',
+                         output_text='',
+                         dataset_config_name='wikitext-2-raw-v1',
+                         model_name_or_path='predictability/diff_models/compress_e=5_b=60_m=gpt2_wikitext-103-raw-v1_None',
+                         experiment='gpt2_pre_compress', model_arch='trans-unet',
+                         preprocessing_num_workers=1, top_p=1.0,)
+    defaults.update(model_and_diffusion_defaults())
+    defaults.update(text_defaults)
+    # defaults.update(model_and_diffusion_defaults())
+    parser = argparse.ArgumentParser()
+    add_dict_to_argparser(parser, defaults)
+    return parser
+if __name__ == '__main__':
+    with torch.no_grad():
+        main()

src/control_gen/eval_control.py ADDED Viewed

	@@ -0,0 +1,567 @@

+import torch, argparse, json
+import benepar, spacy_stanza
+import numpy as np
+import sys, os
+import csv
+from nltk.tree import Tree
+sys.path.insert(0, os.path.join(sys.path[0], '../scripts/'))
+from tree_helper import chart_from_tree, pad_charts, padded_chart_from_spans
+sys.path.insert(0, os.path.join(sys.path[0], '../../misc/self-attentive-parser/src/'))
+import evaluate
+from spacy.lang.en import English
+from collections import defaultdict
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from improved_diffusion.rounding import rounding_func, load_models, load_tokenizer
+nlp = English()
+tokenizer_spacy = nlp.tokenizer
+def eval_ppl2(args, text_samples):
+    print(f'loading from {args.model_name_or_path}')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,  # path to the AR model trained for LMing this task.
+    ).cuda()
+    if 'r2l' in args.model_name_or_path:
+        print('Use the right-to-left encoding.')
+    args.model_path = 'predictability/diffusion_models_v6/diff_e2e-tgt_pad_rand16_transformer_' \
+                      'lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart/ema_0.9999_200000.pt'
+    tokenizer = load_tokenizer('e2e-tgt', 'random', os.path.split(args.model_path)[0])
+    # print(args.modality, tokenizer, args.experiment)
+    reverse_tokenizer = {v: k for k, v in tokenizer.items()}
+    full_score = []
+    for idxx, (gold, full_word_lst) in enumerate(text_samples.items()):
+        # print(len(full_word_lst), full_word_lst[0])
+        agg_loss = []
+        for x in full_word_lst:
+            # x = " ".join(x).split()
+            if 'r2l' in args.model_name_or_path:
+                string = ["START"] + list(reversed(x)) + ["END"]
+                tokenized_x = [reverse_tokenizer.get(s, reverse_tokenizer['UNK']) for s in string]
+            else:
+                tokenized_x = [reverse_tokenizer['START']] + [reverse_tokenizer.get(s, reverse_tokenizer['UNK']) for s in x] \
+                              + [reverse_tokenizer['END']]
+            # print(tokenized_x)
+            tokenized_x = torch.LongTensor(tokenized_x).cuda()
+            labels = tokenized_x.clone()
+            labels[labels == reverse_tokenizer['PAD']] = -100
+            model_output = model(tokenized_x, labels=labels)
+            # print(model_output.loss)
+            # if idxx == 3:
+            #     print(tokenized_x, model_output.loss.item())
+            agg_loss.append(model_output.loss.item())
+        example_mean_score = torch.tensor(agg_loss).mean()
+        # print(f'\nthe mean loss is {example_mean_score} for index', idxx )
+        full_score.append(example_mean_score)
+    full_score_ = np.array(full_score).mean()
+    print(f'full NLL score is {full_score_} for {len(full_score)}')
+    print(f'full PPL score is {np.e ** full_score_} for {len(full_score)}')
+def eval_ppl(args, text_samples):
+    '''
+    Evaluating using GPT2 finetuned on this task...
+    :param text_lst:
+    :return:
+    '''
+    # load model
+    print(f'loading from {args.model_name_or_path}')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,  # path to the AR model trained for LMing this task.
+    ).cuda()
+    # load tokenizer.
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    print('finished loading models.')
+    args.model_path = 'predictability/diffusion_models_v6/diff_e2e-tgt_pad_rand16_transformer_' \
+                      'lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart/ema_0.9999_200000.pt'
+    diff_tokenizer = load_tokenizer('e2e-tgt', 'random', os.path.split(args.model_path)[0])
+    reverse_diff_tokenizer = {v: k for k, v in diff_tokenizer.items()}
+    full_score = []
+    for gold, full_word_lst in text_samples.items():
+        agg_loss = []
+        for x in full_word_lst:
+            x = [kk if kk in reverse_diff_tokenizer else 'UNK' for kk in x]
+            x = tokenizer.bos_token + " ".join(x) + tokenizer.eos_token
+            # print(x)
+            # should also add BOS EOS token?
+            tokenized_x = tokenizer(x, return_tensors='pt') #[reverse_tokenizer[s] for s in x]
+            input_ids = tokenized_x['input_ids'].cuda()
+            labels = input_ids.clone()
+            # print(tokenized_x)
+            # tokenized_x = torch.LongTensor(tokenized_x).cuda()
+            # labels = tokenized_x.clone()
+            # labels[labels == reverse_tokenizer['PAD']] = -100
+            model_output = model(input_ids, labels=labels)
+            agg_loss.append(model_output.loss.item())
+        example_mean_score = torch.tensor(agg_loss).mean()
+        # print(f'\nthe mean loss is {example_mean_score}', )
+        full_score.append(example_mean_score)
+    full_score_ = np.array(full_score).mean()
+    print(f'full NLL score is {full_score_} for {len(full_score)}')
+    print(f'full PPL score is {np.e ** full_score_} for {len(full_score)}')
+def read_files(args):
+    '''
+    :param args:
+    :return: list of tokenized sentences.
+    '''
+    if args.input_format == 'file':
+        text_samples = []
+        if args.input_text.endswith('json'):
+            with open(args.input_text, 'r') as f:
+                for line in f:
+                    words = [x.text for x in tokenizer_spacy(json.loads(line)[0])]
+                    text_samples.append(words)
+                    # text_samples.append(json.loads(line)[0].split(' '))
+        else:
+            with open(args.input_text, 'r') as f:
+                for line in f:
+                    text_samples.append(line.strip().split())
+        # remove trailing PAD tokens.
+        text_samples2 = []
+        for sent in text_samples:
+            tempsent = [x for x in sent if x != 'PAD']
+            if tempsent[0] == 'START':
+                tempsent = tempsent[1:]
+            if tempsent[-1] == 'END':
+                tempsent = tempsent[:-1]
+            if tempsent[-1] == '\n' and args.mode in ['e2e-tgt-tree', 'e2e-tgt-tree-paired']:
+                tempsent[-1] = '.'
+            text_samples2.append(tempsent)
+        return text_samples2
+    elif args.input_format == 'paired':
+        import ast
+        # nlp = English()
+        # tokenizer = nlp.tokenizer
+        result_lst = defaultdict(list)
+        if args.input_text.endswith('json'):
+            with open(args.input_text, 'r') as f:
+                for line in f:
+                    try:
+                        line = json.loads(line)
+                    except:
+                        if args.mode == 'e2e-tgt-spans-paired':
+                            line = ast.literal_eval(line)
+                            line = {tuple(ast.literal_eval(k[0])) : v for k, v in line.items()}
+                            result_lst.update(line)
+                        else:
+                            line = ast.literal_eval(line)
+                            result_lst.update(line)
+        elif args.input_text.endswith('log'):
+            with open(args.input_text, 'r') as csvfile:
+                roc_reader = csv.reader(csvfile) #delimiter=' ', quotechar='|')
+                for idx, row in enumerate(roc_reader):
+                    if idx == 0: continue
+                    if args.mode == 'e2e-tgt-spans-paired' or args.mode == 'e2e-tgt-length-paired':
+                        pos = tuple(ast.literal_eval(row[0]))
+                        if args.mode == 'e2e-tgt-length-paired':
+                            pos = list(pos)
+                            pos[0] = int(pos[0]) + 2 # because this count didn't accounted for START and END
+                            pos = tuple(pos)
+                    else:
+                        pos = tuple(row[0].split())
+                    result_lst[pos].append(row[2])
+        clean_result_lst = {}
+        for k, text_samples in result_lst.items():
+            text_samples2 = []
+            for sent in text_samples:
+                sent = sent.split(' ')
+                # KEY DEBUG.
+                # sent = [x.text for x in tokenizer_spacy(sent)]
+                # print(sent, sent2)
+                # 10/0
+                tempsent = [x for x in sent if x != 'PAD']
+                if tempsent[0] == 'START':
+                    tempsent = tempsent[1:]
+                if tempsent[-1] == 'END':
+                    tempsent = tempsent[:-1]
+                if tempsent[-1] == '\n' and args.mode == 'e2e-tgt-tree':
+                    tempsent[-1] = '.'
+                # KEY DEBUG.
+                tempsent = " ".join(tempsent)
+                tempsent = [x.text for x in tokenizer_spacy(tempsent)]
+                text_samples2.append(tempsent)
+            if k[0] == 'START' and k[-1] == 'END':
+                kk_ = k[1:-1]
+            else:
+                kk_ = k
+            clean_result_lst[kk_] = text_samples2 # remove start and end from the training data.
+        return clean_result_lst
+def eval_parse(parser, generated, tree_vocab):
+    sent_lst = []
+    for sent in generated:
+        # print(sent)
+        input_sentence1 = benepar.InputSentence(
+            words=sent,
+        )
+        sent_lst.append(input_sentence1)
+    parse_lst = list(parser.parse_sents(sent_lst))
+    # print(examples['text'][:10])
+    assert len(parse_lst) == len(generated)
+    # print(parse_lst[:2])
+    spans_lst = []
+    for parse in parse_lst:
+        chart, spans = chart_from_tree(tree_vocab, parse, verbose=True)
+        spans_lst.append(spans)
+    return parse_lst, spans_lst
+def levenshteinDistance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2+1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+def score_spans(gold_spans, generated_span):
+    print(gold_spans)
+    print(generated_span)
+    gold_spans = set([gold_spans])
+    generated_span = set(generated_span)
+    intersection = gold_spans.intersection(generated_span)
+    print(intersection, len(intersection) / len(gold_spans))
+    # union = gold_spans.union(generated_span)
+    # print(len(union), len(intersection))
+    # if unlabeled:
+    # print(generated_span)
+    # unlabeled_gold_spans = set([(a,b) for (a, b, v) in gold_spans])
+    # unlabeled_generated_span =set([(a,b) for (a, b, v) in generated_span])
+    # intersection = gold_spans.intersection(generated_span)
+    # union = gold_spans.union(generated_span)
+    return len(intersection) / len(gold_spans)
+def score_tree(gold_tree, pred_trees):
+    # print([x.leaves() for x in pred_trees])
+    def reset_leaves(tree_):
+        simple_increm = 0
+        for s in tree_.subtrees(lambda t: t.height() == 2):
+            s[0] = simple_increm
+            s._label = 'NN'
+            simple_increm += 1
+        return simple_increm
+    # reset.
+    increm_gold = reset_leaves(gold_tree)
+    # print(increm_gold)
+    for i, pred in enumerate(pred_trees):
+        increm_pred = reset_leaves(pred)
+        # print(increm_pred, 'pred', i)
+    use_evalb = True
+    if use_evalb:
+        # print(len(gold_tree), len(pred_trees), gold_tree)
+        gold_trees = [gold_tree] * len(pred_trees)
+        print(len(gold_tree.leaves()), [len(x.leaves()) for x in pred_trees])
+        # print(pred_trees[0])
+        dev_fscore = evaluate.evalb('diffusion_lm/misc/self-attentive-parser/EVALB',
+                                    gold_trees, pred_trees)
+        print(dev_fscore)
+    return dev_fscore
+def score_pos(gold_pos, generated_pos):
+    ed = levenshteinDistance(gold_pos, generated_pos)
+    return 1 - (ed / len(gold_pos))
+def score_pos_em(gold_pos, generated_pos):
+    # print(len(gold_pos), len(generated_pos), gold_pos, generated_pos)
+    if len(generated_pos) > len(gold_pos):
+        generated_pos = generated_pos[:len(gold_pos)]
+    elif len(generated_pos) < len(gold_pos):
+        generated_pos = generated_pos + ['PAD'] * (len(gold_pos) - len(generated_pos))
+    assert len(gold_pos) == len(generated_pos)
+    correct = 0
+    all = 0
+    for x1, x2 in zip(gold_pos, generated_pos):
+        if x1 == x2:
+            correct += 1
+        all += 1
+    return correct/all
+def score_attributes(gold_att, generated):
+    if gold_att in generated:
+        return 1.
+    else:
+        return 0.
+def eval_pos(tagger, generated_text):
+    generated_pos = []
+    for sent in generated_text:
+        sent_full = " ".join(sent)
+        doc = tagger(sent_full)
+        generated_pos.append([token.pos_ for token in doc])
+    return generated_pos
+def eval_(args, text_samples):
+    if args.mode == 'e2e-tgt-tree':
+        parser = benepar.Parser("benepar_en3")
+        tree_vocab = parser._parser.config["label_vocab"]
+        if args.gold_ref == 'full':
+            # toy1 = 'START Located in riverside area , Alimentum restaurant is a place to bring the whole family . \n END'.split()
+            # toy1 = 'START Alimentum is not a family - friendly place , located in city centre . \n END'.split()
+            toy1 = ['START', 'The', 'Vaults', 'pub', 'near', 'Café', 'Adriatic', 'has', 'a', '5', 'star', 'rating',
+                    '.', 'Prices', 'start', 'at', '£', '30', '.', 'END']
+            input_sentence1 = benepar.InputSentence(
+                words=toy1[1:-1],
+            )
+            gold_parse = list(parser.parse_sents([input_sentence1]))[0]
+            chart, gold_spans = chart_from_tree(tree_vocab, gold_parse, verbose=True)
+            print(len(toy1[1:-1]), len(list(gold_parse.leaves())))
+        elif args.gold_ref == 'span':
+            # spans = [(10, 14, 'ADJP')]
+            gold_spans = [(0, 4, 'S::VP')]
+            gold_spans = [(0, 0, 'NP')]
+            gold_spans = [(9, 13, 'ADJP')]
+            # gold_spans = [(9, 13, 'PP')]
+        print(text_samples[:1])
+        # correct for length:
+        target_len = len(gold_parse.leaves())
+        print(gold_parse.leaves(), 'target')
+        for i, x in enumerate(text_samples):
+            if len(x) == target_len:
+                continue
+            elif len(x) > target_len:
+                text_samples[i] = x[:target_len]
+            else:
+                print('padded to same length', (target_len-len(x)))
+                text_samples[i] = x + ['.'] * (target_len-len(x))
+                # print(text_samples[i])
+                # print('SAD, our model is shorter??')
+        generated_parse, generated_span = eval_parse(parser, text_samples, tree_vocab)
+        # print(gold_spans)
+        # print(generated_span[:2])
+        evalb_score = score_tree(gold_parse, generated_parse)
+        print([len(x) for x in text_samples])
+        score_lst = []
+        for x in generated_span:
+            score_lst.append(score_spans(gold_spans, x))
+        print(np.array(score_lst).mean())
+    elif args.mode == 'e2e-tgt-pos':
+        tagger = spacy_stanza.load_pipeline("en", processors='tokenize,mwt,pos', ) #processors={"tokenize": "spacy",}
+        if args.gold_ref == 'full':
+            toy1 = 'START The Mill is a coffee shop with an expensive menu near The Sorrento . \n END'.split()
+            toy1 = ['START', 'The', 'Vaults', 'pub', 'near', 'Café', 'Adriatic', 'has', 'a', '5', 'star', 'rating', '.',
+                    'Prices', 'start', 'at', '£', '30', '.', '\n', 'END']
+            sent_full = " ".join(toy1[1:-1])
+            doc = tagger(sent_full)
+            gold_pos = [token.pos_ for token in doc]
+        elif args.gold_ref == 'span':
+            gold_pos = [(9, 'PROPN')]
+        generated_pos = eval_pos(tagger, text_samples)
+        score_lst = []
+        score_lst2 = []
+        for x in generated_pos:
+            print(gold_pos)
+            print(x)
+            print()
+            score_lst.append(score_pos(gold_pos, x))
+            score_lst2.append(score_pos_em(gold_pos, x))
+        print(np.array(score_lst).mean())
+        print(np.array(score_lst2).mean())
+    elif args.mode == 'e2e-tgt-pos-paired':
+        import stanza
+        nlp = spacy_stanza.load_pipeline("en", processors={"tokenize": "spacy"})
+        print(nlp)
+        # nlp = stanza.Pipeline("en", processors={"tokenize": "spacy", 'pos': 'combined'}, package=None)
+        full_score = []
+        for gold, full_word_lst in text_samples.items():
+            print(gold, len(full_word_lst), full_word_lst[:2])
+            # full_word_lst = full_word_lst[:2]
+            sent_lst = [" ".join(seq) for seq in full_word_lst]
+            sent_full = " ".join(sent_lst)
+            # print(sent_lst)
+            try:
+                doc = nlp(sent_full)
+                doc_token_pos = [(token.text, token.pos_,) for token in doc]
+                len_lst = [len(seq) for seq in full_word_lst]
+                print(sum(len_lst), len(doc_token_pos), 'should be equal!!! ')
+                assert sum(len_lst) == len(doc_token_pos)
+                pos_lst = []
+                init_idx = 0
+                for len_temp in len_lst:
+                    pos_lst.append([x[1] for x in doc_token_pos[init_idx:init_idx + len_temp]])
+                    init_idx = init_idx + len_temp
+            except:
+                print(f'stanza pipeline failed... for this {gold}')
+                # parse each sentence separately...
+                pos_lst = []
+                for single_sent in sent_lst:
+                    doc = nlp(single_sent)
+                    # doc_token_pos = [(token.text, token.pos_,) for token in doc]
+                    pos_lst.append([ token.pos_ for token in doc])
+            score_lst = []
+            score_lst2 = []
+            for x in pos_lst:
+                score_lst.append(score_pos(gold, x))
+                score_lst2.append(score_pos_em(gold, x))
+            score_ed = np.array(score_lst).mean()
+            score_em = np.array(score_lst2).mean()
+            print(len(score_lst), score_ed, score_em)
+            full_score.append(score_em)
+        full_score_em = np.array(full_score).mean()
+        print(full_score_em, f"\pm {np.array(full_score).std()}", len(full_score))
+    if args.mode == 'e2e-tgt-tree-paired':
+        parser = benepar.Parser("benepar_en3")
+        tree_vocab = parser._parser.config["label_vocab"]
+        full_score = []
+        for idx, (gold_parse, full_word_lst) in enumerate(text_samples.items()):
+            # to avoid evalb complain --> change \n to .
+            gold_parse_str = gold_parse[0]
+            gold_parse_str = gold_parse_str.replace('\n', '.')
+            # print([gold_parse_str], 'gold tree string ')
+            gold_parse = Tree.fromstring(gold_parse_str)
+            target_len = len(gold_parse.leaves())
+            # print(gold_parse.leaves(), 'target')
+            # print(full_word_lst)
+            for i, x in enumerate(full_word_lst):
+                if len(x) == target_len:
+                    continue
+                elif len(x) > target_len:
+                    print('generated seq is longer than gold seq')
+                    full_word_lst[i] = x[:target_len]
+                else:
+                    print('padded to same length', (target_len - len(x)))
+                    full_word_lst[i] = x + ['.'] * (target_len - len(x))
+                    # print(text_samples[i])
+                    # print('SAD, our model is shorter??')
+            generated_parse, generated_span = eval_parse(parser, full_word_lst, tree_vocab)
+            evalb_score = score_tree(gold_parse, generated_parse) # inputs are nltk.Tree
+            # print(type(evalb_score))
+            print(evalb_score.fscore)
+            full_score.append(evalb_score.fscore)
+        full_score_f1 = np.array(full_score).mean()
+        # print(full_score_f1, len(full_score))
+        print(full_score_f1, f"\pm {np.array(full_score).std()}", len(full_score))
+    elif args.mode == 'e2e-tgt-spans-paired':
+        parser = benepar.Parser("benepar_en3")
+        tree_vocab = parser._parser.config["label_vocab"]
+        full_score = []
+        for idx, (gold_spans, full_word_lst) in enumerate(text_samples.items()):
+            # to avoid evalb complain --> change \n to .
+            print(gold_spans, '11 gold')
+            generated_parse, generated_span = eval_parse(parser, full_word_lst, tree_vocab)
+            score_lst = []
+            for x in generated_span:
+                score_lst.append(score_spans(gold_spans, x))
+            print(score_lst)
+            score_lst_mean = np.array(score_lst).mean()
+            full_score.append(score_lst_mean)
+        full_score_span = np.array(full_score).mean()
+        print(full_score_span, f"\pm {np.array(full_score).std()}", len(full_score))
+    if args.mode == 'e2e-tgt-attribute-paired':
+        full_score = []
+        for idx, (attribute, full_word_lst) in enumerate(text_samples.items()):
+            # print(attribute)
+            attribute = " ".join(attribute).split(':')[1].strip()
+            gold_attribute = attribute
+            score_lst = []
+            for i, x in enumerate(full_word_lst):
+                # print(gold_attribute, x)
+                score_lst.append(score_attributes(gold_attribute, " ".join(x)))
+            score_lst_mean = np.array(score_lst).mean()
+            full_score.append(score_lst_mean)
+        full_score_mean = np.array(full_score).mean()
+        # print(full_score_mean, len(full_score))
+        print(full_score_mean, f"\pm {np.array(full_score).std()}", len(full_score))
+    if args.mode == 'e2e-tgt-length-paired':
+        full_score = []
+        for idx, (attribute, full_word_lst) in enumerate(text_samples.items()):
+            tgt_len = int(attribute[0]) - 2 # remove START and END.
+            score_lst = []
+            for i, x in enumerate(full_word_lst):
+                if tgt_len == len(x):
+                # if np.abs(tgt_len - len(x)) <= 2:
+                    score_lst.append(1.)
+                else:
+                    score_lst.append(0.)
+            score_lst_mean = np.array(score_lst).mean()
+            full_score.append(score_lst_mean)
+        full_score_mean = np.array(full_score).mean()
+        # print(full_score_mean, len(full_score))
+        print(full_score_mean, f"\pm {np.array(full_score).std()}", len(full_score))
+    elif args.mode == 'e2e-tgt-attribute':
+        gold_attribute = ""
+        score_lst = []
+        for x in text_samples:
+            score_lst.append(score_attributes(gold_attribute, x))
+        print(np.array(score_lst).mean())
+if __name__ == '__main__':
+    # 'diffusion_lm/improved_diffusion/out_gen/diff_e2e-tgt_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart.ema_0.9999_200000.pt.infill_control_tree_50x64x16_tree_partial-cat-lgv0.1.json'
+    parser = argparse.ArgumentParser(description='training args.')
+    parser.add_argument('--input_text', type=str, default='diffusion_lm/improved_diffusion/out_gen/diff_e2e-tgt_pad_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart.ema_0.9999_200000.pt.'
+                                                          'infill_control_tree_50x64x16_tree_partial-cat-lgv0.1.json',)
+    parser.add_argument('--input_format', type=str, default='batch', help='wp, wikitext')
+    parser.add_argument('--mode', type=str, default='e2e-tgt-tree', help='')
+    parser.add_argument('--gold_ref', type=str, default='full', help='')
+    parser.add_argument('--model_name_or_path', type=str, default='predictability/diff_models/e2e-tgt_e=20_b=64_m=gpt2_wikitext-103-raw-v1_101_wp_finetune_UNK', help='')
+                        # default='predictability/diff_models/e2e-tgt_e=6_b=10_m=gpt2_wikitext-103-raw-v1_101_wp_pad', help='')
+    args = parser.parse_args()
+    text_samples = read_files(args)
+    eval_(args, text_samples)
+    eval_ppl(args, text_samples)
+    # eval_ppl2(args, text_samples)

src/ev.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import numpy as np
+import os.path as osp
+from nltk.translate.bleu_score import corpus_bleu
+from rdkit import RDLogger
+from Levenshtein import distance as lev
+from rdkit import Chem
+from rdkit.Chem import MACCSkeys
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+RDLogger.DisableLog('rdApp.*')
+from fcd import get_fcd, load_ref_model, canonical_smiles
+import warnings
+import os
+warnings.filterwarnings('ignore')
+def get_smis(filepath):
+    print(filepath)
+    with open(filepath) as f:
+        lines = f.readlines()
+    gt_smis= []
+    op_smis = []
+    for s in lines:
+        if len(s)<3:
+            continue
+        s0,s1 = s.split(' || ')
+        s0,s1 = s0.strip().replace('[EOS]','').replace('[SOS]','').replace('[X]','').replace('[XPara]','').replace('[XRing]',''),s1.strip()
+        gt_smis.append(s1)
+        op_smis.append(s0)
+    return gt_smis,op_smis
+def evaluate(gt_smis,op_smis):
+    references = []
+    hypotheses = []
+    for i, (gt, out) in enumerate(zip(gt_smis,op_smis)):
+        gt_tokens = [c for c in gt]
+        out_tokens = [c for c in out]
+        references.append([gt_tokens])
+        hypotheses.append(out_tokens)
+    # BLEU score
+    bleu_score = corpus_bleu(references, hypotheses)
+    references = []
+    hypotheses = []
+    levs = []
+    num_exact = 0
+    bad_mols = 0
+    for i, (gt, out) in enumerate(zip(gt_smis,op_smis)):
+        hypotheses.append(out)
+        references.append(gt)
+        try:
+            m_out = Chem.MolFromSmiles(out)
+            m_gt = Chem.MolFromSmiles(gt)
+            if Chem.MolToInchi(m_out) == Chem.MolToInchi(m_gt): num_exact += 1
+        except:
+            bad_mols += 1
+        levs.append(lev(out, gt))
+    # Exact matching score
+    exact_match_score = num_exact/(i+1)
+    # Levenshtein score
+    levenshtein_score = np.mean(levs)
+    validity_score = 1 - bad_mols/len(gt_smis)
+    return bleu_score, exact_match_score, levenshtein_score, validity_score
+def fevaluate(gt_smis,op_smis, morgan_r=2):
+    outputs = []
+    bad_mols = 0
+    for n, (gt_smi,ot_smi) in enumerate(zip(gt_smis,op_smis)):
+        try:
+            gt_m = Chem.MolFromSmiles(gt_smi)
+            ot_m = Chem.MolFromSmiles(ot_smi)
+            if ot_m == None: raise ValueError('Bad SMILES')
+            outputs.append((gt_m, ot_m))
+        except:
+            bad_mols += 1
+    validity_score = len(outputs)/(len(outputs)+bad_mols)
+    MACCS_sims = []
+    morgan_sims = []
+    RDK_sims = []
+    enum_list = outputs
+    for i, (gt_m, ot_m) in enumerate(enum_list):
+        MACCS_sims.append(DataStructs.FingerprintSimilarity(MACCSkeys.GenMACCSKeys(gt_m), MACCSkeys.GenMACCSKeys(ot_m), metric=DataStructs.TanimotoSimilarity))
+        RDK_sims.append(DataStructs.FingerprintSimilarity(Chem.RDKFingerprint(gt_m), Chem.RDKFingerprint(ot_m), metric=DataStructs.TanimotoSimilarity))
+        morgan_sims.append(DataStructs.TanimotoSimilarity(AllChem.GetMorganFingerprint(gt_m,morgan_r), AllChem.GetMorganFingerprint(ot_m, morgan_r)))
+    maccs_sims_score = np.mean(MACCS_sims)
+    rdk_sims_score = np.mean(RDK_sims)
+    morgan_sims_score = np.mean(morgan_sims)
+    return validity_score, maccs_sims_score, rdk_sims_score, morgan_sims_score
+def fcdevaluate(qgt_smis,qop_smis):
+    gt_smis = []
+    ot_smis = []
+    for n, (gt_smi,ot_smi) in enumerate(zip(qgt_smis,qop_smis)):
+        if len(ot_smi) == 0: ot_smi = '[]'
+        gt_smis.append(gt_smi)
+        ot_smis.append(ot_smi)
+    model = load_ref_model()
+    canon_gt_smis = [w for w in canonical_smiles(gt_smis) if w is not None]
+    canon_ot_smis = [w for w in canonical_smiles(ot_smis) if w is not None]
+    fcd_sim_score = get_fcd(canon_gt_smis, canon_ot_smis, model)
+    return fcd_sim_score
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+gt,op = get_smis('output.txt')
+bleu_score, exact_match_score, levenshtein_score,_  = evaluate(gt,op)
+validity_score, maccs_sims_score, rdk_sims_score, morgan_sims_score = fevaluate(gt,op)
+fcd_metric_score = fcdevaluate(gt,op)
+print(f'BLEU: {round(bleu_score, 3)}')
+print(f'Exact: {round(exact_match_score, 3)}')
+print(f'Levenshtein: {round(levenshtein_score, 3)}')
+print(f'MACCS FTS: {round(maccs_sims_score, 3)}')
+print(f'RDK FTS: {round(rdk_sims_score, 3)}')
+print(f'Morgan FTS: {round(morgan_sims_score, 3)}')
+print(f'FCD Metric: {round(fcd_metric_score, 3)}')
+print(f'Validity: {round(validity_score, 3)}')

src/evaluation/fcd_metric.py ADDED Viewed

	@@ -0,0 +1,54 @@

+'''
+Code from https://github.com/blender-nlp/MolT5
+```bibtex
+@article{edwards2022translation,
+  title={Translation between Molecules and Natural Language},
+  author={Edwards, Carl and Lai, Tuan and Ros, Kevin and Honke, Garrett and Ji, Heng},
+  journal={arXiv preprint arXiv:2204.11817},
+  year={2022}
+}
+```
+'''
+import argparse
+import csv
+import os.path as osp
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+from fcd import get_fcd, load_ref_model, canonical_smiles
+def evaluate(input_file, verbose=False):
+    gt_smis = []
+    ot_smis = []
+    with open(osp.join(input_file)) as f:
+        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for n, line in enumerate(reader):
+            gt_smi = line['ground truth']
+            ot_smi = line['output']
+            if len(ot_smi) == 0: ot_smi = '[]'
+            gt_smis.append(gt_smi)
+            ot_smis.append(ot_smi)
+    model = load_ref_model()
+    canon_gt_smis = [w for w in canonical_smiles(gt_smis) if w is not None]
+    canon_ot_smis = [w for w in canonical_smiles(ot_smis) if w is not None]
+    fcd_sim_score = get_fcd(canon_gt_smis, canon_ot_smis, model)
+    if verbose:
+        print('FCD Similarity:', fcd_sim_score)
+    return fcd_sim_score
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, default='caption2smiles_example.txt', help='path where test generations are saved')
+    args = parser.parse_args()
+    evaluate(args.input_file, True)

src/evaluation/fingerprint_metrics.py ADDED Viewed

	@@ -0,0 +1,81 @@

+'''
+Code from https://github.com/blender-nlp/MolT5
+```bibtex
+@article{edwards2022translation,
+  title={Translation between Molecules and Natural Language},
+  author={Edwards, Carl and Lai, Tuan and Ros, Kevin and Honke, Garrett and Ji, Heng},
+  journal={arXiv preprint arXiv:2204.11817},
+  year={2022}
+}
+```
+'''
+import argparse
+import csv
+import os.path as osp
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem import MACCSkeys
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+def evaluate(input_file, morgan_r, verbose=False):
+    outputs = []
+    bad_mols = 0
+    with open(osp.join(input_file)) as f:
+        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for n, line in enumerate(reader):
+            try:
+                gt_smi = line['ground truth']
+                ot_smi = line['output']
+                gt_m = Chem.MolFromSmiles(gt_smi)
+                ot_m = Chem.MolFromSmiles(ot_smi)
+                if ot_m == None: raise ValueError('Bad SMILES')
+                outputs.append((line['description'], gt_m, ot_m))
+            except:
+                bad_mols += 1
+    validity_score = len(outputs)/(len(outputs)+bad_mols)
+    if verbose:
+        print('validity:', validity_score)
+    MACCS_sims = []
+    morgan_sims = []
+    RDK_sims = []
+    enum_list = outputs
+    for i, (desc, gt_m, ot_m) in enumerate(enum_list):
+        if i % 100 == 0:
+            if verbose: print(i, 'processed.')
+        MACCS_sims.append(DataStructs.FingerprintSimilarity(MACCSkeys.GenMACCSKeys(gt_m), MACCSkeys.GenMACCSKeys(ot_m), metric=DataStructs.TanimotoSimilarity))
+        RDK_sims.append(DataStructs.FingerprintSimilarity(Chem.RDKFingerprint(gt_m), Chem.RDKFingerprint(ot_m), metric=DataStructs.TanimotoSimilarity))
+        morgan_sims.append(DataStructs.TanimotoSimilarity(AllChem.GetMorganFingerprint(gt_m,morgan_r), AllChem.GetMorganFingerprint(ot_m, morgan_r)))
+    maccs_sims_score = np.mean(MACCS_sims)
+    rdk_sims_score = np.mean(RDK_sims)
+    morgan_sims_score = np.mean(morgan_sims)
+    if verbose:
+        print('Average MACCS Similarity:', maccs_sims_score)
+        print('Average RDK Similarity:', rdk_sims_score)
+        print('Average Morgan Similarity:', morgan_sims_score)
+    return validity_score, maccs_sims_score, rdk_sims_score, morgan_sims_score
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, default='caption2smiles_example.txt', help='path where test generations are saved')
+    parser.add_argument('--morgan_r', type=int, default=2, help='morgan fingerprint radius')
+    args = parser.parse_args()
+    evaluate(args.input_file, args.morgan_r, True)

src/evaluation/mol_translation_metrics.py ADDED Viewed

	@@ -0,0 +1,129 @@

+'''
+Code from https://github.com/blender-nlp/MolT5
+```bibtex
+@article{edwards2022translation,
+  title={Translation between Molecules and Natural Language},
+  author={Edwards, Carl and Lai, Tuan and Ros, Kevin and Honke, Garrett and Ji, Heng},
+  journal={arXiv preprint arXiv:2204.11817},
+  year={2022}
+}
+```
+'''
+import pickle
+import argparse
+import csv
+import os.path as osp
+import numpy as np
+#load metric stuff
+from nltk.translate.bleu_score import corpus_bleu
+#from nltk.translate.meteor_score import meteor_score
+from Levenshtein import distance as lev
+from rdkit import Chem
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+def evaluate(input_fp, verbose=False):
+    outputs = []
+    with open(osp.join(input_fp)) as f:
+        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+        for n, line in enumerate(reader):
+            gt_smi = line['ground truth']
+            ot_smi = line['output']
+            outputs.append((line['description'], gt_smi, ot_smi))
+    bleu_scores = []
+    #meteor_scores = []
+    references = []
+    hypotheses = []
+    for i, (smi, gt, out) in enumerate(outputs):
+        if i % 100 == 0:
+            if verbose:
+                print(i, 'processed.')
+        gt_tokens = [c for c in gt]
+        out_tokens = [c for c in out]
+        references.append([gt_tokens])
+        hypotheses.append(out_tokens)
+        # mscore = meteor_score([gt], out)
+        # meteor_scores.append(mscore)
+    # BLEU score
+    bleu_score = corpus_bleu(references, hypotheses)
+    if verbose: print('BLEU score:', bleu_score)
+    # Meteor score
+    # _meteor_score = np.mean(meteor_scores)
+    # print('Average Meteor score:', _meteor_score)
+    rouge_scores = []
+    references = []
+    hypotheses = []
+    levs = []
+    num_exact = 0
+    bad_mols = 0
+    for i, (smi, gt, out) in enumerate(outputs):
+        hypotheses.append(out)
+        references.append(gt)
+        try:
+            m_out = Chem.MolFromSmiles(out)
+            m_gt = Chem.MolFromSmiles(gt)
+            if Chem.MolToInchi(m_out) == Chem.MolToInchi(m_gt): num_exact += 1
+            #if gt == out: num_exact += 1 #old version that didn't standardize strings
+        except:
+            bad_mols += 1
+        levs.append(lev(out, gt))
+    # Exact matching score
+    exact_match_score = num_exact/(i+1)
+    if verbose:
+        print('Exact Match:')
+        print(exact_match_score)
+    # Levenshtein score
+    levenshtein_score = np.mean(levs)
+    if verbose:
+        print('Levenshtein:')
+        print(levenshtein_score)
+    validity_score = 1 - bad_mols/len(outputs)
+    if verbose:
+        print('validity:', validity_score)
+    return bleu_score, exact_match_score, levenshtein_score, validity_score
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, default='caption2smiles_example.txt', help='path where test generations are saved')
+    args = parser.parse_args()
+    evaluate(args.input_file, verbose=True)

src/improved_diffusion/__init__.py ADDED Viewed

File without changes

src/improved_diffusion/dist_util.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Helpers for distributed training.
+"""
+import io
+import os
+import socket
+import blobfile as bf
+from mpi4py import MPI
+import torch as th
+import torch.distributed as dist
+# Change this to reflect your cluster layout.
+# The GPU for a given rank is (rank % GPUS_PER_NODE).
+GPUS_PER_NODE = 1  # 8
+SETUP_RETRY_COUNT = 3
+def setup_dist(rank, world_size, port="12145"):
+    """
+    Setup a distributed process group.
+    """
+    if dist.is_initialized():
+        return
+    # comm = MPI.COMM_WORLD
+    # backend = "gloo" if not th.cuda.is_available() else "nccl"
+    # if backend == "gloo":
+    #     hostname = "localhost"
+    # else:
+    #     hostname = socket.gethostbyname(socket.getfqdn())
+    # os.environ["MASTER_ADDR"] = comm.bcast(hostname, root=0)
+    # os.environ["RANK"] = str(comm.rank)
+    # os.environ["WORLD_SIZE"] = str(comm.size)
+    # port = comm.bcast(_find_free_port(), root=0)
+    # os.environ["MASTER_PORT"] = str(port)
+    # dist.init_process_group(backend=backend, init_method="env://")
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = port
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+def dev():
+    """
+    Get the device to use for torch.distributed.
+    """
+    if th.cuda.is_available():
+        return th.device(f"cuda:{MPI.COMM_WORLD.Get_rank() % GPUS_PER_NODE}")
+    return th.device("cpu")
+def load_state_dict(path, **kwargs):
+    """
+    Load a PyTorch file without redundant fetches across MPI ranks.
+    """
+    if MPI.COMM_WORLD.Get_rank() == 0:
+        with bf.BlobFile(path, "rb") as f:
+            data = f.read()
+    else:
+        data = None
+    data = MPI.COMM_WORLD.bcast(data)
+    return th.load(io.BytesIO(data), **kwargs)
+def sync_params(params):
+    """
+    Synchronize a sequence of Tensors across ranks from rank 0.
+    """
+    for p in params:
+        with th.no_grad():
+            dist.broadcast(p, 0)
+def _find_free_port():
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+    finally:
+        s.close()

src/improved_diffusion/fp16_util.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+Helpers to train with 16-bit precision.
+"""
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.half()
+        l.bias.data = l.bias.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.float()
+        l.bias.data = l.bias.data.float()
+def make_master_params(model_params):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = _flatten_dense_tensors(
+        [param.detach().float() for param in model_params]
+    )
+    master_params = nn.Parameter(master_params)
+    master_params.requires_grad = True
+    return [master_params]
+def model_grads_to_master_grads(model_params, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    master_params[0].grad = _flatten_dense_tensors(
+        [param.grad.data.detach().float() for param in model_params]
+    )
+def master_params_to_model_params(model_params, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    model_params = list(model_params)
+    for param, master_param in zip(
+        model_params, unflatten_master_params(model_params, master_params)
+    ):
+        param.detach().copy_(master_param)
+def unflatten_master_params(model_params, master_params):
+    """
+    Unflatten the master parameters to look like model_params.
+    """
+    return _unflatten_dense_tensors(master_params[0].detach(), model_params)
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()

src/improved_diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,1606 @@

+"""
+This code started out as a PyTorch port of Ho et al's diffusion models:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
+Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
+"""
+import enum
+import math
+import torch
+import numpy as np
+from .nn import mean_flat
+from .losses import normal_kl, discretized_gaussian_log_likelihood
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    elif schedule_name == "sqrt":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: 1 - np.sqrt(t + 0.0001),
+        )
+    elif schedule_name == "trunc_cos":
+        return betas_for_alpha_bar2(
+            num_diffusion_timesteps,
+            lambda t: np.cos((t + 0.1) / 1.1 * np.pi / 2) ** 2,
+        )
+    elif schedule_name == "trunc_lin":
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001 + 0.01
+        beta_end = scale * 0.02 + 0.01
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "pw_lin":
+        scale = 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001 + 0.01
+        beta_mid = scale * 0.0001  # scale * 0.02
+        beta_end = scale * 0.02
+        first_part = np.linspace(beta_start, beta_mid, 10, dtype=np.float64)
+        second_part = np.linspace(
+            beta_mid, beta_end, num_diffusion_timesteps - 10, dtype=np.float64
+        )
+        return np.concatenate([first_part, second_part])
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar2(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    betas.append(min(1 - alpha_bar(0), max_beta))
+    for i in range(num_diffusion_timesteps - 1):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    E2E_KL = enum.auto()
+    E2E_MSE = enum.auto()
+    E2E_Simple_MSE = enum.auto()
+    E2E_Simple_KL = enum.auto()
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+        model_arch=None,
+        training_mode="emb",
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+        self.model_arch = model_arch
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        )
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev)
+            * np.sqrt(alphas)
+            / (1.0 - self.alphas_cumprod)
+        )
+        self.training_mode = training_mode
+        self.mapping_func = None
+        #
+        # if training_mode == 'e2e':
+        #     self.training_losses = self.training_losses_e2e
+        # else:
+        #     self.training_losses = self.training_losses_emb
+        self.maxt = -1
+    def training_losses(self, model, *args, **kwargs):
+        return self.training_losses_e2e(model, *args, **kwargs)
+        # if self.training_mode == "e2e":
+        #     return self.training_losses_e2e(model, *args, **kwargs)
+        # elif self.training_mode == "e2e-simple":
+        #     return self.training_losses_e2e_simple(model, *args, **kwargs)
+        # else:
+        #     return self.training_losses_emb(model, *args, **kwargs)
+    def calc_bpd_loop(self, model, *args, **kwargs):
+        if self.training_mode == "e2e":
+            return self.calc_bpd_loop_e2e(model, *args, **kwargs)
+        else:
+            return self.calc_bpd_loop_emb(model, *args, **kwargs)
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        )
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(
+            self.log_one_minus_alphas_cumprod, t, x_start.shape
+        )
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        caption=None,
+    ):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        caption_state, caption_mask = caption[0], caption[1]
+        if model_kwargs is None:
+            model_kwargs = {}
+        if self.model_arch == "conv-unet" or self.model_arch == "1d-unet":
+            B, C = x.shape[:2]
+        else:
+            B, C = x.size(0), x.size(-1)
+        assert t.shape == (B,)
+        # print(x.shape)
+        model_output = model(
+            x, self._scale_timesteps(t), caption_state, caption_mask, **model_kwargs
+        )
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            if self.model_arch == "conv-unet":
+                assert model_output.shape == (B, C * 2, *x.shape[2:])
+                model_output, model_var_values = torch.split(model_output, C, dim=1)
+                # print('conv-unet')
+            elif self.model_arch == "1d-unet":
+                assert model_output.shape == (B, C * 2, *x.shape[2:])
+                model_output, model_var_values = torch.split(model_output, C, dim=1)
+            else:
+                assert model_output.shape == (B, x.size(1), C * 2)
+                model_output, model_var_values = torch.split(model_output, C, dim=-1)
+            if self.model_var_type == ModelVarType.LEARNED:
+                model_log_variance = model_var_values
+                model_variance = torch.exp(model_log_variance)
+            else:
+                min_log = _extract_into_tensor(
+                    self.posterior_log_variance_clipped, t, x.shape
+                )
+                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+                # The model_var_values is [-1, 1] for [min_var, max_var].
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = torch.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                # print(denoised_fn)
+                x = denoised_fn(x, t)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
+            )
+            model_mean = model_output
+        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+                )
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t
+            )
+        else:
+            raise NotImplementedError(self.model_mean_type)
+        assert (
+            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        )
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (  # (xprev - coef2*x_t) / coef1
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
+            - _extract_into_tensor(
+                self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
+            )
+            * x_t
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        top_p=None,
+        caption=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            caption=caption,
+        )
+        if top_p is not None and top_p > 0:
+            # print('top_p sampling')
+            noise = torch.randn_like(x)
+            replace_mask = torch.abs(noise) > top_p
+            while replace_mask.any():
+                noise[replace_mask] = torch.randn_like(noise[replace_mask])
+                replace_mask = torch.abs(noise) > top_p
+            assert (torch.abs(noise) <= top_p).all()
+        else:
+            noise = torch.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = (
+            out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
+        )
+        return {
+            "sample": sample,
+            "pred_xstart": out["pred_xstart"],
+            "greedy_mean": out["mean"],
+            "out": out,
+        }
+    def p_debug_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        final = None
+        for sample in self.p_debug_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_debug_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        custom_t_start=100,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(custom_t_start))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        top_p=None,
+        caption=None,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            top_p=top_p,
+            caption=caption,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        top_p=None,
+        caption=None,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise.to(device)
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        # print(indices[-10:])
+        # indices = indices[:-1]+[1,1,1,1,1,1,1]*60+[0]
+        # print(indices[-10:])
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        if caption is not None:
+            print("Text Guiding Generation ......")
+            caption = (
+                caption[0].to(img.device),
+                caption[1].to(img.device),
+            )  # (caption_state, caption_mask)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                    top_p=top_p,
+                    caption=caption,
+                )
+                yield out
+                img = out["sample"]
+    def p_sample_loop_langevin_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        langevin_func=None,
+        top_p=None,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                    top_p=top_p,
+                )
+                if langevin_func is not None:
+                    out["t"] = t
+                    out["img"] = img
+                    out = langevin_func(out)
+                yield out
+                img = out["sample"]
+    def p_sample_loop_progressive_infill(
+        self,
+        model,
+        shape,
+        partial_enc,
+        partial_mask,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        greedy=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+            # img = img[partial_mask] + partial_enc_with_noise[~partial_mask]
+        else:
+            t_batch = torch.tensor([self.num_timesteps - 1] * shape[0], device=device)
+            partial_enc_with_noise = self.q_sample(partial_enc, t_batch)
+            img = torch.randn(*shape, device=device)
+            # print(img.shape, partial_enc_with_noise.shape, partial_mask.shape)
+            # img = img[partial_mask] + partial_enc_with_noise[~partial_mask]
+            img[~partial_mask] = partial_enc_with_noise[~partial_mask]
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                )
+                if i > 0:
+                    partial_enc_with_noise = self.q_sample(partial_enc, t - 1)
+                else:
+                    partial_enc_with_noise = partial_enc
+                if greedy:
+                    img = out["greedy_mean"]
+                    img[~partial_mask] = partial_enc[~partial_mask]
+                    out["sample"] = img
+                else:
+                    img = out["sample"]
+                    img[~partial_mask] = partial_enc[~partial_mask]
+                    # img[~partial_mask] = partial_enc_with_noise[~partial_mask]
+                    out["sample"] = img
+                yield out
+    def p_sample_loop_progressive_merge(
+        self,
+        model,
+        shape,
+        partial_enc,
+        partial_mask,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        greedy=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+            # img = img[partial_mask] + partial_enc_with_noise[~partial_mask]
+        else:
+            t_batch = torch.tensor([self.num_timesteps - 1] * shape[0], device=device)
+            partial_enc_with_noise = self.q_sample(partial_enc, t_batch)
+            img = torch.randn(*shape, device=device)
+            # print(img.shape, partial_enc_with_noise.shape, partial_mask.shape)
+            # img = img[partial_mask] + partial_enc_with_noise[~partial_mask]
+            img[~partial_mask] = partial_enc_with_noise[~partial_mask]
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                )
+                if i > 0:
+                    partial_enc_with_noise = self.q_sample(partial_enc, t - 1)
+                else:
+                    partial_enc_with_noise = partial_enc
+                if greedy:
+                    img = out["greedy_mean"]
+                    img[~partial_mask] = partial_enc[~partial_mask]
+                    out["sample"] = img
+                else:
+                    img = out["sample"]
+                    img[~partial_mask] = partial_enc[~partial_mask]
+                    # img[~partial_mask] = partial_enc_with_noise[~partial_mask]
+                    out["sample"] = img
+                yield out
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+        langevin_fn=None,
+        caption=None,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            caption=caption,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = torch.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * torch.sqrt(alpha_bar_prev)
+            + torch.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        # print(sigma.mean())
+        sample = mean_pred + nonzero_mask * sigma * noise
+        if langevin_fn:
+            print(t.shape)
+            sample = langevin_fn(
+                sample, mean_pred, sigma, self.alphas_cumprod_prev[t[0]], t, x
+            )
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = (
+            out["pred_xstart"] * torch.sqrt(alpha_bar_next)
+            + torch.sqrt(1 - alpha_bar_next) * eps
+        )
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        top_p=-1.0,
+        langevin_fn=None,
+        caption=None,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+            langevin_fn=langevin_fn,
+            caption=caption,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        langevin_fn=None,
+        caption=None,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if caption is not None:
+            print("Text Guiding Generation ......")
+            caption = (
+                caption[0].to(img.device),
+                caption[1].to(img.device),
+            )  # (caption_state, caption_mask)
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                    langevin_fn=langevin_fn,
+                    caption=caption,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+        self,
+        model,
+        x_start,
+        x_t,
+        t,
+        clip_denoised=True,
+        model_kwargs=None,
+        noise=None,
+        denoised_fn=None,
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        # lambda *args, r=frozen_out: r,
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        if model_kwargs is not None and "input_ids" in model_kwargs:
+            input_ids = model_kwargs.pop("input_ids")
+            mapping_func = model_kwargs.pop("mapping_func", self.mapping_func)
+        else:
+            input_ids = None
+            # noise=None
+        out = self.p_mean_variance(
+            model,
+            x_t,
+            t,
+            clip_denoised=clip_denoised,
+            model_kwargs=model_kwargs,
+            denoised_fn=denoised_fn,
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        if input_ids is not None:
+            # print('input_ids is not None')
+            # from torch.distributions import Normal
+            # normal_dist = Normal(out["mean"], (0.5 * out["log_variance"]).exp())
+            # decoder_nll = -normal_dist.log_prob(x_start)
+            assert mapping_func is not None
+            if mapping_func is not None and torch.any(t == 0):
+                decoder_nll = mapping_func(out["mean"], input_ids) / out["mean"].size(
+                    -1
+                )
+            else:
+                decoder_nll = torch.zeros_like(x_start)
+            model_kwargs["input_ids"] = input_ids
+            model_kwargs["mapping_func"] = mapping_func
+            # target = {
+            #     ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+            #         x_start=x_start, x_t=x_t, t=t
+            #     )[0],
+            #     ModelMeanType.START_X: x_start,
+            #     ModelMeanType.EPSILON: noise,
+            # }[self.model_mean_type]
+            # # print(out['mean'].shape, x_start.shape, self.model_mean_type, noise)
+            # assert out["mean"].shape == target.shape == x_start.shape
+            # decoder_nll = (target - out["mean"]) ** 2
+        else:
+            decoder_nll = -discretized_gaussian_log_likelihood(
+                x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+            )
+            assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = torch.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def _vb_terms_bpd_e2e(
+        self,
+        model,
+        x_start,
+        x_t,
+        t,
+        input_ids,
+        get_logits,
+        x_start_mean,
+        x_start_log_var,
+        clip_denoised=True,
+        model_kwargs=None,
+        noise=None,
+        denoised_fn=None,
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        # lambda *args, r=frozen_out: r,
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        assert input_ids is not None
+        mapping_func = model_kwargs.pop("mapping_func", self.mapping_func)
+        # assert 'input_ids' in model_kwargs
+        # input_ids = model_kwargs.pop('input_ids')
+        out = self.p_mean_variance(
+            model,
+            x_t,
+            t,
+            clip_denoised=clip_denoised,
+            model_kwargs=model_kwargs,
+            denoised_fn=denoised_fn,
+        )
+        # print(true_log_variance_clipped[0], out["log_variance"][0], 'line1259')
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = self.token_discrete_loss(x_start, get_logits, input_ids)  # t=-1
+        decoder_nll = decoder_nll / out["mean"].size(-1)
+        decoder_nll = decoder_nll / np.log(2.0)
+        mask_1 = t == 0
+        if mask_1.any():
+            kl_T = normal_kl(
+                x_start_mean, x_start_log_var, out["mean"], out["log_variance"]
+            )
+            kl_T = mean_flat(kl_T) / np.log(2.0)
+            kl = torch.where(mask_1, kl_T, kl)
+        out_mean, out_variance, out_log_variance_clipped = self.q_mean_variance(
+            x_start, torch.LongTensor([self.num_timesteps - 1]).to(x_start.device)
+        )
+        kl_T = normal_kl(out_mean, out_log_variance_clipped, 0, 0)
+        kl_T = mean_flat(kl_T) / np.log(2.0)
+        # print(decoder_nll, )
+        # print()
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        # output =torch.where((t == 0), decoder_nll, kl)
+        output = kl + decoder_nll + kl_T
+        return {
+            "output": output,
+            "pred_xstart": out["pred_xstart"],
+            "kl": kl,
+            "decoder_nll": decoder_nll,
+            "kl_T": kl_T,
+        }
+    def get_x_start(self, x_start_mean, std):
+        """
+        Using the interpolating policy OR using the convolution policy...
+        :param x_start_mean:
+        :return:
+        """
+        noise = torch.randn_like(x_start_mean)
+        # print(std.shape, noise.shape, x_start_mean.shape)
+        assert noise.shape == x_start_mean.shape
+        # print(x_start_mean.device, noise.device)
+        return x_start_mean + std * noise
+    def token_discrete_loss(self, x_t, get_logits, input_ids):
+        if self.model_arch == "conv-unet" or self.model_arch == "1d-unet":
+            reshaped_x_t = x_t.view(x_t.size(0), x_t.size(1), -1).permute(0, 2, 1)
+        else:
+            # print(x_t.shape)
+            reshaped_x_t = x_t
+        # logits = get_logits(reshaped_x_t)  # bsz, seqlen, vocab
+        logits = get_logits(reshaped_x_t)
+        loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+        decoder_nll = loss_fct(
+            logits.view(-1, logits.size(-1)), input_ids.view(-1)
+        ).view(input_ids.shape)
+        decoder_nll = decoder_nll.mean(dim=-1)
+        return decoder_nll
+    def x0_helper(self, model_output, x, t):
+        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
+            pred_xstart = self._predict_xstart_from_xprev(
+                x_t=x, t=t, xprev=model_output
+            )
+            pred_prev = model_output
+        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = model_output
+            else:
+                pred_xstart = self._predict_xstart_from_eps(
+                    x_t=x, t=t, eps=model_output
+                )
+            pred_prev, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x, t=t
+            )
+        else:
+            raise NotImplementedError(self.model_mean_type)
+        return {"pred_xprev": pred_prev, "pred_xstart": pred_xstart}
+    def training_losses_e2e(self, model, micro, t, noise=None):
+        """
+        The function `training_losses_e2e` calculates various loss terms for an end-to-end training
+        process in a machine learning model.
+        :param model: The `model` parameter in the `training_losses_e2e` function seems to be an
+        instance of a model used for training. It is likely a neural network model that is being trained
+        for a specific task, such as sequence generation or prediction. The model is used within the
+        function to make predictions
+        :param micro: The `micro` parameter in the `training_losses_e2e` function seems to be a tuple
+        containing the following elements:
+        :param t: The `t` parameter in the `training_losses_e2e` function seems to represent the time
+        step or timestep index. It is used to determine certain conditions within the function, such as
+        comparing it to a threshold value of 400 and scaling timesteps. The function performs various
+        calculations and computations based
+        :param noise: The `noise` parameter in the `training_losses_e2e` function is used to pass a
+        tensor representing random noise. If the `noise` parameter is not provided when calling the
+        function, it generates random noise using `torch.randn_like(mix_start)`. This noise is then used
+        in the
+        :return: The function `training_losses_e2e` returns a dictionary `terms` containing different
+        loss terms based on the specified loss type. The specific terms included in the dictionary
+        depend on the conditions and calculations performed within the function for the given loss type.
+        The function calculates and populates the `terms` dictionary with relevant loss values such as
+        mean squared error (mse), variational bound (vb), decoder negative
+        """
+        selfies_ids = micro[0]
+        caption_state = micro[1]
+        caption_mask = micro[2]
+        corrupted_selfies_ids = micro[3]
+        assert corrupted_selfies_ids.shape == selfies_ids.shape
+        #########################################
+        mix_ids = torch.where(
+            t.reshape(-1, 1) < 400, corrupted_selfies_ids, selfies_ids
+        )
+        if t.max() > self.maxt:
+            self.maxt = t.max()
+            # print("Recieving max t:{}".format(self.maxt))
+        ##########################################
+        # print(f"Model dir: {dir(model)}")
+        try:
+            x_start_mean = model.model.get_embeds(selfies_ids)
+            mix_start_mean = model.model.get_embeds(mix_ids)
+        except:
+            x_start_mean = model.model.module.get_embeds(selfies_ids)
+            mix_start_mean = model.model.module.get_embeds(mix_ids)
+        std = _extract_into_tensor(
+            self.sqrt_one_minus_alphas_cumprod,
+            torch.tensor([0]).to(x_start_mean.device),
+            x_start_mean.shape,
+        )
+        x_start = self.get_x_start(x_start_mean, std)
+        mix_start = self.get_x_start(mix_start_mean, std)
+        if noise is None:
+            noise = torch.randn_like(mix_start)
+        x_t = self.q_sample(mix_start, t, noise=noise)  # reparametrization trick.
+        try:
+            get_logits = model.model.get_logits
+        except:
+            get_logits = model.model.module.get_logits
+        terms = {}
+        if self.loss_type == LossType.E2E_KL:
+            pass
+        elif (
+            self.loss_type == LossType.E2E_MSE
+            or self.loss_type == LossType.E2E_RESCALED_MSE
+        ):
+            model_output = model(
+                x_t, self._scale_timesteps(t), caption_state, caption_mask
+            )
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                pass
+            target = {
+                # ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                #    x_start=x_start, x_t=x_t, t=t
+                # )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[
+                self.model_mean_type
+            ]  # this is exactly x_start
+            # print(model_output.shape ,target.shape , x_start.shape)
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            # print( terms["mse"])
+            model_out_x_start = self.x0_helper(model_output, x_t, t)[
+                "pred_xstart"
+            ]  # this is exactly model_output
+            t0_mask = t == 0
+            t0_loss = mean_flat((x_start_mean - model_out_x_start) ** 2)
+            # print(terms["mse"].shape, )
+            terms["mse"] = torch.where(t0_mask, t0_loss, terms["mse"])
+            # tT_mask = (t == self.num_timesteps - 1)
+            out_mean, _, _ = self.q_mean_variance(
+                x_start, torch.LongTensor([self.num_timesteps - 1]).to(x_start.device)
+            )
+            tT_loss = mean_flat(out_mean**2)
+            decoder_nll = self.token_discrete_loss(x_start, get_logits, selfies_ids)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"] + (decoder_nll + tT_loss)
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop_e2e(
+        self, model, x_start, clip_denoised=True, model_kwargs=None, denoised_fn=None
+    ):
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        input_ids = model_kwargs.pop("input_ids").to(device)
+        x_start_mean = model.get_embeds(input_ids)
+        if self.model_arch == "conv-unet":
+            seqlen = int(np.sqrt(input_ids.size(1)))
+            x_start_mean = x_start_mean.view(
+                x_start_mean.size(0), seqlen, seqlen, x_start_mean.size(-1)
+            ).permute(0, 3, 1, 2)
+        elif self.model_arch == "1d-unet":
+            x_start_mean = x_start_mean.permute(0, 2, 1)
+        std = _extract_into_tensor(
+            self.sqrt_one_minus_alphas_cumprod,
+            torch.tensor([0]).to(x_start_mean.device),
+            x_start_mean.shape,
+        )
+        x_start_log_var = 2 * torch.log(std)
+        x_start = self.get_x_start(x_start_mean, std)
+        get_logits = model.get_logits
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = torch.tensor([t] * batch_size, device=device)
+            noise = torch.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            with torch.no_grad():
+                out = self._vb_terms_bpd_e2e(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    input_ids=input_ids,
+                    get_logits=get_logits,
+                    x_start_mean=x_start_mean,
+                    x_start_log_var=x_start_log_var,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                    noise=noise,
+                    denoised_fn=denoised_fn,
+                )
+            if t == self.num_timesteps - 1:
+                assert len(vb) == 0
+                vb.append(out["kl_T"])
+            vb.append(out["kl"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb.append(out["decoder_nll"])
+        vb = torch.stack(vb, dim=1)
+        xstart_mse = torch.stack(xstart_mse, dim=1)
+        mse = torch.stack(mse, dim=1)
+        # prior_bpd = self._prior_bpd(x_start)
+        prior_bpd = out["kl_T"]
+        total_bpd = vb.sum(dim=1)
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+    def calc_bpd_loop_emb(
+        self, model, x_start, clip_denoised=True, model_kwargs=None, denoised_fn=None
+    ):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = torch.tensor([t] * batch_size, device=device)
+            noise = torch.randn_like(x_start)
+            # print(t)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with torch.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                    noise=noise,
+                    denoised_fn=denoised_fn,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            #
+            # ## DEBUG
+            # def is_very_close(a, b):
+            #     return (((a - b) ** 2).mean())
+            # x_start_cycle = self._predict_xstart_from_eps(x_t=x_t, t=t_batch, eps=noise)
+            # gold_eps_cycle = self._predict_eps_from_xstart(x_t, t_batch, x_start_cycle)
+            # print(((gold_eps_cycle-noise)**2).mean())
+            # print(is_very_close(out2['pred_xstart'],out["pred_xstart"]), 'first isclose --> check p_mean')
+            # model.eval()
+            # with torch.no_grad():
+            #     direct_pred_eps = model(x_t, self._scale_timesteps(t_batch), **model_kwargs)
+            # print(((direct_pred_eps - noise) ** 2).mean(), 'ans1', self.rescale_timesteps)
+            # x_start_cycle_pred = self._predict_xstart_from_eps(x_t=x_t, t=t_batch, eps=direct_pred_eps)
+            # model_kwargs['debug_x_t'] = x_t
+            # model_kwargs['debug_t_batch'] = t_batch
+            # model_kwargs['debug_direct_pred_eps'] = direct_pred_eps
+            # model_kwargs['debug_x_start_cycle_pred'] = x_start_cycle_pred
+            # out2 = self.p_mean_variance(
+            #     model, x_t, t_batch, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+            # )
+            # # print(((out["pred_xstart"] - x_start_cycle_pred) ** 2).mean(), 'if not align issue with vb_terms')
+            # print(is_very_close(out2['pred_xstart'], x_start_cycle_pred), '2nd isclose --> check our flattened')
+            # gold_eps_cycle_pred = self._predict_eps_from_xstart(x_t, t_batch, x_start_cycle_pred)
+            # print(((eps - noise) ** 2).mean(), 'ans2', self._scale_timesteps)
+            # print()
+            # print(((gold_eps_cycle_pred - direct_pred_eps) ** 2).mean(), 'should be same, exactly same computation..')
+            ## DEBUG
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = torch.stack(vb, dim=1)
+        xstart_mse = torch.stack(xstart_mse, dim=1)
+        mse = torch.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = torch.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)

src/improved_diffusion/image_datasets.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from PIL import Image
+import blobfile as bf
+from mpi4py import MPI
+import numpy as np
+from torch.utils.data import DataLoader, Dataset
+def load_data(
+    *, data_dir, batch_size, image_size, class_cond=False, deterministic=False, permutation=None
+):
+    """
+    For a dataset, create a generator over (images, kwargs) pairs.
+    Each images is an NCHW float tensor, and the kwargs dict contains zero or
+    more keys, each of which map to a batched Tensor of their own.
+    The kwargs dict can be used for class labels, in which case the key is "y"
+    and the values are integer tensors of class labels.
+    :param data_dir: a dataset directory.
+    :param batch_size: the batch size of each returned pair.
+    :param image_size: the size to which images are resized.
+    :param class_cond: if True, include a "y" key in returned dicts for class
+                       label. If classes are not available and this is true, an
+                       exception will be raised.
+    :param deterministic: if True, yield results in a deterministic order.
+    """
+    if not data_dir:
+        raise ValueError("unspecified data directory")
+    all_files = _list_image_files_recursively(data_dir)
+    classes = None
+    if class_cond:
+        # Assume classes are the first part of the filename,
+        # before an underscore.
+        class_names = [bf.basename(path).split("_")[0] for path in all_files]
+        sorted_classes = {x: i for i, x in enumerate(sorted(set(class_names)))}
+        classes = [sorted_classes[x] for x in class_names]
+    dataset = ImageDataset(
+        image_size,
+        all_files,
+        classes=classes,
+        shard=MPI.COMM_WORLD.Get_rank(),
+        num_shards=MPI.COMM_WORLD.Get_size(),
+        permutation=permutation,
+    )
+    if deterministic:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=True
+        )
+    else:
+        loader = DataLoader(
+            dataset, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True
+        )
+    while True:
+        yield from loader
+def _list_image_files_recursively(data_dir):
+    results = []
+    for entry in sorted(bf.listdir(data_dir)):
+        full_path = bf.join(data_dir, entry)
+        ext = entry.split(".")[-1]
+        if "." in entry and ext.lower() in ["jpg", "jpeg", "png", "gif"]:
+            results.append(full_path)
+        elif bf.isdir(full_path):
+            results.extend(_list_image_files_recursively(full_path))
+    return results
+class ImageDataset(Dataset):
+    def __init__(self, resolution, image_paths, classes=None, shard=0, num_shards=1, permutation=None):
+        super().__init__()
+        self.resolution = resolution
+        self.local_images = image_paths[shard:][::num_shards]
+        self.local_classes = None if classes is None else classes[shard:][::num_shards]
+        self.permutation = permutation
+    def __len__(self):
+        return len(self.local_images)
+    def __getitem__(self, idx):
+        path = self.local_images[idx]
+        with bf.BlobFile(path, "rb") as f:
+            pil_image = Image.open(f)
+            pil_image.load()
+        # We are not on a new enough PIL to support the `reducing_gap`
+        # argument, which uses BOX downsampling at powers of two first.
+        # Thus, we do it by hand to improve downsample quality.
+        while min(*pil_image.size) >= 2 * self.resolution:
+            pil_image = pil_image.resize(
+                tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+            )
+        scale = self.resolution / min(*pil_image.size)
+        pil_image = pil_image.resize(
+            tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+        )
+        arr = np.array(pil_image.convert("RGB"))
+        crop_y = (arr.shape[0] - self.resolution) // 2
+        crop_x = (arr.shape[1] - self.resolution) // 2
+        arr = arr[crop_y : crop_y + self.resolution, crop_x : crop_x + self.resolution]
+        if self.permutation is not None: # pixel value permutation.
+            # print('running permutation.')
+            # print(arr)
+            arr = self.permutation[arr]
+            # print(arr)
+        arr = arr.astype(np.float32) / 127.5 - 1
+        # if self.permutation is not None: # pixel location permutation.
+        #     # print('running permutation.')
+        #     arr_reshaped = arr.reshape(arr.shape[0] * arr.shape[1], -1)
+        #     arr_permuted = arr_reshaped[self.permutation,:]
+        #     arr = arr_permuted.reshape(arr.shape[0], arr.shape[1], -1)
+        out_dict = {}
+        if self.local_classes is not None:
+            out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+        return np.transpose(arr, [2, 0, 1]), out_dict

src/improved_diffusion/logger.py ADDED Viewed

	@@ -0,0 +1,498 @@

+"""
+Logger copied from OpenAI baselines to avoid extra RL-based dependencies:
+https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py
+"""
+import os
+import sys
+import shutil
+import os.path as osp
+import json
+import time
+import datetime
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+import wandb
+DEBUG = 10
+INFO = 20
+WARN = 30
+ERROR = 40
+DISABLED = 50
+class KVWriter(object):
+    def writekvs(self, kvs):
+        raise NotImplementedError
+class SeqWriter(object):
+    def writeseq(self, seq):
+        raise NotImplementedError
+class HumanOutputFormat(KVWriter, SeqWriter):
+    def __init__(self, filename_or_file):
+        if isinstance(filename_or_file, str):
+            self.file = open(filename_or_file, "wt")
+            self.own_file = True
+        else:
+            assert hasattr(filename_or_file, "read"), (
+                "expected file or str, got %s" % filename_or_file
+            )
+            self.file = filename_or_file
+            self.own_file = False
+    def writekvs(self, kvs):
+        # Create strings for printing
+        key2str = {}
+        for (key, val) in sorted(kvs.items()):
+            if hasattr(val, "__float__"):
+                valstr = "%-8.3g" % val
+            else:
+                valstr = str(val)
+            key2str[self._truncate(key)] = self._truncate(valstr)
+        # Find max widths
+        if len(key2str) == 0:
+            print("WARNING: tried to write empty key-value dict")
+            return
+        else:
+            keywidth = max(map(len, key2str.keys()))
+            valwidth = max(map(len, key2str.values()))
+        # Write out the data
+        dashes = "-" * (keywidth + valwidth + 7)
+        lines = [dashes]
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+            lines.append(
+                "| %s%s | %s%s |"
+                % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val)))
+            )
+        lines.append(dashes)
+        self.file.write("\n".join(lines) + "\n")
+        # Flush the output to the file
+        self.file.flush()
+    def _truncate(self, s):
+        maxlen = 30
+        return s[: maxlen - 3] + "..." if len(s) > maxlen else s
+    def writeseq(self, seq):
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1:  # add space unless this is the last one
+                self.file.write(" ")
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        if self.own_file:
+            self.file.close()
+class JSONOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "wt")
+    def writekvs(self, kvs):
+        for k, v in sorted(kvs.items()):
+            if hasattr(v, "dtype"):
+                kvs[k] = float(v)
+        self.file.write(json.dumps(kvs) + "\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class CSVOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "w+t")
+        self.keys = []
+        self.sep = ","
+    def writekvs(self, kvs):
+        # Add our current row to the history
+        extra_keys = list(kvs.keys() - self.keys)
+        extra_keys.sort()
+        if extra_keys:
+            self.keys.extend(extra_keys)
+            self.file.seek(0)
+            lines = self.file.readlines()
+            self.file.seek(0)
+            for (i, k) in enumerate(self.keys):
+                if i > 0:
+                    self.file.write(",")
+                self.file.write(k)
+            self.file.write("\n")
+            for line in lines[1:]:
+                self.file.write(line[:-1])
+                self.file.write(self.sep * len(extra_keys))
+                self.file.write("\n")
+        for (i, k) in enumerate(self.keys):
+            if i > 0:
+                self.file.write(",")
+            v = kvs.get(k)
+            if v is not None:
+                self.file.write(str(v))
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class TensorBoardOutputFormat(KVWriter):
+    """
+    Dumps key/value pairs into TensorBoard's numeric format.
+    """
+    def __init__(self, dir):
+        os.makedirs(dir, exist_ok=True)
+        self.dir = dir
+        self.step = 1
+        prefix = "events"
+        path = osp.join(osp.abspath(dir), prefix)
+        import tensorflow as tf
+        from tensorflow.python import pywrap_tensorflow
+        from tensorflow.core.util import event_pb2
+        from tensorflow.python.util import compat
+        self.tf = tf
+        self.event_pb2 = event_pb2
+        self.pywrap_tensorflow = pywrap_tensorflow
+        self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
+    def writekvs(self, kvs):
+        def summary_val(k, v):
+            kwargs = {"tag": k, "simple_value": float(v)}
+            return self.tf.Summary.Value(**kwargs)
+        summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
+        event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
+        event.step = (
+            self.step
+        )  # is there any reason why you'd want to specify the step?
+        self.writer.WriteEvent(event)
+        self.writer.Flush()
+        self.step += 1
+    def close(self):
+        if self.writer:
+            self.writer.Close()
+            self.writer = None
+def make_output_format(format, ev_dir, log_suffix=""):
+    os.makedirs(ev_dir, exist_ok=True)
+    if format == "stdout":
+        return HumanOutputFormat(sys.stdout)
+    elif format == "log":
+        return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix))
+    elif format == "json":
+        return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix))
+    elif format == "csv":
+        return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix))
+    elif format == "tensorboard":
+        return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix))
+    else:
+        raise ValueError("Unknown format specified: %s" % (format,))
+# ================================================================
+# API
+# ================================================================
+def logkv(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    If called many times, last value will be used.
+    """
+    get_current().logkv(key, val)
+def logkv_mean(key, val):
+    """
+    The same as logkv(), but if called many times, values averaged.
+    """
+    get_current().logkv_mean(key, val)
+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)
+def dumpkvs():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    return get_current().dumpkvs()
+def getkvs():
+    return get_current().name2val
+def log(*args, level=INFO):
+    """
+    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    """
+    get_current().log(*args, level=level)
+def debug(*args):
+    log(*args, level=DEBUG)
+def info(*args):
+    log(*args, level=INFO)
+def warn(*args):
+    log(*args, level=WARN)
+def error(*args):
+    log(*args, level=ERROR)
+def set_level(level):
+    """
+    Set logging threshold on current logger.
+    """
+    get_current().set_level(level)
+def set_comm(comm):
+    get_current().set_comm(comm)
+def get_dir():
+    """
+    Get directory that log files are being written to.
+    will be None if there is no output directory (i.e., if you didn't call start)
+    """
+    return get_current().get_dir()
+record_tabular = logkv
+dump_tabular = dumpkvs
+@contextmanager
+def profile_kv(scopename):
+    logkey = "wait_" + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
+def profile(n):
+    """
+    Usage:
+    @profile("my_func")
+    def my_func(): code
+    """
+    def decorator_with_name(func):
+        def func_wrapper(*args, **kwargs):
+            with profile_kv(n):
+                return func(*args, **kwargs)
+        return func_wrapper
+    return decorator_with_name
+# ================================================================
+# Backend
+# ================================================================
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+    return Logger.CURRENT
+class Logger(object):
+    DEFAULT = None  # A logger with no output files. (See right below class definition)
+    # So that you can still log to the terminal without setting up any output files
+    CURRENT = None  # Current logger being used by the free functions above
+    def __init__(self, dir, output_formats, comm=None):
+        self.name2val = defaultdict(float)  # values this iteration
+        self.name2cnt = defaultdict(int)
+        self.level = INFO
+        self.dir = dir
+        self.output_formats = output_formats
+        self.comm = comm
+    # Logging API, forwarded
+    # ----------------------------------------
+    def logkv(self, key, val):
+        self.name2val[key] = val
+    def logkv_mean(self, key, val):
+        oldval, cnt = self.name2val[key], self.name2cnt[key]
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
+        self.name2cnt[key] = cnt + 1
+    def dumpkvs(self, prefix=None):
+        if self.comm is None:
+            d = self.name2val
+        else:
+            d = mpi_weighted_mean(
+                self.comm,
+                {
+                    name: (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()
+                },
+            )
+            if self.comm.rank != 0:
+                d["dummy"] = 1  # so we don't get a warning about empty dict
+        # LISA
+        wandb.log({**d})
+        out = d.copy()  # Return the dict for unit testing purposes
+        for fmt in self.output_formats:
+            if isinstance(fmt, KVWriter):
+                fmt.writekvs(d)
+        self.name2val.clear()
+        self.name2cnt.clear()
+        return out
+    def log(self, *args, level=INFO):
+        if self.level <= level:
+            self._do_log(args)
+    # Configuration
+    # ----------------------------------------
+    def set_level(self, level):
+        self.level = level
+    def set_comm(self, comm):
+        self.comm = comm
+    def get_dir(self):
+        return self.dir
+    def close(self):
+        for fmt in self.output_formats:
+            fmt.close()
+    # Misc
+    # ----------------------------------------
+    def _do_log(self, args):
+        for fmt in self.output_formats:
+            if isinstance(fmt, SeqWriter):
+                fmt.writeseq(map(str, args))
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(
+                            "WARNING: tried to compute mean on non-float {}={}".format(
+                                name, val
+                            )
+                        )
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name: name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+def configure(dir=None, format_strs=None, comm=None, log_suffix=""):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
+    if dir is None:
+        dir = os.getenv("OPENAI_LOGDIR")
+    if dir is None:
+        dir = osp.join(
+            tempfile.gettempdir(),
+            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"),
+        )
+    assert isinstance(dir, str)
+    dir = os.path.expanduser(dir)
+    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+    rank = get_rank_without_mpi_import()
+    if rank > 0:
+        log_suffix = log_suffix + "-rank%03i" % rank
+    if format_strs is None:
+        if rank == 0:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",")
+        else:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",")
+    format_strs = filter(None, format_strs)
+    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
+    if output_formats:
+        log("Logging to %s" % dir)
+def _configure_default_logger():
+    configure()
+    Logger.DEFAULT = Logger.CURRENT
+def reset():
+    if Logger.CURRENT is not Logger.DEFAULT:
+        Logger.CURRENT.close()
+        Logger.CURRENT = Logger.DEFAULT
+        log("Reset logger")
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
+        Logger.CURRENT.close()
+        Logger.CURRENT = prevlogger

src/improved_diffusion/losses.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Helpers for various likelihood-based losses. These are ported from the original
+Ho et al. diffusion models codebase:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
+"""
+import numpy as np
+import torch as th
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    # print(logvar2.shape)
+    # temp1 = 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2))
+    # print(f'const = {temp1.mean()}, coef={(th.exp(-logvar2) * 0.5).mean()}, mse={((mean1 - mean2) ** 2).mean().item()}')
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs
+def gaussian_density(x, *, means, log_scales):
+    from torch.distributions import Normal
+    normal_dist = Normal(means, log_scales.exp())
+    logp = normal_dist.log_prob(x)
+    return logp
+def discretized_text_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    print(x.shape, means.shape)
+    # assert x.shape == means.shape == log_scales.shape
+    print(x, means)
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

src/improved_diffusion/nn.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+Various utilities for neural networks.
+"""
+import math
+import torch as th
+import torch.nn as nn
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with th.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with th.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = th.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads

src/improved_diffusion/resample.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

src/improved_diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    # if isinstance(section_counts, str):
+    #     if section_counts.startswith("ddim"):
+    #         desired_count = int(section_counts[len("ddim") :])
+    #         for i in range(1, num_timesteps):
+    #             if len(range(0, num_timesteps, i)) == desired_count:
+    #                 return set(range(0, num_timesteps, i))
+    #         raise ValueError(
+    #             f"cannot create exactly {num_timesteps} steps with an integer stride"
+    #         )
+    #     section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        # print(kwargs.keys())
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        # print('called p_mean_var')
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        # print('called training_losses')
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, *args,**kwargs):
+        # print(ts)
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # print(new_ts)
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        # temp = self.model(x, new_ts, **kwargs)
+        # print(temp.shape)
+        # return temp
+        # print(new_ts)
+        return self.model(x, new_ts,*args, **kwargs)

src/improved_diffusion/rounding.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+# bert results
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, default_data_collator
+import sys, yaml, os
+# print( os.path.join(sys.path[0], '../../transformers/examples/pytorch/language-modeling'))
+# sys.path.insert(0, 'diffusion_lm/transformers/examples/pytorch/language-modeling')
+# sys.path.insert(0, os.path.join(sys.path[0], '../../transformers/examples/pytorch/language-modeling'))
+# from custom_trainer import GPT2LMHeadModelCompress, BERTModelCompress, AutoEncoderWithNoise
+def load_models(modality, mode, model_name_or_path, emb_dim, file, extra_args=None):
+    if mode in ['random', 'random1', 'random_up_proj', 'glove']:
+        if modality == 'synth':
+            pass# print(file, 'deciding what to load::: ')
+            # if 'synth128' in file:
+            #     config = 'diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k128_trainc20000.yaml'
+            # else:
+            #     config = 'diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k32_trainc20000.yaml'
+            # import sys, os
+            # sys.path.insert(0, 'diffusion_lm/synthetic_data/rnns-stacks')
+            # from dataset import Dataset as SynthDataset
+            # args_synth = yaml.load(open(config))
+            # dataset = SynthDataset(args_synth)
+            # model = torch.nn.Embedding(len(dataset.vocab), emb_dim)
+            # print('initializing the random embeddings', model)
+            # # print(os.path.split(file.split('.')[0])[-1])
+            # # path_save = '{}/random_emb.torch'.format(file)
+            # path_save = '{}/random_emb.torch'.format(file)
+            # model.load_state_dict(torch.load(path_save))
+            # print(dataset.vocab)
+            # tokenizer = {v: k for k, v in dataset.vocab.items()}
+        else:
+            import json
+            if modality == 'book' or (extra_args is not None and extra_args.use_bert_tokenizer == 'yes'):
+                pass# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+                # if 'e2e' in file and modality == 'book':
+                #     emb_dim = 1
+            else:
+                path_save_tokenizer = '{}/vocab.json'.format(file)
+                path_save_tokenizer = '/data0/gonghaisong/Diffusion-LM/improved-diffusion/diffusion_models/diff_e2e-tgt_block_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e/vocab.json'
+                print(f'loading from {path_save_tokenizer}')
+                with open(path_save_tokenizer, 'r') as f:
+                    vocab = json.load(f)
+                print(len(vocab))
+                tokenizer = {v: k for k, v in vocab.items()}
+            model = torch.nn.Embedding(len(tokenizer), emb_dim)
+            path_save = '{}/random_emb.torch'.format(file)
+            path_save = '/data0/gonghaisong/Diffusion-LM/improved-diffusion/diffusion_models/diff_e2e-tgt_block_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e/random_emb.torch'
+            model.load_state_dict(torch.load(path_save))
+    return model, tokenizer
+def load_tokenizer(modality, mode, model_name_or_path):
+    if mode in ['random', 'random_up_proj', 'glove']:
+        if modality == 'synth':
+            print(model_name_or_path, 'deciding what to load::: ')
+            if 'synth128' in model_name_or_path:
+                config = 'diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k128_trainc20000.yaml'
+            else:
+                config = 'diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k32_trainc20000.yaml'
+            import sys, os
+            sys.path.insert(0, 'diffusion_lm/synthetic_data/rnns-stacks')
+            from dataset import Dataset as SynthDataset
+            args_synth = yaml.load(open(config))
+            dataset = SynthDataset(args_synth)
+            tokenizer = {v: k for k, v in dataset.vocab.items()}
+        elif modality =='book':
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        else:
+            import json
+            path_save_tokenizer = '{}/vocab.json'.format(model_name_or_path)
+            with open(path_save_tokenizer, 'r') as f:
+                vocab = json.load(f)
+            tokenizer = {v: k for k, v in vocab.items()}
+    return tokenizer
+def rounding_func(mode, text_emb_lst, model, tokenizer, emb_scale_factor=1.0):
+    decoded_out_lst = []
+    if mode in ['random', 'random_up_proj', 'glove']:
+        down_proj_emb = model.weight  # input_embs
+        down_proj_emb2 = None
+        def get_knn(down_proj_emb, text_emb, dist='cos'):
+            if dist == 'cos':
+                adjacency = down_proj_emb @ text_emb.transpose(1, 0).to(down_proj_emb.device)
+            elif dist == 'l2':
+                adjacency = down_proj_emb.unsqueeze(1).expand(-1, text_emb.size(0), -1) - text_emb.unsqueeze(0).expand(
+                    down_proj_emb.size(0), -1, -1)
+                adjacency = -torch.norm(adjacency, dim=-1)
+            topk_out = torch.topk(adjacency, k=6, dim=0)
+            return topk_out.values, topk_out.indices
+        dist = 'l2'
+        # print(npzfile['arr_0'].shape)
+        for text_emb in text_emb_lst:
+            import torch
+            text_emb = torch.tensor(text_emb)
+            # print(text_emb.shape)
+            if len(text_emb.shape) > 2:
+                text_emb = text_emb.view(-1, text_emb.size(-1))
+            else:
+                text_emb = text_emb
+            val, indices = get_knn((down_proj_emb2 if dist == 'cos' else down_proj_emb),
+                                   text_emb.to(down_proj_emb.device), dist=dist)
+            # generated_lst.append(tuple(indices[0].tolist()))
+            # print(indices[0].tolist())
+            # for i in range(64):
+            #     print([tokenizer[x.item()] for x in indices[:,i]])
+            decoded_out = " ".join([tokenizer[i] for i in indices[0].tolist()])
+            decoded_out_lst.append(decoded_out)
+    return decoded_out_lst

src/improved_diffusion/script_util.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import argparse
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+# from .unet import SuperResModel
+NUM_CLASSES = 1000
+def model_and_diffusion_defaults():
+    """
+    Defaults for image training.
+    """
+    return dict(
+        image_size=64,
+        num_channels=128,
+        num_res_blocks=2,
+        num_heads=4,
+        num_heads_upsample=-1,
+        attention_resolutions="16,8",
+        dropout=0.0,
+        learn_sigma=False,
+        class_cond=False,
+        diffusion_steps=1000,
+        noise_schedule="linear",
+        timestep_respacing="",
+        use_kl=False,
+        predict_xstart=False,
+        rescale_timesteps=True,
+        rescale_learned_sigmas=True,
+        use_checkpoint=False,
+        use_scale_shift_norm=True,
+        model_arch="trans-unet",
+        in_channel=8,
+        out_channel=8,
+        training_mode="emb",
+        vocab_size=66,
+        config_name="QizhiPei/biot5-base-text2mol",
+        experiment_mode="lm",
+        logits_mode=1,
+    )
+# def sr_model_and_diffusion_defaults():
+#     res = model_and_diffusion_defaults()
+#     res["large_size"] = 256
+#     res["small_size"] = 64
+#     arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0]
+#     for k in res.copy().keys():
+#         if k not in arg_names:
+#             del res[k]
+#     return res
+# def sr_create_model_and_diffusion(
+#     large_size,
+#     small_size,
+#     class_cond,
+#     learn_sigma,
+#     num_channels,
+#     num_res_blocks,
+#     num_heads,
+#     num_heads_upsample,
+#     attention_resolutions,
+#     dropout,
+#     diffusion_steps,
+#     noise_schedule,
+#     timestep_respacing,
+#     use_kl,
+#     predict_xstart,
+#     rescale_timesteps,
+#     rescale_learned_sigmas,
+#     use_checkpoint,
+#     use_scale_shift_norm,
+# ):
+#     model = sr_create_model(
+#         large_size,
+#         small_size,
+#         num_channels,
+#         num_res_blocks,
+#         learn_sigma=learn_sigma,
+#         class_cond=class_cond,
+#         use_checkpoint=use_checkpoint,
+#         attention_resolutions=attention_resolutions,
+#         num_heads=num_heads,
+#         num_heads_upsample=num_heads_upsample,
+#         use_scale_shift_norm=use_scale_shift_norm,
+#         dropout=dropout,
+#     )
+#     diffusion = create_gaussian_diffusion(
+#         steps=diffusion_steps,
+#         learn_sigma=learn_sigma,
+#         noise_schedule=noise_schedule,
+#         use_kl=use_kl,
+#         predict_xstart=predict_xstart,
+#         rescale_timesteps=rescale_timesteps,
+#         rescale_learned_sigmas=rescale_learned_sigmas,
+#         timestep_respacing=timestep_respacing,
+#     )
+#     return model, diffusion
+# def sr_create_model(
+#     large_size,
+#     small_size,
+#     num_channels,
+#     num_res_blocks,
+#     learn_sigma,
+#     class_cond,
+#     use_checkpoint,
+#     attention_resolutions,
+#     num_heads,
+#     num_heads_upsample,
+#     use_scale_shift_norm,
+#     dropout,
+# ):
+#     _ = small_size  # hack to prevent unused variable
+#     if large_size == 256:
+#         channel_mult = (1, 1, 2, 2, 4, 4)
+#     elif large_size == 64:
+#         channel_mult = (1, 2, 3, 4)
+#     else:
+#         raise ValueError(f"unsupported large size: {large_size}")
+#     attention_ds = []
+#     for res in attention_resolutions.split(","):
+#         attention_ds.append(large_size // int(res))
+#     return SuperResModel(
+#         in_channels=3,
+#         model_channels=num_channels,
+#         out_channels=(3 if not learn_sigma else 6),
+#         num_res_blocks=num_res_blocks,
+#         attention_resolutions=tuple(attention_ds),
+#         dropout=dropout,
+#         channel_mult=channel_mult,
+#         num_classes=(NUM_CLASSES if class_cond else None),
+#         use_checkpoint=use_checkpoint,
+#         num_heads=num_heads,
+#         num_heads_upsample=num_heads_upsample,
+#         use_scale_shift_norm=use_scale_shift_norm,
+#     )
+def create_gaussian_diffusion(
+    *,
+    steps=1000,
+    learn_sigma=False,
+    noise_schedule="linear",  # sqrt
+    use_kl=False,
+    predict_xstart=False,  # True
+    rescale_timesteps=False,  # True
+    rescale_learned_sigmas=False,  # True
+    timestep_respacing="",
+    model_arch="conv-unet",  # transformer
+    training_mode="emb",  # e2e
+):
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(2000, [2000]),
+        betas=gd.get_named_beta_schedule("sqrt", 2000),
+        model_mean_type=(gd.ModelMeanType.START_X),
+        model_var_type=(
+            (gd.ModelVarType.FIXED_LARGE)
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=gd.LossType.E2E_MSE,
+        rescale_timesteps=True,
+        model_arch="transformer",
+        training_mode="e2e",
+    )
+def add_dict_to_argparser(parser, default_dict):
+    for k, v in default_dict.items():
+        v_type = type(v)
+        if v is None:
+            v_type = str
+        elif isinstance(v, bool):
+            v_type = str2bool
+        parser.add_argument(f"--{k}", default=v, type=v_type)
+def args_to_dict(args, keys):
+    return {k: getattr(args, k) for k in keys}
+def str2bool(v):
+    """
+    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("boolean value expected")

src/improved_diffusion/test_util.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch as th
+import numpy as np
+def compute_logp(args, model, x, input_ids):
+    word_emb = model.weight
+    sigma = 0.1
+    if args.model_arch == '1d-unet':
+        x = x.permute(0, 2, 1)
+    bsz, seqlen, dim = x.shape
+    x_flat = x.reshape(-1, x.size(-1)).unsqueeze(0)  # 1, bsz*sample*seqlen, dim
+    word_emb_flat = word_emb.unsqueeze(1)  # vocab, 1,  dim
+    diff = (x_flat - word_emb_flat) ** 2  # vocab, seqlen, dim
+    logp_expanded = -diff.sum(dim=-1) / (2 * sigma ** 2)  # vocab, seqlen
+    logp_expanded = logp_expanded.permute((1, 0))
+    # print(th.topk(logp_expanded.view(bsz, seqlen, -1), k=5, dim=-1)[0])
+    # print(input_ids[0])
+    ce = th.nn.CrossEntropyLoss(reduction='none')
+    loss = ce(logp_expanded, input_ids.view(-1)).view(bsz, seqlen)
+    # print(loss[0])
+    # print(loss.shape)
+    return loss
+def get_weights(model, args):
+    if hasattr(model, 'transformer'):
+        input_embs = model.transformer.wte  # input_embs
+        down_proj = model.down_proj
+        down_proj_emb = down_proj(input_embs.weight)
+        print(down_proj_emb.shape)
+        # model = th.nn.Embedding(down_proj_emb.shape[1], down_proj_emb.shape[0])
+        model = th.nn.Embedding(down_proj_emb.size(0), down_proj_emb.size(1))
+        print(args.emb_scale_factor)
+        model.weight.data = down_proj_emb * args.emb_scale_factor
+    elif hasattr(model, 'weight'):
+        pass
+    else:
+        assert NotImplementedError
+    model.weight.requires_grad = False
+    return model
+def denoised_fn_round(args, model, text_emb, t):
+    # return text_emb
+    thresh_t = 350
+    # print(thresh_t)
+    # print(t)
+    if thresh_t is not None and t[0] > thresh_t:
+        return text_emb
+    # return text_emb
+    # print(t.float().mean(), t[0])
+    # assert t.float().mean() == t[0].float()
+    # print(text_emb.shape) # bsz, seqlen, dim
+    # down_proj_emb = model.weight  # input_embs
+    down_proj_emb = model
+    # print(t)
+    old_shape = text_emb.shape
+    old_device = text_emb.device
+    def get_efficient_knn(down_proj_emb, text_emb, dist='l2'):
+        if dist == 'l2':
+            emb_norm = (down_proj_emb**2).sum(-1).view(-1, 1) #vocab
+            text_emb_t = th.transpose(text_emb.view(-1, text_emb.size(-1)), 0, 1) #d, bsz*seqlen
+            arr_norm = (text_emb ** 2).sum(-1).view(-1, 1) #bsz*seqlen, 1
+            # print(emb_norm.shape, arr_norm.shape)
+            dist = emb_norm + arr_norm.transpose(0, 1) - 2.0 * th.mm(down_proj_emb, text_emb_t) #(vocab, d) x (d, bsz*seqlen)
+            dist = th.clamp(dist, 0.0, np.inf)
+            # print(dist.shape)
+        topk_out = th.topk(-dist, k=1, dim=0)
+        #     adjacency = down_proj_emb.unsqueeze(1).expand(-1, text_emb.size(0), -1) - text_emb.unsqueeze(0).expand(
+        #         down_proj_emb.size(0), -1, -1)
+        #     adjacency = -th.norm(adjacency, dim=-1)
+        # topk_out = th.topk(adjacency, k=1, dim=0)
+        # print(topk_out1.indices == topk_out.indices)
+        # assert th.all(topk_out1.indices == topk_out.indices)
+        return topk_out.values, topk_out.indices
+    # def get_knn(down_proj_emb, text_emb, dist='l2'):
+    #     if dist == 'l2':
+    #         adjacency = down_proj_emb.unsqueeze(1).expand(-1, text_emb.size(0), -1) - text_emb.unsqueeze(0).expand(
+    #             down_proj_emb.size(0), -1, -1)
+    #         adjacency = -th.norm(adjacency, dim=-1)
+    #     topk_out = th.topk(adjacency, k=1, dim=0)
+    #     return topk_out.values, topk_out.indices
+    dist = 'l2'
+    if len(text_emb.shape) > 2:
+        text_emb = text_emb.reshape(-1, text_emb.size(-1))
+    else:
+        text_emb = text_emb
+    # val, indices = get_knn(down_proj_emb,
+    #                        text_emb.to(down_proj_emb.device), dist=dist)
+    val, indices = get_efficient_knn(down_proj_emb,
+                           text_emb.to(down_proj_emb.device), dist=dist)
+    rounded_tokens = indices[0]
+    # print(rounded_tokens.shape)
+    new_embeds = model[rounded_tokens].view(old_shape).to(old_device)
+    return new_embeds
+def load_results(json_path, load_dict):
+    import json
+    with open(json_path, 'w') as f:
+        json.dump(load_dict, f, indent=2)

src/improved_diffusion/text_datasets.py ADDED Viewed

	@@ -0,0 +1,948 @@

+# from PIL import Image
+# import blobfile as bf
+from mpi4py import MPI
+import numpy as np
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, default_data_collator, PreTrainedTokenizerFast, \
+    PreTrainedTokenizer
+# from datasets import load_dataset
+import sys, os
+import torch
+# sys.path.insert(0, os.path.join(sys.path[0], '../../transformers/examples/pytorch/language-modeling'))
+# from custom_trainer import GPT2LMHeadModelCompress, BERTModelCompress, AutoEncoderWithNoise
+from collections import Counter, defaultdict
+from functools import partial
+from itertools import chain
+def load_data_text(
+    *, data_dir, batch_size, image_size, class_cond=False, deterministic=False, data_args=None,
+        task_mode='roc', model=None, padding_mode='block', split='train', load_vocab=None,
+):
+    """
+    For a dataset, create a generator over (images, kwargs) pairs.
+    Each images is an NCHW float tensor, and the kwargs dict contains zero or
+    more keys, each of which map to a batched Tensor of their own.
+    The kwargs dict can be used for class labels, in which case the key is "y"
+    and the values are integer tensors of class labels.
+    :param data_dir: a dataset directory.
+    :param batch_size: the batch size of each returned pair.
+    :param image_size: the size to which images are resized.
+    :param class_cond: if True, include a "y" key in returned dicts for class
+                       label. If classes are not available and this is true, an
+                       exception will be raised.
+    :param deterministic: if True, yield results in a deterministic order.
+    """
+    print('hello loading text data. ')
+    if data_args.experiment.startswith('random') and model is None:
+        model = None
+    # elif data_args.experiment.startswith('random') and model is not None:
+    #     print('loading initialized random embeddings. ')
+    if task_mode == 'roc' or task_mode == 'roc-aug' :
+        pass
+        # training_data, model = get_corpus_rocstory(data_args, model, image_size,
+        #                                     padding_mode=padding_mode, split=split,
+                                            # load_vocab=load_vocab)
+    elif task_mode == 'simple-wiki':
+        pass
+        # training_data, model = get_corpus_rocstory(data_args, model, image_size,
+                                            # padding_mode=padding_mode, split=split,
+                                            # load_vocab=load_vocab)
+    elif task_mode == 'e2e-tgt':
+        print('hello loading e2e-tgt. ')
+        training_data, model = get_corpus_rocstory(data_args, model, image_size,
+                                            padding_mode=padding_mode, split=split,
+                                            load_vocab=load_vocab)
+    # elif task_mode == 'yelp':
+    #     print('hello loading yelp ')
+    #     training_data, model = get_corpus_rocstory(data_args, model, image_size,
+    #                                         padding_mode=padding_mode, split=split,
+    #                                         load_vocab=load_vocab)
+    # elif task_mode == 'commonGen' or task_mode == 'commonGen-aug':
+    #     print('hello loading common-gen ')
+    #     training_data, model = get_corpus_rocstory(data_args, model, image_size,
+    #                                         padding_mode=padding_mode, split=split,
+    #                                         load_vocab=load_vocab)
+    # elif task_mode == 'e2e':
+    #     training_data, model = get_corpus_rocstory(data_args, model, image_size,
+    #                                         padding_mode=padding_mode, split=split,
+    #                                         load_vocab=load_vocab)
+    # elif task_mode == 'book':
+    #     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    #     training_data, model = get_corpus_book(data_args, tokenizer, model, image_size,
+    #                                           padding_mode=padding_mode, split=split,)
+    if data_args.modality in ['roc-aug', 'roc', 'book', 'yelp', 'commonGen', 'commonGen-aug'] and data_args.cache_mode=='no':
+        pass# dataset = TextDataset_NoCache(
+        #     training_data,
+        #     image_size,
+        #     data_args,
+        #     model_arch=data_args.model_arch,
+        #     model_emb=model
+        # )
+    else:
+        dataset = TextDataset(
+            training_data,
+            image_size,
+            data_args,
+            model_arch=data_args.model_arch,
+        )
+    if deterministic:
+        pass# data_loader = DataLoader(
+        #     dataset,
+        #     batch_size=batch_size,  # 20,
+        #     drop_last=True,
+        #     shuffle=False,
+        #     num_workers=1,
+        # )
+    else:
+        data_loader = DataLoader(
+            dataset,
+            batch_size=batch_size,  # 20,
+            drop_last=True,
+            shuffle=True,
+            num_workers=1,
+        )
+    while True:
+        yield from data_loader
+def helper_tokenize_encode_cond(sentence_lst, vocab_dict, model, seqlen, data_args):
+    result_train_lst = []
+    group_lst = defaultdict(list)
+    with torch.no_grad():
+        for (src_ids, input_ids) in sentence_lst:
+            tokenized_ = [vocab_dict.get(x, vocab_dict['UNK']) for x in input_ids]
+            tokenized_src = [vocab_dict.get(x, vocab_dict['UNK']) for x in src_ids]
+            input_ids = [0] + tokenized_ + [1]
+            group_lst['word_ids'].append(input_ids)
+            group_lst['src_ids'].append(tokenized_src)
+        print(group_lst['word_ids'][:2])
+        print('padding mode is pad')
+        max_length = seqlen
+        group_lst['word_ids'] = _collate_batch_helper(group_lst['word_ids'], vocab_dict['PAD'], max_length)
+        max_src_length = max([len(xx) for xx in group_lst['src_ids']])
+        print(max_src_length, seqlen)
+        max_src_length = min(seqlen, max_src_length)
+        group_lst['src_ids'], group_lst['src_mask'] = _collate_batch_helper(group_lst['src_ids'],
+                                                                            vocab_dict['PAD'],
+                                                                            max_src_length,
+                                                                            return_mask=True)
+        for input_ids, src_ids, src_mask in zip(group_lst['word_ids'], group_lst['src_ids'],
+                                      group_lst['src_mask']):
+            if data_args.experiment.startswith('random'):
+                hidden_state = model(torch.tensor(input_ids))
+            elif data_args.experiment == 'gpt2_pre_compress':
+                input_ids2 = torch.tensor(input_ids).to(model.device)
+                input_embs = model.transformer.wte(input_ids2)  # input_embs
+                hidden_state = model.down_proj(input_embs)
+                hidden_state = hidden_state * data_args.emb_scale_factor
+            result_train_lst.append({'input_ids': input_ids,
+                                     'hidden_states': hidden_state.cpu().tolist(),
+                                     'src_ids':src_ids,
+                                     'src_mask':src_mask
+                                     })
+    return result_train_lst
+def helper_tokenize_stream(sentence_lst, vocab_dict, model, seqlen, data_args, padding_mode, ):
+    import psutil
+    # Process.memory_info is expressed in bytes, so convert to megabytes
+    print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+    from datasets import Dataset as Dataset2
+    raw_datasets = Dataset2.from_dict({'text':sentence_lst})
+    print(raw_datasets)
+    print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+    def tokenize_function(examples):
+        if isinstance(vocab_dict, dict):
+            input_ids = [[0] + [vocab_dict.get(x, vocab_dict['UNK']) for x in seq] + [1] for seq in examples['text']]
+        elif isinstance(vocab_dict, PreTrainedTokenizerFast):
+            examples['text'] = [" ".join(seq) for seq in examples['text']]
+            input_ids = vocab_dict(examples['text'], add_special_tokens=True)['input_ids']
+        result_dict = {'input_ids': input_ids}
+        # clm input could be much much longer than block_size
+        return result_dict
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=4,
+        remove_columns=['text'],
+        load_from_cache_file=True,
+        desc="Running tokenizer on dataset",
+    )
+    print(tokenized_datasets)
+    print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+    if padding_mode == 'block':
+        block_size = seqlen
+        def group_texts(examples):
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            if total_length >= block_size:
+                total_length = (total_length // block_size) * block_size
+            result = {
+                k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+            result["labels"] = result["input_ids"].copy()
+            return result
+        lm_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Grouping texts in chunks of {block_size}",
+        )
+    else:
+        def pad_function(group_lst):
+            max_length = seqlen
+            if isinstance(vocab_dict, dict):
+                group_lst['input_ids'] = _collate_batch_helper(group_lst['input_ids'], vocab_dict['PAD'], max_length)
+            else:
+                group_lst['input_ids'] = _collate_batch_helper(group_lst['input_ids'], vocab_dict.pad_token_id, max_length)
+            return group_lst
+        # Process.memory_info is expressed in bytes, so convert to megabytes
+        print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+        lm_datasets = tokenized_datasets.map(
+            pad_function,
+            batched=True,
+            num_proc=1,
+            desc=f"padding",
+        )
+    print(lm_datasets, 'padded dataset')
+    print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+    import datasets
+    raw_datasets = datasets.DatasetDict()
+    raw_datasets['train'] = lm_datasets
+    print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
+    return raw_datasets
+def helper_tokenize_encode(sentence_lst, vocab_dict, model, seqlen, data_args, padding_mode, ):
+    result_train_lst = []
+    group_lst = defaultdict(list)
+    with torch.no_grad():
+        for input_ids in sentence_lst:
+            tokenized_ = [vocab_dict.get(x, vocab_dict['UNK']) for x in input_ids]
+            input_ids = [0] + tokenized_ + [1]
+            group_lst['word_ids'].append(input_ids)
+        print(group_lst['word_ids'][:2])
+        if padding_mode == 'block':
+            print('padding mode is block')
+            concatenated_examples = {k: sum(group_lst[k], []) for k in group_lst.keys()}
+            total_length = len(concatenated_examples[list(group_lst.keys())[0]])
+            block_size = seqlen
+            total_length = (total_length // block_size) * block_size
+            # Split by chunks of max_len.
+            group_lst = {
+                k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+        elif padding_mode == 'pad':
+            print('padding mode is pad')
+            max_length = seqlen
+            group_lst['word_ids'] = _collate_batch_helper(group_lst['word_ids'], vocab_dict['PAD'], max_length)
+        for input_ids in group_lst['word_ids']:
+            if data_args.experiment.startswith('random'):
+                hidden_state = model(torch.tensor(input_ids))
+            elif data_args.experiment == 'gpt2_pre_compress':
+                input_ids2 = torch.tensor(input_ids).to(model.device)
+                input_embs = model.transformer.wte(input_ids2)  # input_embs
+                hidden_state = model.down_proj(input_embs)
+                hidden_state = hidden_state * data_args.emb_scale_factor
+            elif data_args.experiment == 'glove':
+                hidden_state = model(torch.tensor(input_ids))
+            result_train_lst.append({'input_ids': input_ids, 'hidden_states': hidden_state.cpu().tolist()})
+    return result_train_lst
+def load_glove_model(File):
+    print("Loading Glove Model")
+    glove_model = {}
+    with open(File,'r') as f:
+        for line in f:
+            split_line = line.split()
+            word = split_line[0]
+            embedding = torch.tensor(np.array(split_line[1:], dtype=np.float64))
+            # embedding = np.array(split_line[1:], dtype=np.float64)
+            glove_model[word] = embedding
+    print(f"{len(glove_model)} words loaded!")
+    return glove_model
+def load_glove(vocab):
+    model = torch.nn.Embedding(len(vocab), 50)
+    glove_model = load_glove_model('predictability/glove/glove.6B.50d.txt')
+    array_lst = []
+    count_ = 0
+    for word, idx in vocab.items():
+        if word in glove_model:
+            array_lst.append(glove_model[word])
+        else:
+            count_ += 1
+            array_lst.append(torch.randn(50))
+    print(f'{count_} out of {len(vocab)} is initialized. ')
+    array_lst = torch.stack(array_lst)
+    print(torch.norm(array_lst, dim=-1).mean())
+    model.weight.data = array_lst
+    return model
+def get_corpus_rocstory(data_args, model, image_size, padding_mode='block',
+                        split='train', load_vocab=None):
+    import csv, torch, json
+    from spacy.lang.en import English
+    if data_args.experiment_mode == 'lm':
+        if data_args.modality == 'roc':
+            pass
+            # print('loading dataset from ROCStory')
+            # nlp = English()
+            # tokenizer = nlp.tokenizer
+            # sentence_lst = []
+            # print(f'loading from {data_args.roc_train}')
+            # if split == 'train':
+            #     print('loading form the TRAIN set')
+            #     path = f'{data_args.roc_train}/roc_train.json'
+            # elif split == 'valid':
+            #     print('loading form the VALID set')
+            #     path = f'{data_args.roc_train}/roc_valid.json'
+            # else:
+            #     assert False, "invalid split for ROC dataset"
+            # with open(path, 'r') as roc_reader:
+            #     for row in roc_reader:
+            #         sentences = json.loads(row)[0].strip()
+            #         word_lst = [x.text for x in tokenizer(sentences)]
+            #         sentence_lst.append(word_lst)
+            # # with open(data_args.roc_train, 'r') as csvfile:
+            # #     roc_reader = csv.reader(csvfile) #delimiter=' ', quotechar='|')
+            # #     for row in roc_reader:
+            # #         # tokenize.
+            # #         sentences = " ".join(row[2:])
+            # #         word_lst = [x.text for x in tokenizer(sentences)]
+            # #         sentence_lst.append(word_lst)
+            # # sentence_lst = sentence_lst[1:]
+            # print(sentence_lst[:2])
+        if data_args.modality == 'roc-aug':
+            pass
+            # print('loading dataset from ROCStory')
+            # nlp = English()
+            # tokenizer = nlp.tokenizer
+            # sentence_lst = []
+            # if split == 'train':
+            #     print('loading form the TRAIN set')
+            #     path_lst = [f'{data_args.roc_train}/roc_train.json']
+            #     path_lst.append('diffusion_lm/improved-diffusion/diff_models/rocstories_gptj.txt')
+            #     # path_lst.append('diffusion_lm/improved-diffusion/cache/ar_model_augment_roc.json')
+            #     # path_lst.append('diffusion_lm/improved-diffusion/cache/ar_model_augment_roc2.json')
+            # elif split == 'valid':
+            #     print('loading form the VALID set')
+            #     path_lst = [f'{data_args.roc_train}/roc_valid.json']
+            # else:
+            #     assert False, "invalid split for ROC dataset"
+            # print(path_lst)
+            # for path in path_lst:
+            #     if path.endswith('txt'):
+            #         with open(path, 'r') as roc_reader:
+            #             for row in roc_reader:
+            #                 sentences = row.strip()
+            #                 word_lst = [x.text for x in tokenizer(sentences)]
+            #                 sentence_lst.append(word_lst)
+            #     else:
+            #         with open(path, 'r') as roc_reader:
+            #             for row in roc_reader:
+            #                 sentences = json.loads(row)[0].strip()
+            #                 word_lst = [x.text for x in tokenizer(sentences)]
+            #                 sentence_lst.append(word_lst)
+            # print(sentence_lst[:2],sentence_lst[-2:], 'dataset size=',len(sentence_lst))
+        elif data_args.modality == 'simple-wiki':
+            pass
+            # print('loading dataset from simple wikipedia')
+            # sentence_lst = []
+            # with open(data_args.wiki_train, 'r') as ff:
+            #     for row in ff:
+            #         word_lst = row.lower().split()
+            #         sentence_lst.append(word_lst)
+            # print(sentence_lst[:2])
+        elif data_args.modality == 'e2e-tgt':
+            print('loading dataset from simple e2e dataset')
+            sentence_lst = []
+            nlp = English()
+            tokenizer = nlp.tokenizer
+            if split == 'train':
+                print('loading form the TRAIN set')
+                path = '/data0/gonghaisong/Diffusion-LM/datasets/e2e_data/src1_train.txt'
+                # path = f'../{data_args.e2e_train}/src1_train.txt'
+            elif split == 'valid':
+                print('loading form the VALID set')
+                path = f'../{data_args.e2e_train}/src1_valid.txt'
+                path = '/data0/gonghaisong/Diffusion-LM/datasets/e2e_data/src1_valid.txt'
+            elif split == 'test':
+                print('loading form the TEST set')
+                path = f'../{data_args.e2e_train}/src1_test.txt'
+                path = '/data0/gonghaisong/Diffusion-LM/datasets/e2e_data/src1_test.txt'
+            elif split == 'debug':
+                print('loading form the DEBUG set')
+                path = data_args.debug_path
+                import json
+                with open(path, 'r') as ff:
+                    for line in ff:
+                        sentence_lst.append(json.loads(line)[0].split(' '))
+                sentence_lst = sentence_lst + sentence_lst
+            if split in ['train', 'valid', 'test']:
+                with open(path, 'r') as ff:
+                    for row in ff:
+                        word_lst = row.split('||')[1]
+                        word_lst = [x.text for x in tokenizer(word_lst)]
+                        sentence_lst.append(word_lst)
+            print(sentence_lst[:2])
+        elif data_args.modality == 'yelp':
+            print('loading dataset from simple YelpNLG dataset')
+            sentence_lst = []
+            nlp = English()
+            tokenizer = nlp.tokenizer
+            if split == 'train':
+                print('loading form the TRAIN set')
+                path = f'{data_args.yelp_train}/yelpnlg-train.csv'
+            elif split == 'valid':
+                print('loading form the VALID set')
+                path = f'{data_args.yelp_train}/yelpnlg-dev.csv'
+            elif split == 'test':
+                print('loading form the TEST set')
+                path = f'{data_args.yelp_train}/yelpnlg-test.csv'
+            if split in ['train', 'valid', 'test']:
+                with open(path, 'r') as csvfile:
+                    yelp_reader = csv.reader(csvfile) #delimiter=' ', quotechar='|')
+                    for row in yelp_reader:
+                        sentences = row[1]
+                        word_lst = [x.text for x in tokenizer(sentences)]
+                        sentence_lst.append(word_lst)
+                sentence_lst = sentence_lst[1:]
+            print(sentence_lst[:2])
+        elif data_args.modality == 'commonGen':
+            print('loading dataset from simple YelpNLG dataset')
+            sentence_lst = []
+            nlp = English()
+            tokenizer = nlp.tokenizer
+            if split == 'train':
+                print('loading form the TRAIN set')
+                path = f'{data_args.commonGen_train}/commongen.train.jsonl'
+            elif split == 'valid':
+                print('loading form the VALID set')
+                path = f'{data_args.commonGen_train}/commongen.dev.jsonl'
+            elif split == 'test':
+                print('loading form the TEST set')
+                path = f'{data_args.commonGen_train}/commongen.test.jsonl'
+            if split in ['train', 'valid', 'test']:
+                with open(path, 'r') as ff:
+                    for line in ff:
+                        line = json.loads(line)
+                        for sentences in line['scene']:
+                            word_lst = [x.text for x in tokenizer(sentences)]
+                            sentence_lst.append(word_lst)
+            print(sentence_lst[:2])
+        elif data_args.modality == 'commonGen-aug':
+            print('loading dataset from simple YelpNLG dataset')
+            sentence_lst = []
+            nlp = English()
+            tokenizer = nlp.tokenizer
+            if split == 'train':
+                print('loading form the TRAIN set')
+                path = f'{data_args.commonGen_train}/commongen.train.jsonl'
+                path_lst = [f'{data_args.roc_train}/roc_train.json']
+                path_lst.append('diffusion_lm/improved-diffusion/diff_models/rocstories_gptj.txt')
+            elif split == 'valid':
+                print('loading form the VALID set')
+                path = f'{data_args.commonGen_train}/commongen.dev.jsonl'
+                path_lst = []
+            elif split == 'test':
+                print('loading form the TEST set')
+                path = f'{data_args.commonGen_train}/commongen.test.jsonl'
+                path_lst = []
+            if split in ['train', 'valid', 'test']:
+                with open(path, 'r') as ff:
+                    for line in ff:
+                        line = json.loads(line)
+                        for sentences in line['scene']:
+                            word_lst = [x.text for x in tokenizer(sentences)]
+                            sentence_lst.append(word_lst)
+            print(sentence_lst[:2])
+            import itertools
+            for path in path_lst:
+                if path.endswith('txt'):
+                    with open(path, 'r') as roc_reader:
+                        for row in roc_reader:
+                            sentences = row.strip()
+                            word_lst = [x.text for x in tokenizer(sentences)]
+                            spl = [[]]
+                            for x, y in itertools.groupby(word_lst, lambda z: z == '.'):
+                                spl[-1].extend(y)
+                                if x: spl.append([])
+                            sentence_lst.extend(spl[:-1])
+                else:
+                    with open(path, 'r') as roc_reader:
+                        for row in roc_reader:
+                            sentences = json.loads(row)[0].strip()
+                            word_lst = [x.text for x in tokenizer(sentences)]
+                            spl = [[]]
+                            for x, y in itertools.groupby(word_lst, lambda z: z == '.'):
+                                spl[-1].extend(y)
+                                if x: spl.append([])
+                            sentence_lst.extend(spl[:-1])
+            print(sentence_lst[-2:])
+        # get tokenizer.
+        if load_vocab is None:
+            counter = Counter()
+            for input_ids in sentence_lst:
+                counter.update(input_ids)
+    if data_args.experiment_mode == 'conditional_gen':
+        if data_args.modality == 'e2e':
+            print('loading dataset from simple e2e dataset')
+            sentence_lst = []
+            nlp = English()
+            tokenizer = nlp.tokenizer
+            if split == 'train':
+                path = f'{data_args.e2e_train}/src1_train.txt'
+                with open(path, 'r') as ff:
+                    for row in ff:
+                        src_lst, word_lst = row.split('||')
+                        word_lst = [x.text for x in tokenizer(word_lst)]
+                        src_lst = [x.text for x in tokenizer(src_lst)]
+                        sentence_lst.append((src_lst, word_lst))
+            elif split == 'valid':
+                path = f'{data_args.e2e_train}/src1_valid.txt'
+                sentence_lst = read_e2e_files(path, data_args, tokenizer)
+            print(sentence_lst[:2])
+        # get tokenizer.
+        if load_vocab is None:
+            counter = Counter()
+            for (src_ids, input_ids) in sentence_lst:
+                counter.update(input_ids)
+                counter.update(src_ids)
+    if load_vocab is None:
+        vocab_dict = {'START': 0, 'END': 1, 'UNK':2, 'PAD':3}
+        for k, v in counter.items():
+            if v > 10:
+                vocab_dict[k] = len(vocab_dict)
+        print(len(counter), len(vocab_dict))
+        path_save_vocab = '/data0/gonghaisong/Diffusion-LM/improved-diffusion/diffusion_models/diff_e2e-tgt_block_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e/vocab.json'
+        print(f'save the vocab to {path_save_vocab}')
+        with open(path_save_vocab, 'w') as f:
+            json.dump(vocab_dict, f)
+    else:
+        vocab_dict = load_vocab
+        path_save_vocab = '/data0/gonghaisong/Diffusion-LM/improved-diffusion/diffusion_models/diff_e2e-tgt_block_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e/vocab.json'
+        if not os.path.exists(path_save_vocab):
+            print(f'save the vocab to {path_save_vocab}')
+            if isinstance(vocab_dict, dict):
+                with open(path_save_vocab, 'w') as f:
+                    json.dump(vocab_dict, f)
+                assert vocab_dict['START'] == 0
+            elif isinstance(vocab_dict, PreTrainedTokenizerFast):
+                vocab_dict.save_pretrained(data_args.checkpoint_path)
+            else:
+                assert False, "invalid type of vocab_dict"
+    if model is None and data_args.experiment == 'random':
+        model = torch.nn.Embedding(len(vocab_dict), data_args.in_channel)
+        print('initializing the random embeddings', model)
+        torch.nn.init.normal_(model.weight)
+        path_save = '/data0/gonghaisong/Diffusion-LM/improved-diffusion/diffusion_models/diff_e2e-tgt_block_rand16_transformer_lr0.0001_0.0_2000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e/random_emb.torch'
+        print(f'save the random encoder to {data_args.checkpoint_path}/random_emb.torch')
+        torch.save(model.state_dict(), path_save)
+    # path_save = f'{data_args.checkpoint_path}/random_emb.torch'
+    # if not os.path.exists(path_save) and data_args.experiment == 'random':
+    #     torch.save(model.state_dict(), path_save)
+    if data_args.experiment_mode == 'lm' and data_args.modality in ['roc-aug', 'roc', 'yelp', 'commonGen', 'commonGen-aug'] \
+            and data_args.cache_mode=='no':
+        train_dataset = helper_tokenize_stream(sentence_lst, vocab_dict, model, image_size**2, data_args, padding_mode)
+        return train_dataset, model
+    elif data_args.experiment_mode == 'lm':
+        result_train_lst = helper_tokenize_encode(sentence_lst, vocab_dict, model, image_size**2, data_args, padding_mode)
+    elif data_args.experiment_mode == 'conditional_gen':
+        result_train_lst = helper_tokenize_encode_cond(sentence_lst, vocab_dict, model, image_size ** 2, data_args)
+    return {'train': result_train_lst}, model
+def write_e2e_corr(prompt_lst, file_dict, corr_path):
+    print(len(prompt_lst))
+    with open(corr_path, 'w') as f:
+        for x in prompt_lst:
+            for line in file_dict[x]:
+                print(" ".join(line), file=f)
+            print('', file=f)
+def write_e2e_src(prompt_lst, corr_path):
+    with open(corr_path, 'w') as f:
+        for x in prompt_lst:
+            print(" ".join(x), file=f)
+    return
+def read_e2e_files(path, args, tokenizer):
+    file_dict = {}
+    with open(path, 'r') as f:
+        for line in f:
+            src_lst, word_lst = line.strip().split('||')
+            tgt = tuple([x.text for x in tokenizer(word_lst)])
+            src = tuple([x.text for x in tokenizer(src_lst)])
+            if src not in file_dict:
+                file_dict[src] = []
+            file_dict[src].append(tgt)
+    temp = '1'
+    prompt_text_dict = file_dict
+    prompt_text_lst = list(prompt_text_dict.keys())
+    gold_dir = os.path.join(args.out_dir, '{}_{}_{}'.format(temp, args.split, 'gold'))
+    print("gold dir", gold_dir)
+    write_e2e_corr(prompt_text_lst, prompt_text_dict, gold_dir)
+    src_dir = os.path.join(args.out_dir, '{}_{}_{}'.format(temp, args.split, 'src'))
+    write_e2e_src(prompt_text_lst, src_dir)
+    final_lst = [(xx, prompt_text_dict[xx][0]) for xx in prompt_text_lst]
+    return final_lst
+def get_corpus_book(data_args, tokenizer, model, image_size, padding_mode='block', split='train',):
+    max_length = image_size ** 2
+    import os
+    assert padding_mode == 'block'
+    raw_datasets = load_dataset('bookcorpus')
+    if "validation" not in raw_datasets.keys():
+        raw_datasets["validation"] = load_dataset(
+            'bookcorpus',
+            split=f"train[:1%]",
+        )
+        raw_datasets["train"] = load_dataset(
+            'bookcorpus',
+            split=f"train[1%:]",
+        )
+    print(raw_datasets)
+    column_names = raw_datasets["train"].column_names
+    def tokenize_function(examples):
+        output = tokenizer(examples['text'], add_special_tokens=False)
+        return output
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=True,
+    )
+    print(tokenized_datasets)
+    block_size = max_length
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        result = {
+            k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+    lm_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=4,
+        load_from_cache_file=True,
+        desc=f"Grouping texts in chunks of {block_size}",
+    )
+    print(lm_datasets)
+    if model is None:
+        if data_args.training_mode.startswith('e2e'):
+            print('since its e2e, initialize a dummy embedding' )
+            model = torch.nn.Embedding(len(tokenizer), 1)
+        else:
+            model = torch.nn.Embedding(len(tokenizer), data_args.in_channel)
+        print('initializing the random embeddings', model)
+        torch.nn.init.normal_(model.weight)
+        path_save = f'{data_args.checkpoint_path}/random_emb.torch'
+        print(f'save the random encoder to {data_args.checkpoint_path}/random_emb.torch')
+        torch.save(model.state_dict(), path_save)
+    if split == 'train':
+        return lm_datasets, model
+    else:
+        lm_datasets['train'] = lm_datasets['validation']
+        return lm_datasets, model
+class TextDataset(Dataset):
+    def __init__(self, text_datasets, resolution, data_args, model_arch='conv-unet',
+                 classes=None, shard=0, num_shards=1, eigen_transform=None,
+                 mapping_func=None, model_emb=None):
+        super().__init__()
+        self.resolution = resolution
+        self.text_datasets = text_datasets
+        self.length = len(self.text_datasets['train'])
+        self.model_arch = model_arch
+        self.data_args = data_args
+        print(self.resolution)
+        self.eigen_transform = eigen_transform
+        self.mapping_func = mapping_func
+        self.model_emb = model_emb
+        # self.local_images = image_paths[shard:][::num_shards]
+        # self.local_classes = None if classes is None else classes[shard:][::num_shards]
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        # We are not on a new enough PIL to support the `reducing_gap`
+        # argument, which uses BOX downsampling at powers of two first.
+        # Thus, we do it by hand to improve downsample quality.
+        if self.model_arch == 'conv-unet':
+            pass# arr = np.array(self.text_datasets['train'][idx]['hidden_states'],
+            #                dtype=np.float32).reshape(self.resolution, self.resolution, -1)
+            # # print(self.eigen_transform.shape)
+            # if self.eigen_transform  is not None:
+            #     old_shape = arr.shape
+            #     arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+            #     arr = arr @ self.eigen_transform['map']
+            #     arr = arr.reshape(old_shape)
+            # if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+            #     arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+            # out_dict = {}
+            # out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+            # # if self.local_classes is not None:
+            # #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+            # # print(out_dict.keys())
+            # return np.transpose(arr, [2, 0, 1]), out_dict
+        elif self.model_arch == '1d-unet':
+            pass# arr = np.array(self.text_datasets['train'][idx]['hidden_states'],
+            #                dtype=np.float32) # seqlen, dim
+            # if self.eigen_transform  is not None:
+            #     old_shape = arr.shape
+            #     arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+            #     arr = arr @ self.eigen_transform['map']
+            #     arr = arr.reshape(old_shape)
+            # if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+            #     arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+            # arr = np.transpose(arr, [1, 0])
+            # out_dict = {}
+            # out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+            # # out_dict['mapping_func'] = self.mapping_func
+            # # if self.local_classes is not None:
+            # #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+            # # print(arr.shape)
+            # return arr, out_dict
+        else:
+            arr = np.array(self.text_datasets['train'][idx]['hidden_states'],
+                           dtype=np.float32)
+            if self.eigen_transform  is not None:
+                old_shape = arr.shape
+                # arr = arr.reshape(1, -1) @ self.eigen_transform
+                arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+                arr = arr @ self.eigen_transform['map']
+                arr = arr.reshape(old_shape)
+            if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+                # print(arr.dtype)
+                # print(self.data_args.noise_level, 'using the noise level.')
+                arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+                # print(arr.dtype)
+            out_dict = {}
+            out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+            # out_dict['mapping_func'] = self.mapping_func
+            if self.data_args.experiment_mode == 'conditional_gen':
+                out_dict['src_ids'] = np.array(self.text_datasets['train'][idx]['src_ids'])
+                out_dict['src_mask'] = np.array(self.text_datasets['train'][idx]['src_mask'])
+            # if self.local_classes is not None:
+            #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+            return arr, out_dict
+        # print(arr.dtype)
+        # arr = arr.float()
+        # print(arr.shape)
+class TextDataset_NoCache(Dataset):
+    def __init__(self, text_datasets, resolution, data_args, model_arch='conv-unet',
+                 classes=None, shard=0, num_shards=1, eigen_transform=None,
+                 mapping_func=None, model_emb=None):
+        super().__init__()
+        self.resolution = resolution
+        self.text_datasets = text_datasets
+        self.length = len(self.text_datasets['train'])
+        self.model_arch = model_arch
+        self.data_args = data_args
+        print(self.resolution)
+        self.eigen_transform = eigen_transform
+        self.mapping_func = mapping_func
+        self.model_emb = model_emb
+        # self.local_images = image_paths[shard:][::num_shards]
+        # self.local_classes = None if classes is None else classes[shard:][::num_shards]
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        # We are not on a new enough PIL to support the `reducing_gap`
+        # argument, which uses BOX downsampling at powers of two first.
+        # Thus, we do it by hand to improve downsample quality.
+        with torch.no_grad():
+            input_ids = self.text_datasets['train'][idx]['input_ids']
+            model = self.model_emb
+            if self.data_args.experiment.startswith('random'):
+                hidden_state = model(torch.tensor(input_ids))
+            elif self.data_args.experiment == 'gpt2_pre_compress':
+                input_ids2 = torch.tensor(input_ids).to(model.device)
+                input_embs = model.transformer.wte(input_ids2)  # input_embs
+                hidden_state = model.down_proj(input_embs)
+                hidden_state = hidden_state * data_args.emb_scale_factor
+            if self.model_arch == 'conv-unet':
+                arr = np.array(hidden_state,
+                               dtype=np.float32).reshape(self.resolution, self.resolution, -1)
+                # print(self.eigen_transform.shape)
+                if self.eigen_transform is not None:
+                    old_shape = arr.shape
+                    arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+                    arr = arr @ self.eigen_transform['map']
+                    arr = arr.reshape(old_shape)
+                if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+                    arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+                out_dict = {}
+                out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+                # if self.local_classes is not None:
+                #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+                # print(out_dict.keys())
+                return np.transpose(arr, [2, 0, 1]), out_dict
+            elif self.model_arch == '1d-unet':
+                arr = np.array(hidden_state,
+                               dtype=np.float32)  # seqlen, dim
+                if self.eigen_transform is not None:
+                    old_shape = arr.shape
+                    arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+                    arr = arr @ self.eigen_transform['map']
+                    arr = arr.reshape(old_shape)
+                if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+                    arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+                arr = np.transpose(arr, [1, 0])
+                out_dict = {}
+                out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+                # out_dict['mapping_func'] = self.mapping_func
+                # if self.local_classes is not None:
+                #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+                # print(arr.shape)
+                return arr, out_dict
+            else:
+                arr = np.array(hidden_state,
+                               dtype=np.float32)
+                if self.eigen_transform is not None:
+                    old_shape = arr.shape
+                    # arr = arr.reshape(1, -1) @ self.eigen_transform
+                    arr = arr.reshape(1, -1) - self.eigen_transform['mean']
+                    arr = arr @ self.eigen_transform['map']
+                    arr = arr.reshape(old_shape)
+                if hasattr(self.data_args, 'noise_level') and self.data_args.noise_level > 0:
+                    # print(arr.dtype)
+                    # print(self.data_args.noise_level, 'using the noise level.')
+                    arr = arr + self.data_args.noise_level * np.random.randn(*arr.shape).astype(arr.dtype)
+                    # print(arr.dtype)
+                out_dict = {}
+                out_dict['input_ids'] = np.array(self.text_datasets['train'][idx]['input_ids'])
+                # out_dict['mapping_func'] = self.mapping_func
+                if self.data_args.experiment_mode == 'conditional_gen':
+                    out_dict['src_ids'] = np.array(self.text_datasets['train'][idx]['src_ids'])
+                    out_dict['src_mask'] = np.array(self.text_datasets['train'][idx]['src_mask'])
+                # if self.local_classes is not None:
+                #     out_dict["y"] = np.array(self.local_classes[idx], dtype=np.int64)
+                return arr, out_dict
+def _collate_batch_helper(examples, pad_token_id, max_length, return_mask=False):
+    result = torch.full([len(examples), max_length], pad_token_id, dtype=torch.int64).tolist()
+    mask_ = torch.full([len(examples), max_length], pad_token_id, dtype=torch.int64).tolist()
+    for i, example in enumerate(examples):
+        curr_len = min(len(example), max_length)
+        result[i][:curr_len] = example[:curr_len]
+        mask_[i][:curr_len] = [1] * curr_len
+    if return_mask:
+        return result, mask_
+    return result
+def _torch_collate_batch(examples, pad_token_id, max_length):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    import numpy as np
+    import torch
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple, np.ndarray)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+    # length_of_first = examples[0].size(0)
+    # Check if padding is necessary.
+    # are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    # if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+    #     return torch.stack(examples, dim=0)
+    # Creating the full tensor and filling it with our data.
+    # max_length = max(x.size(0) for x in examples)
+    # if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+    #     max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], pad_token_id)
+    for i, example in enumerate(examples):
+        if True:
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result

src/improved_diffusion/train_util.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import os
+import copy
+import functools
+import blobfile as bf
+import torch
+import torch.distributed as dist
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.optim import AdamW
+from . import dist_util, logger
+from .fp16_util import (
+    make_master_params,
+    master_params_to_model_params,
+    model_grads_to_master_grads,
+    unflatten_master_params,
+    zero_grad,
+)
+from .nn import update_ema
+from .resample import LossAwareSampler, UniformSampler
+import wandb
+from tqdm import tqdm
+INITIAL_LOG_LOSS_SCALE = 20.0
+class TrainLoop:
+    def __init__(
+        self,
+        *,
+        model,
+        diffusion,
+        data,
+        batch_size,
+        microbatch,
+        lr,
+        ema_rate,
+        log_interval,
+        save_interval,
+        resume_checkpoint,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        schedule_sampler=None,
+        weight_decay=0.0,
+        lr_anneal_steps=0,
+        checkpoint_path="",
+        gradient_clipping=-1.0,
+        eval_data=None,
+        eval_interval=-1,
+    ):
+        print('Initiating train loop')
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        self.rank = rank
+        self.world_size = world_size
+        self.diffusion = diffusion
+        self.data = data
+        self.eval_data = eval_data
+        self.batch_size = batch_size
+        self.microbatch = microbatch if microbatch > 0 else batch_size
+        self.lr = lr * world_size
+        self.ema_rate = (
+            [ema_rate]
+            if isinstance(ema_rate, float)
+            else [float(x) for x in ema_rate.split(",")]
+        )
+        self.log_interval = log_interval
+        self.eval_interval = eval_interval
+        self.save_interval = save_interval
+        self.resume_checkpoint = resume_checkpoint
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.schedule_sampler = schedule_sampler or UniformSampler(diffusion)
+        self.weight_decay = weight_decay
+        self.lr_anneal_steps = lr_anneal_steps
+        self.gradient_clipping = gradient_clipping
+        self.step = 0
+        self.resume_step = 0
+        self.global_batch = self.batch_size * dist.get_world_size()
+        self.lg_loss_scale = INITIAL_LOG_LOSS_SCALE
+        self.sync_cuda = torch.cuda.is_available()
+        self.checkpoint_path = checkpoint_path
+        self.model = model.to(rank)
+        if torch.cuda.is_available():  # DEBUG **
+            self.use_ddp = True
+            self.ddp_model = self.model
+            # self.ddp_model = DDP(
+            #     self.model,
+            #     device_ids=[self.rank],
+            #     find_unused_parameters=False,
+            # )
+        else:
+            self.ddp_model = model.to("cpu")
+        self.model_params = list(self.ddp_model.parameters())
+        self.master_params = self.model_params
+        self.opt = AdamW(self.master_params, lr=self.lr, weight_decay=self.weight_decay)
+        if self.resume_step:
+            # self._load_optimizer_state()
+            # # Model was resumed, either due to a restart or a checkpoint
+            # # being specified at the command line.
+            # self.ema_params = [
+            #     self._load_ema_parameters(rate) for rate in self.ema_rate
+            # ]
+            pass
+        else:
+            self.ema_params = [
+                copy.deepcopy(self.master_params) for _ in range(len(self.ema_rate))
+            ]
+        print('Finish initiating train loop')
+    def _load_and_sync_parameters(self):
+        resume_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        if resume_checkpoint:
+            self.resume_step = parse_resume_step_from_filename(resume_checkpoint)
+            if dist.get_rank() == 0:
+                # logger.log(f"loading model from checkpoint: {resume_checkpoint}...")
+                print(f"loading model from checkpoint: {resume_checkpoint}...")
+                self.model.load_state_dict(
+                    dist_util.load_state_dict(
+                        resume_checkpoint, map_location=dist_util.dev()
+                    )
+                )
+        dist_util.sync_params(self.model.parameters())
+    def _load_ema_parameters(self, rate):
+        ema_params = copy.deepcopy(self.master_params)
+        main_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        ema_checkpoint = find_ema_checkpoint(main_checkpoint, self.resume_step, rate)
+        if ema_checkpoint:
+            if dist.get_rank() == 0:
+                logger.log(f"loading EMA from checkpoint: {ema_checkpoint}...")
+                state_dict = dist_util.load_state_dict(
+                    ema_checkpoint, map_location=dist_util.dev()
+                )
+                ema_params = self._state_dict_to_master_params(state_dict)
+        dist_util.sync_params(ema_params)
+        return ema_params
+    def _load_optimizer_state(self):
+        main_checkpoint = find_resume_checkpoint() or self.resume_checkpoint
+        opt_checkpoint = bf.join(
+            bf.dirname(main_checkpoint), f"opt{self.resume_step:06}.pt"
+        )
+        if bf.exists(opt_checkpoint):
+            logger.log(f"loading optimizer state from checkpoint: {opt_checkpoint}")
+            state_dict = dist_util.load_state_dict(
+                opt_checkpoint, map_location=dist_util.dev()
+            )
+            self.opt.load_state_dict(state_dict)
+    def _setup_fp16(self):
+        self.master_params = make_master_params(self.model_params)
+        self.model.convert_to_fp16()
+    def run_loop(self):
+        pbar = tqdm(total=self.lr_anneal_steps // self.world_size)
+        print('Start running train loop')
+        while (
+            not self.lr_anneal_steps
+            or self.step + self.resume_step < self.lr_anneal_steps // self.world_size
+        ):
+            pbar.set_description(f"Step: {self.step + self.resume_step}")
+            batch = next(self.data)
+            # if self.step<3:
+            #     print("RANK:",self.rank,"STEP:",self.step,"BATCH:",batch)
+            self.run_step(batch, cond=None)
+            if self.step % self.log_interval == 0:
+                # dist.barrier()
+                pass
+                # print('loggggg')
+                # logger.dumpkvs()
+            if self.eval_data is not None and self.step % self.eval_interval == 0:
+                # batch_eval, cond_eval = next(self.eval_data)
+                # self.forward_only(batch, cond)
+                print("eval on validation set")
+                pass  # logger.dumpkvs()
+            if self.step % self.save_interval == 0 and self.step != 0:
+                self.save()
+                # Run for a finite amount of time in integration tests.
+                if os.environ.get("DIFFUSION_TRAINING_TEST", "") and self.step > 0:
+                    return
+            self.step += 1
+            pbar.update(1)
+        # Save the last checkpoint if it wasn't already saved.
+        if (self.step - 1) % self.save_interval != 0:
+            self.save()
+    def run_step(self, batch, cond):
+        self.forward_backward(batch, cond)
+        if self.use_fp16:
+            self.optimize_fp16()
+        else:
+            self.optimize_normal()
+        self.log_step()
+    def forward_only(self, batch, cond):
+        with torch.no_grad():
+            zero_grad(self.model_params)
+            for i in range(0, batch.shape[0], self.microbatch):
+                micro = batch[i : i + self.microbatch].to(dist_util.dev())
+                micro_cond = {
+                    k: v[i : i + self.microbatch].to(dist_util.dev())
+                    for k, v in cond.items()
+                }
+                last_batch = (i + self.microbatch) >= batch.shape[0]
+                t, weights = self.schedule_sampler.sample(
+                    micro.shape[0], dist_util.dev()
+                )
+                # print(micro_cond.keys())
+                compute_losses = functools.partial(
+                    self.diffusion.training_losses,
+                    self.ddp_model,
+                    micro,
+                    t,
+                    micro_cond,
+                )
+                if last_batch or not self.use_ddp:
+                    losses = compute_losses()
+                else:
+                    with self.ddp_model.no_sync():
+                        losses = compute_losses()
+                log_loss_dict(
+                    self.diffusion,
+                    t,
+                    {f"eval_{k}": v * weights for k, v in losses.items()},
+                )
+    def forward_backward(self, batch, cond):
+        # zero_grad(self.model_params)
+        self.opt.zero_grad()
+        for i in range(0, batch[0].shape[0], self.microbatch):
+            # micro = batch[i : i + self.microbatch].to(self.rank)
+            # last_batch = (i + self.microbatch) >= batch.shape[0]
+            # t, weights = self.schedule_sampler.sample(micro.shape[0], self.rank)
+            micro = (
+                batch[0].to(self.rank),  # selfies_ids
+                batch[1].to(self.rank),  # caption_state
+                batch[2].to(self.rank),  # caption_mask
+                batch[3].to(self.rank),  # corrupted_selfies_ids
+            )
+            last_batch = True
+            t, weights = self.schedule_sampler.sample(micro[0].shape[0], self.rank)
+            compute_losses = functools.partial(
+                self.diffusion.training_losses,
+                self.ddp_model,
+                micro,
+                t,
+                None,
+            )
+            if last_batch or not self.use_ddp:
+                losses = compute_losses()
+            else:
+                with self.ddp_model.no_sync():
+                    losses = compute_losses()
+            if isinstance(self.schedule_sampler, LossAwareSampler):
+                self.schedule_sampler.update_with_local_losses(
+                    t, losses["loss"].detach()
+                )
+            loss = (losses["loss"] * weights).mean()
+            # print('----DEBUG-----',self.step,self.log_interval)
+            if self.step % self.log_interval == 0 and self.rank == 0:
+                print("rank0: ", self.step, loss.item())
+                wandb.log({"loss": loss.item()})
+            # log_loss_dict(
+            #     self.diffusion, t, {k: v * weights for k, v in losses.items()}
+            # )
+            if self.use_fp16:
+                # loss_scale = 2 ** self.lg_loss_scale
+                # (loss * loss_scale).backward()
+                pass
+            else:
+                loss.backward()
+    def optimize_fp16(self):
+        if any(not torch.isfinite(p.grad).all() for p in self.model_params):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            return
+        model_grads_to_master_grads(self.model_params, self.master_params)
+        self.master_params[0].grad.mul_(1.0 / (2**self.lg_loss_scale))
+        self._log_grad_norm()
+        self._anneal_lr()
+        self.opt.step()
+        for rate, params in zip(self.ema_rate, self.ema_params):
+            update_ema(params, self.master_params, rate=rate)
+        master_params_to_model_params(self.model_params, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+    def grad_clip(self):
+        # print('doing gradient clipping')
+        max_grad_norm = self.gradient_clipping  # 3.0
+        if hasattr(self.opt, "clip_grad_norm"):
+            # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping
+            self.opt.clip_grad_norm(max_grad_norm)
+        # else:
+        #     assert False
+        # elif hasattr(self.model, "clip_grad_norm_"):
+        #     # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
+        #     self.model.clip_grad_norm_(args.max_grad_norm)
+        else:
+            # Revert to normal clipping otherwise, handling Apex or full precision
+            torch.nn.utils.clip_grad_norm_(
+                self.model.parameters(),  # amp.master_params(self.opt) if self.use_apex else
+                max_grad_norm,
+            )
+    def optimize_normal(self):
+        if self.gradient_clipping > 0:
+            self.grad_clip()
+        # self._log_grad_norm()
+        self._anneal_lr()
+        self.opt.step()
+        for rate, params in zip(self.ema_rate, self.ema_params):
+            update_ema(params, self.master_params, rate=rate)
+    def _log_grad_norm(self):
+        sqsum = 0.0
+        for p in self.master_params:
+            sqsum += (p.grad**2).sum().item()
+        # logger.logkv_mean("grad_norm", np.sqrt(sqsum))
+    def _anneal_lr(self):
+        if not self.lr_anneal_steps:
+            return
+        frac_done = (self.step + self.resume_step) / self.lr_anneal_steps
+        lr = self.lr * (1 - frac_done)
+        for param_group in self.opt.param_groups:
+            param_group["lr"] = lr
+    def log_step(self):
+        logger.logkv("step", self.step + self.resume_step)
+        # logger.logkv("samples", (self.step + self.resume_step + 1) * self.global_batch)
+        if self.use_fp16:
+            logger.logkv("lg_loss_scale", self.lg_loss_scale)
+    def save(self):
+        def save_checkpoint(rate, params):
+            state_dict = self._master_params_to_state_dict(params)
+            if dist.get_rank() == 0:
+                # logger.log(f"saving model {rate}...")
+                print(f"saving model {rate}...")
+                if not rate:
+                    filename = f"PLAIN_model{((self.step+self.resume_step)*self.world_size):06d}.pt"
+                else:
+                    filename = f"PLAIN_ema_{rate}_{((self.step+self.resume_step)*self.world_size):06d}.pt"
+                # print('writing to', bf.join(get_blob_logdir(), filename))
+                # print('writing to', bf.join(self.checkpoint_path, filename))
+                # with bf.BlobFile(bf.join(get_blob_logdir(), filename), "wb") as f:
+                #     torch.save(state_dict, f)
+                with bf.BlobFile(
+                    bf.join(self.checkpoint_path, filename), "wb"
+                ) as f:  # DEBUG **
+                    torch.save(state_dict, f)
+        save_checkpoint(0, self.master_params)
+        for rate, params in zip(self.ema_rate, self.ema_params):
+            save_checkpoint(rate, params)
+        # if dist.get_rank() == 0: # DEBUG **
+        #     with bf.BlobFile(
+        #         bf.join(get_blob_logdir(), f"opt{(self.step+self.resume_step):06d}.pt"),
+        #         "wb",
+        #     ) as f:
+        #         torch.save(self.opt.state_dict(), f)
+        dist.barrier()
+    def _master_params_to_state_dict(self, master_params):
+        if self.use_fp16:
+            master_params = unflatten_master_params(
+                list(self.model.parameters()), master_params  # DEBUG **
+            )
+        state_dict = self.model.state_dict()
+        for i, (name, _value) in enumerate(self.model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+        return state_dict
+    def _state_dict_to_master_params(self, state_dict):
+        params = [state_dict[name] for name, _ in self.model.named_parameters()]
+        if self.use_fp16:
+            return make_master_params(params)
+        else:
+            return params
+def parse_resume_step_from_filename(filename):
+    """
+    Parse filenames of the form path/to/modelNNNNNN.pt, where NNNNNN is the
+    checkpoint's number of steps.
+    """
+    split = filename.split("model")
+    if len(split) < 2:
+        return 0
+    split1 = split[-1].split(".")[0]
+    try:
+        return int(split1)
+    except ValueError:
+        return 0
+def get_blob_logdir():
+    return os.environ.get("DIFFUSION_BLOB_LOGDIR", logger.get_dir())
+def find_resume_checkpoint():
+    # On your infrastructure, you may want to override this to automatically
+    # discover the latest checkpoint on your blob storage, etc.
+    return None
+def find_ema_checkpoint(main_checkpoint, step, rate):
+    if main_checkpoint is None:
+        return None
+    filename = f"ema_{rate}_{(step):06d}.pt"
+    path = bf.join(bf.dirname(main_checkpoint), filename)
+    if bf.exists(path):
+        return path
+    return None
+def log_loss_dict(diffusion, ts, losses):
+    return
+    for key, values in losses.items():
+        logger.logkv_mean(key, values.mean().item())
+        # Log the quantiles (four quartiles, in particular).
+        for sub_t, sub_loss in zip(ts.cpu().numpy(), values.detach().cpu().numpy()):
+            quartile = int(4 * sub_t / diffusion.num_timesteps)
+            logger.logkv_mean(f"{key}_q{quartile}", sub_loss)

src/improved_diffusion/transformer_model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, T5EncoderModel
+from .nn import SiLU, linear, timestep_embedding
+class TransformerNetModel(nn.Module):
+    def __init__(
+        self,
+        in_channels=32,
+        model_channels=128,
+        dropout=0.1,
+        config_name="QizhiPei/biot5-base-text2mol",
+        vocab_size=None,  # 821
+        hidden_size=768,
+        num_attention_heads=12,
+        num_hidden_layers=12,
+    ):
+        super().__init__()
+        config = AutoConfig.from_pretrained(config_name)
+        config.is_decoder = True
+        config.add_cross_attention = True
+        config.hidden_dropout_prob = 0.1
+        config.num_attention_heads = num_attention_heads
+        config.num_hidden_layers = num_hidden_layers
+        config.max_position_embeddings = 512
+        config.layer_norm_eps = 1e-12
+        config.vocab_size = vocab_size
+        config.d_model = hidden_size
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.dropout = dropout
+        self.word_embedding = nn.Embedding(vocab_size, self.in_channels)
+        self.lm_head = nn.Linear(self.in_channels, vocab_size)
+        self.lm_head.weight = self.word_embedding.weight
+        self.caption_down_proj = nn.Sequential(
+            linear(768, self.hidden_size),
+            SiLU(),
+            linear(self.hidden_size, self.hidden_size),
+        )
+        time_embed_dim = model_channels * 4  # 512
+        self.time_embed = nn.Sequential(
+            linear(self.model_channels, time_embed_dim),
+            SiLU(),
+            linear(time_embed_dim, self.hidden_size),
+        )
+        self.input_up_proj = nn.Sequential(
+            nn.Linear(self.in_channels, self.hidden_size),
+            nn.Tanh(),
+            nn.Linear(self.hidden_size, self.hidden_size),
+        )
+        self.input_transformers = T5EncoderModel(config)
+        # self.input_transformers.eval()
+        # for param in self.input_transformers.parameters():
+        #     param.requires_grad = False
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, self.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.output_down_proj = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.Tanh(),
+            nn.Linear(self.hidden_size, self.in_channels),
+        )
+    def get_embeds(self, input_ids):
+        return self.word_embedding(input_ids)
+    def get_embeds_with_deep(self, input_ids):
+        atom, deep = input_ids
+        atom = self.word_embedding(atom)
+        deep = self.deep_embedding(deep)
+        return torch.concat([atom, deep], dim=-1)
+    def get_logits(self, hidden_repr):
+        return self.lm_head(hidden_repr)
+    def forward(self, x, timesteps, caption_state, caption_mask, y=None):
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        emb_x = self.input_up_proj(x)
+        seq_length = x.size(1)
+        position_ids = self.position_ids[:, :seq_length]
+        emb_inputs = (
+            self.position_embeddings(position_ids)
+            + emb_x
+            + emb.unsqueeze(1).expand(-1, seq_length, -1)
+        )
+        emb_inputs = self.dropout(self.LayerNorm(emb_inputs))
+        caption_state = self.dropout(
+            self.LayerNorm(self.caption_down_proj(caption_state))
+        )
+        input_trans_hidden_states = self.input_transformers.encoder(
+            inputs_embeds=emb_inputs,
+            encoder_hidden_states=caption_state,
+            encoder_attention_mask=caption_mask,
+        ).last_hidden_state
+        h = self.output_down_proj(input_trans_hidden_states)
+        h = h.type(x.dtype)
+        return h

src/improved_diffusion/transformer_utils.py ADDED Viewed

	@@ -0,0 +1,450 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, hidden_size, num_attention_heads, attention_head_size,  position_embedding_type=None):
+        super().__init__()
+        # hidden_size, num_attention_heads, attention_probs_dropout_prob
+        # if hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+        #     raise ValueError(
+        #         f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
+        #         f"heads ({num_attention_heads})"
+        #     )
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = attention_head_size
+        # self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        # print(self.position_embedding_type,  config.max_position_embeddings)
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = config.is_decoder
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        # print(x.shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        #
+        # if self.is_decoder:
+        #     outputs = outputs + (past_key_value,)
+        return context_layer
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertSelfOutput(nn.Module):
+    def __init__(self, config, hidden_size, input_hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        if input_hidden_size != hidden_size:
+            self.rescale=True
+            self.dense2 = nn.Linear(input_hidden_size, hidden_size)
+        else:
+            self.rescale = False
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        if self.rescale:
+            input_tensor2 = self.dense2(input_tensor)
+        else:
+            input_tensor2 = input_tensor
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor2)
+        return hidden_states
+def trans_nd(config, hidden_size, num_attention_heads, attention_head_size):
+    return BertSelfAttention(config, hidden_size, num_attention_heads, attention_head_size,
+                             position_embedding_type=None)
+def layer_norm(hidden_size, ):
+    # print(f'layer norm, {hidden_size}')
+    return nn.LayerNorm(hidden_size)
+class BertAttention(nn.Module):
+    def __init__(self, config, hidden_size, num_attention_heads, attention_head_size,
+                 position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(config, hidden_size, num_attention_heads, attention_head_size,
+                                      position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config, num_attention_heads * attention_head_size, hidden_size)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs, hidden_states)
+        # print(self_outputs.shape, attention_output.shape, 'output of BertAttention')
+        # outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return attention_output

src/scripts/__init__.py ADDED Viewed

File without changes

src/scripts/batch_decode.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os, sys, glob
+# full_lst = glob.glob('diff_models_synth128*')
+# full_lst = glob.glob('diff_models_synth32*')
+# full_lst = glob.glob('diff_models_synth32_3_rand16*')
+# full_lst = glob.glob('diff_models_synth_rand_16_trans_lr_1e-5_long_Lsimple')
+full_lst = glob.glob(sys.argv[1])
+top_p = -1.0 if len(sys.argv) < 2 else sys.argv[2]
+print(f'top_p = {top_p}')
+pattern_ = 'model' if len(sys.argv) < 3 else sys.argv[3]
+print(f'pattern_ = {pattern_}', sys.argv[3])
+# print(full_lst)
+output_lst = []
+for lst in full_lst:
+    print(lst)
+    try:
+        tgt = sorted(glob.glob(f"{lst}/{pattern_}*pt"))[-1]
+        lst = os.path.split(lst)[1]
+        print(lst)
+        num = 1
+    except:
+        continue
+    model_arch_ = lst.split('_')[5-num]
+    model_arch = 'conv-unet' if 'conv-unet' in lst else 'transformer'
+    mode =  'image' if ('conv' in model_arch ) else 'text' #or '1d-unet' in model_arch_
+    print(mode, model_arch_)
+    dim_ =lst.split('_')[4-num]
+    # diffusion_steps= 4000
+    # noise_schedule = 'cosine'
+    # dim = dim_.split('rand')[1]
+    if 'synth' in lst:
+        modality = 'synth'
+    elif 'pos' in lst:
+        modality = 'pos'
+    elif 'image' in lst:
+        modality = 'image'
+    elif 'roc' in lst:
+        modality = 'roc'
+    elif 'e2e-tgt' in lst:
+        modality = 'e2e-tgt'
+    elif 'simple-wiki' in lst:
+        modality = 'simple-wiki'
+    elif 'book' in lst:
+        modality = 'book'
+    elif 'yelp' in lst:
+        modality = 'yelp'
+    elif 'commonGen' in lst:
+        modality = 'commonGen'
+    elif 'e2e' in lst:
+        modality = 'e2e'
+    if 'synth32' in lst:
+        kk = 32
+    elif 'synth128' in lst:
+        kk = 128
+    try:
+        diffusion_steps = int(lst.split('_')[7-num])
+        print(diffusion_steps)
+    except:
+        diffusion_steps = 4000
+    try:
+        noise_schedule = lst.split('_')[8-num]
+        assert  noise_schedule in ['cosine', 'linear']
+        print(noise_schedule)
+    except:
+        noise_schedule = 'cosine'
+    try:
+        dim = int(dim_.split('rand')[1])
+    except:
+        dim =lst.split('_')[4-num]
+    try:
+        print(len(lst.split('_')))
+        num_channels =  int(lst.split('_')[-1].split('h')[1])
+    except:
+        num_channels = 128
+    print(tgt, model_arch, dim, num_channels)
+    # out_dir = 'diffusion_lm/improved_diffusion/out_gen_large_nucleus'
+    # num_samples = 512
+    # out_dir = 'diffusion_lm/improved_diffusion/out_gen_v2_nucleus'
+    out_dir = 'generation_outputs'
+    num_samples = 50
+    if modality == 'e2e':
+        num_samples = 547
+    COMMAND = f'python scripts/{mode}_sample.py ' \
+    f'--model_path {tgt} --batch_size 50 --num_samples {num_samples} --top_p {top_p} ' \
+    f'--out_dir {out_dir} '
+    print(COMMAND)
+    # os.system(COMMAND)
+    # shape_str = "x".join([str(x) for x in arr.shape])
+    model_base_name = os.path.basename(os.path.split(tgt)[0]) + f'.{os.path.split(tgt)[1]}'
+    if modality == 'e2e-tgt' or modality == 'e2e':
+        out_path2 = os.path.join(out_dir, f"{model_base_name}.samples_{top_p}.json")
+    else:
+        out_path2 =  os.path.join(out_dir, f"{model_base_name}.samples_{top_p}.txt")
+    output_cands = glob.glob(out_path2)
+    print(out_path2, output_cands)
+    if len(output_cands) > 0:
+        out_path2 = glob.glob(out_path2)[0]
+    else:
+        os.system(COMMAND)
+        out_path2 = glob.glob(out_path2)[0]
+    output_lst.append(out_path2)
+    if modality == 'pos':
+        model_name_path = 'predictability/diff_models/pos_e=15_b=20_m=gpt2_wikitext-103-raw-v1_s=102'
+    elif modality == 'synth':
+        if kk == 128:
+            model_name_path = 'predictability/diff_models/synth_e=15_b=10_m=gpt2_wikitext-103-raw-v1_None'
+        else:
+            model_name_path = 'predictability/diff_models/synth_e=15_b=20_m=gpt2_wikitext-103-raw-v1_None'
+    elif modality == 'e2e-tgt':
+        model_name_path = "predictability/diff_models/e2e-tgt_e=15_b=20_m=gpt2_wikitext-103-raw-v1_101_None"
+    elif modality == 'roc':
+        model_name_path = "predictability/diff_models/roc_e=6_b=10_m=gpt2_wikitext-103-raw-v1_101_wp_pad_v1"
+    elif modality == 'e2e':
+        COMMAND1 = f"python diffusion_lm/e2e_data/mbr.py {out_path2}"
+        os.system(COMMAND1)
+        COMMAND2 = f"python e2e-metrics/measure_scores.py " \
+                   f"diffusion_lm/improved_diffusion/out_gen_v2_dropout2/1_valid_gold  " \
+                   f"{out_path2}.clean -p  -t -H > {os.path.join(os.path.split(tgt)[0], 'e2e_valid_eval.txt')}"
+        print(COMMAND2)
+        os.system(COMMAND2)
+        continue
+    else:
+        print('not trained a AR model yet... only look at the output plz.')
+        continue
+    COMMAND = f"python scripts/ppl_under_ar.py " \
+              f"--model_path {tgt} " \
+              f"--modality {modality}  --experiment random " \
+              f"--model_name_or_path {model_name_path} " \
+              f"--input_text {out_path2}  --mode eval"
+    print(COMMAND)
+    print()
+    os.system(COMMAND)
+print('output lists:')
+print("\n".join(output_lst))

src/scripts/batch_nll.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os, sys, glob
+full_lst = glob.glob(sys.argv[1])
+pattern_ = 'model' if len(sys.argv) < 2 else sys.argv[2]
+clamp = 'clamp' if len(sys.argv) <= 3 else sys.argv[3]
+print(f'pattern_ = {pattern_}', sys.argv[2])
+for lst in full_lst:
+    print(lst)
+    try:
+        tgt = sorted(glob.glob(f"{lst}/{pattern_}*pt"))[-1]
+        lst = os.path.split(lst)[1]
+        print(lst)
+        num = 1
+    except:
+        continue
+    COMMAND = f'python scripts/nll.py --clip_denoised False ' \
+        f'--model_path {tgt} ' \
+        f'--out_dir diffusion_lm/improved_diffusion/scores_eval2_valid_None ' \
+              f'--num_samples 64 --split valid --clamp {clamp}'
+    print(COMMAND)
+    os.system(COMMAND)
+    COMMAND = f'python scripts/nll.py --clip_denoised False ' \
+              f'--model_path {tgt} ' \
+              f'--out_dir diffusion_lm/improved_diffusion/scores_eval2_valid_None ' \
+              f'--num_samples 64 --split train --clamp {clamp}'
+    print(COMMAND)
+    os.system(COMMAND)

src/scripts/infill_util.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import torch as th
+def get_score(input_embs, label_ids, model_control, t=None):
+    label_ids2 = label_ids.clone()
+    label_ids2[:, :65] = -100
+    # print(label_ids2[:, 65:])
+    # print(final.shape, tgt_embs.shape)
+    # input_embs = th.cat([final, tgt_embs], dim=1)
+    model_out = model_control(input_embs=input_embs,
+                              labels=label_ids2, t=t)
+    print(model_out.loss, 'final end')
+    loss_fn = th.nn.CrossEntropyLoss(reduction='none')
+    shifted_logits = model_out.logits[:, :-1].contiguous()
+    shifted_labels = label_ids2[:, 1:].contiguous()
+    loss = loss_fn(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1)).reshape(
+        shifted_labels.shape)
+    return loss.sum(dim=-1).tolist()
+def langevin_fn3(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample):  # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 3
+    # K = 3
+    if t[0].item() > 0:
+        tt = t[0].item() - 1
+    else:
+        tt = 200
+    label_ids = label_ids.cuda()
+    tgt_embs = model3(label_ids[:, sample.size(1):])
+    label_ids2 = label_ids.clone()
+    label_ids2[:, :65] = -100
+    input_embs_param = th.nn.Parameter(sample)
+    if False:
+        input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+        debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt))
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+            model_out = model_control(input_embs=input_embs,
+                                      labels=label_ids2, t=tt)
+            coef = 0.01
+            # coef=1.
+            if sigma.mean() == 0:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+            else:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum()
+            # print(model_out.loss, f'start_{i}', logp_term.item(), t[0].item(), sigma.mean().item())
+            loss = model_out.loss + logp_term
+            loss.backward()
+            optimizer.step()
+            epsilon = th.randn_like(input_embs_param.data)
+            input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach())
+            # input_embs_param = th.nn.Parameter((input_embs_param.data +
+            #                                    np.sqrt(2*sigma.mean().item()) * epsilon).detach())
+    # input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+    # model_out = model_control(input_embs=input_embs,
+    #                           labels=label_ids2,
+    #                           t=tt)
+    # print(model_out.loss, 'end')
+    return input_embs_param.data
+def langevin_fn4(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample): # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 3
+    if t[0].item() >0:
+        tt =t[0].item() - 1
+    else:
+        tt = 200
+    label_ids = label_ids.cuda()
+    input_embs_param = th.nn.Parameter(sample)
+    if False:
+        input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+        debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt))
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            # print(input_embs_param.shape, label_ids.shape)
+            model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt)
+            coef = 0.0001 # prev default.
+            # coef = 0.001
+            # coef = 0.0005
+            # coef=1.
+            if sigma.mean() == 0:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+            else:
+                logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum()
+            print(model_out.loss, f'start_{i}', logp_term.item(),
+                  t[0].item(), sigma.mean().item())
+            loss = model_out.loss + logp_term
+            loss.backward()
+            optimizer.step()
+            epsilon = th.randn_like(input_embs_param.data)
+            input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach())
+            # input_embs_param = th.nn.Parameter((input_embs_param.data +
+            #                                    np.sqrt(2*sigma.mean().item()) * epsilon).detach())
+    model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt)
+    print(model_out.loss, 'end')
+    return input_embs_param.data
+def langevin_fn_length(coeff, diffusion, partial_mask, diff_model, tgt_embs, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample): # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 3
+    if t[0].item() >0:
+        tt =t[0].item() - 1
+    else:
+        tt = 200
+    input_embs_param = th.nn.Parameter(sample)
+    if False:
+        input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+        debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt))
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            print(t.shape)
+            # print(input_embs_param.shape, label_ids.shape)
+            out = diffusion.p_mean_variance(
+                diff_model,
+                input_embs_param,
+                t,
+                clip_denoised=False,
+                denoised_fn=None,
+                model_kwargs={},
+            )
+            # model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt)
+            coef = coeff
+            # coef = 0.0001 # prev default.
+            # coef = 0.001
+            # coef = 0.0005
+            # coef=1.
+            if sigma.mean() == 0:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+                infill_loss = (out['pred_xstart'][~partial_mask] - tgt_embs[~partial_mask]) ** 2
+                infill_loss = infill_loss.mean(dim=0).sum()
+            else:
+                logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum()
+                # print(out['pred_xstart'].shape, tgt_embs.shape)
+                # print(partial_mask[0])
+                infill_loss = ((out['pred_xstart'][~partial_mask] - tgt_embs[~partial_mask]) ** 2).view(tgt_embs.size(0), -1, tgt_embs.size(-1) )
+                # print(infill_loss.shape, ((mean - input_embs_param)**2).shape )
+                infill_loss = (infill_loss/sigma.mean()).mean(dim=0).sum()
+            print(infill_loss, f'start_{i}', logp_term.item(),
+                  t[0].item(), sigma.mean().item())
+            loss = logp_term + infill_loss
+            loss.backward()
+            optimizer.step()
+            epsilon = th.randn_like(input_embs_param.data)
+            input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach())
+            # input_embs_param = th.nn.Parameter((input_embs_param.data +
+            #                                    np.sqrt(2*sigma.mean().item()) * epsilon).detach())
+    # model_out = model_control(input_embs=input_embs_param, pos_ids=label_ids, t=tt)
+    # print(model_out.loss, 'end')
+    return input_embs_param.data
+def langevin_fn_tree(coeff, model_control, model3, label_ids, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample): # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 3
+    if t[0].item() >0:
+        tt =t[0].item() - 1
+    else:
+        tt = 200
+    label_ids = label_ids.cuda()
+    input_embs_param = th.nn.Parameter(sample)
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            # print(input_embs_param.shape, label_ids.shape)
+            model_out = model_control(input_embs=input_embs_param, parse_chart=label_ids, t=tt)
+            # coef = 0.0001
+            # coef = 0.001
+            # coef = 0.01
+            # coef = 0.1 # good for partial.
+            # coef=0.001 # also good for full (more fluent).
+            # coef=0.0001
+            # coef=0.0005 # good for full.
+            coef = coeff
+            # coef = 0.5
+            # coef=1.
+            if sigma.mean() == 0:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+            else:
+                logp_term = coef * ((mean - input_embs_param)**2 / sigma).mean(dim=0).sum()
+            # print(model_out.loss, f'start_{i}', logp_term.item(),
+            #       t[0].item(), sigma.mean().item())
+            loss = model_out.loss + logp_term
+            loss.backward()
+            optimizer.step()
+            epsilon = th.randn_like(input_embs_param.data)
+            input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0*sigma.mean().item() * epsilon).detach())
+            # input_embs_param = th.nn.Parameter((input_embs_param.data +
+            #                                    np.sqrt(2*sigma.mean().item()) * epsilon).detach())
+    # COMMENT OUT
+    # model_out = model_control(input_embs=input_embs_param, parse_chart=label_ids, t=tt)
+    # print(model_out.loss, 'end')
+    return input_embs_param.data
+def langevin_fn1(debug_lst, model_control, model3, label_ids, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample):  # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 1
+    # K = 3
+    if t[0].item() > 0:
+        tt = t[0].item() - 1
+    else:
+        tt = 200
+    label_ids = label_ids.cuda()
+    tgt_embs = model3(label_ids[:, sample.size(1):])
+    label_ids2 = label_ids.clone()
+    label_ids2[:, :65] = -100
+    input_embs_param = th.nn.Parameter(sample)
+    if True:
+        input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+        debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt))
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+            model_out = model_control(input_embs=input_embs,
+                                      labels=label_ids2, t=tt)
+            # coef = 0.0
+            # if sigma.mean() == 0:
+            #     logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+            # else:
+            #     logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum()
+            print(model_out.loss, f'start_{i}', t[0].item(), sigma.mean().item())
+            coef = 3.
+            loss = model_out.loss # + logp_term
+            loss.backward()
+            # print(input_embs_param.grad.shape, )
+            input_embs_param.data = input_embs_param.data - coef * sigma.mean().item() * input_embs_param.grad
+            # optimizer.step()
+            # epsilon = th.randn_like(input_embs_param.data)
+            # input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach())
+            # input_embs_param = th.nn.Parameter((input_embs_param.data +
+            #                                    np.sqrt(2*sigma.mean().item()) * epsilon).detach())
+    input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+    model_out = model_control(input_embs=input_embs,
+                              labels=label_ids2,
+                              t=tt)
+    print(model_out.loss, 'end')
+    # if True:
+    #     debug_lst.append(get_score(input_embs, label_ids2, model_control, t=tt))
+    return input_embs_param.data
+def langevin_fn3_compose(debug_lst, model_control, model3, label_ids_lst, step_size, sample, mean, sigma,
+                 alpha, t, prev_sample):  # current best.
+    if t[0].item() < 10:
+        K = 0
+    else:
+        K = 3
+    # K = 3
+    if t[0].item() > 0:
+        tt = t[0].item() - 1
+    else:
+        tt = 200
+    tgt_embs_lst = [model3(label_ids[:, sample.size(1):]) for label_ids in label_ids_lst]
+    label_ids2_lst = []
+    for label_ids in label_ids_lst:
+        label_ids2 = label_ids.clone()
+        label_ids2[:, :65] = -100
+        label_ids2_lst.append(label_ids2)
+    input_embs_param = th.nn.Parameter(sample)
+    if True:
+        part_score = []
+        for (tgt_embs,label_ids2) in zip(tgt_embs_lst, label_ids2_lst):
+            input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+            score_ = get_score(input_embs, label_ids2, model_control, t=tt)
+            part_score.append(score_)
+        debug_lst.append(part_score)
+    with th.enable_grad():
+        for i in range(K):
+            optimizer = th.optim.Adagrad([input_embs_param], lr=step_size)
+            optimizer.zero_grad()
+            cum_loss = 0
+            for (tgt_embs, label_ids2) in zip(tgt_embs_lst, label_ids2_lst):
+                input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+                model_out = model_control(input_embs=input_embs,
+                                          labels=label_ids2, t=tt)
+                cum_loss += model_out.loss
+            coef = 0.01
+            if sigma.mean() == 0:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / 1.).mean(dim=0).sum()
+            else:
+                logp_term = coef * ((mean - input_embs_param) ** 2 / sigma).mean(dim=0).sum()
+            print(cum_loss, f'start_{i}', logp_term.item(), t[0].item(), sigma.mean().item())
+            loss = cum_loss + logp_term
+            loss.backward()
+            optimizer.step()
+            epsilon = th.randn_like(input_embs_param.data)
+            input_embs_param = th.nn.Parameter((input_embs_param.data + 0.0 * sigma.mean().item() * epsilon).detach())
+    part_score = []
+    for (tgt_embs, label_ids2) in zip(tgt_embs_lst, label_ids2_lst):
+        input_embs = th.cat([input_embs_param, tgt_embs], dim=1)
+        score_ = get_score(input_embs, label_ids2, model_control, t=tt)
+        part_score.append(score_)
+    return input_embs_param.data

src/scripts/mydatasets.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import os
+import glob
+import torch
+import random
+import selfies as sf
+from rdkit import Chem
+from datasets import load_dataset
+from transformers import T5EncoderModel
+from torch.utils.data import DistributedSampler, DataLoader, Dataset
+def get_dataloader(dataset, batchsize, rank, world_size):
+    sampler = DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank, shuffle=True
+    )
+    def collate(batch):
+        selfies_ids = [i["selfies_ids"] for i in batch]
+        caption_state = [i["caption_state"] for i in batch]
+        caption_mask = [i["caption_mask"] for i in batch]
+        corrupted_selfies_ids = [i["corrupted_selfies_ids"] for i in batch]
+        return (
+            torch.concat(selfies_ids, dim=0),
+            torch.concat(caption_state, dim=0),
+            torch.concat(caption_mask, dim=0),
+            torch.concat(corrupted_selfies_ids, dim=0),
+        )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batchsize,
+        shuffle=False,
+        collate_fn=collate,
+        sampler=sampler,
+    )
+    def cycle():
+        ec = 0
+        while True:
+            dataloader.sampler.set_epoch(ec)
+            for i in dataloader:
+                yield i
+            ec += 1
+    return iter(cycle())
+class Lang2molDataset_train(Dataset):
+    def __init__(
+        self,
+        dir,
+        tokenizer,
+        split,
+        dataset_name,
+        pre=None,
+        prob=0,
+        load_state=True,
+        corrupt_prob=0.4,
+        token_max_length=256,
+    ):
+        super().__init__()
+        self.dir = dir
+        self.tokenizer = tokenizer
+        self.split = split
+        self.pre = pre
+        self.prob = prob
+        self.corrupt_prob = corrupt_prob
+        self.token_max_length = token_max_length
+        self.dataset_name = dataset_name
+        self.ori_data = self.create_data()
+        self.load_state = load_state
+        self.model = T5EncoderModel.from_pretrained("QizhiPei/biot5-base-text2mol")
+        self.model.to("cuda")
+        self.model.eval()
+    def create_data(self):
+        try:
+            dataset = load_dataset(
+                self.dataset_name,
+                token=True,
+                split=self.split,
+            ).sort("id")
+        except:
+            dataset = load_dataset(
+                self.dataset_name,
+                use_auth_token=True,
+                split=self.split,
+            ).sort("id")
+        return [
+            (int(sample_id), sample_selfies, sample_caption, sample_smiles)
+            for (sample_id, sample_selfies, sample_caption, sample_smiles) in zip(
+                dataset["id"],
+                dataset["selfies"],
+                dataset["caption"],
+                dataset["smiles"],
+            )
+        ]
+    def __len__(self):
+        return len(self.ori_data)
+    def permute(self, selfies):
+        if random.random() < self.prob:
+            return changeorder(selfies, shuffle=True)
+        else:
+            return selfies
+    def __getitem__(self, idx):
+        data = self.ori_data[idx]
+        sample = {
+            "id": data[0],
+            "selfies": self.permute(data[1]),
+            "caption": data[2],
+            "smiles": data[3],
+        }
+        # Molecules
+        output_molecule = self.tokenizer(
+            sample["selfies"],
+            max_length=self.token_max_length,
+            truncation=True,
+            padding="max_length",
+            add_special_tokens=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        sample["selfies_ids"] = output_molecule["input_ids"]
+        sample["corrupted_selfies_ids"] = sample["selfies_ids"]
+        # Captions
+        output_caption = self.tokenizer(
+            sample["caption"],
+            max_length=self.token_max_length,
+            truncation=True,
+            padding="max_length",
+            add_special_tokens=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        sample["caption_state"] = self.model(
+            input_ids=output_caption["input_ids"].to("cuda"),
+            attention_mask=output_caption["attention_mask"].to("cuda"),
+        ).last_hidden_state
+        sample["caption_mask"] = output_caption["attention_mask"]
+        return sample
+class Lang2molDataset_eval(Dataset):
+    def __init__(
+        self,
+        dir,
+        tokenizer,
+        split,
+        dataset_name,
+        pre=None,
+        prob=0,
+        load_state=True,
+        corrupt_prob=0.4,
+        token_max_length=512,
+    ):
+        super().__init__()
+        self.dir = dir
+        self.tokenizer = tokenizer
+        self.split = split
+        self.pre = pre
+        self.prob = prob
+        self.corrupt_prob = corrupt_prob
+        self.token_max_length = token_max_length
+        self.dataset_name = dataset_name
+        self.ori_data = self.create_data()
+        self.load_state = load_state
+        self.model = T5EncoderModel.from_pretrained("QizhiPei/biot5-base-text2mol")
+        self.model.to("cuda")
+        self.model.eval()
+    def create_data(self):
+        try:
+            dataset = load_dataset(
+                self.dataset_name,
+                token=True,
+                split=self.split,
+            ).sort("id")
+        except:
+            dataset = load_dataset(
+                self.dataset_name,
+                use_auth_token=True,
+                split=self.split,
+            ).sort("id")
+        return [
+            (int(sample_id), sample_selfies, sample_caption, sample_smiles)
+            for (sample_id, sample_selfies, sample_caption, sample_smiles) in zip(
+                dataset["id"],
+                dataset["selfies"],
+                dataset["caption"],
+                dataset["smiles"],
+            )
+        ]
+    def __len__(self):
+        return len(self.ori_data)
+    def permute(self, selfies):
+        if random.random() < self.prob:
+            return changeorder(selfies, shuffle=True)
+        else:
+            return selfies
+    def __getitem__(self, idx):
+        data = self.ori_data[idx]
+        sample = {
+            "id": data[0],
+            "selfies": self.permute(data[1]),
+            "caption": data[2],
+            "smiles": data[3],
+        }
+        output_caption = self.tokenizer(
+            sample["caption"],
+            max_length=self.token_max_length,
+            truncation=True,
+            padding="max_length",
+            add_special_tokens=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        sample["caption_state"] = self.model(
+            input_ids=output_caption["input_ids"].to("cuda"),
+            attention_mask=output_caption["attention_mask"].to("cuda"),
+        ).last_hidden_state
+        sample["caption_mask"] = output_caption["attention_mask"]
+        return sample
+class Lang2molDataset_submission(Dataset):
+    def __init__(
+        self,
+        dir,
+        tokenizer,
+        split,
+        dataset_name,
+        pre=None,
+        prob=0,
+        load_state=True,
+        corrupt_prob=0.4,
+        token_max_length=256,
+    ):
+        super().__init__()
+        self.dir = dir
+        self.tokenizer = tokenizer
+        self.split = split
+        self.pre = pre
+        self.prob = prob
+        self.corrupt_prob = corrupt_prob
+        self.token_max_length = token_max_length
+        self.dataset_name = dataset_name
+        self.ori_data = self.create_data()
+        self.load_state = load_state
+        self.model = T5EncoderModel.from_pretrained("QizhiPei/biot5-base-text2mol")
+        self.model.to("cuda")
+        self.model.eval()
+    def create_data(self):
+        try:
+            dataset = load_dataset(
+                self.dataset_name,
+                token=True,
+                split=self.split,
+            )
+        except:
+            dataset = load_dataset(
+                self.dataset_name,
+                use_auth_token=True,
+                split=self.split,
+            )
+        return [sample_caption for sample_caption in dataset["caption"]]
+    def __len__(self):
+        return len(self.ori_data)
+    def permute(self, selfies):
+        if random.random() < self.prob:
+            return changeorder(selfies, shuffle=True)
+        else:
+            return selfies
+    def __getitem__(self, idx):
+        sample = {"caption": self.ori_data[idx]}
+        # Captions
+        output_caption = self.tokenizer(
+            sample["caption"],
+            max_length=self.token_max_length,
+            truncation=True,
+            padding="max_length",
+            add_special_tokens=True,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        sample["caption_state"] = self.model(
+            input_ids=output_caption["input_ids"].to("cuda"),
+            attention_mask=output_caption["attention_mask"].to("cuda"),
+        ).last_hidden_state
+        sample["caption_mask"] = output_caption["attention_mask"]
+        return sample
+def changeorder(selfies, shuffle):
+    smiles = sf.encoder(selfies)
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return selfies
+    Chem.Kekulize(mol)
+    atom_indices = [atom.GetIdx() for atom in mol.GetAtoms()]
+    if shuffle:
+        random.shuffle(atom_indices)
+    reordered_mol = Chem.RenumberAtoms(mol, atom_indices)
+    new_smiles = Chem.MolToSmiles(reordered_mol, kekuleSmiles=True)
+    new_selfies = sf.decoder(new_smiles)
+    return new_selfies

src/scripts/mytokenizers.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import os
+import torch
+import random
+import selfies as sf
+from transformers import AutoTokenizer
+################################
+def getrandomnumber(numbers, k, weights=None):
+    if k == 1:
+        return random.choices(numbers, weights=weights, k=k)[0]
+    else:
+        return random.choices(numbers, weights=weights, k=k)
+# simple smiles tokenizer
+# treat every charater as token
+def build_simple_smiles_vocab(dir):
+    assert dir is not None, "dir and smiles_vocab can not be None at the same time."
+    if not os.path.exists(os.path.join(dir, "simple_smiles_tokenizer_vocab.txt")):
+        # print('Generating Vocabulary for {} ...'.format(dir))
+        dirs = list(
+            os.path.join(dir, i) for i in ["train.txt", "validation.txt", "test.txt"]
+        )
+        smiles = []
+        for idir in dirs:
+            with open(idir, "r") as f:
+                for i, line in enumerate(f):
+                    if i == 0:
+                        continue
+                    line = line.split("\t")
+                    assert len(line) == 3, "Dataset format error."
+                    if line[1] != "*":
+                        smiles.append(line[1].strip())
+        char_set = set()
+        for smi in smiles:
+            for c in smi:
+                char_set.add(c)
+        vocabstring = "".join(char_set)
+        with open(os.path.join(dir, "simple_smiles_tokenizer_vocab.txt"), "w") as f:
+            f.write(os.path.join(vocabstring))
+        return vocabstring
+    else:
+        print("Reading in Vocabulary...")
+        with open(os.path.join(dir, "simple_smiles_tokenizer_vocab.txt"), "r") as f:
+            vocabstring = f.readline().strip()
+        return vocabstring
+class Tokenizer:
+    def __init__(
+        self,
+        pretrained_name="QizhiPei/biot5-base-text2mol",
+        selfies_dict_path=os.path.join("dataset", "selfies_dict.txt"),
+    ):
+        self.tokenizer = self.get_tokenizer(pretrained_name, selfies_dict_path)
+    def get_tokenizer(self, pretrained_name, selfies_dict_path):
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_name, use_fast=True)
+        tokenizer.model_max_length = int(1e9)
+        amino_acids = [
+            "A",
+            "C",
+            "D",
+            "E",
+            "F",
+            "G",
+            "H",
+            "I",
+            "K",
+            "L",
+            "M",
+            "N",
+            "P",
+            "Q",
+            "R",
+            "S",
+            "T",
+            "V",
+            "W",
+            "Y",
+        ]
+        prefixed_amino_acids = [f"<p>{aa}" for aa in amino_acids]
+        tokenizer.add_tokens(prefixed_amino_acids)
+        selfies_dict_list = [line.strip() for line in open(selfies_dict_path)]
+        tokenizer.add_tokens(selfies_dict_list)
+        special_tokens_dict = {
+            "additional_special_tokens": [
+                "<bom>",
+                "<eom>",
+                "<bop>",
+                "<eop>",
+                "MOLECULE NAME",
+                "DESCRIPTION",
+                "PROTEIN NAME",
+                "FUNCTION",
+                "SUBCELLULAR LOCATION",
+                "PROTEIN FAMILIES",
+            ]
+        }
+        tokenizer.add_special_tokens(special_tokens_dict)
+        return tokenizer
+    def __call__(self, *args, **kwds):
+        return self.tokenizer(*args, **kwds)
+    def __len__(self):
+        return len(self.tokenizer)
+    def corrupt(self, selfies_list: list):
+        tensors = []
+        if type(selfies_list) is str:
+            selfies_list = [selfies_list]
+        for selfies in selfies_list:
+            tensors.append(self.corrupt_one(selfies))
+        return torch.concat(tensors, dim=0)
+    # TODO: rewrite this for selfies
+    def corrupt_one(self, selfies):
+        smi = sf.decoder(selfies)
+        # res = [self.toktoid[i] for i in self.rg.findall(smi)]
+        res = [i for i in self.rg.findall(smi)]
+        total_length = len(res) + 2
+        if total_length > self.max_len:
+            return self.encode_one(smi)
+        ######################## start corruption ###########################
+        r = random.random()
+        if r < 0.3:
+            pa, ring = True, True
+        elif r < 0.65:
+            pa, ring = True, False
+        else:
+            pa, ring = False, True
+        #########################
+        max_ring_num = 1
+        ringpos = []
+        papos = []
+        for pos, at in enumerate(res):
+            if at == "(" or at == ")":
+                papos.append(pos)
+            elif at.isnumeric():
+                max_ring_num = max(max_ring_num, int(at))
+                ringpos.append(pos)
+        # ( & ) remove
+        r = random.random()
+        if r < 0.3:
+            remove, padd = True, True
+        elif r < 0.65:
+            remove, padd = True, False
+        else:
+            remove, padd = False, True
+        if pa and len(papos) > 0:
+            if remove:
+                # remove pa
+                n_remove = getrandomnumber(
+                    [1, 2, 3, 4], 1, weights=[0.6, 0.2, 0.1, 0.1]
+                )
+                p_remove = set(random.choices(papos, weights=None, k=n_remove))
+                total_length -= len(p_remove)
+                for p in p_remove:
+                    res[p] = None
+                    # print('debug pa delete {}'.format(p))
+        # Ring remove
+        r = random.random()
+        if r < 0.3:
+            remove, radd = True, True
+        elif r < 0.65:
+            remove, radd = True, False
+        else:
+            remove, radd = False, True
+        if ring and len(ringpos) > 0:
+            if remove:
+                # remove ring
+                n_remove = getrandomnumber(
+                    [1, 2, 3, 4], 1, weights=[0.7, 0.2, 0.05, 0.05]
+                )
+                p_remove = set(random.choices(ringpos, weights=None, k=n_remove))
+                total_length -= len(p_remove)
+                for p in p_remove:
+                    res[p] = None
+                    # print('debug ring delete {}'.format(p))
+        # ring add & ( ) add
+        if pa:
+            if padd:
+                n_add = getrandomnumber([1, 2, 3], 1, weights=[0.8, 0.2, 0.1])
+                n_add = min(self.max_len - total_length, n_add)
+                for _ in range(n_add):
+                    sele = random.randrange(len(res) + 1)
+                    res.insert(sele, "(" if random.random() < 0.5 else ")")
+                    # print('debug pa add {}'.format(sele))
+                    total_length += 1
+        if ring:
+            if radd:
+                n_add = getrandomnumber([1, 2, 3], 1, weights=[0.8, 0.2, 0.1])
+                n_add = min(self.max_len - total_length, n_add)
+                for _ in range(n_add):
+                    sele = random.randrange(len(res) + 1)
+                    res.insert(sele, str(random.randrange(1, max_ring_num + 1)))
+                    # print('debug ring add {}'.format(sele))
+                    total_length += 1
+        ########################## end corruption ###############################
+        # print('test:',res)
+        # print('test:',''.join([i for i in res if i is not None]))
+        res = [self.toktoid[i] for i in res if i is not None]
+        res = [1] + res + [2]
+        if len(res) < self.max_len:
+            res += [0] * (self.max_len - len(res))
+        else:
+            res = res[: self.max_len]
+            res[-1] = 2
+        return torch.LongTensor([res])
+    def decode_one(self, sample):
+        return self.tokenizer.decode(sample)
+    def decode(self, sample_list):
+        if len(sample_list.shape)==1:
+            return [self.decode_one(sample_list)]
+        return [self.decode_one(sample) for sample in sample_list]
+if __name__ == "__main__":
+    import selfies as sf
+    tokenizer = Tokenizer(
+        selfies_dict_path=r"D:\molecule\mol-lang-bridge\dataset\selfies_dict.txt"
+    )
+    smiles = [
+        "[210Po]",
+        "C[C@H]1C(=O)[C@H]([C@H]([C@H](O1)OP(=O)(O)OP(=O)(O)OC[C@@H]2[C@H](C[C@@H](O2)N3C=C(C(=O)NC3=O)C)O)O)O",
+        "C(O)P(=O)(O)[O-]",
+        "CCCCCCCCCCCC(=O)OC(=O)CCCCCCCCCCC",
+        "C[C@]12CC[C@H](C[C@H]1CC[C@@H]3[C@@H]2CC[C@]4([C@H]3CCC4=O)C)O[C@H]5[C@@H]([C@H]([C@@H]([C@H](O5)C(=O)O)O)O)O",
+    ]
+    selfies = [sf.encoder(smiles_ele) for smiles_ele in smiles]
+    output = tokenizer(
+        selfies,
+        max_length=512,
+        truncation=True,
+        padding="max_length",
+        add_special_tokens=True,
+        return_tensors="pt",
+        return_attention_mask=True,
+    )
+    print(output["input_ids"])

src/scripts/nll.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Approximate the bits/dimension for an image model.
+"""
+import argparse
+import os, json
+import torch as th
+import numpy as np
+import torch.distributed as dist
+from improved_diffusion import dist_util, logger
+from improved_diffusion.image_datasets import load_data
+from improved_diffusion.text_datasets import load_data_text, load_synthetic_data
+from improved_diffusion.script_util import (
+    model_and_diffusion_defaults,
+    create_model_and_diffusion,
+    add_dict_to_argparser,
+    args_to_dict,
+)
+from functools import partial
+from transformers import set_seed
+from improved_diffusion.test_util import get_weights, denoised_fn_round, compute_logp, load_results
+def main():
+    set_seed(42)
+    args = create_argparser().parse_args()
+    # load configurations.
+    config_path = os.path.join(os.path.split(args.model_path)[0], "training_args.json")
+    print(config_path)
+    # sys.setdefaultencoding('utf-8')
+    with open(config_path, 'rb', ) as f:
+        training_args = json.load(f)
+    training_args['batch_size'] = args.batch_size
+    print(args.data_dir)
+    del training_args['data_dir']
+    # print(args.__dict__, training_args)
+    args.__dict__.update(training_args)
+    print(args.__dict__['batch_size'], training_args['batch_size'], args.clip_denoised, args.batch_size)
+    print(args.data_dir)
+    # if args.noise_level > 0.0: flag_noise=True #DEBUG
+    args.noise_level = 0.0
+    args.roc_train = 'diffusion_lm/ROCstory'
+    if args.modality == 'roc-aug':
+        args.modality = 'roc'
+    # DEBUG
+    dist_util.setup_dist()
+    logger.configure()
+    logger.log("creating model and diffusion...")
+    model, diffusion = create_model_and_diffusion(
+        **args_to_dict(args, model_and_diffusion_defaults().keys())
+    )
+    model.load_state_dict(th.load(args.model_path))
+    # model.load_state_dict(
+    #     dist_util.load_state_dict(args.model_path, map_location="cpu")
+    # )
+    # diffusion.rescale_timesteps = False # IMPORTANT DEBUG -->  REMOVE
+    model.to(dist_util.dev())
+    model.eval() # DEBUG
+    logger.log("creating data loader...")
+    if args.modality == 'image':
+        data = load_data(
+            data_dir=args.data_dir,
+            batch_size=args.batch_size,
+            image_size=args.image_size,
+            class_cond=args.class_cond,
+            deterministic=True,
+        )
+    elif args.modality == 'permuted_image':
+        # perm = np.arange(args.image_size * args.image_size)
+        # np.random.shuffle(perm)
+        model_path_base = os.path.split(args.model_path)[0]
+        print(f'load permutation to {model_path_base}/permutation.json')
+        with open(f'{model_path_base}/permutation.json', 'r') as f:
+            perm = json.load(f)
+        perm = np.array(perm)
+        data = load_data(
+            data_dir=args.data_dir,
+            batch_size=args.batch_size,
+            image_size=args.image_size,
+            class_cond=args.class_cond,
+            permutation=perm
+        )
+    elif args.modality == 'synth':
+        from improved_diffusion.rounding import load_models
+        model2, tokenizer = load_models(args.modality, args.experiment, args.model_name_or_path, args.in_channel,
+                    os.path.split(args.model_path)[0])
+        data = load_synthetic_data(
+            data_dir=args.data_dir,
+            batch_size=args.batch_size,
+            image_size=args.image_size,
+            class_cond=args.class_cond,
+            data_args=args,
+            model=model2,
+            split='train',
+            # split='valid',
+            deterministic=True
+        )
+    elif args.modality == 'pos':
+        from improved_diffusion.rounding import load_models
+        model2, tokenizer = load_models(args.modality, args.experiment, args.model_name_or_path, args.in_channel,
+                                        os.path.split(args.model_path)[0])
+        data = load_synthetic_data(
+            data_dir=args.data_dir,
+            batch_size=args.batch_size,
+            image_size=args.image_size,
+            class_cond=args.class_cond,
+            data_args=args,
+            model=model2,
+            pos=True,
+            deterministic = True
+        )
+    else:
+        from improved_diffusion.rounding import load_models
+        model2, tokenizer = load_models(args.modality, args.experiment, args.model_name_or_path, args.in_channel,
+                                        os.path.split(args.model_path)[0])
+        # print(tokenizer)
+        # rev_tokenizer = {k:int(v)  for k, v in tokenizer.items()}
+        rev_tokenizer = {v:k  for k, v in tokenizer.items()}
+        if args.training_mode == 'e2e':
+            print('e2e, load the right model embeddings', '*'*80)
+            model2.weight = th.nn.Parameter(model.word_embedding.weight.clone().cpu())
+        # print(rev_tokenizer)
+        data = load_data_text(
+            data_dir=args.data_dir,
+            batch_size=args.batch_size,
+            image_size=args.image_size,
+            class_cond=args.class_cond,
+            data_args=args,
+            model=model2,
+            deterministic=True,
+            task_mode=args.modality,
+            padding_mode=args.padding_mode,  # block, pad
+            split=args.split,
+            load_vocab=rev_tokenizer,
+        )
+    logger.log("evaluating...")
+    run_bpd_evaluation(model, diffusion, data, args.num_samples, args.clip_denoised, args, model2)
+def run_bpd_evaluation(model, diffusion, data, num_samples, clip_denoised, args, model2):
+    all_bpd = []
+    all_metrics = {"vb": [], "mse": [], "xstart_mse": []}
+    num_complete = 0
+    model3 = get_weights(model2, args)
+    while num_complete < num_samples:
+        batch, model_kwargs = next(data)
+        batch = batch.to(dist_util.dev())
+        model_kwargs = {k: v.to(dist_util.dev()) for k, v in model_kwargs.items()}
+        model_kwargs['mapping_func'] = partial(compute_logp, args, model3.cuda())
+        minibatch_metrics = diffusion.calc_bpd_loop(
+            model, batch, clip_denoised=clip_denoised, model_kwargs=model_kwargs,
+            # denoised_fn=None,
+            denoised_fn=partial(denoised_fn_round, args, model3.cuda()) if args.clamp == 'clamp' else None,
+        )
+        for key, term_list in all_metrics.items():
+            terms = minibatch_metrics[key].mean(dim=0) / dist.get_world_size()
+            dist.all_reduce(terms)
+            term_list.append(terms.detach().cpu().numpy())
+        total_bpd = minibatch_metrics["total_bpd"]
+        total_bpd = total_bpd.mean() / dist.get_world_size()
+        dist.all_reduce(total_bpd)
+        all_bpd.append(total_bpd.item())
+        num_complete += dist.get_world_size() * batch.shape[0]
+        logger.log(f"done {num_complete} samples on {args.split}: bpd={np.mean(all_bpd)}, "
+                   f"per token={np.mean(all_bpd) * args.in_channel} ", args.model_path)
+        temp_cat = np.mean(np.stack(all_metrics['vb']), axis=0)
+        if len(temp_cat) % 8 == 0:
+            print([y.sum() for y in np.split(np.mean(np.stack(all_metrics['vb']), axis=0), 8)])
+        else:
+            print(temp_cat[0].sum())
+            print([y.sum() for y in np.split(temp_cat[1:-1], 8)])
+            print(temp_cat[-1].sum())
+        vb_temp = np.mean(np.stack(all_metrics['vb']), axis=0)
+        print(vb_temp.shape, vb_temp.sum())
+        print(vb_temp[-10:])
+    if dist.get_rank() == 0:
+        for name, terms in all_metrics.items():
+            model_base_name = os.path.basename(
+                os.path.split(args.model_path)[0]) + f'.{os.path.split(args.model_path)[1]}'
+            # args.out_dir = os.path.join(args.out_dir, f"{model_base_name}.samples_{shape_str}.txt")
+            out_path = os.path.join(args.out_dir, f"{model_base_name}.{name}_{args.split}_{args.clamp}_terms.npz")
+            logger.log(f"saving {name} terms to {out_path}")
+            np.savez(out_path, np.mean(np.stack(terms), axis=0))
+    dist.barrier()
+    logger.log("evaluation complete")
+    if 'ema' in args.model_path:
+        json_path = os.path.join(os.path.split(args.model_path)[0], f'ema_score_{args.split}_nll.json')
+    elif args.clamp == 'noclamp':
+        json_path = os.path.join(os.path.split(args.model_path)[0], f'score_{args.split}_nll_noclamp.json')
+    else:
+        json_path = os.path.join(os.path.split(args.model_path)[0], f'score_{args.split}_nll.json')
+    print(f'written to {json_path}')
+    temp_cat = np.mean(np.stack(all_metrics['vb']), axis=0)
+    if len(temp_cat) % 8 == 0:
+        temp_cat = temp_cat
+    else:
+        temp_cat = temp_cat[1:-1]
+    json_dict = {
+        f'score_{args.split}_ppl_token': np.mean(all_bpd) * args.in_channel,
+        f'score_{args.split}_ppl_dim': np.mean(all_bpd),
+        f'break_down_{args.split}_dim' : [y.sum().item() for y in np.split(temp_cat, 8)],
+        f'last_10_{args.split}_dim': vb_temp[-10:].tolist(),
+        'source_file': out_path,
+        'num_samples':num_samples,
+    }
+    load_results(json_path, json_dict)
+def create_argparser():
+    defaults = dict(
+        data_dir="", clip_denoised=False, num_samples=128, batch_size=64, model_path="",
+        out_dir="diffusion_lm/improved_diffusion/scores",
+        emb_scale_factor=1.0, split='train', debug_path='', clamp='clamp',
+    )
+    defaults.update(model_and_diffusion_defaults())
+    parser = argparse.ArgumentParser()
+    add_dict_to_argparser(parser, defaults)
+    return parser
+if __name__ == "__main__":
+    main()

src/scripts/tree_helper.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import spacy, nltk
+from nltk.tree import Tree
+import numpy as np
+def collapse_unary_strip_pos(tree, strip_top=True):
+    """Collapse unary chains and strip part of speech tags."""
+    def strip_pos(tree):
+        if len(tree) == 1 and isinstance(tree[0], str):
+            return tree[0]
+        else:
+            return nltk.tree.Tree(tree.label(), [strip_pos(child) for child in tree])
+    collapsed_tree = strip_pos(tree)
+    collapsed_tree.collapse_unary(collapsePOS=True, joinChar="::")
+    if collapsed_tree.label() in ("TOP", "ROOT", "S1", "VROOT"):
+        if strip_top:
+            if len(collapsed_tree) == 1:
+                collapsed_tree = collapsed_tree[0]
+            else:
+                collapsed_tree.set_label("")
+        elif len(collapsed_tree) == 1:
+            collapsed_tree[0].set_label(
+                collapsed_tree.label() + "::" + collapsed_tree[0].label())
+            collapsed_tree = collapsed_tree[0]
+    return collapsed_tree
+def _get_labeled_spans(tree, spans_out, start):
+    if isinstance(tree, str):
+        return start + 1
+    assert len(tree) > 1 or isinstance(
+        tree[0], str
+    ), "Must call collapse_unary_strip_pos first"
+    end = start
+    for child in tree:
+        end = _get_labeled_spans(child, spans_out, end)
+    # Spans are returned as closed intervals on both ends
+    spans_out.append((start, end - 1, tree.label()))
+    return end
+def get_labeled_spans(tree):
+    """Converts a tree into a list of labeled spans.
+    Args:
+        tree: an nltk.tree.Tree object
+    Returns:
+        A list of (span_start, span_end, span_label) tuples. The start and end
+        indices indicate the first and last words of the span (a closed
+        interval). Unary chains are collapsed, so e.g. a (S (VP ...)) will
+        result in a single span labeled "S+VP".
+    """
+    tree = collapse_unary_strip_pos(tree)
+    spans_out = []
+    _get_labeled_spans(tree, spans_out, start=0)
+    return spans_out
+def padded_chart_from_spans(label_vocab, spans, ):
+    num_words = 64
+    chart = np.full((num_words, num_words), -100, dtype=int)
+    # chart = np.tril(chart, -1)
+    # Now all invalid entries are filled with -100, and valid entries with 0
+    for start, end, label in spans:
+        if label in label_vocab:
+            chart[start, end] = label_vocab[label]
+    return chart
+def chart_from_tree(label_vocab, tree, verbose=False):
+    spans = get_labeled_spans(tree)
+    num_words = len(tree.leaves())
+    chart = np.full((num_words, num_words), -100, dtype=int)
+    chart = np.tril(chart, -1)
+    # Now all invalid entries are filled with -100, and valid entries with 0
+    # print(tree)
+    for start, end, label in spans:
+        # Previously unseen unary chains can occur in the dev/test sets.
+        # For now, we ignore them and don't mark the corresponding chart
+        # entry as a constituent.
+        # print(start, end, label)
+        if label in label_vocab:
+            chart[start, end] = label_vocab[label]
+    if not verbose:
+        return chart
+    else:
+        return chart, spans
+def pad_charts(charts, padding_value=-100):
+    """
+    Our input text format contains START and END, but the parse charts doesn't.
+    NEED TO: update the charts, so that we include these two, and set their span label to 0.
+    :param charts:
+    :param padding_value:
+    :return:
+    """
+    max_len = 64
+    padded_charts = torch.full(
+        (len(charts), max_len, max_len),
+        padding_value,
+    )
+    padded_charts = np.tril(padded_charts, -1)
+    # print(padded_charts[-2:], padded_charts.shape)
+    # print(padded_charts[1])
+    for i, chart in enumerate(charts):
+        # print(chart, len(chart), len(chart[0]))
+        chart_size = len(chart)
+        padded_charts[i, 1:chart_size+1, 1:chart_size+1] = chart
+    # print(padded_charts[-2:], padded_charts.shape)
+    return padded_charts

train.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import argparse
+from transformers import set_seed
+from src.scripts.mytokenizers import Tokenizer
+from src.improved_diffusion import gaussian_diffusion as gd
+from src.improved_diffusion.respace import SpacedDiffusion
+from src.improved_diffusion import dist_util
+from src.improved_diffusion.transformer_model import TransformerNetModel
+from src.improved_diffusion.resample import create_named_schedule_sampler
+from src.improved_diffusion.script_util import model_and_diffusion_defaults
+from src.improved_diffusion.script_util import add_dict_to_argparser
+from src.improved_diffusion.train_util import TrainLoop
+import torch.distributed as dist
+import wandb
+from src.scripts.mydatasets import get_dataloader, Lang2molDataset_train
+import warnings
+import torch.multiprocessing as mp
+def main_worker(rank, world_size):
+    args = create_argparser().parse_args()
+    set_seed(42)
+    wandb.login(key=args.wandb_token)
+    wandb.init(
+        project="ACL_Lang2Mol",
+        config=args.__dict__,
+    )
+    dist_util.setup_dist(rank, world_size)
+    tokenizer = Tokenizer()
+    model = TransformerNetModel(
+        in_channels=args.model_in_channels,
+        model_channels=args.model_model_channels,
+        dropout=args.model_dropout,
+        vocab_size=len(tokenizer),
+        hidden_size=args.model_hidden_size,
+        num_attention_heads=args.model_num_attention_heads,
+        num_hidden_layers=args.model_num_hidden_layers,
+    )
+    if args.model_path != "":
+        model.load_state_dict(
+            dist_util.load_state_dict(args.model_path, map_location="cpu")
+        )
+    model.train()
+    print("Total params:", sum(p.numel() for p in model.parameters()))
+    print(
+        "Total trainable params:",
+        sum(p.numel() for p in model.parameters() if p.requires_grad),
+    )
+    print("Tokenizer vocab length:", len(tokenizer))
+    diffusion = SpacedDiffusion(
+        use_timesteps=[i for i in range(args.diffusion_steps)],
+        betas=gd.get_named_beta_schedule("sqrt", args.diffusion_steps),
+        model_mean_type=(gd.ModelMeanType.START_X),
+        model_var_type=((gd.ModelVarType.FIXED_LARGE)),
+        loss_type=gd.LossType.E2E_MSE,
+        rescale_timesteps=True,
+        model_arch="transformer",
+        training_mode="e2e",
+    )
+    schedule_sampler = create_named_schedule_sampler("uniform", diffusion)
+    print("Loading data...")
+    train_dataset = Lang2molDataset_train(
+        dir=args.dataset_path,
+        tokenizer=tokenizer,
+        split="train",
+        corrupt_prob=0.0,
+        token_max_length=512,
+        dataset_name=args.dataset_name,
+    )
+    dataloader = get_dataloader(train_dataset, args.batch_size, rank, world_size)
+    print("Finish loading data")
+    TrainLoop(
+        model=model,
+        diffusion=diffusion,
+        data=dataloader,
+        batch_size=args.batch_size,
+        microbatch=args.microbatch,
+        lr=args.lr,
+        ema_rate=args.ema_rate,
+        log_interval=args.log_interval,
+        save_interval=args.save_interval,
+        resume_checkpoint=args.resume_checkpoint,
+        use_fp16=args.use_fp16,
+        fp16_scale_growth=args.fp16_scale_growth,
+        schedule_sampler=schedule_sampler,
+        weight_decay=args.weight_decay,
+        lr_anneal_steps=args.lr_anneal_steps,
+        checkpoint_path=args.checkpoint_path,
+        gradient_clipping=args.gradient_clipping,
+        eval_data=None,
+        eval_interval=args.eval_interval,
+    ).run_loop()
+    dist.destroy_process_group()
+def create_argparser():
+    defaults = dict()
+    text_defaults = dict(
+        wandb_token="",
+        batch_size=16,
+        cache_mode="no",
+        checkpoint_path="checkpoints",
+        class_cond=False,
+        config="ll",
+        config_name="QizhiPei/biot5-base-text2mol",
+        dataset_path="dataset",
+        diffusion_steps=2000,
+        dropout=0.01,
+        e2e_train="",
+        ema_rate="0.9999",
+        emb_scale_factor=1.0,
+        eval_interval=2000,
+        experiment="random",
+        experiment_mode="lm",
+        fp16_scale_growth=0.001,
+        gradient_clipping=2.4,
+        image_size=8,
+        in_channel=16,
+        learn_sigma=False,
+        log_interval=1000,
+        logits_mode=1,
+        lr=0.00005,
+        lr_anneal_steps=500000,
+        microbatch=-1,
+        modality="e2e-tgt",
+        model_arch="transformer",
+        noise_level=0.0,
+        noise_schedule="sqrt",
+        num_channels=128,
+        num_heads=4,
+        num_heads_upsample=-1,
+        num_res_blocks=2,
+        out_channel=16,
+        padding_mode="pad",
+        predict_xstart=True,
+        preprocessing_num_workers=1,
+        rescale_learned_sigmas=True,
+        rescale_timesteps=True,
+        resume_checkpoint="",
+        save_interval=50000,
+        schedule_sampler="uniform",
+        seed=42,
+        timestep_respacing="",
+        training_mode="e2e",
+        use_bert_tokenizer="no",
+        use_checkpoint=False,
+        use_fp16=False,
+        use_kl=False,
+        use_scale_shift_norm=True,
+        weight_decay=0.0,
+        model_in_channels=32,
+        model_model_channels=128,
+        model_dropout=0.01,
+        model_hidden_size=1024,
+        model_num_attention_heads=16,
+        model_num_hidden_layers=12,
+        dataset_name="",
+        model_path="",
+    )
+    defaults.update(model_and_diffusion_defaults())
+    defaults.update(text_defaults)
+    parser = argparse.ArgumentParser()
+    add_dict_to_argparser(parser, defaults)
+    return parser
+if __name__ == "__main__":
+    world_size = 1
+    mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True)