Spaces:

knowledgator
/

ChemicalConverters

Running

App Files Files Community

BioMike commited on Feb 6

Commit

7476d14

•

1 Parent(s): 3b0c756

Upload 23 files

Browse files

Files changed (23) hide show

app.py +10 -0
article.html +44 -0
interfaces/__init__.py +4 -0
interfaces/iupac2smiles.py +27 -0
interfaces/iupac2style.py +22 -0
interfaces/landing.py +6 -0
interfaces/smiles2iupac.py +36 -0
materials/introduction.html +67 -0
modeling/__init__.py +2 -0
modeling/config.py +133 -0
modeling/docstrings.py +217 -0
modeling/model.py +612 -0
models/IUPAC2SMILES/config.json +33 -0
models/IUPAC2SMILES/generation_config.json +7 -0
models/IUPAC2SMILES/model.safetensors +3 -0
models/SMILES2IUPAC/config.json +33 -0
models/SMILES2IUPAC/generation_config.json +7 -0
models/SMILES2IUPAC/model.safetensors +3 -0
requirements.txt +3 -0
test.py +12 -0
utils/__init__.py +2 -0
utils/main_model.py +47 -0
utils/rdkit_utils.py +39 -0

app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import gradio as gr
+from interfaces import smiles2iupac, iupac2smiles, iupac2style, landing
+demo = gr.TabbedInterface([landing, smiles2iupac, iupac2smiles, iupac2style],
+                          ["Introduction", "SMILES-to-IUPAC", "IUPAC-to-SMILES", "IUPAC style prediction"],
+                          title="ChemConverters 🧪🔬🧬👨🏻‍🔬",
+                          theme=gr.themes.Base())
+demo.launch(share=True)

article.html ADDED Viewed

	@@ -0,0 +1,44 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ChemConverters App Description</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 30px;
+            line-height: 4;
+        }
+        .link-button {
+            display: inline-block;
+            margin: 50px 50px;
+            padding: 50px;
+            background-color: #007bff;
+            color: white;
+            text-decoration: none;
+            border-radius: 50px;
+            font-weight: bold;
+        }
+        .link-button:hover {
+            background-color: #0056b3;
+        }
+    </style>
+</head>
+<body>
+    <p>With ChemConverters, you can effortlessly:</p>
+    <ul>
+        <li>Convert SMILES strings to IUPAC names and vice versa 🔄</li>
+        <li>Choose your preferred IUPAC naming style: BASE, SYSTEMATIC, or TRADITIONAL 📚</li>
+        <li>Validate chemical naming with molecules fingerprints similarity for accuracy checks ✔️</li>
+    </ul>
+    <p>Developed by the brilliant minds at Knowladgator, this app showcases the abilities of our chemical transformer models. Whether you're working on a research project, studying for an exam, or just exploring the chemical universe, ChemConverters is your go-to tool. 🛠️</p>
+    <p>Remember, chemistry is not just about reactions; it's about connections. Let's build those connections together! 💫</p>
+    <!-- Links Section -->
+    <div>
+        <a href="https://www.knowledgator.com/" class="link-button" target="_blank">🔗Visit our Website 🔗  </a>
+        <a href="https://www.linkedin.com/company/knowledgator/" class="link-button" target="_blank">💼Follow on LinkedIn 💼  </a>
+        <a href="https://huggingface.co/knowledgator/" class="link-button" target="_blank">🤗Hugging Face Profile🤗</a>
+    </div>
+</body>
+</html>

interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .smiles2iupac import  smiles2iupac
+from .iupac2smiles import iupac2smiles
+from .iupac2style import iupac2style
+from .landing import landing

interfaces/iupac2smiles.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import gradio as gr
+from utils import ChemicalConverter, validate_smiles2iupac, plot_mol
+def convert(chemical_name, plot):
+    # Initialize the ChemicalConverter
+    converter = ChemicalConverter(mode="IUPAC2SMILES")
+    converted_name = ""
+    plot_image = None
+    converted_name = converter.convert(chemical_name)[6:]
+    if plot:
+        plot_image = plot_mol(converted_name)
+    return converted_name, plot_image
+iupac2smiles = gr.Interface(
+    fn=convert,
+    allow_flagging='auto',
+    inputs=[
+        gr.Textbox(label="Enter your IUPAC name", placeholder="Enter IUPAC name here"),
+        gr.Checkbox(label="Plot molecule", value=True)
+    ],
+    outputs=[gr.Text(label="Converted Name"),
+             gr.Image(type='pil', label="Molecule Plot", height=170, width=890)],
+    examples=[
+        ["ethanol", True]
+    ],
+)

interfaces/iupac2style.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import gradio as gr
+from utils import ChemicalConverter, validate_smiles2iupac, plot_mol
+def convert(chemical_name, plot):
+    # Initialize the ChemicalConverter
+    converter = ChemicalConverter(mode="IUPAC2SMILES")
+    converted_name = converter.convert(chemical_name)[:6]
+    styles = {"<SYST>": "SYSTEMATIC", "<TRAD>": "TRADITIONAL", "<BASE>": "BASE"}
+    return styles.get(converted_name, "")
+iupac2style = gr.Interface(
+    fn=convert,
+    allow_flagging='auto',
+    inputs=[
+        gr.Textbox(label="Enter your IUPAC name", placeholder="Enter IUPAC name here"),
+    ],
+    outputs=[gr.Text(label="IUPAC style")],
+    examples=[
+        ["propan-2-yl 2-[4-(4-chlorophenyl)carbonylphenoxy]-2-methyl-propanoate"]
+    ],
+)

interfaces/landing.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import gradio as gr
+with open('materials/introduction.html', 'r', encoding='utf-8') as file:
+    html_description = file.read()
+landing = gr.HTML(html_description)

interfaces/smiles2iupac.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import gradio as gr
+from utils import ChemicalConverter, validate_smiles2iupac, plot_mol
+def convert(chemical_name, style, validate, plot):
+    # Initialize the ChemicalConverter
+    converter = ChemicalConverter(mode="SMILES2IUPAC")
+    converted_name = ""
+    validation_score = ""
+    plot_image = None
+    style_prefix = "<" + style[:4] + ">"
+    converted_name = converter.convert(style_prefix + chemical_name)
+    if validate:
+        validation_score = validate_smiles2iupac(chemical_name, converted_name)
+    if plot:
+        plot_image = plot_mol(chemical_name)
+    return converted_name, validation_score, plot_image
+smiles2iupac = gr.Interface(
+    fn=convert,
+    allow_flagging='auto',
+    inputs=[
+        gr.Textbox(label="Enter your SMILES name", placeholder="Enter your SMILES name here"),
+        gr.Radio(
+            choices=["BASE", "SYSTEMATIC", "TRADITIONAL"],
+            label="Choose desired IUPAC style",
+        ),
+        gr.Checkbox(label="Validate with molecular similarity", value=False),
+        gr.Checkbox(label="Plot molecule", value=True)
+    ],
+    outputs=[gr.Text(label="Converted Name"),
+             gr.Text(label="Input-Target similarity score"),
+             gr.Image(type='pil', label="Molecule Plot", height=170, width=890)],
+    examples=[
+        ["CCO", "BASE", True, True]
+    ],
+)

materials/introduction.html ADDED Viewed

	@@ -0,0 +1,67 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ChemConverters App Description</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 10px;
+            line-height: 1.6;
+        }
+        .link-button {
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            margin: 10px;
+            padding: 10px;
+            background-color: white;
+            border: 1px solid grey; /* Added border to make the button visible against white background */
+            color: #007bff; /* Text color changed to make it visible against white background */
+            text-decoration: none;
+            border-radius: 10px;
+            text-align: center;
+            vertical-align: middle;
+            box-sizing: border-box;
+        }
+        .link-button:hover {
+            background-color: #c0dcfc;
+        }
+        .link-button img {
+            height: 30px;
+            width: auto;
+            display: block;
+        }
+        .links-container {
+            text-align: center; /* Center the container's content */
+            margin: auto; /* Auto margins for horizontal centering if necessary */
+            display: flex; /* Use flexbox */
+            justify-content: center; /* Center flex items horizontally */
+            flex-wrap: wrap; /* Allow items to wrap */
+        }
+    </style>
+</head>
+<body>
+    <h2>Welcome to ChemConverters! 🧪🔬</h2>
+    <h3>With ChemConverters, you can effortlessly:</h3>
+    <ol>
+        <li>Convert SMILES strings to IUPAC names and vice versa 🔄</li>
+        <li>Choose your preferred IUPAC naming style: BASE, SYSTEMATIC, or TRADITIONAL 📚</li>
+        <li>Validate chemical naming with molecules fingerprints similarity for accuracy checks ✔️</li>
+    </ol>
+    <h3>What is ChemConverters?</h3>
+    <p>ChemConverters serves as a foundational showcase of our technological capabilities within the chemical domain. The models deployed in this application represent our entry-level offerings, designed to provide a glimpse into the potential applications of our advanced solutions. For access to our comprehensive suite of larger and more precise models, we invite interested parties to engage directly with us. Developed by the brilliant minds at Knowladgator, this app showcases the abilities of our chemical transformer models. Whether you're working on a research project, studying for an exam, or just exploring the chemical universe, ChemConverters is your go-to tool 🛠.<p>
+    <h3>Models Availability</h3>
+    <p>All models used in the applications are available on <a href="https://huggingface.co/knowledgator/" target="_blank">our Hugging Face page</a>. For translating from SMILES to IUPAC, the <a href="https://huggingface.co/knowledgator/SMILES2IUPAC-canonical-base" target="_blank">knowledgator/SMILES2IUPAC-canonical-base</a> model was used. To translate from IUPAC to SMILES, the <a href="https://huggingface.co/knowledgator/IUPAC2SMILES-canonical-base" target="_blank">knowledgator/IUPAC2SMILES-canonical-base</a> model was used.</p>
+    <h3>Citation</h3>
+    <p>Coming soon</p>
+    <h3>Remember, chemistry is not just about reactions; it's about connections. Let's build those connections together! 💫</h3>
+    <!-- Links Section -->
+    <div class="links-container">
+        <a href="https://www.knowledgator.com/" class="link-button" target="_blank"><img src="https://assets-global.website-files.com/65902be8ba48a05dfdb73331/6590476fcc8e8f35b2332781_Group%201000002504%20(1).png" alt="Visit our website"></a>
+        <a href="https://www.linkedin.com/company/knowledgator/" class="link-button" target="_blank"><img src="https://www.edigitalagency.com.au/wp-content/uploads/Linkedin-logo-png.png" alt="Follow on LinkedIn"></a>
+        <a href="https://huggingface.co/knowledgator/" class="link-button" target="_blank"><img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.png" alt="Hugging Face Profile"></a>
+    </div>
+</body>
+</html>

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .model import MT5ForConditionalGeneration
2	+ from .config import MT5Config

modeling/config.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from transformers import PretrainedConfig
+class MT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MT5Model`] or a [`TFMT5Model`]. It is used to
+    instantiate a mT5 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the mT5
+    [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 250112):
+            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. In the conventional context, it is typically expected that `d_kv` has to be equal to `d_model // num_heads`.
+            But in the architecture of mt5-small, `d_kv` is not equal to `d_model //num_heads`. The `inner_dim` of the projection layer will be defined as `num_heads * d_kv`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = "mt5"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        encoder_vocab_size=250112,
+        decoder_vocab_size=250112,
+        shared_embedding=False,
+        d_model=256,
+        d_kv=64,
+        d_ff=512,
+        num_layers=4,
+        num_decoder_layers=None,
+        num_heads=3,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        dropout_rate=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_factor=1.0,
+        feed_forward_proj="gated-gelu",
+        is_encoder_decoder=True,
+        use_cache=True,
+        tokenizer_class="ChemTokenizers.SMILES_IUPAC_FAST.FastTokenizer",
+        tie_word_embeddings=False,
+        pad_token_id=0,
+        eos_token_id=1,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            is_encoder_decoder=is_encoder_decoder,
+            tokenizer_class=tokenizer_class,
+            tie_word_embeddings=tie_word_embeddings,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+        self.encoder_vocab_size = encoder_vocab_size
+        self.decoder_vocab_size = decoder_vocab_size
+        self.shared_embedding = shared_embedding
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.classifier_dropout = classifier_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
+        act_info = self.feed_forward_proj.split("-")
+        self.dense_act_fn = act_info[-1]
+        self.is_gated_act = act_info[0] == "gated"
+        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
+            raise ValueError(
+                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. "
+                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
+                "'gated-gelu' or 'relu'"
+            )
+        # for backwards compatibility
+        if feed_forward_proj == "gated-gelu":
+            self.dense_act_fn = "gelu_new"
+    @property
+    def hidden_size(self):
+        return self.d_model
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers

modeling/docstrings.py ADDED Viewed

	@@ -0,0 +1,217 @@

+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+    Args:
+        device_map (`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the
+            following number of attention modules:
+                - mt5-small: 6
+                - mt5-base: 12
+                - mt5-large: 24
+                - mt5-xl: 24
+                - mt5-xxl: 24
+    Example:
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs using mt5-xl, which has a total of 24 attention modules:
+    model = MT5ForConditionalGeneration.from_pretrained("mt5-xl")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+    Example:
+    ```python
+    # On a 4 GPU machine with mt5-xl:
+    model = MT5ForConditionalGeneration.from_pretrained("Mt5-xl")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+MT5_START_DOCSTRING = r"""
+    The MT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
+    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
+    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
+    text-to-text denoising generative setting.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MT5Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+MT5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+            [What are input IDs?](../glossary#input-ids)
+            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
+            Training](./mt5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+                `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
+            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+MT5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""

modeling/model.py ADDED Viewed

	@@ -0,0 +1,612 @@

+import copy
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+from transformers import MT5PreTrainedModel
+from transformers.models.mt5 import MT5Stack
+from transformers.modeling_outputs import Seq2SeqModelOutput,Seq2SeqLMOutput, BaseModelOutput
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from .config import MT5Config
+from .docstrings import  (
+    PARALLELIZE_DOCSTRING,
+    DEPARALLELIZE_DOCSTRING,
+    __HEAD_MASK_WARNING_MSG,
+    MT5_START_DOCSTRING,
+    MT5_INPUTS_DOCSTRING,
+)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "MT5Config"
+_CHECKPOINT_FOR_DOC = "mt5-small"
+class MT5Model(MT5PreTrainedModel):
+    r"""
+    Examples:
+    ```python
+    >>> from transformers import MT5Model, AutoTokenizer
+    >>> model = MT5Model.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> summary = "Weiter Verhandlung in Syrien."
+    >>> inputs = tokenizer(article, return_tensors="pt")
+    >>> labels = tokenizer(text_target=summary, return_tensors="pt")
+    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
+    >>> hidden_states = outputs.last_hidden_state
+    ```"""
+    model_type = "mt5"
+    config_class = MT5Config
+    _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model)
+        if config.shared_embedding:
+            self.decoder_embedding = self.encoder_embedding
+        else:
+            self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = MT5Stack(encoder_config, self.encoder_embedding)
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = MT5Stack(decoder_config, self.decoder_emebedding)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+    # Copied from transformers.models.t5.modeling_t5.T5Model.parallelize
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
+            " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':"
+            " 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    # Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.encoder_embedding
+    # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.encoder_embedding = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
+    def get_encoder(self):
+        return self.encoder
+    # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
+    def get_decoder(self):
+        return self.decoder
+    # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5Model.forward with T5->MT5, t5->mt5
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, MT5Model
+        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
+        >>> model = MT5Model.from_pretrained("mt5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model.
+        >>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg.
+        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        hidden_states = encoder_outputs[0]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings("""MT5 Model with a `language modeling` head on top.""", MT5_START_DOCSTRING)
+class MT5ForConditionalGeneration(MT5PreTrainedModel):
+    r"""
+    Examples:
+    ```python
+    >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer
+    >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+    >>> summary = "Weiter Verhandlung in Syrien."
+    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
+    >>> outputs = model(**inputs)
+    >>> loss = outputs.loss
+    ```"""
+    model_type = "mt5"
+    config_class = MT5Config
+    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+        self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model)
+        if config.shared_embedding:
+            self.decoder_embedding = self.encoder_embedding
+        else:
+            self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = MT5Stack(encoder_config, self.encoder_embedding)
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = MT5Stack(decoder_config, self.decoder_emebedding)
+        self.lm_head = nn.Linear(config.d_model, config.decoder_vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you"
+            " should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also"
+            " provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'encoder.block.0': 0, 'encoder.block.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.encoder_embedding
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.encoder_embedding = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.encoder
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.decoder
+    @add_start_docstrings_to_model_forward(MT5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration
+        >>> tokenizer = AutoTokenizer.from_pretrained("mt5-small")
+        >>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small")
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = decoder_outputs[0]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+        lm_logits = self.lm_head(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past_key_values
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
+                )
+            if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
+                raise ValueError(
+                    f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
+                )
+            if len(reordered_layer_past_states) != len(layer_past_states):
+                raise ValueError(
+                    f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
+                )
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past

models/IUPAC2SMILES/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 512,
+  "d_kv": 64,
+  "d_model": 256,
+  "decoder_start_token_id": 2,
+  "decoder_vocab_size": 137,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "encoder_vocab_size": 822,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 4,
+  "num_heads": 3,
+  "num_layers": 4,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "shared_embedding": false,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.1",
+  "use_cache": true
+}

models/IUPAC2SMILES/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.37.1"
+}

models/IUPAC2SMILES/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1f38994ec986388a2f099652139d6a05b5981fb57bdf62361d4614f84ca07ed
+size 23177168

models/SMILES2IUPAC/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 512,
+  "d_kv": 64,
+  "d_model": 256,
+  "decoder_start_token_id": 2,
+  "decoder_vocab_size": 822,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "encoder_vocab_size": 137,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 4,
+  "num_heads": 3,
+  "num_layers": 4,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "shared_embedding": false,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.1",
+  "use_cache": true
+}

models/SMILES2IUPAC/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.37.1"
+}

models/SMILES2IUPAC/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4307a50d6b192a06bb81552d7cd6bcf6ac7ea6bb21d72ca4755e28d7d28655d2
+size 23878608

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+rdkit

test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from gradio_client import Client
+client = Client("https://knowledgator-chemicalconverters.hf.space/--replicas/ucig0/")
+result = client.predict(
+		"CCO",	# str  in 'Enter your chemical name' Textbox component
+		"SMILES2IUPAC",	# Literal['SMILES2IUPAC', 'IUPAC2SMILES', 'IUPAC style prediction']  in 'Choose method to convert chemical names' Radio component
+		"BASE",	# Literal['BASE', 'SYSTEMATIC', 'TRADITIONAL']  in 'If SMILES to IUPAC, choose desired IUPAC style' Radio component
+		True,	# bool  in 'Validate with molecular similarity' Checkbox component
+		True,	# bool  in 'Plot molecule' Checkbox component
+		api_name="/predict"
+)
+print(result)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .main_model import ChemicalConverter
2	+ from .rdkit_utils import validate_smiles2iupac, plot_mol

utils/main_model.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from modeling import MT5ForConditionalGeneration
+from transformers import AutoTokenizer
+import os
+class ChemicalConverter:
+    def __init__(self, mode: str):
+        self.mode = mode
+        model_directory = os.path.abspath("models")
+        model_path = os.path.join(model_directory, mode)
+        if not os.path.exists(model_path):
+            raise ValueError(f"Model path does not exist: {model_path}")
+        self.model = MT5ForConditionalGeneration.from_pretrained(model_path)
+        self.smiles_tokenizer = AutoTokenizer.from_pretrained("BioMike/smiles")
+        self.iupac_tokenizer = AutoTokenizer.from_pretrained("BioMike/iupac")
+        self.smiles_max_len = 128
+        self.iupac_max_len = 156
+    def convert(self, input):
+        if self.mode == "SMILES2IUPAC":
+            tokenizer = self.smiles_tokenizer
+            reverse_tokenizer = self.iupac_tokenizer
+            max_length = self.smiles_max_len
+        else:
+            tokenizer = self.iupac_tokenizer
+            reverse_tokenizer = self.smiles_tokenizer
+            max_length = self.iupac_max_len
+        encoding = tokenizer(input,
+                             return_tensors='pt',
+                             padding="max_length",
+                             truncation=True,
+                             max_length=max_length)
+        # Move the input tensor to GPU
+        encoding = {key: value.to(self.model.device) for key, value in encoding.items()}
+        # Generate  names
+        output = self.model.generate(input_ids=encoding['input_ids'],
+                                     attention_mask=encoding['attention_mask'],
+                                     max_new_tokens=156,
+                                     num_beams=1,
+                                     num_return_sequences=1)
+        # Decode names
+        output = [reverse_tokenizer.decode(ids, skip_special_tokens=True) for ids in output]
+        return output[0]

utils/rdkit_utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from rdkit import DataStructs, Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import Draw
+from PIL import Image
+import io
+from .main_model import ChemicalConverter
+def validate_smiles2iupac(input_smiles, predicted_iupac):
+    converter = ChemicalConverter(mode="IUPAC2SMILES")
+    predicted_smiles = converter.convert(predicted_iupac)
+    ms = [Chem.MolFromSmiles(input_smiles), Chem.MolFromSmiles(predicted_smiles[6:])]
+    if None in ms:
+        return None
+    fpgen = AllChem.GetRDKitFPGenerator()
+    fps = [fpgen.GetFingerprint(x) for x in ms]
+    return DataStructs.TanimotoSimilarity(fps[0], fps[1])
+def plot_mol(smiles):
+    # Convert the SMILES string to an RDKit molecule object
+    mol = Chem.MolFromSmiles(smiles)
+    # Use RDKit to draw the molecule to an image, with original intended size
+    img = Draw.MolToImage(mol, size=(185, 185))
+    # Create a new, blank image with the desired final size (800x190 pixels) with a white background
+    final_img = Image.new('RGB', (890, 185), 'white')
+    # Calculate the position to paste the original image onto the blank image to keep it centered
+    left = (890 - 185) // 2
+    top = (185 - 185) // 2  # This will be zero in this case but included for clarity
+    # Paste the original image onto the blank image
+    final_img.paste(img, (left, top))
+    return final_img