xiaotianhan commited on Jan 4

Commit

779abe8

•

1 Parent(s): b706adf

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

LICENSE +352 -0
README.md +260 -0
added_tokens.json +4 -0
assets/infimm-logo.webp +0 -0
assets/infimm-zephyr-mmmu-test.jpeg +0 -0
assets/infimm-zephyr-mmmu-val.jpeg +0 -0
config.json +66 -0
configuration_infimm_zephyr.py +42 -0
convert_infi_zephyr_tokenizer_to_hf.py +29 -0
convert_infi_zephyr_weights_to_hf.py +6 -0
eva_vit.py +948 -0
flamingo.py +261 -0
flamingo_lm.py +256 -0
generation_config.json +7 -0
helpers.py +410 -0
modeling_infimm_zephyr.py +138 -0
preprocessor_config.json +7 -0
processing_infimm_zephyr.py +345 -0
pytorch_model.bin +3 -0
special_tokens_map.json +46 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +62 -0
utils.py +48 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,352 @@

+Creative Commons Attribution-NonCommercial 4.0 International
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright and
+certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+-   Considerations for licensors: Our public licenses are intended for
+    use by those authorized to give the public permission to use
+    material in ways otherwise restricted by copyright and certain other
+    rights. Our licenses are irrevocable. Licensors should read and
+    understand the terms and conditions of the license they choose
+    before applying it. Licensors should also secure all rights
+    necessary before applying our licenses so that the public can reuse
+    the material as expected. Licensors should clearly mark any material
+    not subject to the license. This includes other CC-licensed
+    material, or material used under an exception or limitation to
+    copyright. More considerations for licensors :
+    wiki.creativecommons.org/Considerations\_for\_licensors
+-   Considerations for the public: By using one of our public licenses,
+    a licensor grants the public permission to use the licensed material
+    under specified terms and conditions. If the licensor's permission
+    is not necessary for any reason–for example, because of any
+    applicable exception or limitation to copyright–then that use is not
+    regulated by the license. Our licenses grant only permissions under
+    copyright and certain other rights that a licensor has authority to
+    grant. Use of the licensed material may still be restricted for
+    other reasons, including because others have copyright or other
+    rights in the material. A licensor may make special requests, such
+    as asking that all changes be marked or described. Although not
+    required by our licenses, you are encouraged to respect those
+    requests where reasonable. More considerations for the public :
+    wiki.creativecommons.org/Considerations\_for\_licensees
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and conditions.
+-   Section 1 – Definitions.
+    -   a. Adapted Material means material subject to Copyright and
+        Similar Rights that is derived from or based upon the Licensed
+        Material and in which the Licensed Material is translated,
+        altered, arranged, transformed, or otherwise modified in a
+        manner requiring permission under the Copyright and Similar
+        Rights held by the Licensor. For purposes of this Public
+        License, where the Licensed Material is a musical work,
+        performance, or sound recording, Adapted Material is always
+        produced where the Licensed Material is synched in timed
+        relation with a moving image.
+    -   b. Adapter's License means the license You apply to Your
+        Copyright and Similar Rights in Your contributions to Adapted
+        Material in accordance with the terms and conditions of this
+        Public License.
+    -   c. Copyright and Similar Rights means copyright and/or similar
+        rights closely related to copyright including, without
+        limitation, performance, broadcast, sound recording, and Sui
+        Generis Database Rights, without regard to how the rights are
+        labeled or categorized. For purposes of this Public License, the
+        rights specified in Section 2(b)(1)-(2) are not Copyright and
+        Similar Rights.
+    -   d. Effective Technological Measures means those measures that,
+        in the absence of proper authority, may not be circumvented
+        under laws fulfilling obligations under Article 11 of the WIPO
+        Copyright Treaty adopted on December 20, 1996, and/or similar
+        international agreements.
+    -   e. Exceptions and Limitations means fair use, fair dealing,
+        and/or any other exception or limitation to Copyright and
+        Similar Rights that applies to Your use of the Licensed
+        Material.
+    -   f. Licensed Material means the artistic or literary work,
+        database, or other material to which the Licensor applied this
+        Public License.
+    -   g. Licensed Rights means the rights granted to You subject to
+        the terms and conditions of this Public License, which are
+        limited to all Copyright and Similar Rights that apply to Your
+        use of the Licensed Material and that the Licensor has authority
+        to license.
+    -   h. Licensor means the individual(s) or entity(ies) granting
+        rights under this Public License.
+    -   i. NonCommercial means not primarily intended for or directed
+        towards commercial advantage or monetary compensation. For
+        purposes of this Public License, the exchange of the Licensed
+        Material for other material subject to Copyright and Similar
+        Rights by digital file-sharing or similar means is NonCommercial
+        provided there is no payment of monetary compensation in
+        connection with the exchange.
+    -   j. Share means to provide material to the public by any means or
+        process that requires permission under the Licensed Rights, such
+        as reproduction, public display, public performance,
+        distribution, dissemination, communication, or importation, and
+        to make material available to the public including in ways that
+        members of the public may access the material from a place and
+        at a time individually chosen by them.
+    -   k. Sui Generis Database Rights means rights other than copyright
+        resulting from Directive 96/9/EC of the European Parliament and
+        of the Council of 11 March 1996 on the legal protection of
+        databases, as amended and/or succeeded, as well as other
+        essentially equivalent rights anywhere in the world.
+    -   l. You means the individual or entity exercising the Licensed
+        Rights under this Public License. Your has a corresponding
+        meaning.
+-   Section 2 – Scope.
+    -   a. License grant.
+        -   1. Subject to the terms and conditions of this Public
+            License, the Licensor hereby grants You a worldwide,
+            royalty-free, non-sublicensable, non-exclusive, irrevocable
+            license to exercise the Licensed Rights in the Licensed
+            Material to:
+            -   A. reproduce and Share the Licensed Material, in whole
+                or in part, for NonCommercial purposes only; and
+            -   B. produce, reproduce, and Share Adapted Material for
+                NonCommercial purposes only.
+        -   2. Exceptions and Limitations. For the avoidance of doubt,
+            where Exceptions and Limitations apply to Your use, this
+            Public License does not apply, and You do not need to comply
+            with its terms and conditions.
+        -   3. Term. The term of this Public License is specified in
+            Section 6(a).
+        -   4. Media and formats; technical modifications allowed. The
+            Licensor authorizes You to exercise the Licensed Rights in
+            all media and formats whether now known or hereafter
+            created, and to make technical modifications necessary to do
+            so. The Licensor waives and/or agrees not to assert any
+            right or authority to forbid You from making technical
+            modifications necessary to exercise the Licensed Rights,
+            including technical modifications necessary to circumvent
+            Effective Technological Measures. For purposes of this
+            Public License, simply making modifications authorized by
+            this Section 2(a)(4) never produces Adapted Material.
+        -   5. Downstream recipients.
+            -   A. Offer from the Licensor – Licensed Material. Every
+                recipient of the Licensed Material automatically
+                receives an offer from the Licensor to exercise the
+                Licensed Rights under the terms and conditions of this
+                Public License.
+            -   B. No downstream restrictions. You may not offer or
+                impose any additional or different terms or conditions
+                on, or apply any Effective Technological Measures to,
+                the Licensed Material if doing so restricts exercise of
+                the Licensed Rights by any recipient of the Licensed
+                Material.
+        -   6. No endorsement. Nothing in this Public License
+            constitutes or may be construed as permission to assert or
+            imply that You are, or that Your use of the Licensed
+            Material is, connected with, or sponsored, endorsed, or
+            granted official status by, the Licensor or others
+            designated to receive attribution as provided in Section
+            3(a)(1)(A)(i).
+    -   b. Other rights.
+        -   1. Moral rights, such as the right of integrity, are not
+            licensed under this Public License, nor are publicity,
+            privacy, and/or other similar personality rights; however,
+            to the extent possible, the Licensor waives and/or agrees
+            not to assert any such rights held by the Licensor to the
+            limited extent necessary to allow You to exercise the
+            Licensed Rights, but not otherwise.
+        -   2. Patent and trademark rights are not licensed under this
+            Public License.
+        -   3. To the extent possible, the Licensor waives any right to
+            collect royalties from You for the exercise of the Licensed
+            Rights, whether directly or through a collecting society
+            under any voluntary or waivable statutory or compulsory
+            licensing scheme. In all other cases the Licensor expressly
+            reserves any right to collect such royalties, including when
+            the Licensed Material is used other than for NonCommercial
+            purposes.
+-   Section 3 – License Conditions.
+    Your exercise of the Licensed Rights is expressly made subject to
+    the following conditions.
+    -   a. Attribution.
+        -   1. If You Share the Licensed Material (including in modified
+            form), You must:
+            -   A. retain the following if it is supplied by the
+                Licensor with the Licensed Material:
+                -   i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if designated);
+                -   ii. a copyright notice;
+                -   iii. a notice that refers to this Public License;
+                -   iv. a notice that refers to the disclaimer of
+                    warranties;
+                -   v. a URI or hyperlink to the Licensed Material to
+                    the extent reasonably practicable;
+            -   B. indicate if You modified the Licensed Material and
+                retain an indication of any previous modifications; and
+            -   C. indicate the Licensed Material is licensed under this
+                Public License, and include the text of, or the URI or
+                hyperlink to, this Public License.
+        -   2. You may satisfy the conditions in Section 3(a)(1) in any
+            reasonable manner based on the medium, means, and context in
+            which You Share the Licensed Material. For example, it may
+            be reasonable to satisfy the conditions by providing a URI
+            or hyperlink to a resource that includes the required
+            information.
+        -   3. If requested by the Licensor, You must remove any of the
+            information required by Section 3(a)(1)(A) to the extent
+            reasonably practicable.
+        -   4. If You Share Adapted Material You produce, the Adapter's
+            License You apply must not prevent recipients of the Adapted
+            Material from complying with this Public License.
+-   Section 4 – Sui Generis Database Rights.
+    Where the Licensed Rights include Sui Generis Database Rights that
+    apply to Your use of the Licensed Material:
+    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the
+        right to extract, reuse, reproduce, and Share all or a
+        substantial portion of the contents of the database for
+        NonCommercial purposes only;
+    -   b. if You include all or a substantial portion of the database
+        contents in a database in which You have Sui Generis Database
+        Rights, then the database in which You have Sui Generis Database
+        Rights (but not its individual contents) is Adapted Material;
+        and
+    -   c. You must comply with the conditions in Section 3(a) if You
+        Share all or a substantial portion of the contents of the
+        database.
+    For the avoidance of doubt, this Section 4 supplements and does not
+    replace Your obligations under this Public License where the
+    Licensed Rights include other Copyright and Similar Rights.
+-   Section 5 – Disclaimer of Warranties and Limitation of Liability.
+    -   a. Unless otherwise separately undertaken by the Licensor, to
+        the extent possible, the Licensor offers the Licensed Material
+        as-is and as-available, and makes no representations or
+        warranties of any kind concerning the Licensed Material, whether
+        express, implied, statutory, or other. This includes, without
+        limitation, warranties of title, merchantability, fitness for a
+        particular purpose, non-infringement, absence of latent or other
+        defects, accuracy, or the presence or absence of errors, whether
+        or not known or discoverable. Where disclaimers of warranties
+        are not allowed in full or in part, this disclaimer may not
+        apply to You.
+    -   b. To the extent possible, in no event will the Licensor be
+        liable to You on any legal theory (including, without
+        limitation, negligence) or otherwise for any direct, special,
+        indirect, incidental, consequential, punitive, exemplary, or
+        other losses, costs, expenses, or damages arising out of this
+        Public License or use of the Licensed Material, even if the
+        Licensor has been advised of the possibility of such losses,
+        costs, expenses, or damages. Where a limitation of liability is
+        not allowed in full or in part, this limitation may not apply to
+        You.
+    -   c. The disclaimer of warranties and limitation of liability
+        provided above shall be interpreted in a manner that, to the
+        extent possible, most closely approximates an absolute
+        disclaimer and waiver of all liability.
+-   Section 6 – Term and Termination.
+    -   a. This Public License applies for the term of the Copyright and
+        Similar Rights licensed here. However, if You fail to comply
+        with this Public License, then Your rights under this Public
+        License terminate automatically.
+    -   b. Where Your right to use the Licensed Material has terminated
+        under Section 6(a), it reinstates:
+        -   1. automatically as of the date the violation is cured,
+            provided it is cured within 30 days of Your discovery of the
+            violation; or
+        -   2. upon express reinstatement by the Licensor.
+        For the avoidance of doubt, this Section 6(b) does not affect
+        any right the Licensor may have to seek remedies for Your
+        violations of this Public License.
+    -   c. For the avoidance of doubt, the Licensor may also offer the
+        Licensed Material under separate terms or conditions or stop
+        distributing the Licensed Material at any time; however, doing
+        so will not terminate this Public License.
+    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+        License.
+-   Section 7 – Other Terms and Conditions.
+    -   a. The Licensor shall not be bound by any additional or
+        different terms or conditions communicated by You unless
+        expressly agreed.
+    -   b. Any arrangements, understandings, or agreements regarding the
+        Licensed Material not stated herein are separate from and
+        independent of the terms and conditions of this Public License.
+-   Section 8 – Interpretation.
+    -   a. For the avoidance of doubt, this Public License does not, and
+        shall not be interpreted to, reduce, limit, restrict, or impose
+        conditions on any use of the Licensed Material that could
+        lawfully be made without permission under this Public License.
+    -   b. To the extent possible, if any provision of this Public
+        License is deemed unenforceable, it shall be automatically
+        reformed to the minimum extent necessary to make it enforceable.
+        If the provision cannot be reformed, it shall be severed from
+        this Public License without affecting the enforceability of the
+        remaining terms and conditions.
+    -   c. No term or condition of this Public License will be waived
+        and no failure to comply consented to unless expressly agreed to
+        by the Licensor.
+    -   d. Nothing in this Public License constitutes or may be
+        interpreted as a limitation upon, or waiver of, any privileges
+        and immunities that apply to the Licensor or You, including from
+        the legal processes of any jurisdiction or authority.
+Creative Commons is not a party to its public licenses. Notwithstanding,
+Creative Commons may elect to apply one of its public licenses to
+material it publishes and in those instances will be considered the
+"Licensor." The text of the Creative Commons public licenses is
+dedicated to the public domain under the CC0 Public Domain Dedication.
+Except for the limited purpose of indicating that material is shared
+under a Creative Commons public license or as otherwise permitted by the
+Creative Commons policies published at creativecommons.org/policies,
+Creative Commons does not authorize the use of the trademark "Creative
+Commons" or any other trademark or logo of Creative Commons without its
+prior written consent including, without limitation, in connection with
+any unauthorized modifications to any of its public licenses or any
+other arrangements, understandings, or agreements concerning use of
+licensed material. For the avoidance of doubt, this paragraph does not
+form part of the public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md ADDED Viewed

	@@ -0,0 +1,260 @@

+---
+language: en
+tags:
+  - multimodal
+  - text
+  - image
+  - image-to-text
+license: mit
+datasets:
+  - HuggingFaceM4/OBELICS
+  - laion/laion2B-en
+  - coyo-700m
+  - mmc4
+pipeline_tag: text-generation
+inference: true
+---
+<h1 align="center">
+  <br>
+  <img src="assets/infimm-logo.webp" alt="Markdownify" width="200"></a>
+</h1>
+# InfiMM
+InfiMM, inspired by the Flamingo architecture, sets itself apart with unique training data and diverse large language models (LLMs). This approach allows InfiMM to maintain the core strengths of Flamingo while offering enhanced capabilities. As the premier open-sourced variant in this domain, InfiMM excels in accessibility and adaptability, driven by community collaboration. It's more than an emulation of Flamingo; it's an innovation in visual language processing.
+Our model is another attempt to produce the result reported in the paper "Flamingo: A Large-scale Visual Language Model for Multimodal Understanding" by DeepMind.
+Compared with previous open-sourced attempts ([OpenFlamingo](https://github.com/mlfoundations/open_flamingo) and [IDEFIC](https://huggingface.co/blog/idefics)), InfiMM offers a more flexible models, allowing for a wide range of applications.
+In particular, InfiMM integrates the latest LLM models into VLM domain the reveals the impact of LLMs with different scales and architectures.
+Please note that InfiMM is currently in beta stage and we are continuously working on improving it.
+## Model Details
+- **Developed by**: Institute of Automation, Chinese Academy of Sciences and ByteDance
+- **Model Type**: Visual Language Model (VLM)
+- **Language**: English
+- **LLMs**: [Zephyr](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta), [LLaMA2-13B](https://ai.meta.com/llama/), [Vicuna-13B](https://huggingface.co/lmsys/vicuna-13b-v1.5)
+- **Vision Model**: [EVA CLIP](https://huggingface.co/QuanSun/EVA-CLIP)
+- **Language(s) (NLP):** en
+- **License:** see [License section](#license)
+<!---
+- **Parent Models:** [QuanSun/EVA-CLIP](https://huggingface.co/QuanSun/EVA-CLIP/blob/main/EVA02_CLIP_L_336_psz14_s6B.pt) and [HuggingFaceH4/zephyr-7b--beta ta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
+  -->
+## Model Family
+Our model consists of several different model. Please see the details below.
+| Model                  | LLM            | Vision Encoder | IFT |
+| ---------------------- | -------------- | -------------- | --- |
+| InfiMM-Zephyr          | Zehpyr-7B-beta | ViT-L-336      | No  |
+| InfiMM-Llama-13B       | Llama2-13B     | ViT-G-224      | No  |
+| InfiMM-Vicuna-13B      | Vicuna-13B     | ViT-E-224      | No  |
+| InfiMM-Zephyr-Chat     | Zehpyr-7B-beta | ViT-L-336      | Yes |
+| InfiMM-Llama-13B-Chat  | Llama2-13B     | ViT-G-224      | Yes |
+| InfiMM-Vicuna-13B-Chat | Vicuna-13B     | ViT-E-224      | Yes |
+<!-- InfiMM-Zephyr-Chat is an light-weighted, open-source re-production of Flamingo-style Multimodal large language models with chat capability that takes sequences of interleaved images and texts as inputs and generates text outputs, with only 9B parameters.
+-->
+## Demo
+Will be released soon.
+Our model adopts the Flamingo architecture, leveraging EVA CLIP as the visual encoder and employing LLaMA2, Vicuna, and Zephyr as language models. The visual and language modalities are connected through a Cross Attention module.
+## Quickstart
+Use the code below to get started with the base model:
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+processor = AutoProcessor.from_pretrained("InfiMM/infimm-zephyr", trust_remote_code=True)
+prompts = [
+    {
+        "role": "user",
+        "content": [
+            {"image": "assets/infimm-logo.webp"},
+            "Please explain this image to me.",
+        ],
+    }
+]
+inputs = processor(prompts)
+# use bf16
+model = AutoModelForCausalLM.from_pretrained(
+    "InfiMM/infimm-zephyr",
+    local_files_only=True,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+).eval()
+inputs = inputs.to(model.device)
+inputs["batch_images"] = inputs["batch_images"].to(torch.bfloat16)
+generated_ids = model.generate(
+    **inputs,
+    min_generation_length=0,
+    max_generation_length=256,
+)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text)
+```
+## Training Details
+We employed three stages to train our model: pretraining (PT), multi-task training (MTT), and instruction finetuning (IFT). Refer to the table below for detailed configurations in each stage. Due to significant noise in the pretraining data, we aimed to enhance the model's accuracy by incorporating higher-quality data. In the multi-task training (MTT) phase, we utilized substantial training data from diverse datasets. However, as the answer in these data mainly consisted of single words or phrases, the model's conversational ability was limited. Therefore, in the third stage, we introduced a considerable amount of image-text dialogue data (llava665k) for fine-tuning the model's instructions.
+### Pretraining (PT)
+We follow similar training procedures used in [IDEFICS](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct/blob/main/README.md).
+The model is trained on a mixture of image-text pairs and unstructured multimodal web documents. All data are from public sources. Many image URL links are expired, we are capable of only downloading partial samples. We filter low quality data, here are resulting data we used:
+| Data Source                                                      | Type of Data                          | Number of Tokens in Source | Number of Images in Source | Number of Samples | Epochs |
+| ---------------------------------------------------------------- | ------------------------------------- | -------------------------- | -------------------------- | ----------------- | ------ |
+| [OBELICS](https://huggingface.co/datasets/HuggingFaceM4/OBELICS) | Unstructured Multimodal Web Documents | -                          | -                          | 101M              | 1      |
+| [MMC4](https://github.com/allenai/mmc4)                          | Unstructured Multimodal Web Documents | -                          | -                          | 53M               | 1      |
+| [LAION](https://huggingface.co/datasets/laion/laion2B-en)        | Image-Text Pairs                      | -                          | 115M                       | 115M              | 1      |
+| [COYO](https://github.com/kakaobrain/coyo-dataset)               | Image-Text Pairs                      | -                          | 238M                       | 238M              | 1      |
+| [LAION-COCO](https://laion.ai/blog/laion-coco/)                  | Image-Text Pairs                      | -                          | 140M                       | 140M              | 1      |
+| [PMD\*](https://huggingface.co/datasets/facebook/pmd)            | Image-Text Pairs                      | -                          | 20M                        | 1                 |
+\*PMD is only used in models with 13B LLMs, not the 7B Zephyr model.
+During pretraining of interleaved image text sample, we apply masked cross-attention, however, we didn't strictly follow Flamingo, which alternate attention of image to its previous text or later text by change of 0.5.
+We use the following hyper parameters:
+| Categories               | Parameters                 | Value                |
+| ------------------------ | -------------------------- | -------------------- |
+| Perceiver Resampler      | Number of Layers           | 6                    |
+|                          | Number of Latents          | 64                   |
+|                          | Number of Heads            | 16                   |
+|                          | Resampler Head Dimension   | 96                   |
+| Training                 | Sequence Length            | 384 (13B) / 792 (7B) |
+|                          | Effective Batch Size       | 40\*128              |
+|                          | Max Images per Sample      | 6                    |
+|                          | Weight Decay               | 0.1                  |
+|                          | Optimizer                  | Adam(0.9, 0.999)     |
+|                          | Gradient Accumulation Step | 2                    |
+| Learning Rate            | Initial Max                | 1e-4                 |
+|                          | Decay Schedule             | Constant             |
+|                          | Warmup Step rate           | 0.005                |
+| Large-scale Optimization | Gradient Checkpointing     | False                |
+|                          | Precision                  | bf16                 |
+|                          | ZeRO Optimization          | Stage 2              |
+### Multi-Task Training (MTT)
+Here we use mix_cap_vqa to represent the mixed training set from COCO caption, TextCap, VizWiz Caption, VQAv2, OKVQA, VizWiz VQA, TextVQA, OCRVQA, STVQA, DocVQA, GQA and ScienceQA-image. For caption, we add prefix such as "Please describe the image." before the question. And for QA, we add "Answer the question using a single word or phrase.". Specifically, for VizWiz VQA, we use "When the provided information is insufficient, respond with 'Unanswerable'. Answer the question using a single word or phrase.". While for ScienceQA-image, we use "Answer with the option's letter from the given choices directly."
+### Instruction Fine-Tuning (IFT)
+For instruction fine-tuning stage, we use the recently released [LLaVA-MIX-665k](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main).
+We use the following hyper parameters:
+| Categories               | Parameters                 | Value                |
+| ------------------------ | -------------------------- | -------------------- |
+| Perceiver Resampler      | Number of Layers           | 6                    |
+|                          | Number of Latents          | 64                   |
+|                          | Number of Heads            | 16                   |
+|                          | Resampler Head Dimension   | 96                   |
+| Training                 | Sequence Length            | 384 (13B) / 792 (7B) |
+|                          | Effective Batch Size       | 64                   |
+|                          | Max Images per Sample      | 6                    |
+|                          | Weight Decay               | 0.1                  |
+|                          | Optimizer                  | Adam(0.9, 0.999)     |
+|                          | Gradient Accumulation Step | 2                    |
+| Learning Rate            | Initial Max                | 1e-5                 |
+|                          | Decay Schedule             | Constant             |
+|                          | Warmup Step rate           | 0.005                |
+| Large-scale Optimization | Gradient Checkpointing     | False                |
+|                          | Precision                  | bf16                 |
+|                          | ZeRO Optimization          | Stage 2              |
+During IFT, similar to pretrain, we keep ViT and LLM frozen for both chat-based LLM (Vicuna and Zephyr). For Llama model, we keep LLM trainable during the IFT stage. We also apply chat-template to process the training samples.
+## Evaluation
+### PreTraining Evaluation
+We evaluate the pretrained models on the following downstream tasks: Image Captioning and VQA. We also compare with our results with [IDEFICS](https://huggingface.co/blog/idefics).
+| Model             | Shots | COCO CIDEr | Flickr30K CIDEr | VQA v2 Acc | TextVQA Acc | OK-VQA Acc |
+| ----------------- | ----- | ---------- | --------------- | ---------- | ----------- | ---------- |
+| IDEFICS-9B        | 0     | 46         | 27.3            | 50.9       | 25.9        | 38.4       |
+|                   | 4     | 93         | 59.7            | 55.4       | 27.6        | 45.5       |
+| IDEFICS-80B       | 0     | 91.8       | 53.7            | 60         | 30.9        | 45.2       |
+|                   | 4     | 110.3      | 73.7            | 64.6       | 34.4        | 52.4       |
+| InfiMM-Zephyr-7B  | 0     | 78.8       | 60.7            | 33.7       | 15.2        | 17.1       |
+|                   | 4     | 108.6      | 71.9            | 59.1       | 34.3        | 50.5       |
+| InfiMM-Llama2-13B | 0     | 85.4       | 54.6            | 51.6       | 24.2        | 26.4       |
+|                   | 4     | 125.2      | 87.1            | 66.1       | 38.2        | 55.5       |
+| InfiMM-Vicuna13B  | 0     | 69.6       | 49.6            | 60.4       | 32.8        | 49.2       |
+|                   | 4     | 118.1      | 81.4            | 64.2       | 38.4        | 53.7       |
+### IFT Evaluation
+In our analysis, we concentrate on two primary benchmarks for evaluating MLLMs: 1) Multi-choice Question Answering (QA) and 2) Open-ended Evaluation. We've observed that the evaluation metrics for tasks like Visual Question Answering (VQA) and Text-VQA are overly sensitive to exact answer matches. This approach can be misleading, particularly when models provide synonymous but technically accurate responses. Therefore, these metrics have been omitted from our comparison for a more precise assessment. The evaluation results are shown in the table below.
+| Model               | ScienceQA-Img | MME                   | MM-VET | InfiMM-Eval  | MMbench | MMMU-Val | MMMU-Test |
+| ------------------- | ------------- | --------------------- | ------ | ------------ | ------- | -------- | --------- |
+| Otter-9B            | -             | 1292/306              | 24.6   | 32.2         | -       | 22.69    | -         |
+| IDEFICS-9B-Instruct | 60.6          | -/-                   | -      | -            | -       | 24.53    | -         |
+| InfiMM-Zephyr-7B    | 71.1          | P: 1406<br>C:327      | 32.8   | 36.0         | 59.7    | 39.4     | 35.5      |
+| InfiMM-Llama-13b    | 73.0          | P: 1444.5<br>C: 337.6 | 39.2   | 0.4559/0.414 | 66.4    | 39.1     | 35.2      |
+| InfiMM-Vicuna-13B   | 74.0          | P: 1461.2<br>C: 323.5 | 36.0   | 40.0         | 66.7    | 37.6     | 34.6      |
+<!--
+| Model             | TextVQA (no ocr) | OK-VQA | VQAv2 | ScienceQA-Img | GQA  | MME                   | MM-VET | MMMU | InfiMM-Eval  | MMbench |
+| ----------------- | ---------------- | ------ | ----- | ------------- | ---- | --------------------- | ------ | ---- | ------------ | ------- |
+| InfiMM-Zephyr-7B  | 36.7             | 55.4   | /     | 71.1          |      | P: 1406<br>C:327      | 32.8   | 39.4 | 36.0         | 59.7    |
+| InfiMM-Llama-13b  | 44.6             | 62.3   | 78.5  | 73.0          | 61.2 | P: 1444.5<br>C: 337.6 | 39.2   | 39.1 | 0.4559/0.414 | 66.4    |
+| InfiMM-Vicuna-13B | 41.7             | 58.5   | 73.0  | 74.0          | 58.5 | P: 1461.2<br>C: 323.5 | 36.0   | 37.6 | 40.0         | 66.7    |
+We select checkpoint after 1 epoch instruction fine-tuning.
+| Model               | <nobr>ScienceQA <br>acc.</nobr> | <nobr>MME <br>P/C</nobr> | <nobr>MM-Vet</nobr> | <nobr>InfiMM-Eval</nobr> | <nobr>MMMU (val)</nobr> |
+| :------------------ | ------------------------------: | -----------------------: | ------------------: | -----------------------: | ----------------------: |
+| Otter-9B            |                               - |                 1292/306 |                24.6 |                    22.69 |                    32.2 |
+| IDEFICS-9B-Instruct |                            60.6 |                      -/- |                   - |                    24.53 |                       - |
+| InfiMM-Zephyr-Chat  |                           71.14 |                 1406/327 |                33.3 |                    35.97 |                    39.4 |
+-->
+<details>
+<summary>Leaderboard Details</summary>
+<img src="assets/infimm-zephyr-mmmu-val.jpeg" style="zoom:40%;" />
+<br>MMMU-Val split results<br>
+<img src="assets/infimm-zephyr-mmmu-test.jpeg" style="zoom:40%;" />
+<br>MMMU-Test split results<br>
+</details>
+## Citation
+@misc{infimm-v1,
+title={InfiMM: },
+author={InfiMM Team},
+year={2024}
+}
+## License
+<a href="https://creativecommons.org/licenses/by-nc/4.0/deed.en">
+	<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Cc_by-nc_icon.svg/600px-Cc_by-nc_icon.svg.png" width="160">
+</a>
+This project is licensed under the **CC BY-NC 4.0**.
+The copyright of the images belongs to the original authors.
+See [LICENSE](LICENSE) for more information.
+## Contact Us
+Please feel free to contact us via email [infimmbytedance@gmail.com](infimmbytedance@gmail.com) if you have any questions.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<image>": 32001,
+  "<|endofchunk|>": 32000
+}

assets/infimm-logo.webp ADDED Viewed

assets/infimm-zephyr-mmmu-test.jpeg ADDED Viewed

assets/infimm-zephyr-mmmu-val.jpeg ADDED Viewed

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+    "_name_or_path": "./",
+    "architectures": [
+        "InfiMMZephyrModel"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_infimm_zephyr.InfiMMConfig",
+        "AutoModelForCausalLM": "modeling_infimm_zephyr.InfiMMZephyrModel"
+    },
+    "model_type": "infimm-zephyr",
+    "seq_length": 1024,
+    "tokenizer_type": "LlamaTokenizer",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.35.2",
+    "use_cache": true,
+    "use_flash_attn": false,
+    "cross_attn_every_n_layers": 2,
+    "use_grad_checkpoint": false,
+    "freeze_llm": true,
+    "image_token_id": 32001,
+    "eoc_token_id": 32000,
+    "visual": {
+        "image_size": 336,
+        "layers": 24,
+        "width": 1024,
+        "head_width": 64,
+        "patch_size": 14,
+        "mlp_ratio": 2.6667,
+        "eva_model_name": "eva-clip-l-14-336",
+        "drop_path_rate": 0.0,
+        "xattn": false,
+        "fusedLN": true,
+        "rope": true,
+        "pt_hw_seq_len": 16,
+        "intp_freq": true,
+        "naiveswiglu": true,
+        "subln": true,
+        "embed_dim": 768
+    },
+    "language": {
+        "_name_or_path": "HuggingFaceH4/zephyr-7b-beta",
+        "architectures": [
+            "MistralForCausalLM"
+        ],
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "intermediate_size": 14336,
+        "max_position_embeddings": 32768,
+        "model_type": "mistral",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "pad_token_id": 2,
+        "rms_norm_eps": 1e-05,
+        "rope_theta": 10000.0,
+        "sliding_window": 4096,
+        "tie_word_embeddings": false,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.35.0",
+        "use_cache": true,
+        "vocab_size": 32002
+    }
+  }

configuration_infimm_zephyr.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from transformers import PretrainedConfig
+class InfiMMConfig(PretrainedConfig):
+    model_type = "infimm"
+    def __init__(
+        self,
+        model_type="infimm-zephyr",
+        seq_length=1024,
+        tokenizer_type="ZephyrTokenizer",
+        torch_dtype="bfloat16",
+        transformers_version="4.35.2",
+        use_cache=True,
+        use_flash_attn=False,
+        cross_attn_every_n_layers=2,
+        use_grad_checkpoint=False,
+        freeze_llm=True,
+        visual=None,
+        language=None,
+        image_token_id=None,
+        eoc_token_id=None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.seq_length = seq_length
+        self.tokenizer_type = tokenizer_type
+        self.torch_dtype = torch_dtype
+        self.transformers_version = transformers_version
+        self.use_cache = use_cache
+        self.use_flash_attn = use_flash_attn
+        self.cross_attn_every_n_layers = cross_attn_every_n_layers
+        self.use_grad_checkpoint = use_grad_checkpoint
+        self.freeze_llm = freeze_llm
+        self.visual = visual
+        self.language = language
+        self.image_token_id = image_token_id
+        self.eoc_token_id = eoc_token_id
+        super().__init__(**kwargs)

convert_infi_zephyr_tokenizer_to_hf.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import argparse
+from open_flamingo.eval.models.mistral_model import EvalModel
+from open_flamingo.train.distributed import init_distributed_device, world_info_from_env
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model",
+    type=str,
+    help="Model name. Currently only `OpenFlamingo` is supported.",
+    default="open_flamingo",
+)
+def main():
+    model_args = {
+        "config_yaml": "configs/mlm_multi_source_v1_zephyr_ift_zero2.yaml",
+        "checkpoint_path": "cruise_logs/zephyr_freeze_ift/mp_rank_00_model_states.pt",
+        "precision": "bf16",
+    }
+    eval_model = EvalModel(model_args)
+    tokenizer = eval_model.tokenizer
+    # tokenizer.save_pretrained('hf_weights')
+if __name__ == "__main__":
+    main()

convert_infi_zephyr_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch
+state_dict = torch.load(
+    "cruise_logs/zephyr_freeze_ift/mp_rank_00_model_states.pt", map_location="cpu"
+)
+state_dict = {k.replace("module.", ""): v for k, v in state_dict["module"].items()}

eva_vit.py ADDED Viewed

	@@ -0,0 +1,948 @@

+# --------------------------------------------------------
+# Adapted from  https://github.com/baaivision/EVA/blob/master/EVA-CLIP/rei/eva_clip/eva_vit_model.py
+# --------------------------------------------------------
+import logging
+import math
+import os
+from dataclasses import dataclass
+from functools import partial
+from math import pi
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+if os.getenv("ENV_TYPE") == "deepspeed":
+    try:
+        from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
+    except:
+        from torch.utils.checkpoint import checkpoint
+else:
+    from torch.utils.checkpoint import checkpoint
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+    print("Please 'pip install xformers'")
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+    def forward(self, x):
+        if not self.training or self.prob == 0.0:
+            return x
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        x = x[batch_indices, patch_indices_keep]
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+        if self.training and os.getenv("RoPE") == "1":
+            return x, patch_indices_keep
+        return x
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        drop=0.0,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.ffn_ln(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.SiLU,
+        drop=0.0,
+        norm_layer=nn.LayerNorm,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        subln=False,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.subln = subln
+        if self.subln:
+            self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        else:
+            self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1
+            ) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads)
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0
+            ).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = torch.zeros(
+                size=(window_size[0] * window_size[1] + 1,) * 2,
+                dtype=relative_coords.dtype,
+            )
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
+        # self.proj = nn.Linear(all_head_dim, all_head_dim)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.xattn = xattn
+        self.xattn_drop = attn_drop
+        self.rope = rope
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        B, N, C = x.shape
+        if self.subln:
+            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+            q = q.reshape(B, N, self.num_heads, -1).permute(
+                0, 2, 1, 3
+            )  # B, num_heads, N, C
+            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        else:
+            qkv_bias = None
+            if self.q_bias is not None:
+                qkv_bias = torch.cat(
+                    (
+                        self.q_bias,
+                        torch.zeros_like(self.v_bias, requires_grad=False),
+                        self.v_bias,
+                    )
+                )
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(
+                2, 0, 3, 1, 4
+            )  # 3, B, num_heads, N, C
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        if self.rope:
+            # slightly fast impl
+            q_t = q[:, :, 1:, :]
+            ro_q_t = self.rope(q_t)
+            q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
+            k_t = k[:, :, 1:, :]
+            ro_k_t = self.rope(k_t)
+            k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)  # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = xops.memory_efficient_attention(
+                q,
+                k,
+                v,
+                p=self.xattn_drop,
+                scale=self.scale,
+            )
+            x = x.reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            if self.relative_position_bias_table is not None:
+                relative_position_bias = self.relative_position_bias_table[
+                    self.relative_position_index.view(-1)
+                ].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1,
+                    -1,
+                )  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(
+                    2, 0, 1
+                ).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
+            if rel_pos_bias is not None:
+                attn = attn + rel_pos_bias.type_as(attn)
+            if attn_mask is not None:
+                attn_mask = attn_mask.bool()
+                attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        postnorm=False,
+        subln=False,
+        naiveswiglu=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+            xattn=xattn,
+            rope=rope,
+            subln=subln,
+            norm_layer=norm_layer,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if naiveswiglu:
+            self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                subln=subln,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.mlp = Mlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                subln=subln,
+                drop=drop,
+            )
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+        self.postnorm = postnorm
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        if self.gamma_1 is None:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.norm1(
+                        self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                    )
+                )
+                x = x + self.drop_path(self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.attn(
+                        self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask
+                    )
+                )
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.norm1(
+                        self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                    )
+                )
+                x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.attn(
+                        self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask
+                    )
+                )
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], (
+            f"Input image size ({H}*{W}) doesn't match model"
+            f" ({self.img_size[0]}*{self.img_size[1]})."
+        )
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1
+        ) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
+        )
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1] + 1,
+            self.window_size[0] * self.window_size[1] + 1,
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+class EVAVisionTransformer(nn.Module):
+    """Vision Transformer with support for patch or hybrid CNN input stage"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_values=None,
+        patch_dropout=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        rope=False,
+        use_mean_pooling=True,
+        init_scale=0.001,
+        grad_checkpointing=False,
+        xattn=False,
+        postnorm=False,
+        pt_hw_seq_len=16,
+        intp_freq=False,
+        naiveswiglu=False,
+        subln=False,
+    ):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads
+            )
+        else:
+            self.rel_pos_bias = None
+        if rope:
+            half_head_dim = embed_dim // num_heads // 2
+            hw_seq_len = img_size // patch_size
+            self.rope = VisionRotaryEmbeddingFast(
+                dim=half_head_dim,
+                pt_seq_len=pt_hw_seq_len,
+                ft_seq_len=hw_seq_len if intp_freq else None,
+                # patch_dropout=patch_dropout
+            )
+        else:
+            self.rope = None
+        self.naiveswiglu = naiveswiglu
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    init_values=init_values,
+                    window_size=(
+                        self.patch_embed.patch_shape if use_rel_pos_bias else None
+                    ),
+                    xattn=xattn,
+                    rope=self.rope,
+                    postnorm=postnorm,
+                    subln=subln,
+                    naiveswiglu=naiveswiglu,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        self.head = (
+            nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+        if isinstance(self.head, nn.Linear):
+            trunc_normal_(self.head.weight, std=0.02)
+            self.head.weight.data.mul_(init_scale)
+            self.head.bias.data.mul_(init_scale)
+        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
+        self.patch_dropout = (
+            PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity()
+        )
+        self.grad_checkpointing = grad_checkpointing
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            if self.naiveswiglu:
+                rescale(layer.mlp.w3.weight.data, layer_id + 1)
+            else:
+                rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def get_num_layers(self):
+        return len(self.blocks)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=""):
+        self.num_classes = num_classes
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+    def forward_features(self, x, return_all_features=False, return_all_layers=False):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        if os.getenv("RoPE") == "1":
+            if self.training and not isinstance(self.patch_dropout, nn.Identity):
+                x, patch_indices_keep = self.patch_dropout(x)
+                self.rope.forward = partial(
+                    self.rope.forward, patch_indices_keep=patch_indices_keep
+                )
+            else:
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
+                x = self.patch_dropout(x)
+        else:
+            x = self.patch_dropout(x)
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        all_x = []
+        for blk in self.blocks:
+            if self.grad_checkpointing:
+                x = checkpoint(blk, x, (rel_pos_bias,))
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias)
+            if return_all_layers:
+                all_x.append(x)
+        if not return_all_features:
+            x = self.norm(x)
+            if self.fc_norm is not None:
+                return self.fc_norm(x.mean(1))
+            else:
+                return x[:, 0]
+        return x if not return_all_layers else all_x
+    def forward(self, x, return_all_features=False, return_all_layers=False):
+        if return_all_features:
+            return self.forward_features(x, return_all_features, return_all_layers)
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.0  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    global_average_pool: bool = False  # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    drop_path_rate: Optional[float] = None  # drop path rate
+    timm_model_name: str = (
+        None  # a valid model name overrides layers, width, patch_size
+    )
+    timm_model_pretrained: bool = (
+        False  # use (imagenet) pretrained weights for named model
+    )
+    timm_pool: str = (  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+        "avg"
+    )
+    timm_proj: str = (  # linear projection for timm model output ('linear', 'mlp', '')
+        "linear"
+    )
+    timm_proj_bias: bool = False  # enable bias final projection
+    eva_model_name: str = (
+        None  # a valid eva model name overrides layers, width, patch_size
+    )
+    qkv_bias: bool = True
+    fusedLN: bool = False
+    embed_dim: int = 1024
+    xattn: bool = False
+    postnorm: bool = False
+    rope: bool = False
+    pt_hw_seq_len: int = 16  # 224/14
+    intp_freq: bool = False
+    naiveswiglu: bool = False
+    subln: bool = False
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum("..., f -> ... f", t, freqs)
+        freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2)
+        freqs_w = torch.einsum("..., f -> ... f", t, freqs)
+        freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        logging.info(f"Shape of rope freq: {self.freqs_cos.shape}")
+    def forward(self, t, start_index=0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], (
+            f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in"
+            f" all the positions {rot_dim}"
+        )
+        t_left, t, t_right = (
+            t[..., :start_index],
+            t[..., start_index:end_index],
+            t[..., end_index:],
+        )
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim=-1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        patch_dropout=0.0,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum("..., f -> ... f", t, freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.patch_dropout = patch_dropout
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        logging.info(f"Shape of rope freq: {self.freqs_cos.shape}")
+    def forward(self, t, patch_indices_keep=None):
+        if patch_indices_keep is not None:
+            batch = t.size()[0]
+            batch_indices = torch.arange(batch)
+            batch_indices = batch_indices[..., None]
+            freqs_cos = repeat(
+                self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1]
+            )
+            freqs_sin = repeat(
+                self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1]
+            )
+            freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
+            freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j")
+            freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
+            freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j")
+            return t * freqs_cos + rotate_half(t) * freqs_sin
+        return t * self.freqs_cos + rotate_half(t) * self.freqs_sin

flamingo.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import inspect
+import torch
+from einops import rearrange
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from .helpers import PerceiverResampler
+def unwrap_fsdp(m):
+    if isinstance(m, FSDP):
+        return unwrap_fsdp(m.module)
+    return m
+def accepts_parameter(func, parameter_name):
+    signature = inspect.signature(func)
+    return parameter_name in signature.parameters
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        vis_dim: int,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+        enable_init_network_params: bool = False,
+        initializer_range: float = 0.02,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for <|endofchunk|>
+            media_token_id (int): Token id for <image>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        self.vision_encoder = (
+            vision_encoder.visual
+            if hasattr(vision_encoder, "visual")
+            else vision_encoder
+        )
+        self.perceiver = PerceiverResampler(
+            dim=self.vis_dim,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert vision_x is None, (
+                "Expect vision_x to be None when media has been cached using"
+                " cache_media(). Try uncache_media() first."
+            )
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x)
+        # eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            # eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            module_to_inspect = unwrap_fsdp(self.vision_encoder)
+            if accepts_parameter(module_to_inspect.forward, "return_all_features"):
+                vision_x = self.vision_encoder(vision_x, return_all_features=True)
+            else:
+                vision_x = self.vision_encoder(vision_x)[1]
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        vision_x = self.perceiver(vision_x)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x(vision_x)
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        """
+        Compute the media token locations from lang_x and condition the language model on these.
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+        """
+        media_locations = input_ids == self.media_token_id
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
+        """
+        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
+        All subsequent calls to forward() will generate attending to the LAST
+        image in vision_x.
+        This is not meant to be used to cache things for generate().
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        """
+        self._encode_vision_x(vision_x=vision_x)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_vision_x = True
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False

flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import functools
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+from transformers.utils import logging
+from .helpers import GatedCrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+logger = logging.get_logger(__name__)
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+        self._use_gradient_checkpointing = gradient_checkpointing
+        if self._use_gradient_checkpointing:
+            self.gradient_checkpointing_enable()
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None and self.media_locations is not None
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x):
+        self.vis_x = vis_x
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        # Cross attention
+        if self.gated_cross_attn_layer is not None:
+            if self.vis_x is None:
+                raise ValueError("vis_x must be conditioned before forward pass")
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.vis_x,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+        # Normal decoder layer
+        if (
+            self._use_gradient_checkpointing
+            and self.training
+            and isinstance(self.decoder_layer, MistralDecoderLayer)
+        ):
+            if (
+                "use_cache" in decoder_layer_kwargs
+                and decoder_layer_kwargs["use_cache"] is True
+            ):
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing."
+                    " Setting `use_cache=False`..."
+                )
+                decoder_layer_kwargs["use_cache"] = False
+            # lang_x = self._gradient_checkpointing_func(
+            #     self.decoder_layer.__call__,
+            #     lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+            # )
+            # Only work for Mistral
+            lang_x = self._gradient_checkpointing_func(
+                self.decoder_layer.__call__,
+                lang_x,
+                attention_mask,
+                decoder_layer_kwargs["position_ids"],
+                decoder_layer_kwargs["past_key_value"],
+                decoder_layer_kwargs["output_attentions"],
+                decoder_layer_kwargs["use_cache"],
+            )
+        else:
+            lang_x = self.decoder_layer(
+                lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+            )
+        return lang_x
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        """
+        Activates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
+        the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        Args:
+            gradient_checkpointing_kwargs (dict, *optional*):
+                Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
+        """
+        if gradient_checkpointing_kwargs is None:
+            gradient_checkpointing_kwargs = {}
+        gradient_checkpointing_func = functools.partial(
+            checkpoint, **gradient_checkpointing_kwargs
+        )
+        self._gradient_checkpointing_func = gradient_checkpointing_func
+        if getattr(self, "_hf_peft_config_loaded", False):
+            # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
+            # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
+            # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
+            # the gradients to make sure the gradient flows.
+            self.enable_input_require_grads()
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        vis_hidden_size,
+        cross_attn_every_n_layers,
+        *,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                (
+                    GatedCrossAttentionBlock(
+                        dim=lang_hidden_size,
+                        dim_visual=vis_hidden_size,
+                        ff_mult=4,
+                        enable_init_network_params=enable_init_network_params,
+                        initializer_range=initializer_range,
+                        gradient_checkpointing=gradient_checkpointing,
+                    )
+                    if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                    else None
+                )
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_vision_x = False
+        self.gradient_checkpointing = gradient_checkpointing
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing
+                    )
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo`"
+                " first."
+            )
+        media_locations = input_ids == self.media_token_id
+        # if there are media already cached and we're generating and there are no media tokens in the input,
+        # we'll assume that ALL input tokens should attend to the last previous media that is cached.
+        # this is especially important for HF generate() compatibility, since generate() calls forward()
+        # repeatedly one token at a time (with no media tokens).
+        # without this check, the model would not attend to any images when generating (after the first token)
+        use_cached_media_locations = (
+            self._use_cached_vision_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+        # package arguments for the other parent's forward. since we don't know the order of the arguments,
+        # make them all kwargs
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        # Mistral also need to set 'use_cache' to False when enable gradient checkpointing
+        if self.gradient_checkpointing and isinstance(
+            self.old_decoder_blocks[0], MistralDecoderLayer
+        ):
+            kwargs["use_cache"] = False
+        return super().forward(**kwargs)  # Call the other parent's forward method
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "do_sample": true,
+    "max_new_tokens": 512,
+    "top_k": 0,
+    "top_p": 0.5,
+    "transformers_version": "4.31.0"
+  }

helpers.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+import torch
+from einops import rearrange, repeat
+from torch import einsum, nn
+from einops_exts import rearrange_many
+try:
+    from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
+except:
+    from torch.utils.checkpoint import checkpoint
+def exists(val):
+    return val is not None
+def FeedForward(
+    dim,
+    mult=4,
+    enable_init_network_params=False,
+    initializer_range=0.02,
+):
+    inner_dim = int(dim * mult)
+    net = nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+    if enable_init_network_params:
+        # then start the initialization
+        net[0].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[0].bias.data.zero_()
+        net[1].weight.data.normal_(mean=0.0, std=initializer_range)
+        net[3].weight.data.normal_(mean=0.0, std=initializer_range)
+    return net
+class PerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head=64,
+        heads=8,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.initializer_range = initializer_range
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        if enable_init_network_params:
+            self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents.contiguous())
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        super().__init__()
+        self.gradient_checkpointing = gradient_checkpointing
+        self.initializer_range = initializer_range
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = (
+            nn.Parameter(torch.randn(max_num_frames, dim))
+            if exists(max_num_frames)
+            else None
+        )
+        self.media_time_embs = (
+            nn.Parameter(torch.randn(max_num_media, 1, dim))
+            if exists(max_num_media)
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(
+                            dim=dim,
+                            dim_head=dim_head,
+                            heads=heads,
+                            enable_init_network_params=enable_init_network_params,
+                            initializer_range=initializer_range,
+                        ),
+                        FeedForward(
+                            dim=dim,
+                            mult=ff_mult,
+                            enable_init_network_params=enable_init_network_params,
+                            initializer_range=initializer_range,
+                        ),
+                    ]
+                )
+            )
+        # Should this norm layer also change?
+        self.norm = nn.LayerNorm(dim)
+        if enable_init_network_params:
+            self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=self.initializer_range)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(
+            x, "b T F v d -> b T (F v) d"
+        )  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            if self.gradient_checkpointing and latents.requires_grad:
+                latents = checkpoint(attn, x, (latents)) + latents
+                latents = checkpoint(ff, latents) + latents
+            else:
+                latents = attn(x, latents) + latents
+                latents = ff(latents) + latents
+        return self.norm(latents)
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.initializer_range = initializer_range
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+        if enable_init_network_params:
+            self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, x, media, media_locations=None, use_cached_media=False):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+        if not use_cached_media:
+            assert media_locations.shape[1] == x.shape[1], (
+                f"media_location.shape is {media_locations.shape} but x.shape is"
+                f" {x.shape}"
+            )
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x.contiguous())
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            if self.only_attend_immediate_media:
+                # any text without a preceding media needs to have attention zeroed out
+                text_without_media_mask = text_time == 0
+                text_without_media_mask = rearrange(
+                    text_without_media_mask, "b i -> b 1 i 1"
+                )
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(media_locations):
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+        enable_init_network_params=False,
+        initializer_range=0.02,
+        gradient_checkpointing=False,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+            enable_init_network_params=enable_init_network_params,
+            initializer_range=initializer_range,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+        self.gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        if exists(media_locations):
+            flag = torch.sum(media_locations, dim=-1)
+            flag = torch.where(flag > 0.0, 1.0, 0.0)
+            flag = flag.unsqueeze(1).unsqueeze(1).to(torch.bfloat16)
+        else:
+            flag = 1.0
+        if self.gradient_checkpointing and media.requires_grad:
+            x = (
+                flag
+                * checkpoint(self.attn, x, media, media_locations, use_cached_media)
+                * self.attn_gate.tanh()
+                + x
+            )
+            x = flag * checkpoint(self.ff, x) * self.ff_gate.tanh() + x
+        else:
+            x = (
+                flag
+                * self.attn(
+                    x,
+                    media,
+                    media_locations=media_locations,
+                    use_cached_media=use_cached_media,
+                )
+                * self.attn_gate.tanh()
+                + x
+            )
+            x = flag * self.ff(x) * self.ff_gate.tanh() + x
+        return x

modeling_infimm_zephyr.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import importlib
+import math
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+from transformers import GenerationConfig, PreTrainedTokenizer, StoppingCriteriaList
+from transformers.generation.logits_process import LogitsProcessorList
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.mistral.configuration_mistral import MistralConfig
+from transformers.models.mistral.modeling_mistral import MistralForCausalLM
+from transformers.utils import logging
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+from torch import nn
+from .configuration_infimm_zephyr import InfiMMConfig
+from .eva_vit import CLIPVisionCfg, EVAVisionTransformer
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .helpers import PerceiverResampler
+from .utils import _infer_decoder_layers_attr_name, extend_instance
+SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
+SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
+class InfiMMPreTrainedModel(PreTrainedModel):
+    config_class = InfiMMConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+class InfiMMZephyrModel(InfiMMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_config = config.visual
+        vision_encoder = self.build_vision_encoder()
+        self.language_config = config.language
+        language_encoder = self.build_language_encoder()
+        self.model = self.build_flamingo(vision_encoder, language_encoder)
+    def build_vision_encoder(self):
+        vision_cfg = CLIPVisionCfg(**self.vision_config)
+        vision_encoder = EVAVisionTransformer(
+            img_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            num_classes=vision_cfg.embed_dim,
+            use_mean_pooling=vision_cfg.global_average_pool,  # False
+            init_values=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            embed_dim=vision_cfg.width,
+            depth=vision_cfg.layers,
+            num_heads=vision_cfg.width // vision_cfg.head_width,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            qkv_bias=vision_cfg.qkv_bias,
+            drop_path_rate=vision_cfg.drop_path_rate,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            xattn=vision_cfg.xattn,
+            rope=vision_cfg.rope,
+            postnorm=vision_cfg.postnorm,
+            pt_hw_seq_len=vision_cfg.pt_hw_seq_len,  # 224/14
+            intp_freq=vision_cfg.intp_freq,
+            naiveswiglu=vision_cfg.naiveswiglu,
+            subln=vision_cfg.subln,
+        )
+        return vision_encoder
+    def build_language_encoder(self):
+        mistral_config = MistralConfig(**self.language_config)
+        lang_encoder = MistralForCausalLM(mistral_config)
+        return lang_encoder
+    def build_flamingo(self, vision_encoder, lang_encoder):
+        extend_instance(lang_encoder, FlamingoLMMixin)
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+        lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+        # lang_encoder.resize_token_embeddings(self.config.)
+        model = Flamingo(
+            vision_encoder,
+            lang_encoder,
+            self.config.eoc_token_id,
+            self.config.image_token_id,
+            vis_dim=self.vision_config["width"],
+            cross_attn_every_n_layers=self.config.cross_attn_every_n_layers,
+            gradient_checkpointing=self.config.use_grad_checkpoint,
+        )
+        return model
+    def generate(
+        self,
+        input_ids,
+        attention_mask,
+        batch_images,
+        min_generation_length: int,
+        max_generation_length: int,
+        **kwargs,
+    ):
+        with torch.inference_mode():
+            outputs = self.model.generate(
+                batch_images,
+                input_ids,
+                attention_mask,
+                min_new_tokens=min_generation_length,
+                max_new_tokens=max_generation_length,
+                **kwargs,
+            )
+        # Extract only the new gnerated tokens
+        outputs = outputs[:, len(input_ids[0]) :]
+        return outputs

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "_name_or_path": "./",
+    "auto_map": {
+        "AutoProcessor": "processing_infimm_zephyr.InfiMMZephyrProcessor"
+    },
+    "image_size": 336
+}

processing_infimm_zephyr.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for InfiMMZephyr.
+"""
+import random
+from typing import List, Optional, Tuple, Union
+import torch
+import torchvision.transforms.functional as F
+from PIL import Image
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    Resize,
+    ToTensor,
+)
+from transformers import AutoTokenizer
+from transformers.image_processing_utils import ImageProcessingMixin
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+IMAGE_TOKEN = "<image>"
+END_OF_CHUNK_TOKEN = "<|endofchunk|>"
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+def _convert_to_rgb(image):
+    return image.convert("RGB")
+class ResizeKeepRatio:
+    """Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
+    def __init__(
+        self,
+        size,
+        longest=0.0,
+        interpolation=InterpolationMode.BICUBIC,
+        random_scale_prob=0.0,
+        random_scale_range=(0.85, 1.05),
+        random_aspect_prob=0.0,
+        random_aspect_range=(0.9, 1.11),
+    ):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.longest = float(longest)  # [0, 1] where 0 == shortest edge, 1 == longest
+        self.random_scale_prob = random_scale_prob
+        self.random_scale_range = random_scale_range
+        self.random_aspect_prob = random_aspect_prob
+        self.random_aspect_range = random_aspect_range
+    @staticmethod
+    def get_params(
+        img,
+        target_size,
+        longest,
+        random_scale_prob=0.0,
+        random_scale_range=(0.85, 1.05),
+        random_aspect_prob=0.0,
+        random_aspect_range=(0.9, 1.11),
+    ):
+        """Get parameters"""
+        source_size = img.size[::-1]  # h, w
+        h, w = source_size
+        target_h, target_w = target_size
+        ratio_h = h / target_h
+        ratio_w = w / target_w
+        ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (
+            1.0 - longest
+        )
+        if random_scale_prob > 0 and random.random() < random_scale_prob:
+            ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
+            ratio_factor = (ratio_factor, ratio_factor)
+        else:
+            ratio_factor = (1.0, 1.0)
+        if random_aspect_prob > 0 and random.random() < random_aspect_prob:
+            aspect_factor = random.uniform(
+                random_aspect_range[0], random_aspect_range[1]
+            )
+            ratio_factor = (
+                ratio_factor[0] / aspect_factor,
+                ratio_factor[1] * aspect_factor,
+            )
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
+        """
+        size = self.get_params(
+            img,
+            self.size,
+            self.longest,
+            self.random_scale_prob,
+            self.random_scale_range,
+            self.random_aspect_prob,
+            self.random_aspect_range,
+        )
+        img = F.resize(img, size, self.interpolation)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "(size={0}".format(self.size)
+        format_string += f", interpolation={self.interpolation})"
+        format_string += f", longest={self.longest:.3f})"
+        return format_string
+def image_transform(
+    image_size: Union[int, Tuple[int, int]],
+    mean: Optional[Tuple[float, ...]] = None,
+    std: Optional[Tuple[float, ...]] = None,
+    resize_mode: Optional[str] = None,
+    interpolation: Optional[str] = None,
+):
+    mean = mean or OPENAI_DATASET_MEAN
+    if not isinstance(mean, (list, tuple)):
+        mean = (mean,) * 3
+    std = std or OPENAI_DATASET_STD
+    if not isinstance(std, (list, tuple)):
+        std = (std,) * 3
+    interpolation = interpolation or "bicubic"
+    assert interpolation in ["bicubic", "bilinear", "random"]
+    # NOTE random is ignored for interpolation_mode, so defaults to BICUBIC for inference if set
+    interpolation_mode = (
+        InterpolationMode.BILINEAR
+        if interpolation == "bilinear"
+        else InterpolationMode.BICUBIC
+    )
+    resize_mode = resize_mode or "shortest"
+    assert resize_mode in ("shortest", "longest", "squash")
+    normalize = Normalize(mean=mean, std=std)
+    assert resize_mode == "shortest"
+    if not isinstance(image_size, (tuple, list)):
+        image_size = (image_size, image_size)
+    if image_size[0] == image_size[1]:
+        # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
+        transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
+    else:
+        # resize shortest edge to matching target dim for non-square target
+        transforms = [ResizeKeepRatio(image_size)]
+    transforms += [CenterCrop(image_size)]
+    transforms.extend(
+        [
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ]
+    )
+    return Compose(transforms)
+class EVAClipImageProcessor(ImageProcessingMixin):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.processor = image_transform(image_size=336)
+    def _prepare_images(self, batch: List[List[Image.Image]]) -> torch.Tensor:
+        """
+        Convert images to tensors, reshape them, and stack them.
+        Args:
+            batch: A list of lists of images.
+        Returns:
+            preprocessed images (tensors) or None
+                shape (B, T_img, F, C, H, W)
+                None if no images in batch
+        """
+        images_per_example = max(len(x) for x in batch)
+        batch_images = None
+        for iexample, example in enumerate(batch):
+            for iimage, image in enumerate(example):
+                preprocessed = self.processor(image)
+                if batch_images is None:
+                    batch_images = torch.zeros(
+                        (len(batch), images_per_example, 1) + preprocessed.shape,
+                        dtype=preprocessed.dtype,
+                    )
+                batch_images[iexample, iimage, 0] = preprocessed
+        return batch_images
+    def preprocess(self, imgpaths=None):
+        if imgpaths is None or len(imgpaths) == 0:
+            images = [(Image.new("RGB", (336, 336), color="black"))]
+        else:
+            images = [Image.open(fp) for fp in imgpaths]
+        return self._prepare_images([images])
+class InfiMMZephyrProcessor(ProcessorMixin):
+    r"""
+    Constructs a InfiMMZephyr processor which wraps a tokenizer and an image processor into a single processor.
+    Args:
+        image_processor (`EVAClipImageProcessor`):
+            An instance of [`EVAClipImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of [`LlamaTokenizer`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 336): Image size (assuming a square image)
+    """
+    attributes = ["tokenizer"]
+    tokenizer_class = "LlamaTokenizer"
+    def __init__(self, tokenizer=None, **kwargs):
+        self.image_processor = EVAClipImageProcessor()
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained("infimm-zephyr", verbose=False)
+        super().__init__(tokenizer, tokenizer)
+    def _prepare_text(
+        self,
+        batch: List[List[str]],
+        padding="longest",
+        truncation=True,
+        max_length=2048,
+    ):
+        """
+        Tokenize the text and stack them.
+        Args:
+            batch: A list of lists of strings.
+        Returns:
+            input_ids (tensor)
+                shape (B, T_txt)
+            attention_mask (tensor)
+                shape (B, T_txt)
+        """
+        encodings = self.tokenizer(
+            batch,
+            padding=padding,
+            truncation=truncation,
+            return_tensors="pt",
+            max_length=max_length,
+        )
+        input_ids, attention_mask = encodings["input_ids"], encodings["attention_mask"]
+        return input_ids, attention_mask
+    def __call__(
+        self,
+        prompts,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+        """
+        image_paths = self._extract_image_paths(prompts)
+        images = self.image_processor.preprocess(image_paths)
+        prompts = self._replace_with_media_tokens(prompts)
+        final_prompt = self.apply_chat_template(prompts)
+        input_ids, attention_mask = self._prepare_text([final_prompt])
+        return BatchEncoding(
+            data={
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "batch_images": images,
+            }
+        )
+    def _extract_image_paths(self, prompts):
+        image_paths = []
+        for round in prompts:
+            if round["role"] != "user":
+                continue
+            for piece in round["content"]:
+                if isinstance(piece, dict):
+                    image_paths.append(piece["image"])
+        return image_paths
+    def _replace_with_media_tokens(self, prompts):
+        new_prompts = []
+        for round in prompts:
+            if round["role"] != "user":
+                new_prompts.append(round)
+            new_content = []
+            for piece in round["content"]:
+                if isinstance(piece, dict):
+                    new_content.append(f"{END_OF_CHUNK_TOKEN}{IMAGE_TOKEN}")
+                else:
+                    new_content.append(piece)
+            new_prompts.append({"role": "user", "content": "".join(new_content)})
+        return new_prompts
+    def apply_chat_template(self, messages, task="generation"):
+        if messages[0]["role"] != "system":
+            messages.insert(0, {"role": "system", "content": ""})
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True if task == "generation" else False,
+        )
+        return prompt
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4561b0cf112593ac6fd3f4dd6705cac172fbbc9876ff798e58dc303cc941c8b7
+size 19682192822

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|endofchunk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|endofchunk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endofchunk|>",
+    "<image>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "left",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true,
+  "verbose": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        "We require the attribute name for the nn.ModuleList in the decoder storing"
+        " the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "llama": "model.layers",
+    "mistral": "model.layers",
+}