microsoft
/

Phi-3-vision-128k-instruct

@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 from torch import nn
@@ -191,7 +192,15 @@ class Phi3ImageEmbedding(nn.Module):
         # positions for image tokens
         positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
         has_image = len(positions[0].tolist()) > 0
-        input_ids = input_ids.clamp_min(0).clamp_max(self.vocab_size).detach()
         hidden_states = self.wte(input_ids)
         if has_image:

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 import torch
 from torch import nn
         # positions for image tokens
         positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
         has_image = len(positions[0].tolist()) > 0
+        # input_ids = input_ids.clamp_min(0).clamp_max(self.vocab_size).detach()
+        input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+        warnings.warn(
+            "Phi-3-V modifies `input_ids` in-place and the tokens indicating images will be "
+            "removed after model forward. If your workflow requires multiple forward passes on "
+            "the same `input_ids`, please make a copy of `input_ids` before passing it to the "
+            "model."
+        )
         hidden_states = self.wte(input_ids)
         if has_image: