huggingface · akshayballal95 · Oct 21, 2024 · Oct 21, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -81,6 +81,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     CLIPModelPatcher,
+    ColPaliModelPatcher,
     FalconModelPatcher,
     MistralModelPatcher,
     MusicgenModelPatcher,
@@ -2474,3 +2475,48 @@ class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
 
     DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+
+class PaliGemmaOnnxConfig(GemmaOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator)
+    NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args(
+        text_config="text_config", vision_config="vision_config"
+    )
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        dynamic_axis = {0: "batch_size", 1: "sequence_length"}
+        if self.task == "feature-extraction":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            }
+        elif self.task == "text-generation":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+            }
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+        if framework == "pt":
+            if self.task == "feature-extraction":
+                generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)
+                prefix_tensor = generator.constant_tensor(
+                    shape=[dummy_inputs["input_ids"].shape[0], 1024],
+                    value=self._normalized_config.image_token_index,
+                    framework=framework,
+                )
+                dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1)
+                dummy_inputs["attention_mask"] = generator.random_mask_tensor(
+                    shape=[generator.batch_size, generator.sequence_length + 1024],
+                    padding_side=generator.padding_side,
+                    framework=framework,
+                    dtype="int64",
+                )
+        return dummy_inputs
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        if self.task == "feature-extraction":
+            return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs)
+        else:
+            return super().patch_model_for_export(model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -509,6 +509,24 @@ def patched_forward(*args, **kwargs):
         self.patched_forward = patched_forward
 
 
+class ColPaliModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
+            outputs = self.orig_forward(
+                input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
+            )
+            return outputs
+
+        self.patched_forward = patched_forward
+
+
 class SAMModelPatcher(ModelPatcher):
     def __init__(
         self,
@@ -1154,3 +1172,18 @@ def __exit__(self, exc_type, exc_value, traceback):
             from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 
             CLIPSdpaAttention.forward = self.original_sdpa_forward
+
+class ColPaliModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
+            outputs = self.orig_forward(
+                input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
+            )
+            return outputs
+        self.patched_forward = patched_forward
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
@@ -949,6 +949,10 @@ class TasksManager:
             "text-generation-with-past",
             onnx="GraniteOnnxConfig",
         ),
+        "paligemma": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="PaliGemmaOnnxConfig",
+        ),
         "pegasus": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",