huggingface · jla524 · Dec 27, 2024 · Dec 27, 2024 · Dec 28, 2024
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -291,8 +291,8 @@ def _preprocess(
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        if patches.shape[0] % 2 == 1:
+            patches = np.concatenate([patches, patches[-1][np.newaxis]], axis=0)
         channel = patches.shape[1]
         grid_t = patches.shape[0] // self.temporal_patch_size
         grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size

diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -247,3 +247,14 @@ def test_nested_input(self):
         # Image processor should return same pixel values, independently of ipnut format
         self.assertTrue((encoded_images_nested == encoded_images).all())
         self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
+
+    def test_odd_number_of_frames(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        expected_dims_by_frames = {1: 324, 3: 648, 5: 972, 7: 1296, 9: 1620}
+
+        for num_frames, expected_dims in expected_dims_by_frames.items():
+            image_inputs = np.random.randint(0, 255, size=(num_frames, 256, 256, 3))
+            prcocess_out = image_processing(image_inputs, return_tensors="pt")
+            encoded_images = prcocess_out.pixel_values
+            expected_output_image_shape = (expected_dims, 1176)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)