apple · ds-hwang · Nov 14, 2024
@@ -20,6 +20,7 @@
     make_segment_mask,
 )
 from axlearn.common.config import config_class
+from axlearn.common.flash_attention import tpu_attention
 from axlearn.common.flash_attention.utils import (
     MultiHeadAttentionImpl,
     flash_attention_implementation,
@@ -169,10 +170,6 @@ def _compute_attention(
         cfg = self.config
         backend = self._backend()
 
-        # Repeats key/value heads dim if necessary.
-        k_proj = self._repeat_kv_heads(k_proj)
-        v_proj = self._repeat_kv_heads(v_proj)
-
         batch, target_len, num_heads, _ = q_proj.shape
         _, source_len, _, _ = k_proj.shape
 
@@ -228,7 +225,18 @@ def _compute_attention(
                 f"{k_proj.shape[1]} for correctly supported GPU flash attention usage."
             )
 
-        if backend == "tpu":
+        if backend == "cpu" and not tpu_attention.check_tpu_splash_attention(
+            query=q_proj,
+            key=k_proj,
+            has_mask=bool(cfg.mask),
+            segment_ids=segment_ids,
+            has_bias=(attention_logit_biases is not None),
+        ):
+            backend = "xla"
+
+        if backend in ("tpu", "cpu"):
+            # Splash attention needs to know sliding_window_size.
+            mask_fn = cfg.mask
             assert q_proj.shape[1] % cfg.tpu_block_size == 0, (
                 f"Target seq len {q_proj.shape[1]} must be "
                 f"divisible by block size {cfg.tpu_block_size}."
@@ -263,6 +271,12 @@ def _compute_attention(
         q_proj = self.scale_query(q_proj)
         k_proj = self.scale_key(k_proj)
 
+        # TODO(dhwang2): splash attention supports GQA natively, so don't repeat with proper shard.
+        # https://github.com/jax-ml/jax/blob/7b9914d711593dca8725d46aa1dadb2194284519/jax/experimental/pallas/ops/tpu/splash_attention/splash_attention_kernel.py#L934
+        # Repeats key/value heads dim if necessary.
+        k_proj = self._repeat_kv_heads(k_proj)
+        v_proj = self._repeat_kv_heads(v_proj)
+
         # Constrain input to conform to partitioned MHA expectations.
         q_proj = with_sharding_constraint(q_proj, cfg.mha_dim_to_partition_spec["btnh"])
         k_proj = with_sharding_constraint(k_proj, cfg.mha_dim_to_partition_spec["bsnh"])

@@ -16,7 +16,7 @@
 import jax
 import jax.numpy as jnp
 import pytest
-from absl.testing import parameterized
+from absl.testing import absltest, parameterized
 from jax.experimental import mesh_utils
 from jax.sharding import Mesh
 
@@ -91,6 +91,7 @@ def _prepare_layers(
     sliding_window_size,
     inference=False,
     set_layer_bias_recursively=False,
+    tpu_block_size=512,
 ):
     hidden_dim = num_heads * per_head_dim
     kwargs = dict(
@@ -110,6 +111,7 @@ def _prepare_layers(
         .set(
             mha_dim_to_partition_spec=default_mha_dim_to_partition_spec(mesh_axis_names),
             output_dim_to_partition_spec=default_output_dim_to_partition_spec(mesh_axis_names),
+            tpu_block_size=tpu_block_size,
         )
     )
     if inference:
@@ -378,7 +380,9 @@ def test_forward(
                 mesh_axis_names=mesh_axis_names,
                 causal=causal,
                 sliding_window_size=sliding_window_size,
+                tpu_block_size=128,
             )
+
             # pylint: disable-next=protected-access
             if test_layer._backend() == "gpu" and query_len_multiplier != 1:
                 pytest.skip(
@@ -734,3 +738,7 @@ def test_extend_step(
                 atol=2e-2,
             )
         jax.clear_backends()
+
+
+if __name__ == "__main__":
+    absltest.main()