iree-org · harsh-nod · Dec 9, 2024
diff --git a/iree/turbine/kernel/compiler/kernel_codegen.py b/iree/turbine/kernel/compiler/kernel_codegen.py
@@ -129,9 +129,27 @@ def sym_to_dim_asm(s: IndexSymbol) -> str:
             else:
                 # Unranked. Not well supported, but for completeness.
                 spec_asm = element_type_asm
-            strides = strides_from_symbolic_shape(
-                idx_context, kb_t.symbolic_shape, allow_mixed_shapes=True
-            )
+            # If strides have been specified in the type, that implies that they are
+            # not consistent with the dimensions of the tensor, so we default to
+            # dynamic dims for all shapes.
+            ref_type = self.reference[1].type
+            if ref_type.physical_layout:
+                # Strides are always present in the physical layout.
+                strides = [
+                    idx_context.get_static_value(s)
+                    for s in ref_type.physical_layout["stride"]
+                ]
+                # Shapes are not always present in the physical layout.
+                if ref_type.physical_layout.get("shape", None):
+                    shape_asm = "x".join(
+                        sym_to_dim_asm(s) for s in ref_type.physical_layout["shape"]
+                    )
+                    spec_asm = f"{shape_asm}x{element_type_asm}"
+            else:
+                strides = strides_from_symbolic_shape(
+                    idx_context, kb_t.symbolic_shape, allow_mixed_shapes=True
+                )
+
             if strides is None:
                 memref_asm = f"memref<{spec_asm}>"
             elif _is_symbolic(strides):

diff --git a/iree/turbine/kernel/lang/kernel_buffer.py b/iree/turbine/kernel/lang/kernel_buffer.py
@@ -64,20 +64,23 @@ def new_subtype(
         address_space: AddressSpace | NotSetType = NotSet,
         symbolic_shape: tuple[IndexExpr, ...] | NotSetType = NotSet,
         dtype: DataType | NotSetType = NotSet,
+        physical_layout: dict[str, IndexExpr] | NotSetType = NotSet,
         usage: KernelBufferUsage | NotSetType = NotSet,
     ) -> Type[SubtypeT]:
         init_address_space = (
             address_space if address_space else AddressSpace.GLOBAL_MEMORY
         )
         init_symbolic_shape = symbolic_shape if symbolic_shape is not NotSet else cls.symbolic_shape  # type: ignore
         init_dtype = dtype if dtype is not NotSet else cls.dtype  # type: ignore
+        init_physical_layout = physical_layout if physical_layout else None  # type: ignore
         init_usage = usage if usage is not NotSet else cls.usage  # type: ignore
 
         class SubType(cls):
             address_space = init_address_space
             symbolic_shape = init_symbolic_shape
             rank = len(init_symbolic_shape)  # type: ignore
             dtype = init_dtype
+            physical_layout = init_physical_layout
             usage = init_usage
 
         if name is not NotSet:
@@ -104,6 +107,7 @@ class KernelBuffer(metaclass=KernelBufferMeta):
     symbolic_shape: ClassVar[tuple[IndexExpr, ...]]
     rank: ClassVar[int]
     dtype: ClassVar[DataType]
+    stride: ClassVar[tuple[IndexExpr, ...]]
 
     def __init__(self, tensor: torch.Tensor):
         assert isinstance(tensor, torch.Tensor), f"Expected Tensor but got {tensor}"

diff --git a/iree/turbine/kernel/lang/wave_types.py b/iree/turbine/kernel/lang/wave_types.py
@@ -14,7 +14,7 @@
 from .._support.dtype import DataType
 from .._support.indexing import IndexExpr, IndexSymbol, index_symbol
 
-from sympy import Symbol
+from sympy import Symbol, Integer
 from sympy.core.expr import Expr
 from typing_extensions import Self
 
@@ -41,6 +41,7 @@ class Memory(metaclass=KernelBufferMeta):
     symbolic_shape: ClassVar[tuple[IndexExpr, ...]]
     rank: ClassVar[int]
     dtype: ClassVar[DataType]
+    physical_layout: ClassVar[Optional[dict[str, IndexExpr]]]
     usage: ClassVar[Optional[KernelBufferUsage]]
 
     def __init__(self) -> None:
@@ -55,9 +56,15 @@ def __class_getitem__(
 
         shift = 0
         usage = KernelBufferUsage.NONE
-        if isinstance(shape_and_dtype[-1], KernelBufferUsage):
-            shift = 1
-            usage = shape_and_dtype[-1]
+        last_dim = -1
+        if isinstance(shape_and_dtype[last_dim], KernelBufferUsage):
+            shift += 1
+            usage = shape_and_dtype[last_dim]
+            last_dim -= 1
+        physical_layout = None
+        if isinstance(shape_and_dtype[last_dim], dict):
+            shift += 1
+            physical_layout = shape_and_dtype[last_dim]
         shape = shape_and_dtype[: -2 - shift]
         addressSpace = shape_and_dtype[-2 - shift]
         dtype = shape_and_dtype[-1 - shift]
@@ -85,6 +92,7 @@ def __class_getitem__(
             address_space=addressSpace,
             symbolic_shape=shape,
             dtype=dtype,
+            physical_layout=physical_layout,
             usage=usage,
         )
 

diff --git a/iree/turbine/kernel/ops/wave_ops.py b/iree/turbine/kernel/ops/wave_ops.py
@@ -70,6 +70,7 @@ def read(
     elements_per_thread: Optional[IndexExpr | int] = None,
     mapping: Optional[IndexMapping] = None,
     mapping_dynamic_vals: "Register" | tuple["Register", ...] = (),
+    drop_dims: Optional[IndexExpr] = (),
 ) -> "Register":
     ...
 
@@ -461,7 +462,7 @@ def node_args(self) -> dict[int, Any]:
         for i, arg in enumerate(self.fx_node.args):
             if isinstance(arg, fx.Node):
                 custom_args[i] = get_custom(arg)
-            if isinstance(arg, list) and all(isinstance(x, fx.Node) for x in arg):
+            if isinstance(arg, Sequence) and all(isinstance(x, fx.Node) for x in arg):
                 custom_args[i] = [get_custom(x) for x in arg]
         return custom_args
 
@@ -965,6 +966,22 @@ class Read(CustomOp):
     mapping_dynamic_vals: tuple[fx.Node, ...] = ()
     _write_dependency: Optional[list[fx.Node]] = None
 
+    """
+    Note on drop_dims.
+    Consider the following loop:
+
+    for b in range(B):
+        for k1 in range(K1):
+            for k2 in range(K2):
+                out[b, k1, k2] = in[b, 0, k1, k2]
+
+    This is a slice where the output is a 3D tensor and the input is a 4D tensor.
+    The index mapping does not allow rank-reducing operations, since every symbol in the output must be
+    bound to an index variable. So we introduce a drop_dims field to specify which dimensions are dropped
+    after the mapping.
+
+    """
+
     @property
     def indexing_dims(self) -> list[IndexSymbol]:
         if self.mapping is not None:
@@ -1013,7 +1030,10 @@ def transform_index_backwards(
             iters = self.mapping.iters
             mapping = self.mapping.dynamic_val_mappings[i]
             subs = {v: k for k, v in zip(iters, mapping.keys())}
-            return {k: v.apply_expr(subs[k], mapping[k]) for k, v in index.items()}
+            return {
+                k: v.apply_expr(subs[k], mapping[k]) if k in mapping else v
+                for k, v in index.items()
+            }
 
         return index
 
@@ -1253,7 +1273,10 @@ def transform_index_backwards(
             iters = self.mapping.iters
             mapping = self.mapping.dynamic_val_mappings[i]
             subs = {v: k for k, v in zip(iters, mapping.keys())}
-            return {k: v.apply_expr(subs[k], mapping[k]) for k, v in index.items()}
+            return {
+                k: v.apply_expr(subs[k], mapping[k]) if k in mapping else v
+                for k, v in index.items()
+            }
 
         return index