llvm · rengolin · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/5_Matrix_scalar_multiplication.o b/5_Matrix_scalar_multiplication.o
diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml
@@ -41,10 +41,10 @@
   pipeline: matmul
 
 - kernel: level1/7_Matmul_with_small_K_dimension_.py
-  input_shapes: [4096x64, 64x4096]
+  input_shapes: [1024x64, 64x1024]
   initializations: [rnd, rnd]
-  output_shape: 4096x4096
-  gflops: (4096 * 64 * 4096 * 2) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 64 * 1024 * 2) / 1e9
   pipeline: matmul
 
 - kernel: level1/8_Matmul_with_irregular_shapes_.py
@@ -123,16 +123,17 @@
   pipeline: matmul
 
 - kernel: level1/19_ReLU.py
-  input_shapes: [4096x8192]
+  input_shapes: [1024x1024]
   initializations: [rnd]
-  output_shape: 4096x8192
-  gflops: (4096 * 8192) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 1024) / 1e9
 
 - kernel: level1/20_LeakyReLU.py
-  input_shapes: [4096x8192]
+  input_shapes: [1024x1024]
   initializations: [rnd]
-  output_shape: 4096x8192
-  gflops: (4096 * 8192) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 1024) / 1e9
+  pipeline: element_wise
 
 - kernel: level1/21_Sigmoid.py
   input_shapes: [4096x8192]

diff --git a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml
@@ -1,6 +1,8 @@
 # This is an optimizing pipeline for kernel_bench element-wise kernels.
 # Assumption: M, N, K % 32 = 0
 Pipeline:
+  # Tries to combine as much as possible into one big generic
+  - pass: "linalg-fuse-elementwise-ops"
   # Register tiling and unroll to fill the pipeline
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic
                                        tile_sizes=[1,$register_tile]

diff --git a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml
@@ -2,6 +2,7 @@
 # This is a good default to most CPU based pipelines.
 Pipeline:
   ## Packing & Cache tiling (CPU generic)
+  - pass: "linalg-fuse-elementwise-ops"
   - schedule: "packing.py[gen=block_pack_matmuls]{block_factors=[32,32,32] rhs_transpose_outer_block=True rhs_transpose_inner_block=False}"
   - schedule: "x86/pack_lowering.py[gen=lower_packs_unpacks]{tile_size=32}"
   - pass: "linalg-morph-ops{named-to-category generic-to-category}"