From 75a889e4f7c60f883c4085e4212ff8d5981cb2d3 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Mon, 15 Jun 2026 16:24:55 +0100
Subject: [PATCH 1/2] [KB] Reduce sizes to make it easier to debug on CPUs

---
 examples/KernelBench/level1.yaml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml
index a6ccfd0f..8f86f5c6 100644
--- a/examples/KernelBench/level1.yaml
+++ b/examples/KernelBench/level1.yaml
@@ -41,10 +41,10 @@
   pipeline: matmul
 
 - kernel: level1/7_Matmul_with_small_K_dimension_.py
-  input_shapes: [4096x64, 64x4096]
+  input_shapes: [1024x64, 64x1024]
   initializations: [rnd, rnd]
-  output_shape: 4096x4096
-  gflops: (4096 * 64 * 4096 * 2) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 64 * 1024 * 2) / 1e9
   pipeline: matmul
 
 - kernel: level1/8_Matmul_with_irregular_shapes_.py
@@ -123,16 +123,16 @@
   pipeline: matmul
 
 - kernel: level1/19_ReLU.py
-  input_shapes: [4096x8192]
+  input_shapes: [1024x1024]
   initializations: [rnd]
-  output_shape: 4096x8192
-  gflops: (4096 * 8192) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 1024) / 1e9
 
 - kernel: level1/20_LeakyReLU.py
-  input_shapes: [4096x8192]
+  input_shapes: [1024x1024]
   initializations: [rnd]
-  output_shape: 4096x8192
-  gflops: (4096 * 8192) / 1e9
+  output_shape: 1024x1024
+  gflops: (1024 * 1024) / 1e9
 
 - kernel: level1/21_Sigmoid.py
   input_shapes: [4096x8192]

From 7f6f4f8265a388ee24050d13416dd4bca3e4d682 Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Mon, 15 Jun 2026 19:26:14 +0100
Subject: [PATCH 2/2] [KB] Linalg generic fusion to help some bad lowering from
 torch-mlir

---
 5_Matrix_scalar_multiplication.o                 | Bin 3296 -> 0 bytes
 examples/KernelBench/level1.yaml                 |   1 +
 .../schedules/x86_64/element_wise/common.yaml    |   2 ++
 .../schedules/x86_64/pack_and_tile.yaml          |   1 +
 4 files changed, 4 insertions(+)
 delete mode 100644 5_Matrix_scalar_multiplication.o

diff --git a/5_Matrix_scalar_multiplication.o b/5_Matrix_scalar_multiplication.o
deleted file mode 100644
index b5ceb8a42cecb16157df1d46badbf4a33b116b35..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3296
zcmbuBTTc@~6vwAfXv@WRK*~)h>Uu+2K@cw}^r6CN2E3$-O-!I$+dyoAlqEs}8iJZM
z`@%QyYv3CJG~yHT1vEYyA1KCXe5o@#XK5)Z591`eGxMKwesgwq=44*pqWwiS8?uTG
z^`dOgQW0t?&$j($(~lg?k{AKy9x<p=*Rk#P`H{A}KsxPs<J-bqo!Jgzo0gs-$sLc`
zUDwhHlHB!Z=>=0u>m>A1OUFrQOWS=xlAGpxTl@KoG<;&7^8{CTO;3u809b4S(~JSA
zc@o;>rlIXZ4C=K3#14=W8Hk90;!=P(WFW;5i4!0$8Ho4{ipu~}E(0lnNGbqQDFYE}
zLGclQxMd&?h(v6XAjUEfv7i)JK_t~O5En#p6d*M+5b;+i7CSeHYh@ti5Q%k;>SZ9}
z*HYX7pVKG<sf0+50i;O=BK8Nx%>eNggskP9;#D&ftcpw!@VZUs0B+$4h9!V(Vqs1!
z^91qn{h3AN6f05erI~#RhI+-eqxb|sP8N*qkPL%OXNnbooRWcvy-V?FfSi$mR6`_Z
z0dh_TA`Sq>tpGVM1F3;XE&!xW24dxZ6(AR7Ahi%lJ3u;QAmS)ed<h_(GLU+RqzfRI
zWgy}_Q``-ZD>9Hqh~z3juE{_|QlR)cKzd{#O%TZqfZUXUh|EH9FF^VVLJr9=KC-xm
zoCuM{zwFk7+^<`IXEN{qJYSEweE}vf{TL(0kdqh#?0MiMDXMtLJL_^pQXfbvsF2KO
zva~U_9$Oan-1e@Y8pE^1Xs1T~ZKI5=WPPYYXzGdL$!rO)SWI>MUYX;<-@)7&V%S3r
zcH?ppqly?)PHNm|;vMG!RB#Vv&2gaaXNn!AGx-A@OsGP|d-8X-6uc<x;rb5rRZ~T%
z8uS*I5V}Bb27MItKF}S|=XcB^(0J8TKz~y}M`|(}536xKI;$hqe2*mJ<H?akczgnh
zrg%rjW8p|t=<39z9#sd19=1*E;m9*Y>7&8GcsL%7=!26_XW~($PU}fMtg(77!R!Am
zj~O7lnpxUPnP`t?C9{=(_pW3r*Glv3{CRz=YquIub3gF+<PQ0v^A^_E8rx^YMDEL(
ze)i;+DTtqd$mikT3EKnJf!@vP{BPnzU-IS!^8>b{Y#?`^Sax5*e;EWm2e-?g{~yZd
Bi7WsB

diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml
index 8f86f5c6..f7fb6e12 100644
--- a/examples/KernelBench/level1.yaml
+++ b/examples/KernelBench/level1.yaml
@@ -133,6 +133,7 @@
   initializations: [rnd]
   output_shape: 1024x1024
   gflops: (1024 * 1024) / 1e9
+  pipeline: element_wise
 
 - kernel: level1/21_Sigmoid.py
   input_shapes: [4096x8192]
diff --git a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml
index 939ac8e0..b7bb5f21 100644
--- a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml
+++ b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml
@@ -1,6 +1,8 @@
 # This is an optimizing pipeline for kernel_bench element-wise kernels.
 # Assumption: M, N, K % 32 = 0
 Pipeline:
+  # Tries to combine as much as possible into one big generic
+  - pass: "linalg-fuse-elementwise-ops"
   # Register tiling and unroll to fill the pipeline
   - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic
                                        tile_sizes=[1,$register_tile]
diff --git a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml
index aa166837..25d28da3 100644
--- a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml
+++ b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml
@@ -2,6 +2,7 @@
 # This is a good default to most CPU based pipelines.
 Pipeline:
   ## Packing & Cache tiling (CPU generic)
+  - pass: "linalg-fuse-elementwise-ops"
   - schedule: "packing.py[gen=block_pack_matmuls]{block_factors=[32,32,32] rhs_transpose_outer_block=True rhs_transpose_inner_block=False}"
   - schedule: "x86/pack_lowering.py[gen=lower_packs_unpacks]{tile_size=32}"
   - pass: "linalg-morph-ops{named-to-category generic-to-category}"