From 75a889e4f7c60f883c4085e4212ff8d5981cb2d3 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Mon, 15 Jun 2026 16:24:55 +0100 Subject: [PATCH 1/2] [KB] Reduce sizes to make it easier to debug on CPUs --- examples/KernelBench/level1.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml index a6ccfd0f..8f86f5c6 100644 --- a/examples/KernelBench/level1.yaml +++ b/examples/KernelBench/level1.yaml @@ -41,10 +41,10 @@ pipeline: matmul - kernel: level1/7_Matmul_with_small_K_dimension_.py - input_shapes: [4096x64, 64x4096] + input_shapes: [1024x64, 64x1024] initializations: [rnd, rnd] - output_shape: 4096x4096 - gflops: (4096 * 64 * 4096 * 2) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 64 * 1024 * 2) / 1e9 pipeline: matmul - kernel: level1/8_Matmul_with_irregular_shapes_.py @@ -123,16 +123,16 @@ pipeline: matmul - kernel: level1/19_ReLU.py - input_shapes: [4096x8192] + input_shapes: [1024x1024] initializations: [rnd] - output_shape: 4096x8192 - gflops: (4096 * 8192) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 1024) / 1e9 - kernel: level1/20_LeakyReLU.py - input_shapes: [4096x8192] + input_shapes: [1024x1024] initializations: [rnd] - output_shape: 4096x8192 - gflops: (4096 * 8192) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 1024) / 1e9 - kernel: level1/21_Sigmoid.py input_shapes: [4096x8192] From 7f6f4f8265a388ee24050d13416dd4bca3e4d682 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Mon, 15 Jun 2026 19:26:14 +0100 Subject: [PATCH 2/2] [KB] Linalg generic fusion to help some bad lowering from torch-mlir --- 5_Matrix_scalar_multiplication.o | Bin 3296 -> 0 bytes examples/KernelBench/level1.yaml | 1 + .../schedules/x86_64/element_wise/common.yaml | 2 ++ .../schedules/x86_64/pack_and_tile.yaml | 1 + 4 files changed, 4 insertions(+) delete mode 100644 5_Matrix_scalar_multiplication.o diff --git a/5_Matrix_scalar_multiplication.o b/5_Matrix_scalar_multiplication.o deleted file mode 100644 index b5ceb8a42cecb16157df1d46badbf4a33b116b35..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3296 zcmbuBTTc@~6vwAfXv@WRK*~)h>Uu+2K@cw}^r6CN2E3$-O-!I$+dyoAlqEs}8iJZM z`@%QyYv3CJG~yHT1vEYyA1KCXe5o@#XK5)Z591`eGxMKwesgwq=44*pqWwiS8?uTG z^`dOgQW0t?&$j($(~lg?k{AKy9x=0u>m>A1OUFrQOWS=xlAGpxTl@KoG<;&7^8{CTO;3u809b4S(~JSA zc@o;>rlIXZ4C=K3#14=W8Hk90;!=P(WFW;5i4!0$8Ho4{ipu~}E(0lnNGbqQDFYE} zLGclQxMd&?h(v6XAjUEfv7i)JK_t~O5En#p6d*M+5b;+i7CSeHYh@ti5Q%k;>SZ9} z*HYX7pVKGeNggskP9;#D&ftcpw!@VZUs0B+$4h9!V(Vqs1! z^91qn{h3AN6f05erI~#RhI+-eqxb|sP8N*qkPL%OXNnbooRWcvy-V?FfSi$mR6`_Z z0dh_TA`Sq>tpGVM1F3;XE&!xW24dxZ6(AR7Ahi%lJ3u;QAmS)ed9Hqh~z3juE{_|QlR)cKzd{#O%TZqfZUXUh|EH9FF^VVLJr9=KC-xm zoCuM{zwFk7+^<`IXEN{qJYSEweE}vf{TL(0kdqh#?0MiMDXMtLJL_^pQXfbvsF2KO zva~U_9$Oan-1e@Y8pE^1Xs1T~ZKI5=WPPYYXzGdL$!rO)SWI>MUYX;<-@)7&V%S3r zcH?ppqly?)PHNm|;vMG!RB#Vv&2gaaXNn!AGx-A@OsGP|d-8X-6uc7&8GcsL%7=!26_XW~($PU}fMtg(77!R!Am zj~O7lnpxUPnP`t?C9{=(_pW3r*Glv3{CRz=YquIub3gF+b{Y#?`^Sax5*e;EWm2e-?g{~yZd Bi7WsB diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml index 8f86f5c6..f7fb6e12 100644 --- a/examples/KernelBench/level1.yaml +++ b/examples/KernelBench/level1.yaml @@ -133,6 +133,7 @@ initializations: [rnd] output_shape: 1024x1024 gflops: (1024 * 1024) / 1e9 + pipeline: element_wise - kernel: level1/21_Sigmoid.py input_shapes: [4096x8192] diff --git a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml index 939ac8e0..b7bb5f21 100644 --- a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml +++ b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml @@ -1,6 +1,8 @@ # This is an optimizing pipeline for kernel_bench element-wise kernels. # Assumption: M, N, K % 32 = 0 Pipeline: + # Tries to combine as much as possible into one big generic + - pass: "linalg-fuse-elementwise-ops" # Register tiling and unroll to fill the pipeline - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,$register_tile] diff --git a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml index aa166837..25d28da3 100644 --- a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml +++ b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml @@ -2,6 +2,7 @@ # This is a good default to most CPU based pipelines. Pipeline: ## Packing & Cache tiling (CPU generic) + - pass: "linalg-fuse-elementwise-ops" - schedule: "packing.py[gen=block_pack_matmuls]{block_factors=[32,32,32] rhs_transpose_outer_block=True rhs_transpose_inner_block=False}" - schedule: "x86/pack_lowering.py[gen=lower_packs_unpacks]{tile_size=32}" - pass: "linalg-morph-ops{named-to-category generic-to-category}"