diff --git a/5_Matrix_scalar_multiplication.o b/5_Matrix_scalar_multiplication.o deleted file mode 100644 index b5ceb8a4..00000000 Binary files a/5_Matrix_scalar_multiplication.o and /dev/null differ diff --git a/examples/KernelBench/level1.yaml b/examples/KernelBench/level1.yaml index a6ccfd0f..f7fb6e12 100644 --- a/examples/KernelBench/level1.yaml +++ b/examples/KernelBench/level1.yaml @@ -41,10 +41,10 @@ pipeline: matmul - kernel: level1/7_Matmul_with_small_K_dimension_.py - input_shapes: [4096x64, 64x4096] + input_shapes: [1024x64, 64x1024] initializations: [rnd, rnd] - output_shape: 4096x4096 - gflops: (4096 * 64 * 4096 * 2) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 64 * 1024 * 2) / 1e9 pipeline: matmul - kernel: level1/8_Matmul_with_irregular_shapes_.py @@ -123,16 +123,17 @@ pipeline: matmul - kernel: level1/19_ReLU.py - input_shapes: [4096x8192] + input_shapes: [1024x1024] initializations: [rnd] - output_shape: 4096x8192 - gflops: (4096 * 8192) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 1024) / 1e9 - kernel: level1/20_LeakyReLU.py - input_shapes: [4096x8192] + input_shapes: [1024x1024] initializations: [rnd] - output_shape: 4096x8192 - gflops: (4096 * 8192) / 1e9 + output_shape: 1024x1024 + gflops: (1024 * 1024) / 1e9 + pipeline: element_wise - kernel: level1/21_Sigmoid.py input_shapes: [4096x8192] diff --git a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml index 939ac8e0..b7bb5f21 100644 --- a/examples/KernelBench/schedules/x86_64/element_wise/common.yaml +++ b/examples/KernelBench/schedules/x86_64/element_wise/common.yaml @@ -1,6 +1,8 @@ # This is an optimizing pipeline for kernel_bench element-wise kernels. # Assumption: M, N, K % 32 = 0 Pipeline: + # Tries to combine as much as possible into one big generic + - pass: "linalg-fuse-elementwise-ops" # Register tiling and unroll to fill the pipeline - schedule: "tiling.py[gen=tile_ops]{target_op=linalg.generic tile_sizes=[1,$register_tile] diff --git a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml index aa166837..25d28da3 100644 --- a/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml +++ b/examples/KernelBench/schedules/x86_64/pack_and_tile.yaml @@ -2,6 +2,7 @@ # This is a good default to most CPU based pipelines. Pipeline: ## Packing & Cache tiling (CPU generic) + - pass: "linalg-fuse-elementwise-ops" - schedule: "packing.py[gen=block_pack_matmuls]{block_factors=[32,32,32] rhs_transpose_outer_block=True rhs_transpose_inner_block=False}" - schedule: "x86/pack_lowering.py[gen=lower_packs_unpacks]{tile_size=32}" - pass: "linalg-morph-ops{named-to-category generic-to-category}"