From 508f8349b2d89ede64dffa7423d6473c95855bed Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Sun, 19 Apr 2026 16:58:07 +0000 Subject: [PATCH 1/6] Add SIMD backend with AVX support for CPU BLAS operations --- CMakeLists.txt | 27 ++ benchmarks/python/blas/bench_gemm.py | 50 ++- benchmarks/python/blas/bench_gemv.py | 82 +++- .../blas/results/unknown_float16_gemv.pdf | Bin 0 -> 17529 bytes .../blas/results/unknown_float16_gemv_t.pdf | Bin 0 -> 17427 bytes .../blas/results/unknown_float32_gemv.pdf | Bin 0 -> 17563 bytes .../blas/results/unknown_float32_gemv_t.pdf | Bin 0 -> 17266 bytes mlx/backend/cpu/gemms/aligned_buffer.h | 65 +++ mlx/backend/cpu/gemms/avx_simd_gemm.h | 397 ++++++++++++++++++ mlx/backend/cpu/gemms/avx_simd_gemv.h | 219 ++++++++++ mlx/backend/cpu/gemms/simd_bf16.cpp | 5 + mlx/backend/cpu/gemms/simd_fp16.cpp | 5 + mlx/backend/cpu/simd/avx_simd.h | 393 +++++++++++++++++ mlx/event.h | 1 + results/unknown_float16_gemv.pdf | Bin 0 -> 17615 bytes results/unknown_float16_gemv_t.pdf | Bin 0 -> 17413 bytes results/unknown_float32_gemv.pdf | Bin 0 -> 17819 bytes results/unknown_float32_gemv_t.pdf | Bin 0 -> 17773 bytes 18 files changed, 1205 insertions(+), 39 deletions(-) create mode 100644 benchmarks/python/blas/results/unknown_float16_gemv.pdf create mode 100644 benchmarks/python/blas/results/unknown_float16_gemv_t.pdf create mode 100644 benchmarks/python/blas/results/unknown_float32_gemv.pdf create mode 100644 benchmarks/python/blas/results/unknown_float32_gemv_t.pdf create mode 100644 mlx/backend/cpu/gemms/aligned_buffer.h create mode 100644 mlx/backend/cpu/gemms/avx_simd_gemm.h create mode 100644 mlx/backend/cpu/gemms/avx_simd_gemv.h create mode 100644 mlx/backend/cpu/simd/avx_simd.h create mode 100644 results/unknown_float16_gemv.pdf create mode 100644 results/unknown_float16_gemv_t.pdf create mode 100644 results/unknown_float32_gemv.pdf create mode 100644 results/unknown_float32_gemv_t.pdf diff --git a/CMakeLists.txt b/CMakeLists.txt index a2395d02f6..2dcb89e57f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,33 @@ option(USE_ASAN "Enable AddressSanitizer (ASan)" OFF) option(USE_UBSAN "Enable UndefinedBehaviorSanitizer (UBSan)" OFF) option(USE_TSAN "Enable ThreadSanitizer (TSan)" OFF) +# ----------------------------- x86 SIMD Detection ----------------------------- +if(MLX_BUILD_CPU AND (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-9]86")) + include(CheckCXXCompilerFlag) + + check_cxx_compiler_flag("-mavx2" HAS_AVX2) + check_cxx_compiler_flag("-mfma" HAS_FMA) + check_cxx_compiler_flag("-mf16c" HAS_F16C) + + if(HAS_AVX2 AND HAS_FMA AND HAS_F16C) + message(STATUS "Compiler supports AVX2/FMA/F16C - enabling AVX SIMD backend") + add_compile_options(-mavx2 -mfma -mf16c) + add_compile_definitions(MLX_USE_AVX) + else() + message(STATUS + "Missing required x86 SIMD support - using base SIMD backend") + if(NOT HAS_AVX2) + message(STATUS " Missing: AVX2") + endif() + if(NOT HAS_FMA) + message(STATUS " Missing: FMA") + endif() + if(NOT HAS_F16C) + message(STATUS " Missing: F16C") + endif() + endif() +endif() + # --------------------- Processor tests ------------------------- message( STATUS diff --git a/benchmarks/python/blas/bench_gemm.py b/benchmarks/python/blas/bench_gemm.py index ee358a95d8..3629c0ea39 100644 --- a/benchmarks/python/blas/bench_gemm.py +++ b/benchmarks/python/blas/bench_gemm.py @@ -10,18 +10,32 @@ import numpy as np import torch -device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"]) -device_name = device_name.decode("utf-8").strip("\n") - -N_warmup = 8 -N_iter_bench = 80 +try: + device_name = subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL + ).decode("utf-8").strip() +except (subprocess.CalledProcessError, FileNotFoundError): + device_name = "unknown" + +if torch.backends.mps.is_available(): + torch_device = "mps" + torch_sync = torch.mps.synchronize +elif torch.cuda.is_available(): + torch_device = "cuda" + torch_sync = torch.cuda.synchronize +else: + torch_device = "cpu" + torch_sync = lambda: None + +N_warmup = 2 +N_iter_bench = 10 N_iter_func = 5 def bench(f, a, b): for i in range(N_warmup): f(a, b) - torch.mps.synchronize() + torch_sync() s = time.perf_counter_ns() for i in range(N_iter_bench): @@ -72,7 +86,7 @@ def gemm_nn_torch(a, b): for i in range(N_iter_func): y = a @ b ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -82,7 +96,7 @@ def gemm_nt_torch(a, b): for i in range(N_iter_func): y = a @ b.transpose(-1, -2) ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -92,7 +106,7 @@ def gemm_tn_torch(a, b): for i in range(N_iter_func): y = a.transpose(-1, -2) @ b ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -102,7 +116,7 @@ def gemm_tt_torch(a, b): for i in range(N_iter_func): y = a.transpose(-1, -2) @ b.transpose(-1, -2) ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -116,10 +130,10 @@ def bench_shape(B, M, N, K, np_dtype, transpose="nn"): a_mx = mx.array(a_np) b_mx = mx.array(b_np) - a_pt = torch.from_numpy(a_np).to("mps") - b_pt = torch.from_numpy(b_np).to("mps") + a_pt = torch.from_numpy(a_np).to(torch_device) + b_pt = torch.from_numpy(b_np).to(torch_device) - torch.mps.synchronize() + torch_sync() f_mx = { "nn": gemm_nn_mlx, @@ -165,12 +179,12 @@ def get_gflop_count(B, M, N, K): transposes = ("nn", "nt", "tn") shapes = ( (16, 234, 768, 3072), - (1, 64, 64, 25344), - (16, 1024, 1024, 1024), + # (1, 64, 64, 25344), + # (16, 1024, 1024, 1024), (1, 1024, 1024, 2048), - (4, 1024, 1024, 4096), - (4, 1024, 4096, 1024), - (1, 4096, 4096, 4096), + # (4, 1024, 1024, 4096), + # (4, 1024, 4096, 1024), + # (1, 4096, 4096, 4096), ) for dtype in dtypes: diff --git a/benchmarks/python/blas/bench_gemv.py b/benchmarks/python/blas/bench_gemv.py index 3cfc5eba41..4457a50fbb 100644 --- a/benchmarks/python/blas/bench_gemv.py +++ b/benchmarks/python/blas/bench_gemv.py @@ -14,21 +14,40 @@ if not os.path.isdir(results_dir): os.mkdir(results_dir) -device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"]) -device_name = device_name.decode("utf-8").strip("\n") - -N_warmup = 5 -N_iter_bench = 50 -N_iter_func = 20 - -out_vec_sizes = [128, 512, 2048, 4096] -in_vec_sizes = [128, 512, 2048, 4096] - -benchmark_vector_lens = [] -benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2] -benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2] -benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2] -benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000] +try: + device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"]) + device_name = device_name.decode("utf-8").strip("\n") +except (subprocess.CalledProcessError, FileNotFoundError): + device_name = "unknown" + +if torch.backends.mps.is_available(): + torch_device = "mps" + torch_sync = torch.mps.synchronize +elif torch.cuda.is_available(): + torch_device = "cuda" + torch_sync = torch.cuda.synchronize +else: + torch_device = "cpu" + torch_sync = lambda: None + +# N_warmup = 5 +# N_iter_bench = 50 +# N_iter_func = 20 +N_warmup = 2 +N_iter_bench = 10 +N_iter_func = 5 + +# out_vec_sizes = [128, 512, 2048, 4096] +# in_vec_sizes = [128, 512, 2048, 4096] +out_vec_sizes = [512, 2048] +in_vec_sizes = [512, 2048] + +benchmark_vector_lens = [128, 1024, 4096, 11008] +# benchmark_vector_lens = [] +# benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2] +# benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2] +# benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2] +# benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000] benchmark_vector_lens.sort() @@ -36,7 +55,7 @@ def bench(f, m, v): for i in range(N_warmup): f(m, v) - torch.mps.synchronize() + torch_sync() s = time.perf_counter_ns() for i in range(N_iter_bench): @@ -69,7 +88,7 @@ def gemv_torch(m, v): for i in range(N_iter_func): y = m @ v ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -79,7 +98,7 @@ def gemv_t_torch(m, v): for i in range(N_iter_func): y = v @ m ys.append(y) - torch.mps.synchronize() + torch_sync() return ys @@ -91,10 +110,10 @@ def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False): vec_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_vec).astype(np_dtype) mat_mlx = mx.array(mat_npy) vec_mlx = mx.array(vec_npy) - mat_trc = torch.from_numpy(mat_npy).to("mps") - vec_trc = torch.from_numpy(vec_npy).to("mps") + mat_trc = torch.from_numpy(mat_npy).to(torch_device) + vec_trc = torch.from_numpy(vec_npy).to(torch_device) - torch.mps.synchronize() + torch_sync() time_torch = ( bench(gemv_t_torch, mat_trc, vec_trc) @@ -151,6 +170,13 @@ def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose): mlx_gflops.append(gflop_count / time_mlx) pyt_gflops.append(gflop_count / time_torch) + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={gbyte_size/time_mlx:7.2f} GB/s, " + f"torch={gbyte_size/time_torch:7.2f} GB/s, " + f"diff={gbyte_size/time_mlx/(gbyte_size/time_torch) - 1:+.1%}" + ) + if transpose: title = f"gemv_t ([1, {in_vec_len}] [{in_vec_len}, out_vec_len]) | {dtype}" else: @@ -182,6 +208,13 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose): mlx_gflops.append(gflop_count / time_mlx) pyt_gflops.append(gflop_count / time_torch) + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={gbyte_size/time_mlx:7.2f} GB/s, " + f"torch={gbyte_size/time_torch:7.2f} GB/s, " + f"diff={gbyte_size/time_mlx/(gbyte_size/time_torch) - 1:+.1%}" + ) + if transpose: title = f"([1, in_vec_len] [in_vec_len, {out_vec_len}])" else: @@ -196,15 +229,22 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose): for transpose in (False, True): for dtype in ("float32", "float16", "complex64"): + op_name = "gemv_t" if transpose else "gemv" + print(f"\n{'='*60}") + print(f"{op_name} | {dtype} | device: {torch_device}") + print(f"{'='*60}") + fig, axs = plt.subplots( len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained" ) + print(f"--- sweep out_vec_len (fixed in_vec_len) ---") for i, in_vec_len in enumerate(in_vec_sizes): bench_with_in_len( axs[i][0], in_vec_len, benchmark_vector_lens, dtype, transpose ) + print(f"--- sweep in_vec_len (fixed out_vec_len) ---") for i, out_vec_len in enumerate(out_vec_sizes): bench_with_out_len( axs[i][1], out_vec_len, benchmark_vector_lens, dtype, transpose diff --git a/benchmarks/python/blas/results/unknown_float16_gemv.pdf b/benchmarks/python/blas/results/unknown_float16_gemv.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fee838a0be7f8f30a01565f3ad2a618f57005ff7 GIT binary patch literal 17529 zcmbun2Rzl^{|8>7xEa|g<66nSn`>on*(00l8fC>LY1t!W6KNQgiX^g<>=A_$DIyfI zN`p%L&-<#+r;qOM|MmF)ugB?q-mmu=uk$*u^LjncdB0x?8>wqbBBd}e;i4hOYEC34Paik9w2_k=(I1Wi1x(;dN-(0g3sg|{j|!ST-Xu7h zTtM2&(D^WtK!VGx-)aPrv`k4(BqAKMPB3yJk%)fYa4d8SlQwmCa`EtXgX4ZZ_46T^ z5=n3y(5<>AV1*b&f=lao0Txt$UDbYFb>Mb?u!H=201)30_kKjc-8%cyCPaUq06zj4 z55&I-T-t!>;^Cy~69jrhfIl1(1((62;C3)+H83bp(;tpsFR9|~?E~(RP_aJ+0X_aL zIentH8_69G8X-5X>EQ{+3YXUO1OiYe5`0{U5X=4~KcbTtEGVbN+RlethUMgChX$>^ z+WYT*JLU87)Nb1O5Iup>Z7(!;&2}90ONlSz`SJGjih1|p=*PVhZh=Jnc9%oM`8(Na zfyYWoA2V+yKAgW@cV<=p=eOuLd8_3o4p@EJTlac@{^_Ox5#n?IlMlG)PuFne37>q( z`|!Hza%T09)lNh$uT)(_&(iLXCwJTS-9FD9zUUI5tIo{i8%6rKtwu>%RNWw|mM-3a z&wKQ~j{AX*y}PJ%E~>DaqEO3M2(&Gx=v&HO{Un)>ALx@$?^%s|nDG7OsK=#p!{o4I zT;&E&ac>=;&$|W%;%!$Riyi5EJ9AW0YE7}$$gs=wi>}>i`IRqz%ig=Q=v&-vblWSO z;5|~|Y31GfU*NY7y?YC+@x8;UMj*&hVn={Vyj{5;WXM+%$oIPL$MpC2Ur(PL>(eTs zGHzXMC>6Q5&=B_#@0Ig~W<35Nn{KQ8dlXEOr9m#?Z9_0lh`P3RhNK_#cDMUkDg6DE z{RFD79MyV)0unA6L$@PCEZwPdG9B*bsvIQ@HOjd?+xc@V3u**)R7&wE8_iY_M}JHxUn zl9hz6R@OMRW_wSuc@*uIGO@gcc=1Ccy&GN#&-ZP1oIOv#RI{~7OmJp8NAlc*ZC4lq z^TrPw?$6FjVq-sex#HSx`S~8csD(bYv9aROM@pzR6#?4BiE~;XnFzJKqW3s6+dk8X zzQ8&iZ<8-W3`MlawuH6j(Z#*7RXA(I5O*m1MzP1_fu|;tOS(a?AGprAmJW_3+g!Nk zS`_rCSL6cn(y3dnZJKil3Wkj0-i05YXpILf7syig@rKD*%J7x;It)GMmr`}$zxYi2 zOMY2C7}yUcwY0xp~@gpOPPk)}Z8$4Y`x`q_5Nixfsuutj$ji6-IPOq=kW?X>blnEvJ~y@Eceg68nx(;V2^rJ zS162XIHG_mkbIr1!5sS_b@sb>iZ3Hxg>!&~c^Q_f9!g_ZA)54QxbdjEi@e2bsQD+s zo3qA=QePx5&`eL;&F|MvZVHI$<*a+AO7&=fIG(IOE_vbO$z4~>PKgn2Pv|E+c`B4R z9P*yabS!lbOwXL{oJ9y<+ShOadDFvj?FIcDp}9?sxCVOzFUR;v`YV{^xi{Puhv(kn zK3loPibw?%%CyOu_4i)K)XXhw2Ju~XGB~jyJrFk{R+oEUEB1l(sOEod}=4lth4iWy?@mr%xzpiJZX2K z#*FY?jk=27e7`T(ja{!$J*k+g_)l>$AEwWBY#z2|qk6<88Pj9z>I_`G{ZRboBY&e{UGEmhBo>(d}!~LM0l@_;KBM+f{Z15uF{<}Iaqz2`8#hh!$}+L?IUXW zjpCS|J9%Sy^ctwi6XosnH7`{2oh&>c3mt0f)XiPe{KaRb24fVRn{Lt}nE+ez*md)0 zs1U!KL8ZWwf%QaA@92KXI66E2WPYJChJLq^;>@_gekWWTId^8RtqmdNX8rRNkQ4tpq= z>q@wM;PdDZQ&KE?$-(F**P^`=`|3x(sMJg1D0*!#U*{E9n+vt7U>c^S&`btl4U<$a1Ke?4C;TJBuAZ1xnPP=K_%Y%k+(>gIB7R?Qa}t}%lSuI0>_ zuYydrVs}}K#LtS1u^+dQcyfKEc4u^Hz_qi7zV%UAS3BsHx9)dkIZ}EjBQaOKntpCl zB|ww;;R&}@CD>OdI=cZ^Hm1@$97AX3cZ&7w$&;Dt{l=@UDI1DX-g=3J&z`vFz*pj| zL$d38p3HX4Hc#rpphgdvSye5ng|2+yLuLQMaRD={+LTb``}Ec$OP)N|4E8sg#a~#C zB}|{FU}S%HTZq3$Dz5dTqD?f+%`z#;J+ZlK9=fryTvwkjN=DHZw~mFwhP^-U^PXaO z%+L>8^0Kd;i#WV{zx8|K^*TG^pr`KCQ@6nMddpcc`PVP*M{K|76}+vO>5*#c*G=er%TaNCT6so&0rttKRPpxSv&iuwy`m3$n56VX z&zH^|7^7+yoXW+ukHR;hn}Ysr%{BpC)|%U znPKCJAJd#Aa*?YstipcQ$qyA)lIoSh^E)cP5l3$7XqDYOz-)mCF2|Z2aa+7G_U&K= znn*ZWV19V0>n+Yxim|fNtE|_ApZCn_&2*iU(vdQEbp%y;=OafakD|hl?a@kd7H8;% z9qip1Kr39rKcw%`$DbeFkQ$V7r6*=v2FkcG^>k{P`BjlZoznf=k`E)}zv@c$6}F7d zVS=&`hFK^Dgw|BU)GbuNkt6cul-!06E2_VsO-MX^L@l;&Hu;OXLFYsE0xaroXtKf zv%(irNv#X*M}7h7Upo^r`i2lWN>d%Hr0?GVuhVEf+%p*_k5=#Ii+0ABwexLwht zI++Sd-&qOEsCO_?tgAlixOADfUQr&2)MMVFvgB=y-+RDIxHEC(1<_4sB4g=_*J4fH z`|UR_!2LUpcCb5VELO1yaYz#P^}F2+Up&fin7!@;E1LBgzfLoYpGl%XQDRB>z!&Zz z2bt-WJzRVwPorSHlv`KKn(F+@S>!X`+l9}DU%T-1J^I=0*^sLrql)?p4xZup@=#Es zvEa3~#twdthtE2B?n)GJ_r0nYRYdf`vMX%|bPAm1hhabKdwR@de z+v4l)RzpGZe{KqY?S)Vnng1H~gPj?}7A-q3lc~9xsftQ*E=?Bbg;H*evA(FAoo+WLWT3GMt zf-uXQOIDwlkBL9Fkqaz+xYSVoE?MlBs=~wz?m=zq0rlRGisi5U)ku%vp-M9RJJdbS z*+&*U{Mr=Yg}HwZS0-{ZT}R5`?Vg2gUSbv~n9R-ivp>uAT{$`Pc=>`y@ztcGDgv06t0Qi; zZIeN3Tha|}_fe1^g}o#W`M<@7#$qiIMt91$uw1*Q345oR0TYd~RaS~g?gkqF_FY|nGl7COxEodl9`V1KP(0VkP>-NhThMQy zePg=qhnYE3$Lc`T$>0VioAxR?4tl=)myzjIK4GOI0giU|RP3r8TrLx@MXT{#|R z@4LbJROsky^;~%a-HS}+&t8p2PEviD=9#6y;h%f`-&<~(&9IoH$I=9+S6j+a)!MQa z@~QmUblA`to6am!I2)l;THTT~n;zZPnwGHSPB4s+w0%q3bI_D4G48&r-IY!!)H9#j z?CN?mr{w3edEa-}=6))2jV>`qBO_$8SoS!I9rI)Q^5weh+^du%PfMqsi+c*n5M9YD zin~nxd;DXPR|FmWjyUyXz2oGRyv0>a)bE{jy6GuN|2{$G*rP9f!M8slm=``#ji@NO zP#7|FqOf67aJc_HDQGmt3}MuR-Nxb?weLTgLBra)wX93OD@1}|YATemH3^5phCkcI zb!DsXcJ(vcE%$6OIl@*?ffMMQX9FjjQ4h(4Qb;|hPZ&1Pj_!zGip^u|SoJ~#HeEhg z($*A1m(yn8@mVJmqt!dm?AH6rQU7qV%TybFEa@(bX0`2`=T8eG!r*b#bqtoDv(6B@ zbJDL5A87xv_ZbBap_8T!9HJ4MutJI{dfvdSt1T%-O6{uuK6vRja;Iy}J3-)DEp!0YWC{C5^uZn ze;O_E?w4|`HI>tFb~)%*uL=(?6e|q&Eb&3-9}E-L!wd z!J$Ec8A|>cjr!l*VE^>b{qwaVEmxwyt__$xVxv_SuBZ}rCJ1hmMlR!HhL%&X`TI`0 z`B|<=7)EL}7LV$%g=W674-D(v+e$pZh~}%!w0Kg`no8fUfm2jqM!gW;C5O1bJ>Z4w z%h)>pH)Vn*k2N!%4X}KEDn|Xtv|?mBjK%BLcmE$nJH39qWIY&o;=RsW)|knRqI%*u zduof-c17cAJ>;pR_nx^TxwZQ*hkGg=j#1Ct?=~9S(6GOUakhHSZOokC-ER207okv7 zx9!+HdM~ZWWk$j@2{W59$VfFdgJKW2PW53J!Tw>hw#UsgpG|x5KU}Tb$+R z)UNMoSLe^%DMafG9P)PMJ$bS_Ztsg0r|R175m~sS6#hOzG9g!tSt7<0vcA{bIZOU# z2u;B`<7EH$1fp@}K)(?(kUC+&@xo{>H>4 zw&UvMv_}WL9;LzLTzjb{SgJ=|SPf)5@9UVvbETo)^Ii)SI8~(3o-a4$V(_F$Fl+Fc z;j64=Zr@6Wmo5j>+eo}!%Q@vw!tv}iOXIP)9R>+6r)HN}J;bCK! zH%q458LknRwQoCCqpbxfNRWb8N8_>odpZ?O^t@Y?XB;o3_}3jvOHRg zJ5d+K)C(Wq?9E+P&4}EQbnrv&&c;*t)p6;NhJiZ_Rc{ngDtCn5iK@1G8*^o|s4zQC zCbJh77+pTpIAT0MvxK5K>=S==CxQDjT{(@amv-~E_sUz;XVc7zlg_ybu|8xUc5~)- zob0{3^){E8dd7*_wwN1F3gc_lo0SX=zVFY_m$Yd0Hsx|yNsI40Num+Efmuvj>@hFv z*=ZGfoaHNiMSFR&-;+N|&bxmx_AMW}1;z5jQ4jB!n>nsE`sz>s^KG8$swbpSzVv=y z{qmX9?OzO6Nw2IQ2h{q}<|)55Q2B75@wR4Zb2CcmVuEOJp2VF!*|nPJ2B%%j{Yy>l zRC|8<)$YPvKO#_WC2--MvwFf=Pk+S+)cyJkE%^Q>)<(~Ti~YHG(P%OLu$MJahi8-n z%j!P)R$M-Sp33v0FsBso5fZ%#7if^E!bpXrrCK;1d082*9Mo-L&h%={LGMN2vz@#h zA)dzDLxrP*qW8Y(eZ@4Qfn#?Y&v7sjb>=^B=zAGq%AFv3Y>|){ay0F*#C7)|d%Y(H zO`MX)SS7C(XQp!bRLy5DX{CE5GiY>6?o=J{O!dsPuP2D8>cV&M2)Y|K6S!$DO1CJOZ@ziuBDXAIft#&dYBlsi~ zmy`MZWq$5Goi}*@P;g4e;M(~~0aN2;lR_zHRlAtI-MP#k6myTT94dYA5igOSTmX>)+m>#d# z=1`Cz1yF?F#4&?LLIZO>oR)gcOoa{xdug~Y6*%35U8s|elZ^8bG&?JfYx2>g@3UvO z+M!DrGD=i_@WxNFV~B>ySLDpg9F}m$bDm>mxxVW^F@Iz36 z0+$q^6B@IL-KPdHM(Rk~R}W2nL$%*zCVeWu387}23;NF=kJ34M26dWVb9r>9T3AqM zAkFZUfLhVD6^x&dz_n!^I-0=r#I_@PMG3RXg>08!ejG3Tta)va-TKRX@72e7#*ZJh z#!D~XO{%8S9xL9$Xz$Wgd@H{&V{Td@?5mrG zpJsetoU9ywiuF@`*x1u6yw>BkGNof)f~D1#BlCQ3a-?m)&T#6@Ml@5H9SS%SxtX&n zjik1tG+O`q8ZL;am}}W zM-ghDTfs%9sOXbpZ}k>@4{YJO`FY~G^X>dRTWL7~madfXTgrkb6;&IEu#-9 z7tRTbaSJ&Q_uAD0=TI-rise%0$=RCR3GFI`UvYF$;D-VXl-A7UO zA{fywuym9=E@;gH(Yu}ZR`3;m+3=c%;x~?uYDlaLrsL0Bb6DsKgLDltYIX+qmDlyv zxsTqTd8hY0_Q2zA&WlXPzhC+SZ%l3lq51kiLCO zwHt#7g>ETfKkO!es7A7$A2pm-ed$t|V93gZLaHtiS-pm1c;bQ!%q;vFp(}WeMPice zIO0^t;$xSJ>0@{c`Mor38vD<8m`{~%yCrYhoOLAj@C<=CQT0rId&}M@cK91ENAu$Y z%mT!G^}Ex*7!Q97Jd}1yi;)O>QRL69@pvFM59M)k82+|5Rl?KM|)qLm%4u*!1%R%k21g;AE(y&v9n$!Cmdi{JVL=0x#OG zjrD(fuy!;`zWW~6ZSIA!W9Rz=+Z>!z*!{Q04T%;`9+9#h;F5ctif)^@_9N;^wPWV} zaQ%gE2l~6euTt@fx_+l1N(vwliQ9yBX(Y+BQ1R2Noj8~)xGYGe)a{BA#Vz>|UU-_+ zKeK{|-)9Z$dl?ox6&76~JWR7tOzlQknZ|izSth6hLM#f#-P24?RkwJglRmoR{Ka<` zv+}qy2B+n7UD+ka{(3HN!tdJrlW%IiUmU(XbRNcEhz;#{#6$b|z3I?5oP1tQO~-0M z+fdQKY4KTeCyzsVophBK@4G6~4VmI!-cmfk2b23sVXPFu5em5pwNW&xPM`)S?sk!T ztm=d3Zn$y8>0Mj*>gvB(ZQ||~b z^}s%!ihaknH2gfHObwIijz7HbHur@(Qb z0`Awww41&mi#1MwUe{pz+HJqY%l2`5m6|@Pu_dQ4zQe9r{ao~CC^1bGN%r=s8?(>3 z!AQl-oswNEXlGR_#lq}?RWazUx0VdKUunO(w_LD>UioWm^U-XRbGZ+X_+IUnek)@7 zf`W)Bz^%>L8jbk-(D|DBiEED3r}uqTGEQrJ{g7douy>Wnu%aywU6!afx=C&OC3=Y{ zyQ86>GY~s%wT#YPoXvN7UiY3l;mb9XPYXL=_pJS4!X;&rOPRMP61;;Ab<)XZ0c+VAEQXc&>9I&IvsndN*dm;_~s@NB_g z)fR+=FHR>kTE#-gQpqMqpZ^|>z4mfHo5EfLm#|B1F+5eN9aB^bCN`x3Q(NbDnG4yK z(GyCGYkU=*sIe` z2?C;HUvw)omwf}=K5pCPBrJ~Mz4d2&p?C96Zo8E@uuHi6v6wKzXG+nBhThnu7 zNltQ zK}2`!LBoS}mgpINJUiVDXKTv_54J34x>qa?rpBh(!Cb397Uw_8pI+T6;eXPf!n9C8 zQE2eC>3_a%#exk64cK6Gi&iOL{b2ay)6JW4b>Osv^#Qv+!f{+w`p{Fox<27oSDPkm z_?CoARbMC0Wp|mhEOC^vbxW3Lbu(8=!l;c|6|J(Q*#-K#KTpXO`s!W|wWO_N+gls1 zvnBlS@($jsXt>d&a_fC(lnGUs&u4Xp5J8j!) zU9+E?PdPU7MvKNqF{ zi>>duRCd*=wT;*M%O@tY>5~)=$$`oK>yR9~iRFQYPBpiv+UW_>5a6$d+D-Udk~-V_ zsJpt}hS9zAJ+wRA{H-sSvd8&6?%VXYFA8rkdH7!n%LAp%RKL> zO#W6dEF~m`-VBmT)MKRP2cR@3?_7NwANcxVI?{h_F65#&D?u^REVMc-S5fK+)kStb zey)m-l{YfZX&h2A3pTM}>KKhKYVG77x+3u*Oh+t=-ze_84lD z-HFRbBz`UTWao9rvk?eLIS}3~COtG>!_$7X|NiT|xVke1Tak1WMneHBp#j|A#B#xV z1FSUj6Ltz7;{9r25U!oP@{r-iAREh-9W z=BYgyPv-Bq+MLets80-v$Pw?CY_p;<**h#dcXlcFxa8gKF%Pmb&cb*sVmVH$e7vHS zaO&%BGb?!S9Jh{de5_G&NMPtoZF>9(i9ln%&*=R~)~T0!HHK6|g;UPsTWUz3N0xg^ zudKd^mw6~BHf>{li11Wt?_64|1Zq0i>NDw6e_QxY%tGGE%?|J}o}rWi_E}2G=1NVRHjN z27l&#ar8W0S8D$qdeb6P^+-S-_}gP`LX@DDxFA*GEXQ(&iYGw zIdmJy9_g92Dq5UK^p;}a7Xm@BaRhq6VjthR9NbA4yT ze?E#^ieT!0WvjAA>ovzcHxERMhl?|8Nl#)~puiFZxF)-a*GO2f;|W2~tDi7@!#wtI za0fr^#lsdIU&6?6s=djW@>@CPbJs#eRK8@{!w09gT4;vXLdJh-7@+I?;AmI1X0 z`rMZQ?!xS6!9CXdy;3<|`<~j8!!t*LK?=YHh1`VDu*P^sutT3+VqaAjikA*D%-jEX z_2J|AUJKp^+o`e~yXn=|jNPHzw_3<@4@?bwNLMu6*GpnY_1(gD?xtha5K#Db?%H$K8Eec_ z6iZ(#bJ1a{TKS)dgIl(+9_`%ba#=U6Y@*qx^mEyE%kQgOFKd}){4F0Kihn)&dhifA zB1l@*$)ETuXJBlsZXvEtJnUo0qLt4_w;E8w`zf%m*e-a`z*Gyeo)X0{?mhg1kTA8v3e&NNxo>b|d(B z`gp^kFM?FzGGNQ91(yMNeGuOSmj&G$!DTTpX*0MiC~5`Ag96TePJ|;w5}74DkmaxY zb-tiikOAH1=OoU0h!lglI7+}`}kihdJaA{Atv=?03 z8}JTCf+>>*?1E_qBMyK|2g0R~0oenpGnjBY7>I(Bw)AizxdYikIOorRD)Myy{)XdU zb^1dg$g}r<%s!Y}PbW8jApcG2fr>u?3J=4h!04bJpc_dP^uA8b$ya+_zzCSM)$c3_ zJ_BKK9t0I{H%}s9T-uaG^s<0522Mfb5rF|=P=7|I{l;-Na2x_enEn4xu>Y@ZpaIUt zfCeQEZ`A(4wXlP4-m;$v&m&=hF!vKWv+LwH%Z4ESmf2UZFebS?uW0O3)1EDR(#4rn+Q3mO4SR|W&h z$^r%uAP-%k=V*{X*q;oj54DL0J_9GaPJ^!G!JuVe&=q0=8WfapK;fXeSTOMQHb7a> zFF6lR2~aY!s0S`c+FsM%~s6!UYZ|0yYS%09OFwhMG5)s5UG&JZ65_vwL?$#3w`)khr zX=*(K7+h}x1x5_2VIi&n=TLP>4SrA8?<<%x*l$M2Ent8tfg&Oy_8~p^r+ugtL?kN{ zXbHOirxl1P1Oi5W_NxIr_~hw#f)rt02OvH8Zz9VY=mn$`ee0pFo%FwNCzk+yeya#NH2g(`!oI4iQiX9C}7@U zZpp>{;XsMjbHJs7Z_WScjDwz3vfpQs|YIf+b?5(F#%jP9H4k|{Q#gt zzjFUNQDDu;4+DX*ThASX1J6xvitO^S>%IRQVGXtX+YM@h#kbz(zt5rGem_@1{s~F` ziMf_mQvrpf2~Kb%0u&{8;Njz~4h6OESC>N}P&n{G9U1|?s6%3{MgNdVkRQ<%23ZpraD2ahz^`G! zN^ph$LxaF2^n-i{Y^td zYaaUj!yf{T2KH?O4S_-cZQf801)%1CX-Etb`2GzvH2&|gq0s=WZYYOELAC+<{mBbt zdp6K;z~5}7LHJ+;4TFTt=YQ*BP{1c`q+ucJxS<^GZ&_hv|1O6`0eHNjE*1^h!vD61 z#Q+etk%of|{f2UQ$bA2ohC={9yOD3pw(+mIL?R&Iys@tAU;F_H{9O)-{#);n7_dv*&>j*8FvtcP3h~!k1$!w77(l;& z)&l4j>;N~?p#RQdBMtx8Izpm>6gHLvdi6(pBtIt)Pof|BzX&n)2qpr%2A4MR@qvsn k*|q6-yZQi#MZN((z@OygMzfTiHsM1|h=i!`QMf*_Z5O8%x&4khGGDkex`0h_aWGCE1tA5|NZ_ zMUkyVyZ1h$e!tTE-=EL>j?eA6&w1{0?z!il^S$?ZPNK#dTGA*PEKIcU8N8qhhJqvE ze$EGAii&WA`C)$|9HH(+cJlUfha-%g+=&5jG$>#SS5}4*eO;k~a=%s3^7AFbF_Z!b zDDq#cJ)>>n5=C^b|Mqu*j0kD6PZjT`NDC~Eev7i;pFP+>kh~Nd`j|jF(Z=U zwxC-LEx-yfm<&hg`T!QxeqPmoUUlL2zp;b*djJsM5ceb^;BJ+Dgeft=FOcK{#sl$h z3P%_aT|J%D{DMJ`NbrY8q2aOwG~6DBPzQqoH3Q&;)sm{dzJA~i1r_^U5YXdalG7*p zx|2QNpb<*rTAtoutZ;;uHxPgZ(Z$b|2(cVMCJ~){V8I#9Hp9OBvK+_eOQyB?baq+9 zH+R%FA3<%N-FQ($g{PENr00R*&4lRN0#n32$JvkMxRpI`yv%ETZo|Wu7Yr^tCpOC; zdo_zl%{*Rt_uG%pw-&qiu(*F1Zay2@vVUQxe&nH39oX5^+(XY>vn{jeWzfPo4y|gwkl-^BCwBaZvQ#+k(|gfA)Ns=oDovECBxa2& z$83u>cD;!D5-RoNDYx+ClSvJx39LSLBu~wol|fs%Wiz|z#9%wTunkpt7k+ht6ky4Cx=xq=tnSZ=tCT~N#6l(u_wBi}zRe#q;Hik%<1Ut3l4S#j^BZ8h=XksXX2 zdZ=TYN?N9)o7JQ)K1jc?Goo=O-qtK$=X$3}M3$(@m~y1~oIw-oLld>L;qP@K;ue*6 z2C*{Hm(0_03Ug*<#fp8(IF=1BJ5q84yYcYklO&hM?CfJ4p>0pX$)aq`*R-=0whB43 za{kCo(L62dp&_-SLD+e*t?ZuN(ACg~-r0wuW7E%seZ|Hp>VK?v8nN@UFD`D|i#sJqCgkbjGMB?lGS?T|lD6XIeXe)M+a48H zW(Pgyt2bPH3sXpcP*o9#R|*2DYsknNMiGn zDtBMq-Cmwt9+SFg^KCdcpNj^B@pvNcoQUt>d*>d1YmSOrxlZ*pR{A=xE7=&xgH~o2 z>zM^wqa9fDL!}Lmx56~%%%uI!nvUu-U$P*dAKQBs=YV^pi17+IV)EvP`=P*%4r}4cs?XjGQ+r>S{qR81Mvc@(9FHsMa`A9x%gW;NiPtY>r>-vc zAI(vlydKOV>=VuH8`i&IVv>EFxte+T?BVkh5RlF!;Yp0*$S@`wq&{c69%F3Ff>y+HAF)lJE+ zOP%e*U*G7f+ObuLRHa0>e09y4RGmb<3f_My(lsi&BaO#adxFoEZpaoX=k_XUT0d5b zsHCfG=l+A#mzpNICtTS1(uc(2dYkCXv4XD~4|l3Q-Qp2EI}gSLI4_riOi$>621 z+y#?%cN>3`Phwk>40y)}uX@i4ie7l6ytV%cZ*N}dsOh4#Oc>`SZ;k8qFCToj8y~1~ zw0s)DRX%CDxKE}>&N=~cQ#%2}GL+n-{PC`e^#^e=|HB$*H=5a?VK0PFJ?almrJdV7 zsTtcjaE>YdO8AqPqG{*+5<)*QTF^aCdDy@26(5F^>9Ly*|4}`yF=XX**Fi=9Ko#)QoOl+0$D+!?=$W+((e7B`>3ma&mFr%q7xXG z5v|C(oOn4|rNVQ&#;*|V8^Xdj#a=Pyan+r!RD`RyXx!PH36tbK_;h%Ch-TS~JvE%T zY8X3fv{hNs@It9W>D98Pn`|XlSWvHrr^{RRXxy5Ya9i%rn?88E^WKQrD<1mec9qW@ zvNXSrY`KFy(iA-^R;8}_)v##Z@Uw1KEaFY-rD#9dw&S^TO@nk6xmDV8%}a5cWQ4iX zn!YBT;>nrx8kV%@zL8UPCfR(jbf7y>F1OrT_MX{e`OUeH%iKHOwMSgC9ou;0PQ_JE z^Y%d2-S>*ew%-=?y?o^{;W%Tv!_EPI)yK0kg>@0|;)dan0nt5E*?Tl9X5?=B4~8Cy zEEZ8Rtdh-AJ|CQ!$sq5Jl=}Md;Ak|yrF3_P9}r>^NGtaM7(a`%16i+0$E&(nJ|-wYp{Ok*YaEv!l+r@5f_ilPo13 z=9y(0o{P05ZdpAtQ3LOkPBzz{DcM%|KxKc+TWJE8uC*xBqPO_rn^INvkIu)Y_f{9@ zYB!szU(+e-xG^fjaMLOyO<=DPzi*z+y`(F0`T5H!8mGV33WrzJE6}$O1=b>sd!EeN-?F~0 zql&1W+RmQ!TzSts!CoRUV=$u6ASmdA@bgJQlcdw#HhUF&(k-qo2@|pe$tW|MTh>OA zGqr*R8k`$N&l_0fs+_*k?c`LJJ({a;MxUPz+a{G8$Ik8L)!cEJl=H!InM)yx`TVZ) zPX}&i`mXS)v4$uGtHy-#y~-Z8f9e}~`QAC73-HGehL5pY$l!E4gU;~|_lxY9b-pUU z72~j1GvZoCRD!9TyQMAXdGl_Wid`>m-`j>gABgIaq=<~k!*^znU z(|qN~mzqYe9X5v@I3}B|RWw&Uj=NmIQ_@?o<*|NI*Rr~Y96#!vPEX9stlqh5jAGv4E;g6$Px?eu>4O_+bc7B95FVyfVYa-!7Ig~K5EZ)oS)BvSnwrI zM|3wmUd`|W@ET@r1zkVxz43H_q-5PI zv47vHO5DC&IGvPr!^)$))!FGLbdz|)dRKWzcA(|__37ml zlinJhGuqfSjT4k~Xfqj1d2_@xyhb=0ji^xMs?`iHL|A?f$0UbvK9|kOPtwzIPUF#+ zto6-r8JE*%<9K8K>}~Y3_=G)QQfrc7hM!4li*s9>lg~9&%-XB{@mK?hbZApG&>Ytkv3IK;dIa*GR@N36HLGBRI7Qp*o1)*)q8uX6F zV*zCU59qFBS{YA^V(F-WrPS>=RjRE#=(wCISf`|bLg{UxQC;>mA@J?-5$%Zm_JruJ zJDIY4#b>E1`^~277vTZz2iv)wQ?BtOqiVe8mIJI<;^#FJ60w;#EPe`|kgJc>j zTo_w&YVZU9v;DHKzisE;LH0Hd(Mu?}V%}IwD(6s0d1HTS_SChDkKSPVif2#eE}kju z&oMk9@Zr9QR71{)jwXW;&;7m*ftyk}{Qbj~$7#px--cNk6)Dbc_8hSY(c|uX)k&w% z>!VP3^0u{0gE`Xz&qJYpn~2V>@XD4hs(#WT{TsjaMmgL#wv#l*jG0Pq&%_vAJ1lg*;bSFuF&ml$vzA!TsDnkyQAbG68nb_d2(-u6XLur zoq8^9&$)%{jQ^&z*({(ZAR_*o$bOQSQ%~Bfty`rFc#DYoy|Yd?yro&*#Hb#6@S#7X z_#JY~{5#qgs>-fZh727lteF%%{=X*$gTb03jeBq#Io!^O{h=B3oE;l(b?SGXlyWgM z+nKT<4v)s2>f6kFWrP1FjT4(Jx6_zr6v{KV@8|FTO<3f|prZlgBYsgW;}rh2%Y=dzCu zb!ypLi#NXscayJ}HX&?fz#H7iERf25E)&c)D0bQQ|9(CNh%dkDQ7E%6U>z?$CnHu?M4jX=%LV+0RnxA3920)4wcc z9qZ%{=+L+nY1V_o;?0pau@<0vW6=FZSg2K{9kPrc+0n_-1e}?=^Xqh8HDbIKe1de> zann>?XU8f-f{{GQR2ZUyG4Oe;$~t5pTY7_T$Q+=>slC7129)MV(zlHX=}; z2ocXdCE%`#9d{>Lev>jft=&*Gs>>CcI_3}**1^|8+{1?1QIl%XmD7^Q(x!=5Qrv=m zBDz@~S-mOniQCi2TA{I9BBl?uQu+ot-anG0dti3w#p^H*pMoy|UkjOizCPtN3_9{g z_a$e?)6K?37v4JP+H*g2jbC%W&|4t`}f6f)mJMPO7M^19UNr#jQc zYiVtQT~o0*C$v?kuBS~yD3y69qcd>Gn^d-E%P|y-o!w2_xykc%K0_J&O^kF*mgE+x z2ye*x=Cy4t`I{jO73YkX`|k1J6=0anuFOK?o@c8;)c=Kb?CN|vBG%`A(k1!X~e?LB@ur? ztpaJc`4`2s=@Q2J$M0Zv_G}5&WN6Tn`^4{GvHz*7VR9>3u=7(!dDkfdch&MlB%Z+_=IQk8GN-2`!n9hz z=F;&%j^yXtrnYgUx}V^E;kxpf;m27UVJZ@&;?*$(+<#A}lBu3=vkK$z6qntVM++Cb z51a0LUh~z@OWpLkWKATh1$pd2T;)qWdya#r+bbN3Z56K_m9e!+I~-9R6(g#A=UM8U z&>efSOqra3_7XGtlB7od!yCO>pVU%LGsGD#WHC1!Bm9^^oU9)#WxYG5gjOxx`AR~q z#n*&4okMkt(^NcnevWabb;ApjxtV1&{QkJO4%|{K zPllB*tu&mj;3E`f9WKxyR+Wtw#Yj7U_;jWUTqU^MVjuhP z%6`2kL4C}E?I*oWHie3w4L-{^);rApLKDyJK9RBCRKi*4f{}kF(u_Yw;?R;y?8$>k z2c+^mf*tg_3>vpeAL5kGElN$~^}9Qlx~!e-6VIyIEzPVp=$-Hx9m;TY+Xd~w3D4Zt z=O4XGK7OO=^2X{?84wjXwXS6jHg)G{!KQAC`-_T@Lt;&ydYbyw%HRXG(*!!El7z&R z3s%hRsPG+9N}u9|0%X7Q9nT)Bwhl>XA6mIEC2VH$$uwWaSM?jhjrP# zMS|4Tw18TM9jEWoVX`*39MeTt=D$i7;xuFlZAb~eVI9W|nlbfT>fnrYE9R<9FxXQgu|(i>V>Y9YI!>|8h0`Go&z?Ec~>p7xk$r1Tx~jMv>K@hpeq@Cuu|SX}<$ zUYASD%r0L=6sd4Y1v+7{>)3s20%N3sVjRA2<{zpvmKyi2{04-YZ7=FSemKhH=pEc) zcFpxcX{D&h&cP(3W5Vi%*S=v%JB6=((q*C#N{(&y(kqOajnC)Ge7ZPM_Fn7S5Vy^T zx!&A|*(MJkv=Q1iXI7fvR$BJ>N6!AF;CaP-Nb7*$U3wD9x zu-#!!rCu_p-V)LMD__p?*v%hXc5an9$x}F0ThB(5pT}8F-^TlN4lm(Bv#%;jx8dqi zh;LA~_tkx`Kl9UI*pQ^U2vJ*s+TIbGlw%i zd;W^eY5sb0EgMzN-5cK)vLO286~eMOB7K1=u*wxt({TCet0}ca#zRFkY!0rCMFm&$ zQ{KK-4EyM=Nz#h$kCKxYO0aoE2pfNNMbKu#PPS~^N2ILM^2OYau?&RW$C<5qv*Art zW`_!nM6KtnN;9tYAU)XB%-`53iaZJXP}us8edb5WnfHCFA=}hgoQ)bv?JN&EksD(x zq%o`~w0Y-RzMx%d9$Uf1r)gQ@BMW+Sz63S%-grOx*tz&>wjDxVn4>dcqCiFDsQk^= z#5;E|-^mU;W`*WCS0p8;e4RGP+?Z4OY`SByKX%CEO?pfV|;Q_l{w@s9DklO2!m1;6u zby%7nY`Dd8vRpJHEW-Wd1-Q@VW;l;VSz08ohCs%K_zq}SAv(;{PK6&TFi>tCy9%XF z0>?=tBYn?hvu`5dZNke3`J;kYERemM1Pem02+5tQsxKOId{9N^oHv_z+>*h;lpnm; zAf<{qq`$njzt&^4dghhh(Fp-nrCKVI+5>b^ac-Aggd{Aos z_wc#`?YcRhGDI?y~&(qF$j`Gd*x&q3Bn$F$jquqTB9{NAGvd>XfjYh5gwV_S}d8;P4~ zH@}4sEij1397`_P?SIMpW3L{PjqbY|=Y`2Al^K@MmWO8VCpl&(CW59y9sLe((HYv@ z7B93-AS~#T-OBjD=X)y$&nR@?<}K!*A3tCVf|0TBB#U7-VuFHKVL-W?(*$5-WSI)Nf&t1B4@%Q$@Exd zvu84S(E}f_o4JTnz>O^==_>D)&GOtMZ?h*!*XFuDtNVZ0|KYy#bHc^Q(Dnxcj1S+K zJ^PGT$gZkt|B=)BtZ?x7?%91#p4NIDOcj@^-Bg&KnGv2ADDBw+lmAF%tW>}e8nq6! zQ8KQKp#vxG_NTX7)rGvh?#>gXcWvdp&gYN9y*!$xlDM=42}EF$sI}RM{Lz#qw!>#$ z`nuvpB{dO=rJ~C{u*GAMueg?%q5?H2xo^coR+t1=mmEuJ`4gYG&T>66tJ4nO$HMXFd-K1MbjOgBBB zb>GWBw;S#;SO(bdrTn#Pf9j&#SxJ}R3eHH_S6-7M;RSNyq>od8ps zgfFI1ebZ$YsWbKmL*J($neDWV&tICo>h!qw4PDHKYo_n!*^@5M7LK2p*|1SdGst7knQ3_Jg4HL z)%6``FSv|}ni~}~73fXpSIVO%Ywdy)bbdyRhZLj@$KLR<1J)XlK9Ph~=;1F?bNd<)Qq# zOe?FZ$7@<@Z-PcWPlAAPo=b$nE**Uxmjsw+1zk{$mVgI zMvQ;Vx5(F<%)c4Eu}fMoI8~B>`m)WuM&VSn?k5M$XEvy}KQxeKydRnTG{xw{Bg?K2 z)pk3c>0E{l%C){@)en=Mp)yb^NCBL||M!kw+EkC!3}+b*fZ1Qcd}5eXd`+$jzPEBG zAyHF{&G9|z<(bI6N3G!%R|lFZVmeh$kCha3p6=EeZj+Dbh>NkIi9!?2yic9DTb;Nktx5xhZX5|V%W1s6^T|ZiigI&+ zG@bNOt)7$nCo@aVkA6a)-)=Ly2=9I*5DJOABkZ_jC8JFRmfahcbJ6|dy9cZ$KrV>1LvlC(~5 z(u>I?zI_MtE_dQCMpa;R-DYZa%F-GR+sobh!D8li`>=R7DZBqV<9wOBUQZ99`0~9J zYY*~8&YGO??iNEM!&*zsj1YmF>AJIxWxXd?nlsZd$9^*tv-A+&A77W|78Q>F*dP^f zG~n-P0Y>WYW8rnE9|nUrgN}taf){INsJuRr>I{_UwwMMpP|0_Tjw8t&en^ zkU&^J(yQ$k9nNi>wB4~RTBbG<`!>DPw0W857FV})iFWsv3TYUf38#`(8iHH6zx(~P ze7^tQ%uq|l3NF5yQ@S*#4t!z|%*DWsr&L<1ozbSWVLP7$cphYoB#A|y$f>#fLcAy^ z>%Ee7TE~k-z1$uF1H-qMc1#@};dCYuwTw@0)gP5IidpZOMEqU(kEWmNPj=plE`32szS$G;_`xE2Zy$ zLThcJIFOVyyGOW z)kU~mN|$1B@es=kIUiZqa~XV`*U~Cz^Wh!4`Rk)p-WK4fKrh@n7Apoi-lb8q*At<4 zfe$}xGZku%>uBqz>!g@Of9q|h_PzAyRq?!#&0ow?jHUS+&PL^3362UFS`QYY*yEFp zTHU)Q>0>LB{{>425DColPOx+tAC53LN# zQj+nay~Mpki1*H7#r2f)n%2tZA*L4W?W1Q4TRMcEU6EP{)0I3UWE^#2d%QH!x8me5 zp7rL^apWiFh_qCD4{TP(6|zReK_51B!{tXsEyr z^m-hZw23Z>R+X{3d-wZrx4aiFsk_fkt_(6hVA#YOr9!qDzY?7Zm$IyR8DHSh_yU6p zdqIEp`0ha;t4S4exUP#aE-Pp;pPm`!~

yts`{$#>Jd2I#rJ7 zVyzfFiR)`FB9?q+Urat~HbB~Ef4700SI?|f;nHN1p#4D^@vVqL{`Vj7v|CL)3zt-M z>c0E3e+aml_##+us*9&@;>g4Hr3Q1y=8J>AD)`6aFKQwtzDAc!O@01GyWi0^f{F~O zVH??XsJcO%E}J^oWGrViY|RhH)J-Ygzft8ef4@kbAu#mYB-~$7vFG4U8M5ni^!eQv z`6udb!Cl6CTd_6#7Qk6t9rA z`L_8i>BhNR&oA!w=YIdKSAFg+{cRD&7gUB#4f3pG)EFFiSz_FC+kEwHB#!PT>qC8R zQXBW?=|1K*))e%Q2cL7eX~C|E zz54BQ4p*2PFfG>U5>{3;ykc&zjD;?o;g@kS3%IgDMYH9aqynzK|Z9T|S9CBmT!|G6dfK%^(1Ue|c&& zY8^tunGo2(Zg_T?`-jTTXhg74_O6FN?mvv~wGgbgJ5!z{l(uA=m@o9wOgSV?CZp=v zVCMnsR7$yYz{$IJY7j*Ub&+lnr|LZ~+})6_xIuleH{V2kt6)!muGy~AXLQQvDty_W zyW9>3kXKb*K&>a0OTM7@*)+l^O8q&~k9_-lYnz=XW)?|}k z+520YEWiBNkg08+^0$0|DE^htYOD?=00*Jw6hQo$GcYmHu-L6ZJm6#zXy)V_009e9 z09oC`i3BPm^qqd@(Ks9o0oBnU2Dp$s{mFhLI0|wdrf`H=pfi~g2?X^4Jcpr^4~VBx zTif(%eYMpX8fg>~je<*K!Qll;794P+(7@-|gWf?G22Ny>CneBE28o355BU2h2L;>! z=8Qr74~VgWOce+Y()0x(Gf;HUzfnoQNHBzkr<)rQ#P&eJJhpJ?6?TBXlM9G50^{?C zf_}V-Zsea?lBc@|8IA?Z-o?+y#|aMhc?f?Z$d9(<^OL9_UkzYl!X%1#;=6Kpn#@8g9oaJmjewE!-1Jta*P=xY} zIp|8!AE+lRbc2LM1hEYb4Z4CvnGdMD)da)+oU?zLTFn3kSDQeC5rb+th%3N3R2@=- zU(@yL3Z@M9ixEl-SYS$^KtzarNDuyLA1VbADar&|g6{um1!4+`gi)UTY=8hhGy0Vv zMOf7VNDux@6j=kkfOO)wM9~LGCw?W0uKd&qNFRPr5MPvg&<3O%zY;WZJQ4@y5bz4= z0F^|L19JlD1#oS@r(Zhp>k0`4%sY&7h4Q~8%3QBLhsykuAiaQQ6}tZW+4t{QSPi%; ziY)}@8wio&`}E-`2qu{V`vRwY4GVI{a4@qJnmHT|Z5sj23JxY2Zb0RK z<{)nch7ZWFRR;#fPATIKbZIr`0S88cLi2=!S)=5hh`W~|a61OQZ(a`SJT^Dj37K@csl7FS#U_c_$_FR!VJ`W>nK6Zwo#R|SO-E>7S~ z2=c(H=>J&^c=>;eVg6?^qznW;oB)23111Z$5aA4A$z1^S`Fs13y*-`bK^Pen zQU)aj_aKw~1LP41pMRdo_>tTtU_3=(|QhQZlL{3erN644C?j0+kD{NT?YaC11YFx=q(&>+kS{ouYo zX(*f=0I+Lmcu0fRl*0oTxRxddEz~vTEqgRa(|ZtsNpa9;V_W#`?Fsh7QAR%OT$B! zeN8zCxvrt%kpQx+rJ*5M{byY~4l=}l(qw_&{zXGW7=2ASNNd;9WC0XdLz6@Ots8O> z^jTXDOZbZyIXq-^q2E7i0VxMT#x*nov?*CjgYf5C8Xj_O(C^>&2(o|48A3U0c>$XC z*Ic5I7|8Jd(H;tk1JHIYjR1C7YiKCcU-AQz`inml3JDF1nfnP&(Q*g81E)Xxtx k&=hy4>+1$w+v*MQ3jt&&5}9HaQFwrKVWOg%Mq04{1MuiMSpWb4 literal 0 HcmV?d00001 diff --git a/benchmarks/python/blas/results/unknown_float32_gemv.pdf b/benchmarks/python/blas/results/unknown_float32_gemv.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e9a134209de4cb289d7cb7616a2c75d472f40eed GIT binary patch literal 17563 zcmbun2|Sh0_Xl3ta_wu8<=T_w?w4!J9wBS8mhD=~?vk`wB0_c|C8EeuN<^|`--)PD ziK38_B8mSzSA9NxbbtS^*Z+ULre~h>%rkT5%$alEGtZpFj5M`nP_kH?jZz3F_VNbGm_i}+FjO<;AesDA>U;tPlf9aD=W0U_t%YRpZxH7jF9pJE*@q0PziR?@I*St+S6XA^Lgw`#OUD zK>VA)5e7sjH+yxjK+qx*{NYh(xEuiuw}l}zK&QZ)esIEiNi|PTFK~x~iv1}F=<#pK z=@UI&NUm^D3Au7@H+RriI6~VU2tbqQ=;cI&SoS0N674-;ftgKK!xwkSv7es1J=L;P z$2IX_Q%6nHNfhhMw#!bzOm@4}kct6`_ggH6OOJdTU%ZpY9jz==j-79+(|n>Et<)r5 z_+g>1p?0KtZFc$H`Iflmy=(GHMOh`Um_1kc&my!HS1;DGT;C-)KG$UT`TbMY%$Dhn z_c-Uc;{j{S(?}&jfiuy1uiji}X;7^cAq>hWS1DH_F7 zl_85sOQWI)RYp;RZ3g<5@gYsr37s|R2eV~)YO?|Z&*|)CTx&WzyA<2xI>XfRI$ScO zJhr(>)U)~wPN|AlV6`i@`{JRVoA)0qFMe`bX;;~q6?^Cv)7eM7HG<;e8s*JzrQ2|f zoG*y1*Tu(-9Q{3jAJE6DIQ zW6n*uzu`OZhRys$pdoE^I{~kR9eF!;1=C&GnxHN}Ta)WAEW~qRW_Ot;=ZJ(bF?xS% z0g@0Ed5=Ln=rp|xV(IRvcDqxfu+n?RZ_9DV_tw@{o>^S(>OpvU&NbgOJdK|)3Ft3f z6dw&4@_IneR%k2BDe}^~JiV&#t4^eUM5{eQPL#7+O?aP3Z?e?L?BdW=++JFDqX!(}`TFGE=Et ze*10N`PA~$@qH|>R8IHm%aw4c9+~HHIgioanbf9``njuX>GtPa;`S;D48KwK-m_2MAy9j?EZm83rd9Ttat-JG-8)0nwnoR0 z98G*DjN&=a_S&DNh=bRy#Nyns=#th}teIz9T*KnHn0-pD>WLjYQ+BdZi4-{G&)dV& z65508XAvpKl7rXz_@5v6JYyv)htokUxCs|M_*&_H3aj)*?kUfwS)M?jFCUb@ zeNQ^QifCJ#HqJbYU&X4_PY+!`iB~y|T=hD&I{y_h*E)Ug(HAOy9n=yLk@a1&k_ZJHNTMJ{_^D$PHwPyJjjJt9N9f&fw^*UBu zJ=1R{S$u1_B8s_s$%!_Tb|5NU*`A|oF=j5u4sp!&p7OD|lHp8bOK98Uk0)C=7WS^L zCcX4jhxTWxQZ=R9GokL6znzN7g=-%QXQ?)l!^%6ICW&`#-ChUpm_J_4X`} zUl-4CkKz^do+?`UfeJU;>7%~gC~W$ToMPcZ$4tq#?dP}BE4(m|c6~5BZ_>8PX=Ea6 zL`<&d(aN&6yfLOiKd_I)kci-Nj!6{}`cfXMlf7#jj>lDK4v;c?zl^m$q+y|1jMDV- z4!Slb%i%1akhgX@)EJs%LU~*b_`7K^OjMp%cCxiSsEtkXp_zrsMWiEN1w8YK+ufOp z;hH_B^jl=uG|ZX`zKdsmRb2Y8y7*EvBjd346?%Ls<7-nx4#InkZea8|X$E1s!03JY z;@3i=`1n2yq`xSd5B^kdtv_uo40~lKyt_-y;T@~r%#Cxa7j$0FFSHO3BMX+elk5z1 z%&74Sul>Rui0smh6yg=IWFF z7BFMXWS|wN$JUf(q-hxBSA2qC>3#Lpn**U7Q6qLqe%xXqrGxpj{g{GBHQJKt2h`gJ zr=&g3xoWta55S$ji}<`v^1=s{n1PGlVnOnPR>G+1{=!yKn7V4hyoamS+1GdA@9<^X z21C8x)v^9N=QUA}Ud9MaBT#RwvX(zsWLce?C(L@DJbk?=uIG+e$#KJCOxCRup&|*U zM^fW?5r$P&XGS`v`!(;Y9uqtC_?Vf0Tk8++vOCPFj99M|?YBOKcJB}*-Q0G++We0B zGg!%!Lp|nNcXMv|m%}s}Jks4Wl2412(LD64Swddfc7MS9&fp7JiQ4q@56eX*pKlrs zPwi%5+g&MhcbiD)9heuxllk32YfXrp2L~_h_buF=Qf;nwRjle@OqQ0v;ME~EHL14l zuxHA}ig3CgNyr_%g~C4fe?G69e3o_e08Ogx?tM0K%`1(o6%!AJa!HR2agVm0?62sa zr2pc@lXW3<+euFD@=1C=yse>udbTZh?Bt1UQ3f5R3B9gyjgLPmg*4rGRbaWhtRu%? z(a_H4)^kg~+k91rU%WSWA+=u0=WrWy>6(ftjELwHcMoEY>PhLRV|JA_C~vJO9=-Ki zg~5gD0}Y*8VOmhflj92f5}5(H34`7TKhZL2_Z;iyL0uV&HUts>{t*v%ZRV7~W&7gBQMs zMjkz)FwztBE&alZe~te@Df`qK51~A^ZO47Scpa;`=Ne|s(tO?buljRTz7#N@JdrnM zb2!k-@5N!u7I}q_5oqnOrY}9Q!ac8)`UAeD_UFcRT-J+qaW&?+A{95p(tPhR@3`gI z$sd=tS4!n%&vSkLf}(bEn>Z|a+UbIm5duC)nxISzq{*v ztyeZ|(5I#XUNU(1jHprlsz^w)C%82)zbl^ID;wQ>47#~`S2(**AS%4?l$Tl&Be4ro zsN#-O+3*taFZ8}kq-Y*VaZTxg$Ksd!WRhayiSStYlQng=5ve$WnzrnSLbw;JPp?>D zQc|Pq$;YP}jsyfoF95G^h|kMv30FT zm;1QSZMk;6?`MJ>A3d--wlqHO#XCYIw0jVAn>6a?a+h1aRqXnF*kD4tR;tBv>}jU? zym{FECMgSs6HJ1f|Rc6+&bN7@6n5$Xqz2b~SJIV2q&N$g~7q>wskAXirNo@|e3*l0`H;B~YiVh(hVHZBbkFG$!oY z>mk+=xAKhWqC1hgc+F$JD(3@j!DYB#n@<~;L+bnkc2RB_qWGZ8t+06?rlVXnpExia z{X)8p?7k*(!bNen!-l@{jU17CzrxESNOCs{(n~D7X4X*STgI-K`oT7AChW%LCm%5V zB{RW!pCgI}au1y4|5_^|{W$lfjuxE|cWr+M|6S=^zJcM&GgM=?v!ND-#Y!`*ZZFM) z^td|TbyDl|cqkSH-?wypY)1cyyGv-mDzbBjLUH8xiIKi~?7&|6DfyS}mcma{pVP2FALcjY!6aKe$%vwxkv{ zoiMzh#xlEhTa2G`ww_f)&*sg?MosR+@%z zcP(9ZE6$7eQ4_{C<-KyLZkY^R+md2vBThkn6jqyf)c+PA28TO@G`dr^h5g12ZP+{Q zRG4IhjjBpyN(a%^joJBauqAi1Dy)`+0&kcV z1myo_Liu6`Qyp@<#+-ir_A%3KKgOEv2J}p&&?=!{Y7{Rl~-t~guk8b zVJa?lZXTzJVZ|NM`<=OKGsFuxo{0Lq)XY{i*nfqktbh1*_$1ZW_xv*yIQ(-r;P;|j zZZj;#>#;Y$88sF&)%R^#3;t5p|2}l&taV2^DU8#x1EG0{G?Q|^r8zlv(bduLoQ%y| z67K<1p1A0T&bHS&?9u&R)fts_X7&kBXL6QzR%d@Haz20iAO>|#E}fm%ZqIRFmakuL z%Fhlb#=9S~@4dpCdk@)}u%gUr>euTRnXn>q#P^teZ~D6(J7fxZii!GtGxoRKWf(ui zsvU3tIuKOy1<5w|h3b`>iW7w{LwgPz1_h7*--Cj|V9k(5y|``c&Jp7O(F_`nj;;4P z^*e*59ZgL|Q@6(B(YUaFR-S8Hy=gVi(jMa7VsebLjshppPS6HUHlrRgv89lDP@goc z-~O5|W-%&m7I-mdP?e%BJ)O(ihNQo`zJd42i zN_=!}emBtiCVus0EVW)-iL20;*S9?fW$mg>6|@|j4)_+iMISt{XvEoH+3<*w^{cy< zhNNG@(+@`)x*3k8x^oBaU#1bbJ}R!qZ2o*W|G-?a((~R$L8~E>AO+b&8-E*QkK2r% z;_<+OA+hK~y*M=fAQFu=Lds!`bx;$bD8G8?JMPRoY=q9yC-#ICc8;~b-cPmVm;%dr z3hhHXjT_pR!)#)nl!FG1?jTKjaag<=@-EgKG;ai&-v$e@sI){3o|7XrGqH2W3?unRmX{YuNu`fK?L)~s#{_1@wyGP-&-;W{& zj~_2M4g{S1p!=31ax%53jyTSh)MP=cY+R{_IvxMPJzFBXdUsZsyUNi>&9vPvucPYg zclR>SRL;7*IVj|6J9^W@F<*OsOMfksfkEQ-@0#>3<=Btht7m&2)eQY$IUr=lMnPZ{ z9P*~r>6bcFCunQa!fvRVUl7`%Q`g(7DU`+_y4?XdWLj0;9C^AD@$*ttowRP@`E;f5 z53w?_*?ZWeBi$kEd$V=N?Y|kqP;kz8`TsqD7Bx&XbK7m7lKNGXpUe57iofe8V^F8T?GMzZ#$79$zzg@%`a~+AOTf!GVU@l5t z-E%T4kXw4T6>lw(s5}85Gck#3%gah`-|Nwy3{!CKqn2i`eC@r3R6x7-9;kl&-)FB)0gsd9$)!Z&z9XMyshGJiM7&=Q?k}p=_eu| zM#qY&l#ir+6e_nR$(G9V@0({pU)iIX-*u}mdr3VtoG$*rr)-ADrwOa$h~WC6J4_G8 zl+kK;MBhoOH+vfMWU#BT*-s{L<>wk@Sw4Pc{Be2_O>@*MCXd09Z-KsyM%_cFaoY#g zEt)gQX2tOrT|_x*xkgVVw%wG-FA8eyX>^_ zeXoZt{n<+RD<5=|LIhI=z4gnc&$NCuTqO-#b@^BOZqHGDYoPY&A#;g#X=5W=%E+-i&H(OuaoT+u&kDD;4ig-)dIu&11r47Q&bBJ7~sUaQ9PwL_MfK*F+d>;CSpl zcV#g9E(WtlDD*{D#L;QhfO|DxyvwupVy1FDD2youe1yVm!UY<{sWDTbwo}cW2+vZ5 zs|NO%A7mL`JEHe2pr1jYE!f?dHbm@v;Q3u+eZwrTwD4Rmg3cv!F7pkas1aSoZh;8wX7<1IH~%kM!Lr2+E6=l zIDwk}c4AWMB?|@?)Hy+E<)s86Ke?Z~&g49QXc?5)_I&Nqq_C;+l1aX-gSu_xuAXeR zPs-WH*ey#ReI`g>Pxq^#6AXVqjmh5Xcv=@-ng1?Dh~ogGKOeJI?&Al?92(wO^(n=U zc>d6KCW@b9igQvQ*%=%OpWb$Nr*twN&w;k=rkZ7HW3sXvD%fQl9V(SPNKeW!Z@ks{ z`rf<2067hr`PccUKQFtQr`d(~>3y{bJQA*QsUSVzW1@P{n1P~jrvNM$ttcaP|77C_ z&yeN49VVZCwk|icoJ~Bv8?D* zc9YB>&(ScHpU@VhNfAHHWkI*!al|N2_0gEGOxp+zi?_tt7rkbgsr|>@5_Atrj^whR zSYvh(kIz5R=Xhn2!SRQP5(O?PKqm}#6RS@xV2m_T+lOmSy+d@y(&E3A-GWfF^=191 zU9aiw+ygsIZ#cEzsT30t9ZEJlEv!*=V+HFgDtu!}m!2jdC9dU|UQz5!LOy5Ki_hbw z3)(lHb6I`;*q7IpW8Br=N@!)xzRFc~_v8Tc^G`uGI#qMYT|#^vYepf`#8F?En3{F( z@fo&PEc_**Qla*Dj>(42643)|%jdam=1wmoo@?<8R=UWDQ3-I)Aw>q>S^04t=J1crBsbARaaV{2GqYq zwdr_szoWhQ$Xlr?-CIw-d7J8mM{jlpeG+^24soa?GUF4yf0YxW`tj8l?`4uA6*3s6v-@~HHZP+ctDjoHC8nqt6QT|cy29BJas6!o@0Yh5N7X8 z94}NAIi+y7C8@j|^OJN~a7Jj3V{Ol#Nl*K&vbR2}em4>PJP`NX_(Mi^+#ac80!LpQ zI%mtae^^EKCi=kWBPy%0M=6EMFXr|N1Srf?=qv@CDz^!kia}REC?30&J%SsocAaU- z`ye$1QI+blJ9HQu+qd3h3@#JP42^UNz6AGRZGv-amZnGXX!2)nP3VAD6=K8OZ4~&S z00ZSWv8qt+B(Mh~x6|}qHC+)o*DAc|!xtU6W{&Km6(|h4CL|wLRbM=2*Iq^9m@^%J z+MLNwpC7p2Ahn7iXrQcSpvLv}!|8W=PowsB_3XI9@{pfr^l#R%6ae97cHAh6DwWk- zc89(+UFWvGRBxE@)%q0LEtM`z5)_)Hg#B=v0HRt6dcM@~?V5{MLq&pDCX|x)6H%3G zc&2VAd_Y?M&vW}lZm>&Fa-Kk*Zkz9NDt~{RV6M1}hEr?zrM81prP~S>O&imXMID`X zBu+f&SEOy))on{CaPqkx<8S7_$6LQA<*V`N_W;Y})BBi-uxCYneD1H?JsNgOXkRY= z$h;T@Hx=I?pv$1+JH~ITOA&K6ZcC@cw$_Ypug5;c`?+ zTRZ>ut`DXo-|>n$RaI@Pxh*3_LuaIB4%)j}>UGdpTzTlMNJkc)LF>{Kdd;sx~oj2ic$4j zRHILZ$;Is2W8Qf^h_@1^&nSq90^Hh+tC+`WF%25DR~e-vY<)r&QjrWc`;0DLnYnKNwB`eK?AIG6U*=enug(;`iJ0EH zO`X@70&5g-3<|fIeOE&Qa|4ob_$TZPJlNl&{AH5xRQ&4M;;C!=)NI!RTkl?Xq+v#d z=(g}hrIiV;U}IIK!!ksI)SHmf-gw=R^J?b0hg7UH^@Z-!9M)MHthq?Wu81L0s>?A|0a=_hlW_o8C&pvGnBo*^i!c z0jIaxH6h6#f&Tm52ulF~p>Uf3K)ARLCeS^Nxr|T->fWBaN`@D%ohPjw6T3;ncnR4i z@tiv>Ax1;rZswBXn3$Piv5KfjMU^grwZKS@+R#;>5tEs#_d4Z$4N<8gkWf58b0xlY z)Y1d1$?XM1EM5c!nC@KJ;in;1T=cmg|EnNa&Z3YE~Y30*AexL?1#RzNqnFX#O> zEMFD3L3~op{I!XaPfQ|6~y{wio(erJBGYO@1=eGN-hAgO5yP-M;wxdm^{n;w6v%lAt^Kinns_@VOiM9Tq=ghV-$-`}&;gy(fzp>r@=GtZ40eGFGHtt>*e_HJC4fnLsz915xRUoW%Ew`;x z9QGzI6MU>p^8v#HHHR?MLIf`Q0tc%@^=_Q$4)nw9M@)@PGlFEPYIZ|@@i^qAMBVTqPO&zkw z?$8Xpb0CI(d^?W2{Wx;gILWrb`(tX?@$G%CtGl?{UpsOo2JgFB%Fg5_Ul&<-^{{-? z;V&En=DJA7D;d&^j;`W2gDytZ^`90|Cfen86L48lYzWcT^^SD;SOvlEQ2TNyX_gQwsr5nwfL4u0?Rt2mL@lhA$S6;H2o8H8XoL9Jog-~ldw|DRPdaW%7N-#m)@jN0dG`(jUj+i7M2@0;b*^^S=$N(c@Jd7;BdI4K=qEVzK#jpCSk zu}f=2EkrEw5}~PzwD4-FxAfZTvlzKrg+1@Btt=g%sO*|eZk9&B53*PweHm;CW5CYk ztV}*-WXBR?6PxLZUl1a4(v8gnD>Rarx=<}bUzo2p@X+sHR^qIU6XW1dZhghflzsMy z!1b6+{L5|ca`t*~(J{Zd*KoKpglqqfV~LAf3-qT#-ml&Me7|OBNRWRvxS7J>P(W$u z&0qx?V_jb=we1gkq!!LO-+bkmCUt&dZD@Nt9W7I|D#_~2wU{ip^r7mv35Bi=uP~_4 zS2X9(NDX;dOsJZ{bsddx*#Vz}76hL8T%zwx8oa~kT*R^xBRMkFG=}1FsTd*-u+G@t zR!&!IE3#=6}s2TY-ZPVdB{^0|MbnP>d5gQF}Eitzpqdov9pe(AVW%U zQf?D=$RJ*qSpzJ57Be63$Uldvn^da3RpmNYTdYCnAF?t5_f}Hs^%0dNIZee}l)B6} zUUv`f_@+;Ot?hwIBT8*FN5pyhcdyWe9pYFoK>+jFIyI+hlj?d(ozf;Dw;VcWRx z=3gw67J4caN8_r>oR28y)J4r~Z@RPfQ1H*HmJq$i-w*dsr0o%Nb-Ba{)H4reV3_ApUZ@? z^7V|(b`H_31{za(`;c5e);)gY(;a@sEO8WCrvL)wHu1#)b03_TG(zIR5ff)^*bVV_ ze;hG!f;j`bZ>W!iINw$OlD0(0@JKVN=Ee^9U0ZjU7Hm)@thWb)5aV2}cEL8CSy zG@LPk8LZG}7P(ecMPm?whB>>tR%^Rr`pgCDZ6eB&h0^CulJbS#nyLh)%Vt)M40RsG zPNtU0_ys>GuSOIn)_P5t@smpV9OQ`pN?%#S^JakO*DFyF5II% zw7j}CYoA%_-|_*X_}8PahZm7!h7juZe#Bon17l-Nb16;YQG0WLQ+rQ8$c_2>ku+TG zeZk8JefwW|G!6$tK<{W0{TzMWyh&cZa1>;0K{Su4zXORJyacrY-1-4~4-f*RzOm}{ z_tn?Keq>NcGzu<*1sf_TIk4%DLIXfx3t9(F7}%41-N>;)vPdL^O2FSgIVeU5FlPip zmOxk#G-p9Pla?ol@__=H{tcx1O@bjb-JG3?ARGybKeC2HpVj@m?HxgA6zHEf6sP1) zbSC}E`ntKelHgci>>Ry3JnZ4nmYtuwy`L)_5k&O$f`bqh&^6L=FBpR4>PrN9XAs#0 z{&fogc|X84bgF_#t_9k5bM$ie@`OVtg4E%1V9B}m{3GD`#?%U}2Fd_kWe{XLzCzJ88gzC<_yFlP@ZfGOz+0;IspL>R&u zjsU@9fL%900?&`Z5$>on}Wb=C!pgdr?`XF+%x2%K|sRP%IkCj!P1rX-?=Ig~N54pbO?+7Gj{Da zjcnls!0ILBxpd)DD1u)-N9Z|J z9s>>_lCKy!sQmw1g8hDu0cD{C)$u#wFesoYQ0L{bAc2Yq@^CqDY7Y-)3Jx?b2PFXE z(F7bgxr>A2frjI7pb{{3<*=ZvJYWC`^3WA}jsXb-2Fii=p*jh`XW-@6Y0#D28H^kZ zx-=I4s?9I4p0`fOU{E`0<@ewIpO38(qKT^z70 z7!XVdJ;Q;kJP;OiB_}jg4o?;WbcNa{4;myuzylB{3~Cbx-XRO+H*?UHtUpjoSm*`` zi3nmF>Kb$fi98-qbL$C)`!#0&RJEP~46av!20aF^;UKO6=g{kr8vGuv-&ZhXu-}Z3 zYrq0i0>w;1>_d9+PyJ9Sh)7l@P!n|jPc0BrNF>{kT@aOCNCf)rt02OvH8Zz9VY z=mn$`ej0jNo8o$uu)K8d~!LngtvTGRVPzr3Pk}OoL1<81nV}ANC$i_GTMw zKKfVFj&L-&Y1l8H0t_R$f^|PdK7Rz7BLXE_r>~nxAZKzZXQ1-Ga*(G2-3MggEgrj)6(Fo^uE4wVv~UgYVGfiag=Kwvx+v!Lh$uf?Or= zq-5&9eir+yH8SOwyCpXZ1C9x}TXJzfI8dVX9B`@N?9zIk?1;&`B%rJ7PMo}J0q#g( z7D1(c`(@lOCV;Dk0~Akw-yi7EuiU>*6j(Fz!vJ9H)^o?Cp(H@S1JQ6|;{eKn%ZvEe4nE$gFQWioR z_5di!gBt&Ea}amGxcQewsLy{9q3-`hgoOILGwVKoJun@i=jH^#DKL2<@d4zf;pGo} zG`T;&;y8de`}buH5Pc1S#@ztjdAoa&+}#}D0T@{nQWhl*cO{X${S*)gkAI%YdilCY z!Vm!FIr%$+Xzo8>^LBEEJ3uhW)9H5=peVTkH!n|3DC~W=rUDv?#)AWO7$i7Rhm^CD z{6i*zzC>pjWKCee@%{P%zlH-d!5RJ!4FZ?Y5AOMkh5}{^bm_k|oIC`SHqsz1-#{Y( zd$@rHj6cw{|CR$h1BLq!4cy>2)`vu5pgI5Fa$qxkBhN@A`mZ*?tC0Em&$}or8Zu)W zXvn|&20#?hiT{?vp~2zH|I$Dk0QGL9$pM_Ofrdpw1`_)HBWENQ1J~%MCK@5gv@v5nljHaCS&2chRj@&CUb<$V=5#mLrEn=WuC_IO0o$%W62196fzp;j$)5{+9=|eY{CtCh3@y97#ktX1&40kwhZ;d&9BNPnfK^o1?S4w<{d?}qO-fBx=%3RhyZ_aNEBQSkAmC7WHrE`z>@)R{5qwYx3>@Yg@nj{RRnbVr{)Za z-mWA!IOv4jxwg9}7%N;>+Y=~2lSuG!CPE?yko<{`Ua;WY7Ml_8opNl)XRp20*`mAq z#_8eXaZ5t*$F#VLP$`EU+Ii{e&O{y$#m`+QR`@>~%Egs&)w*i?Gqva7T~aRxi$8kz zYW}|6$<@X8U)$m)zh5{=ntVO-u&sCD{`h3k@OPgt%MZ@3Zr&qzuC?m$Il0KOs;1c_ zh6@X;OS1M$w-wizjrJ6@Xbo5F&m}FCS2?E*4;04mTczq-I#mgC<(ea{c=O)5$Ij5_ zkAm5xOOXf{jqcWUJvY}YVz5}e8E2a9_hJU`^!>qi?vo!@Z4*|d#CIs+yI7k8O`ckL zM0~jRJg`r;cK&|$^S7&W(+GjgwV4B-`xdO%YVNPx`}Tpjq#<|xU5cfs2)9v>dc7By z;O-rP&IvD$2ycB55*F)N5>CwNITbo$$dvrRxy(2#?RxTQ`y#fKg0~mr&@!733ckAQ zuTl_YbV8W>etT35XVGwxNkDS&*)2R5i~ zg4@xtU6~;4n&T&={Cex-0H+qN663Px&(~Jp^-O*!F`nI1SAF+htw5nJef;*0{BOC; zx-CaP&OMbBK)l1Bqv3Q9ZWr;Udt5M{MDpb=ljGbEz~nG}BJIg3`VU$bS9mKkI3+_gnq)og%8B$S6HHWZq%QZUs}E z9dA*f``+^*&*ad>BVE%`77uGk^Co>EF|PZyv-4wWlG|XDEbRJR_D3pD?%hdqm8G#~ zv9`}g#j*B2W^AFi|HG=t>T1#0L8fKP;P3BC*AA|I{`|e~Le|6)`L#0vdU6lXWsTWC ze8QR`DBmfzV@o%0HWxR+UjxIQ#x1iO7Qbh6?~8sJ!Aqx<+Jx0Z#wM=r5cJIRJEYz2 zuEkq18^T*lU*jSFAS+Z-?eOLSzDLbD&exloo2K-;{L&cDwol0D_zE4DRAoK8y~wcP z`*Lk3abdn9p8w2&18c`tz7p6BPK_GN+Z}y#r|RX2>?!4K-J_eWrd!OqyI^gJe!k^g zf%C23+s4*J4iBff%l-V4+{U`+c|B3;j#!dzn++r{ZC)w911bY(ANF zx$G)O)2hY9v3n0Vwmj}F%xT^qquJV1WUpXfYHg==Cw0&e&KPV|r%mgnNvg4`9C+Q> zcNWhbb}dPbHg2BA@$glvCh?xA7u!-Mgg)W8Gd)mKlE-VT0vaT`b`Bbr>K%)U?@}Dj z9}3x3;o=9oTZ!~eJ9Pi~t@NCBCe|aY23h_!8om;vP27nzg16>PXhuRqXaa?CbsDC} z^YkSQO%&fq^oW*cGaQRz6x}Hzpg)on^VTHaBJ0bsSK6IXjxCxmBn|5tcuw~U^=XZ$ z& z)~JqQ@{#d`K>_P8(vk+eCd|Rrl*b$EdyE zBZo{_&RiG0IM=gD&!rJ3c+1?Rc9(K-(JFEIMyloFyI?D2e3t~T(7_Hd4kH3wUF}Fh zZG^Shl{}O@?p$Q_rl>P)#YtwHr4@9Tv5Z0u`V11RVI~&d!A1h?dRkGD5;-%c*z)=< zHggu}=ZRS1hcMCx^Ith`9J*8ac`B~le?|rN`2?f=FrSQJ#R}^K{m9+7(ql|Jce747 zzSq}W4xD(nz#EOf|47vKd81+b#4*9Rf)uQwo%KP^V=+r2tsD-|qiP-ZACsLrkziZC zuN-S6i>%vZ+cX?K9Q|qdnT(oRs5A_H*@ODjzOx1?+uTyMh<;x1iuXQ@Y;0c1Sx0Pl zSe5eHrbMZpl{#eZK^<3ihO5)ZLh6m*lvJ9^ZccSCD?^8oa^1*+Uae@$14r!pog9=1 zyk1Y=g~mpNj!4BDoEGP${@zJ1c`=sm^~1=lk66ex!ts;kd5-LD8Qa}$b(>0TEN%vA zU#yqJ@Eg4bsUQKpHLIZv#LnT$|`29m#_n5`9uFL(Y* zyd7;+wTX&g;3KtU!J)kEP5Vt9b-QKoO*<|#f+We8`6GYA;ur6-GA5V*S=Z`N>gDo}IFl5K3p_7+_y z8Ll^TdZVQ)Cw3M3^-LYNc&yAB5;~eH&C=P2O`RXKw(&eA)IGb}m3%j`bGt~0ZoJjr z3l}^r@rfX!TbMr|iSZP!*FPG|&&+slUxowo-CYBnxq67h@RC+M@O00Gr_*^YBPb+g&bapdXLT*5|zBy(XjOv8!7* zd)DejeURi1c{(hk8@Ast$vK+)Sm|A?^kU^pldYY65^ddesIY6>kSZsKOjK1|A7A9V zI-bKM?{nKl=p(0Sl%rnu7OqSI)u%I{Z2~ zoigU`uNBS7*r z2{A}1$+Kv>=YNw;G2^{`#8gDl*$3~@gXL49mp(<84i)Y_$+vi4SgNt`xvmzyAlLoD zF1{O5g*%5vs!vdl+fRpE8J8(dZF7Ha8DhZM{id77kjG20H1w7=q0xfzBUg{$kWEZC zw?bLW^2D=&2F$)Ayh2T8Q4!P17qND4dD#q_mi1a`-rWEAwY1&qNjPSwj41&t;RCFc1?tjW4d9Ruz}snnPyF{1Gi0v z)la=@(mud=Hf%QHQSKMsikEwyY8R&Ov9N`+BTaXaOyzi&bmi(`11Ac@_g`JInrA&M z*<-5^RB?Z);pUrUi4t|CiN`yi=-Lcx4t!F+`8+^_)Cmt$krSlXbU$+-s__2jra&)D z?JZoT*wqM~qR2VDO5U77ahG<_;x;b{OB77*YW(SM7TPu}bGTkqXzHs-l#Ovvsc6B< zkXU9CH@jb>jG`i!6Z@7Or)~Ah(RQjTG3i}IH+N>2*P+&2t*UgoJh_gW z3LA0)976o>pzb`_)Wx~2`)?so(9Z6URe?wRw-Cx_y0+FM7&K-L8yLpTH-EF($I`Jn z9DOXLfyK7{4kH(nz{RIg>C`^q6=H!7_6Mjr)wy__Cq@*x<*Ms!S__@OIruL8*-6{3 zY*GXVp-WbCKWQrcR9kCW!jc=oI8xf~HHmkxIZt9-t&4qLmm_M>r!J?u-oi2Y;Z(ub z?REL{r7owg?L#9Y<+9m$9V8C>vn(!NmY*I;N%Gw9*nf_fA$-0Zs`)9GNR$Yl;c%TX{PrHYKJ=)heFEd5v;TG)MIKY&J>0WZKeO16ddlq zCk2hhSRhRLv76akqD6nR3|jUss>*J|?ocU$x%sXPsw5l=8!@0htHQQR4=N7xeK~9*t6qqprVD$owKc^s=C&ERFX&}r zbOwf-T?a-S3=btcziq?6OuE6QU2Ql1@Ii58I6RK#-q!sK*(V8ox#^cj_q2c8F-So| z=!oSH3DJlRcp>E!18?BgHI`JOWVY3ReR6)jaCc4C^OGZDBJWP`e8Q!5D5J*E+MGWL zlNx#Q-ACVwb90KGeJ(C9Yzi`CU#Cs>b2`lw9DCBOZEr2s@;=f<3M zPR@J%OWou4?OihA7_4r(%d~CLQ%gfUAo=0@gH63#JTg4Fg7v@B@?RVkHDI=UGE%&E zwoK_s|B`^sFiC)d?xFLnKXi}Xh@Ikaz=I($sQvv|6mA~^g)u?Mp-pv>6XD2!2C3_w z%-nW@k&G#QVI|$;oiFsMH+d+qoT9)VIzan_zZ`l4>*O|IXmTB4-jBuLED$#^mVmtp zV80m_W>sy6sMv|<>Sk*O!A!&HT@H^rG1&_K%Kwf7-DG`t*SbLbv0UjC1fqa3aQF=i zgaP`11i~Rqe+cxCJ`DLHiF8ied2-+t-Jz{vBI8U+Jb2&1b=Us&3YQiIVJO9CH0r;l z!TuVbAI;Q>wd9@pyf$ps$-$s1a{G>m6G3>hEb_yQNuKxR1q>Q6<8p~ej zafD@#9|#KX+R;kf!;BWF%e3q*Y)xfq*TN|)v7#P}Y*Rqg(gi+tc^Z39aJ*93tVcUz zaF}i3fdoyb`R%cH;cQ+dUjx3CZt?o|lznf|(f4|<*<&U%O6!TQI8$4!=#)*X4Uor^ z-h1YY<=5@L5aFqEC`L1Lx9f}8hKAk!%v06Vt}piqy4jCj_97H(>$eTw-)dx(!mzB# z=z1G7!?k*{@9w?fZ!CKSEm$cij6y)(kU9M@XPRVfZ8}(ys^w`xZr%F+c1^*|ExQ<; zKtQHbn>FBd%}N!?B79#u?V0e_z$osciVDiz}idEd+J+}HjV2u&e4iQ$Rc=w1D+VLD96 zv}>?(bh+05R1sq^0S zHWGjLm)x7Z5qQp;rB|^ydZUD=Z>N^n-6dqrYWZx=od{%0AKgB=oh{S#B+r=h+A{s> zDH|aQ8l(`_(Rl2CPp7h(fp?24!^k9uUEYI_XZwzr?HjH8=HsDZc15Bt7TJn8-kDVW z+Q6R8FRG*JK)J0_(J>iYo9rVowQ&g|Dz~3y&IsPNC&^UE^XbfOL7kJ(Ebh5FkpD$J zBZ@w0@5lTtjmPn;uVh0ThOckEGp>wMyT0p+TFgY0Fk`bDo1MY4JnHNVF1HFmq{h{rf8Ww^+p}vsZ3yQ7R!2s`&pZQAX#fWtvYdK%_onBIyK>8zJ!Tq?aNNfFXsM~4n)kbZ-8Rf+522e@ zLT7I|X(pWZ3{bvH^T=?v1^=jtz0q^_+@t&(Xtac2_|ux`LmyOwD(}tv-oCI0{kFi1 z!kkjTM@aMrLZDHi8Z$MLfqM2x)CEpsn(OLc3_Rtvkhs`&(OQqp(>?rGA z>glZ=TWuVMuk0|53ztlL#7HWzY`)t4qVmn7AUO@`xfjL9KYewx%yftvFj%w-J{YBv zbtOA!CPh7D+(=QVn;%w)Qk0gwb+q}tci7iGU1lG@cPxG3yT;P9VwX!ngA_m!egnr0 zS_uuT^>7B7H48OH80@LBXetQ03ENP|9447(B5AkWKB6r^n=X2Q(~4f7@XRDp_3pU8 zbjLGV7GJTGPx~!$GX@X4C+qDKe^$tLWR2NXG^zN=0O8!y7Q#1SB??kffKF)426mrX zz!_;G8Ak4#`-bU`XC}?xyb7Ua+p~rbdtNX)cm{Ww7ddxcuND#BHJoOATu7s|Xa(cH zOQ`6J9wTi~dSaW0L21HNaxuq+r=MO`ENB-!;j~$t8MxF_VA|8!j&I+Vf1b1E#?c|> zCm%!XbZchQdIWcNt(k;L5l8)DB5Jn%ho@M_Soq4rCBq%Bd&opg6H!BJUr%w_%^qKJ zYLf}&DxJL7z`UvWGW$*1cAlp*IB~a4`)cC!8ZXX;cn1}DUflO?dFLiHGs3?kZwRyS_>-*}ewyt(#R(-+j3~iuIg0^_-|cAr0R8%#wuQSE1aEXA4Cz(Dt7wX^tRb<_n!kG3n#;ym9IS7YQL`mv=;pnldu}gkKfEz`7`vyZ zkNX@;Eg#S5-=bkC0K$!&xKUJ9#x_eCZbK>h?#%-kzA&Nl4e4~7s$I8=QD936`(ZZ# zM75F){Au6}noH-yg+o^+lv4GH$m%uR)?Q~^P-gM>NPXcVHmOODBZ%W2b3M+t-yOzV zD(;}=(Au5VvF~lg<`PBo=4_AHLmvpli93UebS*o2?eSNf{Vv7_S_De?8uq0xnvO09 zS*IP>VJ5;Jmj>+ge9`IEv|UX5Y}pL+QY_q9%v`5s8vf)Xy;#EW^b$$mbDp0E3=qsT z-__Z(CgM~-Fom`Dm@iDQO}%;*G#TdLbA(m*$+q@n!R>tELFeq&UOrmByXF_I*msMk zeCO=T!l+73FUa0XDtJrggU^pLR`=23W_if)@I`WD?=?T}d;VK}?I=TYC+Rcd~5 zm#-95NdW{RaT~BMtt3S@YC$HAqkHp(zX(&S^tqtKaZ5gg$DU^OgI4f}TK4dvr{S@0 z!%y878Ks>qqj4pyyu*28*(PWLLoEwmxuuyMuWoTqCw+3m1xRiwW9M^W4oSPnlebNQ z^Z9hagn!XY@3MyP;=#rHPNVps0gP_(w7 zyZl*bfJ@6v0-K#8E*qF8Vr~9h;aEm9^O5M+-p)7?2`$;w>mp12uusQh-*7CAKFp}p zz+}4NPi#R;T;0JpXn@P{m0+*Qd)9>hy48zBZF!2n!WQ0N=rDX6?%&3;ov9(4Jx+*8 z--vFl+&}Tcfw&ziO`kM4l2e%9VArg^&joC~W}YaP?Co>!<$>HQ%+#zqQ*!Er?X4mg+!U^1QvDp;?3ZJ9CjY*N@1;K3*J9?6DX547+}en*(TcBy z&et@L7CF$I5dExTn%4OI{?=_G-gm@CmF@T#v&Fs9O&WCPnWUoa{lXS95L@hYOwOE} zy6E`u-g}ya#Uiu$S(dc(Q>8DXKTvH}=XIeV8U-AK#BSu=)fms*gkT!^4m$x44Ya!b zJXPpz((1{ww|RUtta-uhH!c!rnUP_7Z98K#Zwjnn5>%xka)d+FTM$ydIK8k_YL%Y)^UGv2d7$Eh4z5agFYzrQ!aPyj$k>;?c3HnD3f7@o## zPWTp@{=OSZ#;5a6k=8s!F4Hn)Av(mKa784?YZy9AWf8_jER4%kb_rM2=;60rF_EJ& zb~9u`=N1~gNPl;as8khrE9~eNg}qG7<@cAMc6^^xPV(Qe&+qd2ZtU5(Dzu)< zhkLpe*^Ni+cP0jdYg1t}wE%b+7T>d;8HJ z1o51VSDbA2H@I_TJ28&19W*yJ&k5mK{WdrAN%6!gl~lm701DGW0Y#y~+ou0~-HHVp z3|g?k=o7!Adg-Ha@BG!PareM!2m4)4LxjUCaarpJ2KRRWiLQ0%LJA#6WG6~~Uc2)#`ahrZDBUqZu8 zCRJN&ols`f;kzCOxcf20`isV%EUY^}CRSFMzo2ZL-8GhKaH*fqXzyF9md+NvJbS(6 zyzDG}&6La!$1BtqqU%$XRCJIXp;Ne6uL4@pN&ZVB-OU^d*V{JNx#T?D_tv42|CD(A z{w$i~3(nNH2~7H--bDqhO^2TIW+x~|S%g)G=PS#2P@m%z5ahZ2sp?9`87*rS zix4wQmW~&vN?W@GpXEt?4A+y07Bq>=;!Tz&dRK*x;I`hlk#2tJ%=Ng7h{Vq~JvsRg z7T5{}rtFDmmXLk+s)n!q(xckv1#$OI7E&P@DU5~!Rzd@~zk%g~_Xb$$!*|$mc&PWt z>=U?d^2+_KSDtWC`;pAj6m2Ymx=uw|+?paS-%NkXCO2`I7}d~L$Zy3doUd7rYH5DUo6$RS-Np7q zhI@TtaAdCJBk49PTC*La^3$i6@{dU0po_ViopBn*XBo?NLhVzYPQvle+byi%1JgV8 zeB)zHl0$>Sp6W8;k4goZ3M`;^BiY|R-J$hNEleaO3*S;hS{VD%Uy--^I9~3)g2X#p z8*9P?l^xS*tx~9WAyx~d`A2OLTQIW)E0YhI*f7L|lvet(r}*fCY*WkNDvi{wJ;*k} zdFJy?JdFBZl{oGvim>ygwU04x%|CgN|6+VD?)m071$(?W>6u?vHXW!Acwo;fJ3MdV=5v(9>s^?Fw#!%ZQxe)1cd5n-Ld1_*9n4y!N zZfl$>$>wEV`~|qw{<_!6C2mb)Xk_>p?Wq%z!(LVssuplPf(bT1=u^l7|6{)_#_rTd z*O^>OSytl3pS^7vNAkE<4HJj9P1#>l&Q|LvwQCk(=Qe!kDD29-CmSVmGeB7mKB;Y$ z>6qkg$%|$?gNf1=jJkTZu$M-SExuek0q4*0?UHOFDSBT*F>)$E+UsDSk-SI$lvU~6 zL>j-npNtr{Z0XL0MI1G^se9?1s&4&vZwT#29v#9gRhSEbj&qcIJBG{_EyC` zd^uJZ^XgmtwaLll73zZywlNfRNC{5LZNLv1CFwD1fQ`>mZXHgIkx7qt; z8uWo-D-&>EC8d79T{0x+xAA8r&+dFxUkN9?9FSk@xTDgHR2wajw#p1@d(zDo(1=2A zzOCKyM5&;jrOl3k%{f!xUOimMK5^M{-a2)muS#(=vF4`BLFIz_*eQmV>s0$gzt^;d z88j{*7@WwI5FxOinhSd`WQJ4B-*(Gl%Kz%=%F(luzMKmy0~#~aw6}zn#wZM%0?gcy z_r-!2I41qKEY{zUVrgz{?J?vuW2$~JWyiq2>p~N)Iiq7(VZb(t0P@!z0j4a86u46W zfpQyo0wK|l(?1%=#z(y*p@X0St_TH;()-4!nzY+SIrXZ3zh{D38YgI)B^ zG{NjSv(#e2*XAlA*)q8`&xX4XVJ0(fN(Y4Axm_n)mQo+<5);wjo^^*RM~O;fc%ax+ zgPXs9=#u&F>(6LZPFH!ej1q2zW!@H?jDN>^m zh`pOOvHNvxcD|sWRyon^Q?XD_U(cE$r^3jpI|dMc)Qn6` zH7zAIiH9651I-=11E4VGA3)M@bMyxf%NjcVsH3o0m@M>+CNY5E@9s4 z%mbZB_9Dm0OdvDQDAfs2k3`13VL6s;pnTou3!XA*6L>!WL<%LFn5BQx2q=+ z2rg?*B6?Xu6(h%B@`%8IFsNTsvVP(?TR08@Qmg*|3+(^34>Z8p7%+ht90D$nmWSie zI2a!62IPQ^pg%++x`zc9C{owgP+s&^9rU6_EQjY4;bJ|pj1Fee8>*|=^vtk8p*~4dV+rc(+eaN0s$l6 z`_Ta&eDd_OK!&hx1CSm3UdXxzb^+PMFNJImkWKt7WLx=R6OcXpS|GW|zd;|6ZTu|I z$Z-fPm_r~dWCIimULMQ|WEUW${aSw7#Lp|F6fo~F@)fH8Qpj_?ejlRwr$BZA%_?;L z_Y?GA!Js+_QDk2T+&54nIp!I{kr4U-(a#7@{xTQTOyFQ<$!!*J6tw3BZB}qF$)E-U zo*KAWavS7o!IZDpfARMyay0wH=c9hGCcsf-*03Kj1vo}>2kUW){QVJNM+8Q+-oEZ4 zfu6}!F2Lk})SyTOh7X!Sw19#a7(1E971+{x%?%Em1i8%}4rYy9^8kx#z2*tbYrW$Ssh5V^@+kwZRqo%_G3#Zb>b!=N@;eCvJwdmrNV^S&DLS4i?tvazg&8X%G-IKq)o zM3PlQ{ZBFw*8e3#|4%Z641_iu0Z@_$J^m8rAnATc^RI}|nEz^ohW}S1q|~2-S&#Ya z`PQ-q?#>XL0*ez{ivKK`!aFj)ZeoC66UN%_~ue4Sn3P7qA;cK+D~ASE+!_wm+* zvblF_DxeT39QdFPjR0TNA@Mfizvv{`pXdUEya@~hz8`-;T*HEu-~#`r4FZ?YAGr6A zHY5m#VDkTH!&0EbLgxDi9S#J8KilLW_5DEy`NiMcz%Lw_#^2}=2sE_je{Vx0pr!JA z8v=#+Qx*go0r`^O=#U^}{xvqV+}~qEBO%xI`?DB4K()WO0UjW>{Mm*<{7okZxxL@| zL!jlMMfZCf2HG$D*#>0={n-Wx|7ychpu31Gj1o)ca&o&g~{QsasLmuLfHXOi1 zf40g0t$Qr~FWW%iPykH)@hnab^1Q$Ez~LbW`Fk5s-(R|t!$3a$4?6k3Z3Bq+m+s}k ziuv9NP4}U3>tdu!9+x~jJ09^2QjkMYybcN literal 0 HcmV?d00001 diff --git a/mlx/backend/cpu/gemms/aligned_buffer.h b/mlx/backend/cpu/gemms/aligned_buffer.h new file mode 100644 index 0000000000..08e5aeadd5 --- /dev/null +++ b/mlx/backend/cpu/gemms/aligned_buffer.h @@ -0,0 +1,65 @@ +// Copyright © 2025 Apple Inc. +#pragma once + +#include +#include + +namespace mlx::core { + +// 32-byte aligned buffer with grow-only reallocation (for thread_local reuse). +template +class aligned_unique_ptr { + private: + T* ptr_; + size_t size_; + + public: + aligned_unique_ptr() : ptr_(nullptr), size_(0) {} + + explicit aligned_unique_ptr(size_t size) : size_(size) { + ptr_ = static_cast(aligned_alloc(32, size * sizeof(T))); + if (!ptr_) + throw std::bad_alloc(); + } + + ~aligned_unique_ptr() { + if (ptr_) + free(ptr_); + } + + aligned_unique_ptr(aligned_unique_ptr&& other) noexcept + : ptr_(other.ptr_), size_(other.size_) { + other.ptr_ = nullptr; + other.size_ = 0; + } + + aligned_unique_ptr& operator=(aligned_unique_ptr&& other) noexcept { + if (this != &other) { + if (ptr_) + free(ptr_); + ptr_ = other.ptr_; + size_ = other.size_; + other.ptr_ = nullptr; + other.size_ = 0; + } + return *this; + } + + aligned_unique_ptr(const aligned_unique_ptr&) = delete; + aligned_unique_ptr& operator=(const aligned_unique_ptr&) = delete; + + T* get() const { return ptr_; } + + void reset(size_t new_size) { + if (new_size > size_) { + if (ptr_) + free(ptr_); + ptr_ = static_cast(aligned_alloc(32, new_size * sizeof(T))); + if (!ptr_) + throw std::bad_alloc(); + size_ = new_size; + } + } +}; + +} // namespace mlx::core diff --git a/mlx/backend/cpu/gemms/avx_simd_gemm.h b/mlx/backend/cpu/gemms/avx_simd_gemm.h new file mode 100644 index 0000000000..69d4129613 --- /dev/null +++ b/mlx/backend/cpu/gemms/avx_simd_gemm.h @@ -0,0 +1,397 @@ +// Copyright © 2025 Apple Inc. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "mlx/backend/cpu/gemms/aligned_buffer.h" +#include "mlx/backend/cpu/gemms/avx_simd_gemv.h" +#include "mlx/backend/cpu/simd/avx_simd.h" + +namespace mlx::core { + +template +inline void pack_transpose_8x8( + const T* src, float* dst, int src_stride, int dst_stride) { + simd::transpose_8x8_block(src, dst, src_stride, dst_stride); +} + +// Pack A block (m_block x k_block) into A_packed (MC x KC float, column-major). +template +static void pack_A_block( + const T* A, float* A_packed, + int M, int K, int ldA, + int M_offset, int K_offset, + int m_block, int k_block, bool a_trans) +{ + static_assert(std::is_same_v || std::is_same_v, + "T must be float16 or bfloat16"); + constexpr int simd_width = 8; + + // Zero-fill only the portions we access (edge tiles) + if (m_block < MC || k_block < KC) { + for (int k = 0; k < k_block; ++k) { + std::fill(A_packed + k * MC, A_packed + k * MC + m_block, 0.0f); + } + } + + if (!a_trans) { + // A is row-major (M x K). Pack with 8x8 transpose blocks. + for (int k = 0; k < k_block; k += 8) { + int k_chunk = std::min(8, k_block - k); + + if (k_chunk == 8) { + for (int i = 0; i < m_block; i += 8) { + int m_chunk = std::min(8, m_block - i); + + if (m_chunk == 8) { + const T* a_block_start = A + (M_offset + i) * ldA + K_offset + k; + pack_transpose_8x8(a_block_start, A_packed + k * MC + i, ldA, MC); + } else { + for (int ii = 0; ii < m_chunk; ++ii) { + const T* a_src_row_ptr = A + (M_offset + i + ii) * ldA + K_offset + k; + for (int kk = 0; kk < k_chunk; ++kk) { + A_packed[(k + kk) * MC + (i + ii)] = static_cast(a_src_row_ptr[kk]); + } + } + } + } + } else { + for (int i = 0; i < m_block; ++i) { + const T* a_src_row_ptr = A + (M_offset + i) * ldA + K_offset + k; + for (int kk = 0; kk < k_chunk; ++kk) { + A_packed[(k + kk) * MC + i] = static_cast(a_src_row_ptr[kk]); + } + } + } + } + } else { + // A is transposed (K x M row-major). Contiguous copy with SIMD convert. + for (int k = 0; k < k_block; ++k) { + const T* a_src_row_ptr = A + (K_offset + k) * ldA + M_offset; + float* a_dst_col_ptr = A_packed + k * MC; + int i = 0; + for (; i + simd_width <= m_block; i += simd_width) { + simd::float8 a_vec = simd::load_convert_to_float(a_src_row_ptr + i); + simd::store(a_dst_col_ptr + i, a_vec); + } + for (; i < m_block; ++i) { + a_dst_col_ptr[i] = static_cast(a_src_row_ptr[i]); + } + } + } +} + +// Pack B block (k_block x n_block) into B_packed (KC x NC float, row-major). +template +static void pack_B_block( + const T* B, float* B_packed, + int K, int N, int ldB, + int K_offset, int N_offset, + int k_block, int n_block, bool b_trans) +{ + static_assert(std::is_same_v || std::is_same_v, + "T must be float16 or bfloat16"); + constexpr int simd_width = 8; + + if (k_block < KC || n_block < NC) { + for (int k = 0; k < k_block; ++k) { + std::fill(B_packed + k * NC, B_packed + k * NC + n_block, 0.0f); + } + } + + if (!b_trans) { + // B is row-major (K x N). Contiguous copy with SIMD convert. + for (int k = 0; k < k_block; ++k) { + const T* b_src_row_ptr = B + (K_offset + k) * ldB + N_offset; + float* b_dst_row_ptr = B_packed + k * NC; + int j = 0; + for (; j + simd_width <= n_block; j += simd_width) { + simd::float8 b_vec = simd::load_convert_to_float(b_src_row_ptr + j); + simd::store(b_dst_row_ptr + j, b_vec); + } + for (; j < n_block; ++j) { + b_dst_row_ptr[j] = static_cast(b_src_row_ptr[j]); + } + } + } else { + // B is transposed (N x K row-major). Pack with 8x8 transpose blocks. + for (int k = 0; k < k_block; k += 8) { + int k_chunk = std::min(8, k_block - k); + + if (k_chunk == 8) { + for (int j = 0; j < n_block; j += 8) { + int n_chunk = std::min(8, n_block - j); + + if (n_chunk == 8) { + const T* b_block_start = B + (N_offset + j) * ldB + K_offset + k; + float tmp_transpose[64]; + pack_transpose_8x8(b_block_start, tmp_transpose, ldB, 8); + for (int kk = 0; kk < 8; ++kk) { + for (int jj = 0; jj < 8; ++jj) { + B_packed[(k + kk) * NC + (j + jj)] = tmp_transpose[kk * 8 + jj]; + } + } + } else { + for (int kk = 0; kk < k_chunk; ++kk) { + float* b_dst_row_ptr = B_packed + (k + kk) * NC + j; + for (int jj = 0; jj < n_chunk; ++jj) { + b_dst_row_ptr[jj] = static_cast(B[(N_offset + j + jj) * ldB + (K_offset + k + kk)]); + } + } + } + } + } else { + for (int kk = 0; kk < k_chunk; ++kk) { + float* b_dst_row_ptr = B_packed + (k + kk) * NC; + for (int j = 0; j < n_block; ++j) { + b_dst_row_ptr[j] = static_cast(B[(N_offset + j) * ldB + (K_offset + k + kk)]); + } + } + } + } + } +} + +/** + * Optimized single-threaded matrix multiplication using AVX/FMA with + * float32 accumulation. Inputs/outputs are float16 or bfloat16. + * + * Uses jc→pc→ic loop order (classic Goto) so B is packed once per (jc,pc) + * and reused across all ic blocks. An M×NC_BLOCK fp32 accumulator keeps + * partial C sums in fp32 across K-panels, eliminating fp16 round-trips. + */ +template +void simd_gemm_optimized_higher_precision( + const T* a, const T* b, T* c, + bool a_trans, bool b_trans, + int M, int N, int K, + int ldA, int ldB, int ldC, + float alpha, float beta) +{ + static_assert(std::is_same_v || std::is_same_v, + "GEMM kernel requires float16_t or bfloat16_t."); + + // --- Blocking Parameters --- + constexpr int MR = 6; + constexpr int NR = 16; + static_assert(NR % 8 == 0, "NR must be multiple of float SIMD width (8)"); + + constexpr int KC_BLOCK = 256; + constexpr int MC_BLOCK = 96; + constexpr int NC_BLOCK = 256; + + static_assert(MC_BLOCK % MR == 0, "MC_BLOCK must be a multiple of MR"); + static_assert(NC_BLOCK % NR == 0, "NC_BLOCK must be a multiple of NR"); + + // Thread-local buffers (grow-only, reused across calls) + thread_local aligned_unique_ptr A_packed_buf(MC_BLOCK * KC_BLOCK); + thread_local aligned_unique_ptr B_packed_buf(KC_BLOCK * NC_BLOCK); + thread_local aligned_unique_ptr C_acc_buf(1); + + A_packed_buf.reset(MC_BLOCK * KC_BLOCK); + B_packed_buf.reset(KC_BLOCK * NC_BLOCK); + C_acc_buf.reset(M * NC_BLOCK); + + float* A_packed = A_packed_buf.get(); + float* B_packed = B_packed_buf.get(); + float* C_acc = C_acc_buf.get(); + + // Scalar fallback for edge tiles (m_micro < MR or n_micro < NR) + auto compute_block_scalar_partial = []( + + const float* A_panel, + const float* B_panel, + float* C_sub, + int ldc_acc, + int m_micro, int n_micro, int k_block, + int a_stride, + int b_stride) + { + for (int i = 0; i < m_micro; ++i) { + for (int j = 0; j < n_micro; ++j) { + float acc = C_sub[i * ldc_acc + j]; + for (int k = 0; k < k_block; ++k) { + acc += A_panel[i + k * a_stride] * B_panel[k * b_stride + j]; + } + C_sub[i * ldc_acc + j] = acc; + } + } + }; + + constexpr int sw = 8; + + for (int jc = 0; jc < N; jc += NC_BLOCK) { + int nc = std::min(NC_BLOCK, N - jc); + + for (int pc = 0; pc < K; pc += KC_BLOCK) { + int kc = std::min(KC_BLOCK, K - pc); + bool first_k = (pc == 0); + bool last_k = (pc + kc >= K); + + pack_B_block( + b, B_packed, K, N, ldB, pc, jc, kc, nc, b_trans); + + for (int ic = 0; ic < M; ic += MC_BLOCK) { + int mc = std::min(MC_BLOCK, M - ic); + + pack_A_block( + a, A_packed, M, K, ldA, ic, pc, mc, kc, a_trans); + + // Initialize C_acc on first K-panel + if (first_k) { + if (beta != 0.0f) { + simd::float8 beta_vec(beta); + for (int i = 0; i < mc; ++i) { + const T* c_row = c + (ic + i) * ldC + jc; + float* acc_row = C_acc + (ic + i) * NC_BLOCK; + int j = 0; + for (; j + sw <= nc; j += sw) { + simd::float8 cv = simd::load_convert_to_float(c_row + j); + simd::store(acc_row + j, beta_vec * cv); + } + for (; j < nc; ++j) { + acc_row[j] = beta * static_cast(c_row[j]); + } + } + } else { + for (int i = 0; i < mc; ++i) { + std::memset(C_acc + (ic + i) * NC_BLOCK, 0, nc * sizeof(float)); + } + } + } + + // Microkernel loop + for (int ir = 0; ir < mc; ir += MR) { + int m_micro = std::min(MR, mc - ir); + + for (int jr = 0; jr < nc; jr += NR) { + int n_micro = std::min(NR, nc - jr); + + const float* a_ptr = A_packed + ir; + const float* b_ptr = B_packed + jr; + float* c_ptr = C_acc + (ic + ir) * NC_BLOCK + jr; + + // Prefetch next C_acc tile into L2 + if (jr + NR < nc) { + for (int pi = 0; pi < MR && ir + pi < mc; ++pi) + _mm_prefetch(reinterpret_cast( + C_acc + (ic + ir + pi) * NC_BLOCK + jr + NR), _MM_HINT_T1); + } else if (ir + MR < mc) { + for (int pi = 0; pi < MR && ir + MR + pi < mc; ++pi) + _mm_prefetch(reinterpret_cast( + C_acc + (ic + ir + MR + pi) * NC_BLOCK), _MM_HINT_T1); + } + + if (m_micro == MR && n_micro == NR) { + simd::micro_kernel_6x16( + a_ptr, b_ptr, c_ptr, + NC_BLOCK, kc, + MC_BLOCK, NC_BLOCK); + } else { + compute_block_scalar_partial( + a_ptr, b_ptr, c_ptr, + NC_BLOCK, + m_micro, n_micro, kc, + MC_BLOCK, NC_BLOCK); + } + } + } + + // Write C_acc back to output on last K-panel + if (last_k) { + bool apply_alpha = (alpha != 1.0f); + simd::float8 alpha_vec(alpha); + + for (int i = 0; i < mc; ++i) { + T* c_row = c + (ic + i) * ldC + jc; + float* acc_row = C_acc + (ic + i) * NC_BLOCK; + int j = 0; + for (; j + sw <= nc; j += sw) { + simd::float8 acc = simd::load(acc_row + j); + if (apply_alpha) acc = alpha_vec * acc; + simd::store_convert_from_float(c_row + j, acc); + } + for (; j < nc; ++j) { + float val = acc_row[j]; + if (apply_alpha) val *= alpha; + c_row[j] = static_cast(val); + } + } + } + } // ic + } // pc + } // jc +} + +// Public interface: validates dimensions and dispatches to the blocked kernel. +template +void simd_gemm( + const T* a, + const T* b, + T* c, + bool a_trans, + bool b_trans, + size_t M_s, + size_t N_s, + size_t K_s, + float alpha = 1.0f, + float beta = 0.0f) +{ + static_assert(std::is_same_v || std::is_same_v, + "simd_gemm requires T = float16_t or bfloat16_t."); + static_assert(std::is_same_v, + "simd_gemm requires AccT = float."); + + if (M_s > static_cast(std::numeric_limits::max()) || + N_s > static_cast(std::numeric_limits::max()) || + K_s > static_cast(std::numeric_limits::max())) { + throw std::overflow_error("Matrix dimensions exceed int limits."); + } + int M = static_cast(M_s); + int N = static_cast(N_s); + int K = static_cast(K_s); + + if (M <= 0 || N <= 0) return; + + int ldA = (!a_trans) ? K : M; + int ldB = (!b_trans) ? N : K; + int ldC = N; + + // K=0: C = beta * C + if (K <= 0) { + if (beta == 0.0f) { + for (int i = 0; i < M; ++i) { + T zero_val = static_cast(0.0f); + std::fill(c + i * ldC, c + i * ldC + N, zero_val); + } + } else if (beta != 1.0f) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + float c_old_f = static_cast(c[i * ldC + j]); + c[i * ldC + j] = static_cast(beta * c_old_f); + } + } + } + return; + } + + // Dispatch to GEMV for M=1 or N=1 (avoids blocked GEMM overhead) + if (M == 1 || N == 1) { + simd_gemv(a, b, c, a_trans, b_trans, M, N, K, ldA, ldB, ldC, alpha, beta); + return; + } + + simd_gemm_optimized_higher_precision( + a, b, c, + a_trans, b_trans, + M, N, K, + ldA, ldB, ldC, + alpha, beta); +} + +} // namespace mlx::core \ No newline at end of file diff --git a/mlx/backend/cpu/gemms/avx_simd_gemv.h b/mlx/backend/cpu/gemms/avx_simd_gemv.h new file mode 100644 index 0000000000..89eedd335b --- /dev/null +++ b/mlx/backend/cpu/gemms/avx_simd_gemv.h @@ -0,0 +1,219 @@ +// Copyright © 2025 Apple Inc. +#pragma once + +#include +#include +#include + +#include "mlx/backend/cpu/gemms/aligned_buffer.h" +#include "mlx/backend/cpu/simd/avx_simd.h" + +namespace mlx::core { + +// Block size for output dimension in outer-product GEMV. +// 4096 floats = 16KB of fp32 accumulator, fits comfortably in L1 cache +// alongside the B row data and vector operand. +constexpr int GEMV_NC_BLOCK = 4096; + +// -------------------------------------------------------------------------- +// Outer-product GEMV core. +// acc[0:width] += sum_k vec[k] * mat[k * mat_stride + 0 : width] +// +// vec: K contiguous T elements (the "vector" operand) +// mat: K rows of `width` T elements, row stride = mat_stride +// acc: `width` fp32 elements (caller-initialized) +// +// Blocks along the output dimension so the accumulator fits in L1. +// -------------------------------------------------------------------------- +template +static void gemv_outer_product( + const T* vec, + const T* mat, + float* acc, + int K, int width, int mat_stride) +{ + constexpr int sw = 8; + + for (int jc = 0; jc < width; jc += GEMV_NC_BLOCK) { + int nc = std::min(GEMV_NC_BLOCK, width - jc); + float* acc_block = acc + jc; + + for (int k = 0; k < K; k++) { + float v = static_cast(vec[k]); + simd::float8 v_bcast(v); + const T* mat_row = mat + k * mat_stride + jc; + + // Prefetch start of next row for this block + if (k + 1 < K) { + _mm_prefetch( + reinterpret_cast(mat + (k + 1) * mat_stride + jc), + _MM_HINT_T0); + } + + int j = 0; + for (; j + sw <= nc; j += sw) { + simd::float8 m = simd::load_convert_to_float(mat_row + j); + simd::float8 c = simd::load(acc_block + j); + simd::store(acc_block + j, + simd::fma(v_bcast, m, c)); + } + for (; j < nc; j++) { + acc_block[j] += v * static_cast(mat_row[j]); + } + } + } +} + +// -------------------------------------------------------------------------- +// Dot-product GEMV core. +// acc[i] += dot(mat[i * mat_stride : +K], vec[0:K]) for i = 0..n_outputs-1 +// +// Processes 4 rows at once to amortize vec loads across rows. +// -------------------------------------------------------------------------- +template +static void gemv_dot_product( + const T* mat, + const T* vec, + float* acc, + int n_outputs, int K, int mat_stride) +{ + constexpr int sw = 8; + constexpr int UNROLL = 4; + + int i = 0; + for (; i + UNROLL <= n_outputs; i += UNROLL) { + simd::float8 s0, s1, s2, s3; + + const T* r0 = mat + (i + 0) * mat_stride; + const T* r1 = mat + (i + 1) * mat_stride; + const T* r2 = mat + (i + 2) * mat_stride; + const T* r3 = mat + (i + 3) * mat_stride; + + int k = 0; + for (; k + sw <= K; k += sw) { + simd::float8 v = simd::load_convert_to_float(vec + k); + s0 = simd::fma(simd::load_convert_to_float(r0 + k), v, s0); + s1 = simd::fma(simd::load_convert_to_float(r1 + k), v, s1); + s2 = simd::fma(simd::load_convert_to_float(r2 + k), v, s2); + s3 = simd::fma(simd::load_convert_to_float(r3 + k), v, s3); + } + + float d0 = simd::sum(s0); + float d1 = simd::sum(s1); + float d2 = simd::sum(s2); + float d3 = simd::sum(s3); + + for (; k < K; k++) { + float vk = static_cast(vec[k]); + d0 += vk * static_cast(r0[k]); + d1 += vk * static_cast(r1[k]); + d2 += vk * static_cast(r2[k]); + d3 += vk * static_cast(r3[k]); + } + + acc[i + 0] += d0; + acc[i + 1] += d1; + acc[i + 2] += d2; + acc[i + 3] += d3; + } + + for (; i < n_outputs; i++) { + simd::float8 s; + const T* row = mat + i * mat_stride; + + int k = 0; + for (; k + sw <= K; k += sw) { + simd::float8 v = simd::load_convert_to_float(vec + k); + s = simd::fma(simd::load_convert_to_float(row + k), v, s); + } + + float d = simd::sum(s); + for (; k < K; k++) { + d += static_cast(vec[k]) * static_cast(row[k]); + } + acc[i] += d; + } +} + +// -------------------------------------------------------------------------- +// Public GEMV interface. +// Handles M=1 and N=1 with all transpose combinations. +// C = alpha * op(A) * op(B) + beta * C +// +// Dispatch logic: +// M=1, B not transposed → outer product (SIMD along N, stream B rows) +// M=1, B transposed → dot product (SIMD along K, one dot per j) +// N=1, A not transposed → dot product (SIMD along K, one dot per i) +// N=1, A transposed → outer product (SIMD along M, stream A cols) +// -------------------------------------------------------------------------- +template +void simd_gemv( + const T* a, const T* b, T* c, + bool a_trans, bool b_trans, + int M, int N, int K, + int ldA, int ldB, int ldC, + float alpha, float beta) +{ + int out_len = (M == 1) ? N : M; + + // fp32 accumulator (thread-local, grow-only) + thread_local aligned_unique_ptr acc_buf(1); + acc_buf.reset(out_len); + float* acc = acc_buf.get(); + + // Initialize accumulator: acc = beta * C + // When M=1, C is 1×N contiguous. When N=1, C is M×1 contiguous (ldC=1). + constexpr int sw = 8; + + if (beta != 0.0f) { + simd::float8 beta_vec(beta); + int j = 0; + for (; j + sw <= out_len; j += sw) { + simd::float8 cv = simd::load_convert_to_float(c + j); + simd::store(acc + j, beta_vec * cv); + } + for (; j < out_len; j++) { + acc[j] = beta * static_cast(c[j]); + } + } else { + std::memset(acc, 0, out_len * sizeof(float)); + } + + // Accumulate: acc += op(A) * op(B) + if (M == 1) { + // A is always contiguous for M=1: a[k] for k=0..K-1 + if (!b_trans) { + // B is row-major K×N, stride ldB → outer product along N + gemv_outer_product(a, b, acc, K, N, ldB); + } else { + // B stored as N×K, stride ldB → dot product per output j + gemv_dot_product(b, a, acc, N, K, ldB); + } + } else { + // N=1: B is always contiguous: b[k] for k=0..K-1 + if (!a_trans) { + // A is row-major M×K, stride ldA → dot product per output i + gemv_dot_product(a, b, acc, M, K, ldA); + } else { + // A stored as K×M, stride ldA → outer product along M + gemv_outer_product(b, a, acc, K, M, ldA); + } + } + + // Write back: C = alpha * acc (convert fp32 → T) + bool apply_alpha = (alpha != 1.0f); + simd::float8 alpha_vec(alpha); + int j = 0; + for (; j + sw <= out_len; j += sw) { + simd::float8 val = simd::load(acc + j); + if (apply_alpha) val = alpha_vec * val; + simd::store_convert_from_float(c + j, val); + } + for (; j < out_len; j++) { + float val = acc[j]; + if (apply_alpha) val *= alpha; + c[j] = static_cast(val); + } +} + +} // namespace mlx::core diff --git a/mlx/backend/cpu/gemms/simd_bf16.cpp b/mlx/backend/cpu/gemms/simd_bf16.cpp index 58f5964b6e..b841ffe450 100644 --- a/mlx/backend/cpu/gemms/simd_bf16.cpp +++ b/mlx/backend/cpu/gemms/simd_bf16.cpp @@ -2,7 +2,12 @@ #include "mlx/backend/common/utils.h" #include "mlx/backend/cpu/gemm.h" + +#ifdef MLX_USE_AVX +#include "mlx/backend/cpu/gemms/avx_simd_gemm.h" +#else #include "mlx/backend/cpu/gemms/simd_gemm.h" +#endif namespace mlx::core { diff --git a/mlx/backend/cpu/gemms/simd_fp16.cpp b/mlx/backend/cpu/gemms/simd_fp16.cpp index 93467da868..5e298a3a94 100644 --- a/mlx/backend/cpu/gemms/simd_fp16.cpp +++ b/mlx/backend/cpu/gemms/simd_fp16.cpp @@ -2,7 +2,12 @@ #include "mlx/backend/common/utils.h" #include "mlx/backend/cpu/gemm.h" + +#ifdef MLX_USE_AVX +#include "mlx/backend/cpu/gemms/avx_simd_gemm.h" +#else #include "mlx/backend/cpu/gemms/simd_gemm.h" +#endif namespace mlx::core { diff --git a/mlx/backend/cpu/simd/avx_simd.h b/mlx/backend/cpu/simd/avx_simd.h new file mode 100644 index 0000000000..cb3d684530 --- /dev/null +++ b/mlx/backend/cpu/simd/avx_simd.h @@ -0,0 +1,393 @@ +// Copyright © 2025 Apple Inc. +#pragma once + +#include +#include +#include +#include + +#include "mlx/backend/cpu/simd/base_simd.h" + +namespace mlx::core::simd { + +// Forward declarations +template struct Simd; +template inline Simd load(const T* ptr); +template inline void store(T* ptr, Simd x); +template inline Simd broadcast(const T* ptr); +template inline Simd fma(Simd a, Simd b, Simd c); + + +// Simd — wraps __m256 for AVX operations. +using float8 = Simd; + +template <> +struct Simd { + static constexpr int size = 8; + __m256 value; + + Simd() : value(_mm256_setzero_ps()) {} + Simd(float v) : value(_mm256_set1_ps(v)) {} + explicit Simd(__m256 v) : value(v) {} + Simd(const Simd& other) = default; + Simd& operator=(const Simd& other) = default; + operator __m256() const { return value; } +}; + +// --- Load/Store (float) --- +template <> inline float8 load(const float* x) { + return float8(_mm256_loadu_ps(x)); +} +template <> inline void store(float* dst, float8 x) { + _mm256_storeu_ps(dst, x.value); +} +template <> inline float8 broadcast(const float* x) { + return float8(_mm256_broadcast_ss(x)); +} + +// --- Arithmetic --- +inline float8 operator+(float8 a, float8 b) { return float8(_mm256_add_ps(a, b)); } +inline float8 operator-(float8 a, float8 b) { return float8(_mm256_sub_ps(a, b)); } +inline float8 operator*(float8 a, float8 b) { return float8(_mm256_mul_ps(a, b)); } +inline float8 operator/(float8 a, float8 b) { return float8(_mm256_div_ps(a, b)); } + +// --- FMA --- +template <> inline float8 fma(float8 a, float8 b, float8 c) { +#ifdef __AVX2__ + return float8(_mm256_fmadd_ps(a, b, c)); +#else + return float8(_mm256_add_ps(_mm256_mul_ps(a, b), c)); +#endif +} + +// --- Horizontal Sum --- +inline float sum(float8 x) { + __m256 val = x.value; + __m128 vlow = _mm256_castps256_ps128(val); + __m128 vhigh = _mm256_extractf128_ps(val, 1); // high 128 + vlow = _mm_add_ps(vlow, vhigh); // add the low 128 + __m128 shuf = _mm_movehdup_ps(vlow); // broadcast elements 3,1 to 2,0 + __m128 sums = _mm_add_ps(vlow, shuf); + shuf = _mm_movehl_ps(shuf, sums); // high half -> low half + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +} + +// 8x8 block transpose with fp16/bf16 → fp32 conversion. +// Loads 8 rows of 8 half-precision values, converts and transposes to fp32. +template +inline void transpose_8x8_block(const T* src, float* dst, int src_stride, int dst_stride) { + static_assert(std::is_same_v || std::is_same_v, + "transpose_8x8_block requires float16_t or bfloat16_t input"); + + if constexpr (std::is_same_v) { +#ifdef __F16C__ + // Load 8 rows of 8 float16 values, convert to fp32 + __m128i row0 = _mm_loadu_si128(reinterpret_cast(src)); + __m128i row1 = _mm_loadu_si128(reinterpret_cast(src + src_stride)); + __m128i row2 = _mm_loadu_si128(reinterpret_cast(src + 2 * src_stride)); + __m128i row3 = _mm_loadu_si128(reinterpret_cast(src + 3 * src_stride)); + __m128i row4 = _mm_loadu_si128(reinterpret_cast(src + 4 * src_stride)); + __m128i row5 = _mm_loadu_si128(reinterpret_cast(src + 5 * src_stride)); + __m128i row6 = _mm_loadu_si128(reinterpret_cast(src + 6 * src_stride)); + __m128i row7 = _mm_loadu_si128(reinterpret_cast(src + 7 * src_stride)); + + // Convert to fp32 (vcvtph2ps: 1/cycle throughput, 3 cycle latency) + __m256 frow0 = _mm256_cvtph_ps(row0); + __m256 frow1 = _mm256_cvtph_ps(row1); + __m256 frow2 = _mm256_cvtph_ps(row2); + __m256 frow3 = _mm256_cvtph_ps(row3); + __m256 frow4 = _mm256_cvtph_ps(row4); + __m256 frow5 = _mm256_cvtph_ps(row5); + __m256 frow6 = _mm256_cvtph_ps(row6); + __m256 frow7 = _mm256_cvtph_ps(row7); + + // Transpose via unpack / shuffle / permute + __m256 t0 = _mm256_unpacklo_ps(frow0, frow1); + __m256 t1 = _mm256_unpackhi_ps(frow0, frow1); + __m256 t2 = _mm256_unpacklo_ps(frow2, frow3); + __m256 t3 = _mm256_unpackhi_ps(frow2, frow3); + __m256 t4 = _mm256_unpacklo_ps(frow4, frow5); + __m256 t5 = _mm256_unpackhi_ps(frow4, frow5); + __m256 t6 = _mm256_unpacklo_ps(frow6, frow7); + __m256 t7 = _mm256_unpackhi_ps(frow6, frow7); + + __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); + __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); + __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); + __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); + __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); + __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); + __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); + __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); + + __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); + __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); + __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); + __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); + __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); + __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); + __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); + __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); + + _mm256_storeu_ps(dst + 0*dst_stride, r0); + _mm256_storeu_ps(dst + 1*dst_stride, r1); + _mm256_storeu_ps(dst + 2*dst_stride, r2); + _mm256_storeu_ps(dst + 3*dst_stride, r3); + _mm256_storeu_ps(dst + 4*dst_stride, r4); + _mm256_storeu_ps(dst + 5*dst_stride, r5); + _mm256_storeu_ps(dst + 6*dst_stride, r6); + _mm256_storeu_ps(dst + 7*dst_stride, r7); +#else + // Fallback without F16C + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); + } + } +#endif + } else { // bfloat16_t +#ifdef __AVX2__ + // bf16 → fp32: zero-extend to 32-bit, shift left 16 + __m256 rows[8]; + for (int i = 0; i < 8; i++) { + __m128i bf16_vals_u16 = _mm_loadu_si128( + reinterpret_cast(src + i * src_stride)); + __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); + __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); + rows[i] = _mm256_castsi256_ps(fp32_bits); + } + + // Transpose the 8 rows using AVX shuffles + __m256 t0 = _mm256_unpacklo_ps(rows[0], rows[1]); + __m256 t1 = _mm256_unpackhi_ps(rows[0], rows[1]); + __m256 t2 = _mm256_unpacklo_ps(rows[2], rows[3]); + __m256 t3 = _mm256_unpackhi_ps(rows[2], rows[3]); + __m256 t4 = _mm256_unpacklo_ps(rows[4], rows[5]); + __m256 t5 = _mm256_unpackhi_ps(rows[4], rows[5]); + __m256 t6 = _mm256_unpacklo_ps(rows[6], rows[7]); + __m256 t7 = _mm256_unpackhi_ps(rows[6], rows[7]); + + __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); + __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); + __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); + __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); + __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); + __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); + __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); + __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); + + __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); + __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); + __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); + __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); + __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); + __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); + __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); + __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); + + _mm256_storeu_ps(dst + 0*dst_stride, r0); + _mm256_storeu_ps(dst + 1*dst_stride, r1); + _mm256_storeu_ps(dst + 2*dst_stride, r2); + _mm256_storeu_ps(dst + 3*dst_stride, r3); + _mm256_storeu_ps(dst + 4*dst_stride, r4); + _mm256_storeu_ps(dst + 5*dst_stride, r5); + _mm256_storeu_ps(dst + 6*dst_stride, r6); + _mm256_storeu_ps(dst + 7*dst_stride, r7); +#else + // Scalar fallback + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); + } + } +#endif + } +} + +// ========================================================================== +// Conversion and Combined Operations (T -> float -> T) +// T = float16_t or bfloat16_t +// ========================================================================== + +// Load 8 half-precision values, convert to float8. +template +inline float8 load_convert_to_float(const T* src) { + static_assert(std::is_same_v || std::is_same_v, + "load_convert_to_float requires float16_t or bfloat16_t input for this specialization."); + static_assert(sizeof(T) == 2, "Input type T must be 2 bytes."); + + if constexpr (std::is_same_v) { +#ifdef __F16C__ + __m128i f16_vals = _mm_loadu_si128(reinterpret_cast(src)); + return float8(_mm256_cvtph_ps(f16_vals)); +#else + float buffer[8]; + for (int i = 0; i < 8; ++i) buffer[i] = static_cast(src[i]); + return load(buffer); +#endif + } else { // bfloat16_t +#ifdef __AVX2__ + // bf16 → fp32: zero-extend to 32-bit then shift left 16 + __m128i bf16_vals_u16 = _mm_loadu_si128(reinterpret_cast(src)); + __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); + __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); + return float8(_mm256_castsi256_ps(fp32_bits)); +#else + // Scalar fallback + float buffer[8]; + for (int i = 0; i < 8; ++i) { + uint32_t val_int = static_cast(reinterpret_cast(src)[i]) << 16; + std::memcpy(&buffer[i], &val_int, sizeof(float)); + } + return load(buffer); +#endif + } +} + +// fp32 → bf16 with round-to-nearest-even. +#ifdef __AVX2__ +inline __m128i convert_float_to_bfloat16_rne_avx2(__m256 src) { + __m256i val_int = _mm256_castps_si256(src); + __m256i bias = _mm256_set1_epi32(0x7FFF); + __m256i rounded_val = _mm256_add_epi32(val_int, bias); + __m256i bf16_bits_32 = _mm256_srli_epi32(rounded_val, 16); + __m128i bf16_bits_low = _mm256_castsi256_si128(bf16_bits_32); + __m128i bf16_bits_high = _mm256_extracti128_si256(bf16_bits_32, 1); + // Use signed pack to preserve negative values + return _mm_packs_epi32(bf16_bits_low, bf16_bits_high); +} +#endif + +// Store float8, converting back to 8 half-precision values. +template +inline void store_convert_from_float(T* dst, float8 src) { + static_assert(std::is_same_v || std::is_same_v, + "store_convert_from_float requires float16_t or bfloat16_t output for this specialization."); + static_assert(sizeof(T) == 2, "Output type T must be 2 bytes."); + + if constexpr (std::is_same_v) { +#ifdef __F16C__ + __m128i f16_result = _mm256_cvtps_ph( + src.value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), f16_result); +#else + float buffer[8]; + store(buffer, src); + for(int i=0; i<8; ++i) dst[i] = static_cast(buffer[i]); +#endif + } else { // bfloat16_t +#ifdef __AVX2__ + __m128i bf16_result = convert_float_to_bfloat16_rne_avx2(src.value); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), bf16_result); +#else + // Scalar fallback with RNE + float buffer[8]; + store(buffer, src); + alignas(16) uint16_t bf16_bits_arr[8]; + for (int i = 0; i < 8; ++i) { + uint32_t val_int; + std::memcpy(&val_int, &buffer[i], sizeof(float)); + + // Handle NaN + if ((val_int & 0x7F800000) == 0x7F800000 && (val_int & 0x007FFFFF) != 0) { + bf16_bits_arr[i] = 0x7FC0 | static_cast((val_int >> 16) & 0x003F); + } else { + uint32_t rounding_bias = ((val_int >> 16) & 1) + 0x7FFF; + val_int += rounding_bias; + bf16_bits_arr[i] = static_cast(val_int >> 16); + } + } + std::memcpy(dst, bf16_bits_arr, 8 * sizeof(uint16_t)); +#endif + } +} + +// 6×16 AVX2 microkernel: C[6][16] += A[6][kc] * B[kc][16] +// Uses 12 YMM accumulators + 2 B loads + 1 A broadcast = 15 registers. +template +inline void micro_kernel_6x16( + const float* __restrict A_panel, + const float* __restrict B_panel, + float* __restrict C_block, + int ldc, + int kc, + int a_stride, + int b_stride) +{ + static_assert(MR == 6, "This kernel requires MR=6"); + static_assert(NR == 16, "This kernel requires NR=16"); + + // 12 accumulators + 2 B loads + 1 A broadcast = 15 YMM registers + __m256 c00 = _mm256_loadu_ps(C_block + 0*ldc); + __m256 c01 = _mm256_loadu_ps(C_block + 0*ldc + 8); + __m256 c10 = _mm256_loadu_ps(C_block + 1*ldc); + __m256 c11 = _mm256_loadu_ps(C_block + 1*ldc + 8); + __m256 c20 = _mm256_loadu_ps(C_block + 2*ldc); + __m256 c21 = _mm256_loadu_ps(C_block + 2*ldc + 8); + __m256 c30 = _mm256_loadu_ps(C_block + 3*ldc); + __m256 c31 = _mm256_loadu_ps(C_block + 3*ldc + 8); + __m256 c40 = _mm256_loadu_ps(C_block + 4*ldc); + __m256 c41 = _mm256_loadu_ps(C_block + 4*ldc + 8); + __m256 c50 = _mm256_loadu_ps(C_block + 5*ldc); + __m256 c51 = _mm256_loadu_ps(C_block + 5*ldc + 8); + + // Prefetch B and A data 8 iterations ahead into L1 + constexpr int PF_DIST = 8; + + for (int k = 0; k < kc; ++k) { + const float* b_ptr = B_panel + k * b_stride; + const float* a_ptr = A_panel + k * a_stride; + + // Prefetch next B and A rows into L1 + if (k + PF_DIST < kc) { + _mm_prefetch(reinterpret_cast(B_panel + (k + PF_DIST) * b_stride), _MM_HINT_T0); + _mm_prefetch(reinterpret_cast(B_panel + (k + PF_DIST) * b_stride + 8), _MM_HINT_T0); + _mm_prefetch(reinterpret_cast(A_panel + (k + PF_DIST) * a_stride), _MM_HINT_T0); + } + + __m256 b0 = _mm256_loadu_ps(b_ptr); + __m256 b1 = _mm256_loadu_ps(b_ptr + 8); + + __m256 a; + a = _mm256_broadcast_ss(a_ptr + 0); + c00 = _mm256_fmadd_ps(a, b0, c00); + c01 = _mm256_fmadd_ps(a, b1, c01); + + a = _mm256_broadcast_ss(a_ptr + 1); + c10 = _mm256_fmadd_ps(a, b0, c10); + c11 = _mm256_fmadd_ps(a, b1, c11); + + a = _mm256_broadcast_ss(a_ptr + 2); + c20 = _mm256_fmadd_ps(a, b0, c20); + c21 = _mm256_fmadd_ps(a, b1, c21); + + a = _mm256_broadcast_ss(a_ptr + 3); + c30 = _mm256_fmadd_ps(a, b0, c30); + c31 = _mm256_fmadd_ps(a, b1, c31); + + a = _mm256_broadcast_ss(a_ptr + 4); + c40 = _mm256_fmadd_ps(a, b0, c40); + c41 = _mm256_fmadd_ps(a, b1, c41); + + a = _mm256_broadcast_ss(a_ptr + 5); + c50 = _mm256_fmadd_ps(a, b0, c50); + c51 = _mm256_fmadd_ps(a, b1, c51); + } + + _mm256_storeu_ps(C_block + 0*ldc, c00); + _mm256_storeu_ps(C_block + 0*ldc + 8, c01); + _mm256_storeu_ps(C_block + 1*ldc, c10); + _mm256_storeu_ps(C_block + 1*ldc + 8, c11); + _mm256_storeu_ps(C_block + 2*ldc, c20); + _mm256_storeu_ps(C_block + 2*ldc + 8, c21); + _mm256_storeu_ps(C_block + 3*ldc, c30); + _mm256_storeu_ps(C_block + 3*ldc + 8, c31); + _mm256_storeu_ps(C_block + 4*ldc, c40); + _mm256_storeu_ps(C_block + 4*ldc + 8, c41); + _mm256_storeu_ps(C_block + 5*ldc, c50); + _mm256_storeu_ps(C_block + 5*ldc + 8, c51); +} + + +} // namespace mlx::core::simd \ No newline at end of file diff --git a/mlx/event.h b/mlx/event.h index 66a6a75df5..1ed8db234c 100644 --- a/mlx/event.h +++ b/mlx/event.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "mlx/stream.h" diff --git a/results/unknown_float16_gemv.pdf b/results/unknown_float16_gemv.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7ef57cfdda1fa63a048d1c8cdf4a53e1bcb8257b GIT binary patch literal 17615 zcmbun2{={V_W)cW+zc5@rfaIqcjlThN60)6nXVx-m!!!YndeAlEQ%yjh7g&eP$E+@ z6v|LiRN~v`s`u@!`~QBP@4KGe*=L<~_F8-GwbxpEpS=>&Q&ks7N?>3@B_r^gH83O` z0r#>$4wI9EOB$W@Ccq_C?1*;mUe0hyJv(QD9~=b`4B!e1FoLHeged*n3+i5;L^zs+ zAZey+f1KbzgiCGSss$1c8xrk^1UP0Bpl3%U5_~=3Sm+ieY3O3-=<4YV$NhZj>*Zib zAi^yHSygqw3L%gPm(=tCEGYlHs{Fia!mWQ}2l;mgAig2)eF=cOP4*=X2!3Avz7C*2 z5dQ{nNgaZttDUk}AfSifxxE`>+ItznWXpi|&YKRA99Q_0iQ3)~?gwBH2*J^m#* zZGxvW(FG0~AvLb<>JIt}msEEL0#GG5csUXvmi>sn1UnB{V0MeeuqT%k^O?o6nZukK z2k%@MI1{sC4}VF8iw>r@LJdJwPtUVjheti_diOl)p5+Pa-LO)^E@4YRVu9I*nkQ#e9ZLu zq7lv2x%tIok>j7eztBh4w?$1iH}-m$<0E@h(85e=+KuVjA7M2^D#EHN>2>s>PZ*^> z%=1@TPSHn&6m;Z73sG^LMj*o{ip`C4B{*xW0|R$z9irc8IXAz0x5fQRtZq{1G~E!6 zBg5BwvgPxKxzp0$FL7Sx?mnh_%4dCXyk+%~0{5}F=`n_L^1St&`}eDqx27@Qaoc;a zvJ);eS@Fc(kfttrhk0#dQ@K@0{(|aUkAj_*6%ysy!~dZt^9wzV}s)-FxWE z9qsN!M^#X3FR!DX+6emGOWJkG*3M=|`^%RV#zSZ7>Q+B|8Pzq-kUBAMcwIq;W$)s} zH!o8N39E*Mu@Oay+e5(OEi(OCun+ ztx3GjO|98E?4D!3m#a}=MC8h~sX1SUK7rn`m>mP7zUS_a+0 zuOkbQ06$q&V=qsA*UJ>1xSq&unQz@Pd(2s24EwU_@FHKk`gU#GI4#2;jhJXj={!+3 zR)0!HCOk*%+_8<*em?m6(@SfEgWofBFTH6kJRU4~|NF#l@4bFORZq`<9DTD|_R*=} z<1kZ_vO%rSR3asRVepSnAHN)H+R$XT(c~(M9XrH3e8j17wrTLlMFkmLELU=NAA-Yg z_nXVzd^PYl4jD|R=;QZ>c3+%QgUgpw9Mu@*VCWW@s3`9@Pd9K}wOjQ(9c4WMvkFKr z-%BeXXFzu?`OyAT@;gp-OC-F}O5RiUWGVag;V991tk#Fno@p%y_6yt&6wiF_9?=-8 zt~#F4aQ&90#7o53XtSZ) zw^F!}*-Ej4Qb6#ne%aoUvz2d6dv|#pjVRmM_Yx;~`x)YBzg&RL(U3IGx24|LH%FRZ zc-}P|&1*SRzuV`dyQIW5Ly@Px2X@493iJf$?-)dKHhPy2@7b4b>vDr|-)BsOSm{*s z1#xF!;o*0I)kEv+`CkrmyXfxHc&x3wd*Y(6+nZRNy2$cDPTIRY%OgMbUl9=H<2hoG zOc&9E6e@~2HWW6K6&6_gfyio}3J(>#TTXMz4R-w5%kcaQRr{)*`V>SFxMHW2@012T z_cj&0?MM@^A+<7UC6A68T*arA#9x!I*Z3$6r%D&{L>D*;G*ej^&qwmb?cE zXCmKE%+ylwz!q3E9lb;KOysm5PZ%9Zljg_|bh+Otpz&a*)E)3HpQcB1b)|tx^BI1n zI?U>alLaMhJ~4VmCod)b_(SIr(1`HCqNQwzIKd)TSsG6-)+2_D^^mOt^Mj5na5xFJ&Uf4{u%-9kl-V zhrH&52aav^bhGNXs5Z0U4v}N;H5a&Rswa6epX3dO&AdI_H<-e2EnY0Kp=zQRGr{G&NCZuGtXe#5Bu zSsQEZ{Hprtv4*Cs^;e@ydm4;`RZi)mPGvk23e$a7yMFR!U$LCF!!})3V+IOtVrKty1==v|$ z<*DCY?Z0m_wy&)JJ=^=E;)^Q|G~W8!6>LXB{H&Vn)GS&DJ_}zz{+yS;>2C3n2sf>? z*Zq+^h0S*_A5)F$UOC;Bs2g(n!JA7-)#tnPZ=u49Y(`Z)UpoY~AJ?I}AhnQgJqw?N z&t;h(z$Ly|g3XDKRV7H3H1x}xu(H0)_wJE>)o6FuzeE!_lRQ^0FL?Qol!+i_NWiBD z`>N6Yu4{=o^u&Y1%eMTk0~cNXEuX8`kHtn!nPpjc2)L`)j!s;p6A^hkb2;HUF6QG6 zy0qfU*M$$>AB$0W;e7hztLyGRdY5XRC2A}%DGjCPgqpg**$!W2DzTbvc)RYYc$zAl zi`Q}=n@w&~idBzVN4GrV?1`9Y{h*!kd{{Bo5oaYqc}YQGQ-h6#@>&kcLt?nXmuC|c zfo0EQRmN#$R1G&Ro`@Obt5VOfWM`XCu{BC!4U6zQ5+H4G)RZ~uz(KtuuDUk>)trON zN55$tbon7{5#(u}FQa@*(~x#;ufaFZk00Lz_4lN!w|?!zoFH8C`_LazRV{jSw@S7V zVSe6Vxyb&(X)70VUN7TPo%1ZR{TgH84}?A1_FdppJFVbirot6$F&xomcHa47yoc%g z9lq6=lF@e&_Mg08S2a~~n@`_A@rtcDcH;43Ls{C|dWV4J3-3}nTBonx_7z!Xx6CxV z&TCB~D)y=g&EafYr(OuDM1Rzs3_U8qeRTf?;TtsbZB}%r=tP`e6>tbN<5iU6gCg3$ zC!LaDtJW(Ky}mVPHv}^}PK0 zy4&KY?57j@w;8o&Y7|wQ#txkiBCL$;NqzW2)r*gH3U$O%I&4UhipPQ1hLFYc9d*Wq z&FP)mr3)CCyV32eacPc{p(iqrge$Mh4Dl?x9x?lzg{6oH;HadZNonz($Gm?Z4Es_> zk+txt!`{q46H6NrV2&xBPHDM)94X4YZ$3$SM)s;$r13fP3O$z$wPE7foV~Z?r|As> zPqlGWAz(RS3-4mT)IOnYDOZVp?U(CN=w0}l<|GyVb56O%cv@s;O3>BP(^%uv3sWYa zhzXN#YKN#*Me18A2W6E~(DCkbsM$pt9V%Sq+;*0TG6bKjbwCE)tC3n>GglnTX=$PN z3@fVRZrXy^5A0WuziYk!hy1u^exSk#gdFlrF*w!JMZ0)-Vt$d9I%X~-iIVZ6(doG8 z!%VK+k2Ml1{2ogcyTz=O`_C#Hf1r}BQ{C>bO*@;zG^Anpf`-7WXPaxYOGhA8?>GxA zJZE1nFg9pn;7VwP#|QRmt1cGZ(J?AI59^^;tX|A<-*4|$eMYUVCq@h_u}?gO!MNh;19XD(bt2o=?vV}L{58F+06F23@R!QJ)1F7 zS_yOJxUN)a!d60cGtyb8*z{(Wq5ktb#vFP6ZWn8G+CRm4s##=HcXzt>%tkw#X_?Y@ ztG|EoZe48QqeGNwpZoWS${>2X?U7VIOf~~2mf~*z2s;-ZZ0saLb+g3ixG~4c)JS98 zSqY|L3?XsNVd#vPh!U6J*wb^~Gv^wCL~1c@Q(E!V$8#R?5b8`=dr5HCoJwEG@mQ+Kd$+Ur65OxDr-RKt zed#{4AiFqWf4}pMuq7Y*<81XGSkNrb`7|Ff`x+$hmn4*h4SePrIVLr`wvU6C=&l!} zm3%YDsJY&^l36zWopsn;*tJW~-l3o0nhP%c7+KO+aP%Da=LP|>rh-=*YSet}4bMBd z?}!y}^$k~_r5LxK4>i**m7Ck``pP6oi>>Qz7o|3bhipmkU2}&fBiaw_Px$&QqPq6T zltz7>8tHwAF+RDEzu6!nY+gPeYc;cvS*!W0W-H~}h7aFL+C9!~Z1Z+;t|24&-F)c+Q!j=rYnBLrRTceb_0ZRsT~Cv`6K)^lN}cK4z-4-pd-OzKALh3`h{7K}@{r>NkR$#9f~Za~TYytTdq z8HsyXe3~R=W!ddnc5$As+jQW@wlrO<{bb}v zW`&AF{%`T2u~<`t-tEe5%-61|!``Z=!-ONP6cwV7a*6yNwZE9T2tdzTpV=6#jV~D_Z>ClNQkL*vd-zWLp}GZ%c^cPvP*g~m-lUN zUGAq6r}JgTXk@rlCi6a9krTd*pFdxho*zz5bT_r@xxBBS0@0PUCcoRzug5PcX-(jm zubW*@=G#4c#BXwx6102g>~6S=)4hvRI`Q~(U(l^j2&TnP6k|#Xj%2zF?Nj_QC^+2z z9uzbhV}#J_!R}yoiroJn&7flG++NY8-4!h6U}z|qzC96#!iGKH&5^U+d#CETou>P? z8Mv`FlHml}J^F)_t*D21Tsfp3l&5qb(!8dQU5U^lH*Jpej@P>vwF4F_={&n;i2#t%6fX!<;-&q-Pvi^M-R1s=X_3v zLumi%4-V0YEm$G>WGzo%)>T#%A|!SKHsA9-kVhZA<%@8!sXmv;rXh6s(*C34`spKqG>Z^i6tWSAl6pV6rQ z%?}dbcs0yj;dqp4#zE)T(GMRU>|vOzo_C%w=5w(gz3$;qq<*CBc>}$UPBP6` zRa)mN%mVxRx$Xz`1K$~s@)gqdT*Az`I@a@rP>}gl!%h)AI zV-FniPQ`tB($u&1pBJU*+UXimL|qPl7bhN+O5W{$>bG#yR7p|Mvi* zag{*75mJykVZii4L30h7UMLKP8tqqY8;ZA0kKUeWM5!>qlR~9Aw?Z0Xls|e-ybo)F-&8SxKMi7xBF28>%M!rsTdTC)I}p(5oaDJ zR=?4*X7-8bxMy?AQtsMm2}_I2lTo!XaY71JBN+>PRn|m_a%t|vOS@2)MO2HP-000+ zRZfqfPCWV{cURLH{Q9J1@WX-I^!LZ*QA)Q3-wG?Ydg^mzF)K0IO((Gx73f_xZyM8I zc)x<8I_?!)xXXcSnYNNj*+b*ej(3XNROeESN)zLq1z8%{MxE`sY^QtgY`?`}q?&$e zt}UwgX;EyQ>LUeRoo@%zwZ%wFa&&6O-&~)F;zIno;S1xYE^SineVShM%f}`@t-ry+s`yF@oilyRkEGYA* z6!Y{Q^cJ=Q#hm&vp`xk%w8T7o1&SG?^HkWvv}dR=tp<2M(R4BLoat&WtV@xTgDvf`q5Pd$3)8S?E=r@@CG9V@HcWsFbOtg^{SkW5g7-@-P7 zTHHgXMmP=ShLI923^t^@KLt45xZSAJw$ls?;Z(b-PO9@#rR}$2GowD@Frt^B_+Z>u zykmrl(fh!;p&p~`^yepBlQfNmM+%rvZZJ6SPb@mw>u`Bxm&11fIWk<537ycGEv!D( zfH6`<(hN5kdWUF?XC!{Aya7ecmY1|&Jb6uP>mJx?c+K(g?P?(b!GToWGyEzg*VZt; zg8bK3HEF2=(h}O-v`XUUl8RWb4t<;~Usk_1$Y$|*p||i!p8k`^?fCZHxmVa~?wsml z82k`qrBSn(`h<_Gb3-pgj4D5IWfmH#>jmuR5aA?wwpxQJOS0W@$Edj)#8~I z`!8&N*x_O z1>VOOhCAm9XGO~O5bfFu}52sx@$*ox;Kpf%EPps+p3-E50D{CF80H2 zArMtd((?euN(pxW+6t&3Y1Xreo=eW7X^l zyooF)6|368iyg)@w zZ7gei{leeNu4W*D%}f@S^iLHsH+&^?I{gvD$;daJjyNF^HOZ9QLMuJ6k7uIavaXE2 zNUu=AWVqnZ?m~;);N*U;h0F35VX4U(X-0pe_aIW3oaZUGg!dHK4phN?+ZgxKJugQ?JaGOhmrlzC(GR6B*XX#4$+hR`y+W@eIufy5S$Nv+!Vl zv#M7q{4jn^$ay6b zO;ZKSY;C@~R5lu`{j74FI*y@N+M>Acr*zCvEE-sr`_F8j-)$^tT|wtiURvWVZ%2u3 z31WNu8Q~C}eE6!d*8A;LEUVAHo?Gx-3^=phwgo}@2=w3YMi?>y5E8qE00^7VNe{ZG zvX~XRi?XNtj-2j=ob$vDH=*lPbQcjF2L{>0l44b~ZRajJj0+j*mMRDe+^f;V?=IGp zqSSTKrbA~JXuVFGttZIc3&fZ9QC&`KA2s*DsB%1!i_E`1$9D9aq+d!=8V7-)y~J?z zX#3+(wu^iEe8;-)m-^ZyUwb{fF~rcVM_`*mWKUvAQVCmaFI1^tjE49Z6Pw$ezrt zcikaM_MnEghC{MO6;GHzmRW`M#Y`iS3WM=V*OsP+L5mmds&U?NYtgeijn;JES*3o^ z2`fuRe%otQCmR;4xoV>}Vu74rS4EU_e4+JH+fNl9U0ys`-)$S&wE`QEZu>;99V+#n zOh?It4`3twe=n=W4YYh);B>=&FzX!jD)p4yEU_l=!A4bbikdou?K1LBWb~2K=J0#@ z{g3X&bty)Sm)-1&=++o+mx=03jJcmI9&t)lYzp~E?osIC&5t}hUrV|=DyUl}2Tw4H ze(4J*@J+UG4B4@@*|ANT-S+3ZaVpTTpt)G^51^PQokUOc(s^cE~lWQYd>rvA)Cingu?D!)-9qhr?JoVD_z)2G564# zPVehA$}^izT1!`~(-}J5J$azpH?OalX0hB^tEUHl>&k<4a~I+zmb!xQ?$)EaN9#?| z@A>d-w8i!orVm|NGwo@InU5Lj8)gM@tbbox_$YgJeY=?7X+JW!+=!_MyB#bK*t8M0lfshb&$R0MgoX>7r!09_gvym)CCq1a z8MLghSFm=AmmThAx+e~!)Mt@5%aml}@9SQkkty;%ay7)1<{m3&U6|&!u;Z)LJcVeu z-n3$Ctv$+sB2@6DpQ{f|wD11ta|Lx*#txJgl~fg-txFbGsjx< zYfk3HBdW<6@6VJgFGn^e%PAa2b_CDiqCN7c_D}N^3UxhVmAT!vqs}Soh4GAS6VG|! zSkqk_FSnbv}p9p|O{fqimLIBbhW^gmB3TT=8Err3~3Wg;YNujqA zNhN48Q1XGGG&?W8v5j}y2Q?evw=o}l*^|XVKEf!ZIy6^a!j0lG8!sP6)yI3q>G5jj z3PwQ&CX5}g&zH1z@{Qz(eF)VQiR9CZxwtP$oZxvcco;{2=T4eoVf^iwd_=;RN_RG% zV|kYR{>g{J9*IbfOxAF>7xve_%8RK#SFjyPOQttuVkI;P_qQ-z@SY%6djA7<1|IA= zyf_HgNLp*4FCJv2@F5zc%32r&be@kex;sbwTA4P)EH!n45b=n%<0kXGL8L}eoeAHI z#Y|M$J*+cjyFp>oRKb)_N9dvE!t1-*zE+i&y=mu8;4xzp$W?7bwLF^Hm;Q9&wv*-A zbl1j&!0>F*e(^RlDg(|@>G=yQxhKW%?2LMlnSKGrZ4%9XR_SBT;kYwj_8OVNd*`_{ zy(=X_lF@(6}R_f9rd}LmxzDeLcl@$6X$Tq%D z3|E>tXpel8V{J$fV&P70A7h};J$H;JKQY zo(Y-VDEoM~eqex?dp@|8%;1oT(okE86~y&5eJPY^YP&_3!=0{=Ib?{QpV}Cpc}%^N zK1PvfF_9B{6)t94_a^D4OY;~S89GLF{;cSLhuM^(5nR(j51SkCF=(0RrO!p$u9W`U zbWSCVYq7#3GcDst4(EFVgudN#)@AaUN*yIukAztEXuq%%aAr7^iIS-Flb3=|tD9xm zC3>3dLo>%?A~g6SZd@vON~y#gdrLhI=ga!NP%?oSX&hICobwa+IM%Hr?bb7ARKe>O+RISK6&a%$5NA#ZOf$rPet5| ziLtt<$?vgc)6-wqD2~}$Mv;*rxo}cy3wB5+QIkOhEPPh7oAwliqZ_B?8gA6MEH;#? zQ2U3hO~Jk8K%h}d5w&1Ry53x8NBt4aDMBAuO^?&QX#p^pdDI+wxkk%ECO=T>-f06?l{v#!yq*u>}5lXrnke` zXo`)&gyI_+rub_i2b4Z%+Q0{Ax0|R<=xcUu=sYM^Gwi-Le^ZC@06PA&KUY!K^PnD! zgB~gDue{G}%jTXZ!yuW!1%=##(6IV=2Czb(TVY#Q6pWP&)Xh8iWWC`@Y_AE=L#xQj zRKCn5gOnn^H--v9nG)GGBLiK>G1KXl;(o#RtLh|6lN+O*qQV}!UcA3OOK!W$KyQ)0 z${wDczCy!;w?`-yF5L5E9Cf%Gl2OGs9SyfX$RVcmq*#rG-J|;UnwsS8niyFV&*Z6BMSl(M;o}rlgTA50YQ`E`+ zKpfq+jm4*PhvU^FsTEU?yvmm=cA9=$-+uM5QTpHV0iyWVqi?1pk@AcrmF@fpKWjSr z`l=?Ps)XZqCjN$Yo_>%U^YtUDxY+rEmnF6Be%4V~EKCx5N0s2`;Opv5^zwxxA!7?N zfDHZZiKK)lhz4-$N9{a7+K}>}O>e%hyqPK_jzpl4aB&RSP(ez8O?M;;1O(QAIv}BA zNAz_i!c}0LZJ&@)EQi-5B3v!^;JV9mWP3{CI;?JtDtFsFcjseEb!OO$L4i0VE`MKNqxxgiZ z2)(HR1^`m?Y7~mjLQcAR7w&>ly&+et>J}R0V<53bgI!;N|Y+35QMuDZ{0} zlJziL3e>eh))ZVCkk^AtV_=d-aA|;Q2FC*gdtW;THv*Bw5+2C%=lv#M&?iWLPe+2U zpM#e#0geaE*}?H(N;-g~D)2G^Cg}v11PNz=T~`1C&)wjX?r=#DxTGiG9gYM;CJERD z!wh=t50?ypOP&C-2T*%3;MOpZTP11g>PU0}vW4QD-_x{6!~N?F$G__In?jIA@BbKm zFtqM=&VE4tThaq1KL;r3437f6gD5~Z;wb2Qor;~e#-@M~FiEptRglsKlI>g_lsui? z34n1)Ln6V$1ghxR1(JFMI)p*}p6~W6j<`BzBM`vB0jNK>{N53=6K(Kv>X~R8SBejwA%=3ehJG8YDo# z0}v<-qKO6Xkc9G!Ip|8#ABYkLx35o_-Zb5jJ%I(u4mNlB|JVKsxbTA?X986Tb>cSAOaQqz}Irh%eGTXamxX zUj^zp4uJ(@2zZ5bfK0(lgE4{h0=Tr_%P*byb%lfi#vMkwLiOJYX{apW&}q;YhC~|gM&c^ zH5jnez|4|hkf{YjzFGgx-lIs~><^od`bpXWjv|qU{q!loFp?VB^i!nsM}QmwDA6W- z(?kL}ldznC%KxlEo(gmyfPuGwoEPXj3C9`e(q_#C4vYi|<_ZU+Myk1iNwryX2kNz1 z^MHfz(4>Yu;lQ?%@VwxdpOhe12|OtY`mdkG{G>*L{B*Y@vM}J7fV(B3`@w+{ZPtKG z1!tEw>m)}^+9d&9-E`ulT?=qW1hWXj`sJ6gKbZio8V;g(()<2Ehkn-nb)vwUksbyB zW4Bp50SBI&)D+3(V>hY)o3{kK)R8ay*k`8uo zB;=7Kl~DgD8gT3XqM`p!G=u~cZPnseD1Yw?|zXQks|NS*@M<=*F6ef8({%QiCk_fnZ zd8$IG@&{FAPzV$b9H>Jh5WsTaEQEiPNuV#m2?kjc7;t<)f55L{!Ax+1{{w@9OXvsp z{0T$iK+yUp3@eQTBK|KPRvLt6f53o?1d8-uJRCSb`Zr7(i1a^r;N~wH2m~6M^Z&&| zqk)b71BO5${(=FA@)v&?6!?<=-*=Iizv%)e2>k9Jcvv)K&Y<5vvO<6(Du2_Jf+pa9 z@h}JwU;GIp!!rgA9PXcZkgxa?CIy)U==U!#82sOOSQH4a|G>jyAOrs2cd=Lyz5fZ5 z0-^OEFdPE-mOo)g$fiKQe|f=y?5DqAILMj&7f%WS{O6xA6#g$>q;P-BSxO4{??2v^ zM*giE(iq70|Cg>b6kq)blY$(^A9&J`iTy7OkNlf19s{|@Kk%d=v+)N^`Y$+ kxi(EtCod3Uk#2ww@FUv!5=kZy9A`jcVM0P`y6Uk12Y8X8NdN!< literal 0 HcmV?d00001 diff --git a/results/unknown_float16_gemv_t.pdf b/results/unknown_float16_gemv_t.pdf new file mode 100644 index 0000000000000000000000000000000000000000..30462a86343c591f5114d7925448e83beb598836 GIT binary patch literal 17413 zcmbt+2|Sg}7jR|2_C19zLWH|tt}Xj6YeJNL-{O*#vM)t;B4tTsi$RZRewIU;nu&AgZjGz5Zw^H+!+M`(Be2BNG49c?{ff!R%mhP`-OSc7xQ&uh}%xM2Ns zrPiXF7Rd&$qT3y^o3=SB`C}r7VA~U2Ma64Q2y2Dxk3+)hO#*>o>rC zy5{TRhd1wArh1z9{lHrF8(d(fh<`XX_0gr}<=m&^1@nYcFPOM;yrQ(?UJmxnnr=Tm zG&*}EYOKp=o=@s~$lgoL;b)sUFW7HaNQuRpv~3U7#dFxll+L@yUt`hRe~GH>vI|j~ z$6NFSeHh#w{npM+nR-_oV!&7KxH{dfkJtDw35@qNoSyTRJLQhba-L|FZ4rBp4Z08~ zJR1y~c;gfpK(Jbk)VMkE`hu&4)EcgKd(N|K6D9Kt-^O|}8kif@oLZZ1&D*hQo#o9O?(8SOi+GI<%j^W}82+$Z6*;BfB8hL=@+Fs7GxpynpBl4^L!yV(bfZ zxhqsTt0%_=c{HpPt5e^&7SAd2Ad#*59ib`L*xM!@(J^O>b{~Jv2%*mp$%sr|zd=Z1 z7~+po{z3Q`H>e!v$mpSLUcSgHkvbulz z)6Dn2q-25WskW}3I~PjkU#Cp*Qb&32}cgL+0(?qx#&I^BNZXEg?cb#Xlv75C&Sudj&hdOj;RAL|FzPKass_AyT zY_tz;XrO})LtnI*)2>493mL@udu*2WpWx||3k8f__Z)h66cjx)%sW2o;LBj9%AUU5 z>s$G0m+Oi2Z`rO)t|ey zQ;xmWsu&+WMXd81qKU+yWOnY3ZW9ddRmCFtE7dQna#RR%YD6C_m(*Z-mb`;Qzi#t{=mmua--UP~2Kt#^X|xhHBY*1!D!E0aOZx4@&fB>Rwa%$mwhcW0 z@j;}fIn1)VFSv#N%gY~M4KALF4#K{;!(YU5+-=m3)-#UNE3ALc!sup37ebn!`4bzB zX{ScFD52CGkNWXq^~sEa+wu#qH?Av$oL-+mImDsQ4XNm~Bi0e(BQf`6TaUl*rZUB9 zT$by@?3k5qm&z@j2=RMc#xFPzKN=Sw`ADzRSc#@AC;Uh%r>h<%_Uay&)BZ+SUS908Y#Zb`Dz)tCrIE5C=o-=5VI zT`bRW$ka*-J1o27hyc6c{XDtefMA|9(d)@e9PRJKX9tHLw3;jlydsF$?ZHnQPTJLk zBWCFPQJP}xF1a^%TF?GapjU3X1Srs>vW!UdJszCD`=6aMH+ z=b}h&>`-!QFLz}5(fT6Bw6WX}g?^oAT}z7$czIS`UL#M&8H8_WyO_WUR>fBC@G16Z zb`5X51j{7!8Vk8bgW4s5UPw}QcDRvp#-GL38tu-*u8L;$M|Gn$#!4J~1{@Ny>CD@u zGH<8GKDntX`^BPt&&f2sd1}s`_bs}=4|2K_DD>l~!(h?ragPRDckiiV3uEMRe4rWS zOt@T7BZNitUK;*B7w8uJq%!*L!z#OD#tWGhKK2oJb6x25PvC4T1f2L%sSC97m5dH^ zX{#d^l1r1`g$G5wYlu&fn*hLwBg=mnwpGMBkeJKiiaNe(A;-?;kRsNF~|T<$=X_tMwp>>TXZ z&-tgG!M+4U^A5hZeXo=`LDYdKSuL9Esiar07v$;gA6ebrjFf-`xKqSMbOXuFQiX z$F>8^IT1qj#lWuQD>pxd%)DzUe#iOj!oztxgY&1J#vI_lKA%8hYz=HG-q*Qt!9Lz} z869LdFSRU}W#xQskGf?IeW_mm@_m5aJlms+I=nP)(J|ZXxWLDW{YHEYx6cFw4+!G- z^4~SE3epc6v@F<-81V@-(#?AmY>;4wGBhY`4?Q`{==IXcksvxW^z4bDPY(6hinhe2 zipRmMk5?=zN=3{sczjqEzuU7djU~FWBfHdN&?I<3IGjDKo?}hzMiEa_Ta#@D*=`Itw(`tinA z+Fd?16QP86Z6a=YmlJjzO1a*{qP6fLuUSM_I`Pa!yJq|nox(WB&ZoC!UOg$uJCl*D z+b?t3Q_)Iarc?gjlltg=QUo8-GZ9)D}~Ofj8*q7Kb5?_8hdSg`_9T6lQO1R zMl4}9LHjOiT<5y-8$Zjr?(&WaYK-YbgCR=LNqqvg{JOP!&ttO%8RK{s#9&R)Ngnp0%4 zr?Mk|Nd|gJDl&JeY_W6ldT~fsjK<`ap-YDP2RB7SU0nJbI?MMA`}$&O6M#p@|Eejw zC2aET#DGnjkkACaT|usJmMST2P@QEQlc6`}uGXrv+;(r-;a44u;lc)6sT z!uoDr$)CfCck6IB@XOMpzbm4dr$Xlw5q*89Jg!_Dxm;ejlB~ii@ zJ+~T(7mgNqy7E@MPIn3T!Xw!~?I!v8T*5=QXu_8lLbn{gx`uf-v0T{w>dQIv%k#yv z{aW&l#$vT!Ep#Sa4C&;l+=H);g+F_8>wxXmLRbH5+a8bauQr91pGwc#A8y8cGUWEn zzBvIGg+takO}JB52Bk^i;>_i4*=UNa+#@0T1g&7OKk;GD`Z@#`wN zWVdacW0%#Bp7hD2{+2u6XYZ^%2wwTJ%>3<+%*1k&f!`|Q8pV;_Wzx_hJab{aOi&_LJN}({yG|u1y1^ZIO__=)kzd6hK@V1^tvO)3J0q)m!w2jj)ec>+K zZZ1hjkpbu8@P(s{Zk!M1*fH#V0@{sizQzfHr3vNXgG;<4N2K4a?%>%)^e_z4O)kkb zZFt~Y$tIWn-a33b{OW}#?=gL4(;-*pqe}Y=_MhfkdMG6EsNjW`I*kC=!@dr_n-T@Q z{lnF#D8{TmhM5~&m!D>FePI@)%h~y+lTwezQ?4}RwuSv8Q@S~>E`ff_m`-ll>oH#@ zM;_N>4;G zc(_tu`6fxcL`8n`8Sjvm<)GT*d4PyARzyn2!(SUjCDv_)mgoI+A))D-%JlMxBVE5IvG^Y ze5my<9T&r{%g-azD7?ceMEz~74^wceaPc@y4$E<$)p6o_m?cug{$%&j7ixKO`Z^bx zEBl6DMNUyHz2loEL*efW{9o&A>CKRssLR#>XHZ?qR?*zD7V@dG?_Jo)=|dfv#BdJ# z4usl4;&fVcOLJ=cii^ELgrwDL;*R|$JPBv-Ia%j+*rNNqYqP5BOl^~%PUnB!UYqx+ z)G50B00tEyoyoStM*NsB^U~5anUBNCi5>@SdoS)NxP|OYT2){%@$2=ANm>;;;_GhP zoB4*DTe5`bx})CXY1?8CNrw0F%E#K5`h&_oAz5cXQM^=EbRg4ZXd7X}py2WUJt!Cq z))Z;ji`&NL6eaS9WKgkpY`xW~*BK&VZ(_1LeQP2fjSKH%;mO_VL#=k2`rwW&#_k+- zWGI0)cQ#P68S#*euYkmZ@`OP>?JJtNmDqfawjZ9zfQFp?rd9NxAN#uIO{X;fQ{J$+Ib5e7d?`GE1@V&-Z4p6s-1qx)LF?d&5%A+)`; zfkF&&6Iw_iS=S4wb=4KcNGX=OuS1tU73`@w|KjwpsPMZpyhB{-Zs|387AE|O*p!IV z@8*0eE-uJ<^f)=avdqtjeVsbh%V{^0e{!f(!`ec$>3xKg?47qp1a2C%tB@R3A&l#7jCULX^PHw%ZWMN_&_w4^5Tvb;BQ?h zoO^xwnY&8Q=No&47qy;6VSGhrT$!5xx=dz6!-e~2ugeeh zuI#cLB<>=^duT6j1MhL0kyAV#XfPxeeXtjY#veeUv4%)#jFA>I7I#_ZXfo|Xn}-|Pm&R;jo>YMb4R0V#dU061De@-P3^Z>Dn%@Qs zHLtcpR`4P_I@ub5F;lgBm&K#vm}Cy0@V#q8JyqA)u}%m@vC)Gt9aRno}5B)$t1QtmtRLEV9UZ)c(($p2t2A7`r88+@+D;H^{d5 zM4YnSr0V6nFgDMUuYTW3={>(aXWt)i;=T53_L!;k(mKZp&Xgu|Y6YWeT~u)5dyhQP zyxKiE;U0=^F=`omoL|M(*YD|Nny&unJbpmH#d`Fbr+uM@PD|fIMt%Kc+AnH!&Q;hM zt{C1N}v?R%}2yZmp0Fl3A~Ugp0C5QDD-@{N>+ z#0d+g7aE#t(DXuMu{0RJYMU^EO?vFsL|fY3^*exETl&?>!fzU0Cs^@Vxo$a{&*n^q z7BXfSjopN7D;Ub1QxjnLK!`-l?OqV_;nygUbeerxMv*0EsCTLgv%8lyRFeUfT0S9s zJ-00M;aQb=ukjDzkKiS$#_;YS<{0AiReTb=EMGQepRKh0xjJvQMGbBFzDEkyp# z&)JpT;RMc_m5Eq9jeh*|x6>=^uHp#edwiA`Px-T@jc%XX&X(bPn&+j%+83H1(UyW_ zI7r5-V+gqa9!>>gU9To3+TkeaGt0$!nS!N!Oxpy{RSg~p(V@9CLnkZEv!>74GkG?3bR@hbiIPbGcdL&Ka z{<%E*N5O<26Nr%d!5fTs#}v@YH+H`fQ)%`x;>luDX0@G4;w&sM%&~a%(rD(x3YyBz zJMId-J?|o2C6$V&R^zt!N?X*XQ%$cYo^#&K{*ZIj*^bv{>haC3Wjv;8=_jUJVv4#8 z<7(9!6%F*i?n&2^G;8)U;W@IJ8rOf4NF`o`T}WN%J#ed+-aPg=+cIHQ>+{rrhd`99 z*T6#T>s^>8G+VcgF2N=*V?y)Qu!TQs8DI6?Zer-Jv;iNz$`7YnmkfRohb_DOYkg_+ zm0s&B&)s7x)2L`{L@Qp57YoXlxUnOvRs&OS%fdRa($Gq=h1+&& z@n<~z6z)?F=*>0}1{&BOdCXoM$h(Qbhzo>0uZeQ|pcHWH!6%=poPC(L`JQCPl#G9b z!fe6>>L(~OQJ`okW{*eaD8ZEid&~|n53e23eHPG1&)*i}VMHA&932?FbL{aj^GkI+ zr}ISi5o0ksf%67FIY<-Uc(G#(_6Z?JQ{5!4xda~8?bdJLmORETdF6UW3Xk{QnT!?9 zG|wbP^&Uxjl|hf>FX&L3liSa0`cJrCX&GJiC|_RP((QrOCetA@?$oBHIat)4p#Y1z zDbBA-0*6y-uc>CLPOS~LGer_8>B^H+($AaIGovDQNho|y67ZA$zVlT6&^?QwuhU`%jO8QC5Z1OKq?=5~oxJf0nG(M|D@v5nEv zX0T|d(b=%QsRMMxe6z;l&R4hI30q$u1|liI=*F|4Puff|%|uYqR~^^bMU^IUnA4m_$9}{xLFxXOuVmW@6|;}% z>F2$s+39`9T$8j9h>aAm9baQ|7D+5T{@DKF3cdX|A$c-Xl5w3d*iEcH)qyfnL(vXD zH1P@58p}xhR9Osp&4(`NJ?(l$XX6prVFHftZd3~k?H)`u2o_W=y}F9^-7R?avo;-7 zKw3hJyKZUxbW$Nl&hz<+ibaj9L!6dNGmo!y{_HkxJsuU z)H7`S!-`My`l$UcvY4zn zI{z96MD3$X&)-a`q|hF_zJ=+qL&Now%Z2G5-^qt9JFEL@#Py$*krha`d_o8tf0D~@ zIbkJTG43f;QGM{`%&xI)gw^r~Zr$mKMlz#A#*aj8X0J*;vE?WgSk%lGZxcp_z?Mo| zJ~4mzQ69C}ryR6hg~86?(G9DEM{S7>33nthjHfkuW}3gE?Q5T!!$sdxFeJs6JTCYe z(8N=`IQi7B>~g*pLROHiGkKy!N$8~P&6bp^D$IA{;a$@Lv+Qf);!|F>TcwI;l)f16 zn(t2-GJ2ntmmt2^o!{;G!3b+sonb|(Yv}!>_bDvL?x&R~JfGdiA0WFxrn6-HROwB) zsTlMf$cx8rWsBkht6e8*(mu$8qS$H`DQ+!>#`dkZ7(yzAv%_MXL(aoJS(@NnY89EW zJZgN|Ta!AVRfX^{R~s36$hd(rn^;vSbQ0J?khE02mrPcLB3cDkj`E%jTr)#HrsgjR z$`z0auc^O2X476nWS=#ec-oxJMpqc9qn}ri+b2iew6oiqP~>p*a-6@ZzqpTHPuh~v=$8PC)L=~} zN7%DcKVFYl?Vb(WMKvy5pJ7^wg&T;PXf}O>56#ht#s{aB?De_mvHVyU$wc{Gh5h{G zS)~sQq0L<;i<4~A6B7Ybp*G&fS+#~(T9X8}^MwUmv|1Y<_;P>kXp~&fZJsjT+3{oN z2Lf7-*d=rNZ9O|8RygG@WjV+r`yvI?GI{k|RCl#a#=UU8*)RJBdcOXk;1_fHN`{qW zoIn(Q6Vj!gD91)2z@U0!f1c21AqvGFC$t!T#oPXwhjCq>IXwIxdszSTu-La@(N)5u zRI}G9o$XiO;k|Hdla&4;W(5;2sm8(8O|EIgc^ACjUi$0od`?V3sh4?jS!6k1e9WKp zy*ktVMb&5N$kIc*QNo4T(6)9y+OGE|BVX`x`873dKMGn#N(WEvojzdeYN6Xfcjw|g zCndTO6To=k6KIv(}f%K`ujoc~y?2WTrQ`HS_NaevIWN38G0}-VeqPXBRP1u<|Bn)e2dgS4gq3y5f}e zd+IDDL+;%<{9|XOPz{69a%|(#EaP)|58ZvP^dMe~nmi-JA~NpQW^|2u+&yT2P3^>0 z8_H86%Zf&+k6t`vWD)kdD>|xR#YdMZ=7nicrM|=<5oLWebTJ)CZ>4E??&9=i+ouoS zQ^qe{HU2cqoO)@xbUf>~oXC(y#*abaHnZ<)KwxS>G7Nu*oq~t>n^(O^5qz8Y zkP<$4?a*jtGwp+lhqCnq zZc`oB`aHlPzf<2K>{3e%-`$k9w-mF+hbsKvZvDt|V7K)x2K$QZH9iWql(?oK&NoX) z``Bd7oCCTawoH@K&29B-3Ztj& z?YSv$a3(jJxaKZ=jf&wsvQ2b|D?BMqRnKPny#1K4slj!{-9mS2v;m0aY4095%HIhjT<@p4nAkdM;fYn_>5`ASd~KR@|5t=xN?{t0BU5Xs$>{#p_At)# z-2J{Ud+uKM&12`U+S`yX*A(M>fyedp_t^qz>mwt9$fsn6kBkekneP4EJHR4ok~N}~ z(vxCUpxulujh(N<=-&LklM%j`am!dG<9RYolQHNh{3awvJ(0i$7DbeP_rpZBtio1G zJ8sv1T`Qs)(9D0g4a-}@rQaUJ;iCAiLL;xH*Zr-8PO@4(S2CaBHTxL3Jz9EN_Q_gR z{NX}b=C`cRXPS!NG9IgRZF=-@$nt5cTD(vEYV12|(^Z4_R;hFP;pNGwuiH&)<-+5% zKOa^fu|$3Rp@yvBS*G(=7om!Xttc9L&|?$TxdI!MY5Byc7bg9IOh?Ih3SbZZzgP5< z#=5>uaE4(&m~}4ZGtH#@J7P`X{k5v(6m< zgV)2r@36=bOXq4?{vMjP<+5k2#`m^yL{?+E{l@n6nQ2sV1>m*ZtUSEWeK*(Xjr6k8 zI3tvpQzR&#ThP`i0vk`r=Jyq+uzjCZM)ciz;OMnWowy5U?_ji@K0MH>$b59%TISXd z1{0^-$3=U5^ZScvXDgg_dwU6Gm+q%qxDYR}*A_(dH19Xq|KK3zg8+e(uE@^vV7)6x zrXAfd+Yu8ZldK?~AKw;c=H*WP*ec<7((mtK0ZQucZQ)Ib9|nUrfwqOWfrGUuGRG$p zouLv!^oi-bQmXX!NKZSD0vjhgY6lBA3SI-9u$|9hryxTO8RrQDj@|zA+!+T}T2x@A z)gyLS>B^iz_ow3Gvk$C-flg9gdx~;o4im-9FRsU%|i)r<9uS z*`bXo!gfFNb3IBM>njp_x}f&bOVR5Ed5a1bnH?`vbg%UC>Fh(B5^?jZfG zLVk~hL+ohhV!-`F{Z&Agt#l|PF4Xp~~z;4_onb&U40%a5I0 z?XT=PlS4GGRj@I-%GAZwT{qni4x&`zbp~D$&Pcd-#FZK0HdU z=ks~X7|O6)zC|#RIV-`DafUYI3z773NCSr|sG08#g)tccBjZD$Fq?5H6Lgs<1%P*+ zoqw*bW!KbE^t(vEwT~eez1Zy)B27c9!}1iQ+$k<{?h@dsn!i(&eooy&(KN`|jJfSq zbZK*kz(}sdT$r|alz`#c^E;9x9lh>^4C5JZ-b^#Oa_+|2%gBV~N)Jx{Bl(8}{gd~F zH;N-hCTjRvuMFIKk$?8V>4L2&Ix@W><947oW4I)Zw0$X*Y47#yU5s$L_R>CMZ}jBa zAZ+xYUnW|r(oaDe zKBZxvVVme>wgba<4jZW@5LtYopqo;eEv`%>9`DQX?Fu4+73O6_U&g(6 z+PrjOGL_%@sFWx-qLg=W2~WXoyQPQ@Z{yDGy}&zBcMEPm{#a(M?XF@YN_jM2(mW%eWvG+Q z?-3fctxBV9NIt)gxy6c>%^_pggF3jNb;1|3PZlYQJ$K|r6KX1*jws~U#ZJ>U-Pn3C zp)0PWu+0xixxWuMg+q>SNWJk5spX z6Uekq#_N&Z#O^-kHr5!lj|bN|oHSrpMc({&ox=g<1XPPfmYBKu79J6&H--Y|qj;t4 zP5g4VDycVLwb@a;FM2QJh@~a6&5|LBjDI4ti6a{vSY3o58PrY~jIoYC9HJ3`J$u-s z?PLFPH07}IxYBD`)^k@wMU|H_55tGvZ8cLLH`4B0)4yM&Zqjr0V~IYcDCXRfKW|}H zUr?{*9?ul67e2vTviUy#O(1Xz|FYC*)FzmQGa@j7)$sHR=MSabafm>J{5@Si9(Kh& zHsi0iimFT%$XqZ^DHM2Zq8OAZm0dG3*y)CyO0Sgk3%OfWi@2U#7wZ%gUhjJT?$#{% zt*V2M3yoB{`Fs1XnC!VRLaBJ>ju-Q&{q4|_!4mrf&DP({ zTIxnoEB76_f6Es3qaE8Ea&%H}O*VR0EZ(9%`1Qxu98J^ozxe}L@h^YYqjN|hHwYD5 zKgXXreIp|^v%PALZnkFrCbnLFkYnNNM^tsO^#z3ydbU6FXdDiPfXb*j`q}%s`VhT+ z;V8&-fM6ICe>);6_6KSMSdRU+o*;roWn1)CQrX|TbKLIay) z4O$0H=-U!~T}h!eQb;7^|A4=La!`m3AkGj({D7z$NL7JwAaySgBm>0*{Tqk$3j{-` zxjH#Hf@mHneCH4xI>Pqzv9$-0MWBB^P?(R0qZ9FG*4Ne9g$T!jX>afC>1hiG>pX;y zqpz#C11RNT>*oST1UdS8!w}wHpo_#~-f-X-Ae=yO5ct7hz@@=5Q4=l=@_HbU2qpu_Foes1CQaco0M#5$00?%zw)XChL=tHP zz|)`a>-0g#e*Rt#j=p~O-oB1-0=TbY3nvg@2zwA`1PTK(oZtu$g9XTS1t9R<9ggsT zBRt^7;CgR-AJ6o*AX1H}Wi0KG_}p))d7TOX};1|wkz z^IusIuxMFKvPu$+L%|I9(w3UnWUVe1AA^qqv`3}k6N=K=>xf&_Dg zgHa>p+`*(;&v^jxTF-gH!PzgVA}=`5tt32eIQC~tkf{U~l?46Q-eP~YMuPk_yQF4e zzYH!onj)3)g#$?<<^15_%Fud_WMD~KHK4bU%>|3R^)DnV4dyO*gZ;Ghq}4OPBf`Oo zorFcQ+&?MshXYMUg8pVxfnpq9ENiY9KHvcjskPD&#=Hhzo z|9*#B{$(|lQNMdC|HL{YRFwe|!rm6_2|*TE8T~)efR+Cj4f8+IkW!HIVGH~x8BpVI zRu7`?CpG_)2(jTWAjF8jfDn6sb!OcKSP%C`=(;*UUKW_k5c_~br|RtwOg5=MKSL>i zqWZV6Du`)@e91k)>GSdMCVIHq!2>W-D5Mlh0`5X2`uNEr5T5^hlk)a;7K0&x_vhem z4`QT$FXrRm1h<2{CNGCyRRB~{1Fqg)YEUrw9yM7s5{(CU+Av5Y4u{5Civ7luKwn2E z7*H-~7_fssf56P)z{GHZ{{w@3PUr{s`UykfWPp9%2*X3-zX1;qOz1|K45U*w;6W8~cmI;i1T(4R{1d5&Q|mgDbj!!O%dWHsawR>#?CM0r(Od zVA3eUU$g*E?Jr(QZj{*phW3>KK)jd*y-JV3vH<^mFgQT@d~0)TGB zBVZtV_-9!H9$H##fJy(&X95A3=s)qG)#S!@Q9#%>@&$#&fy*Hq%MySqxdDbk{lz~N zIN;kz9}3*7**G33l=NS75QQQ@Miu(~!z&aTXrjMhIIvpVh=+%a_J*?1nqVW00GZ)G z@o}v`QLS&i63?pRUWlge$F_y9$DsA>9WM5OVl{G|)B-urlh-^iO z%F?by{O>dBeM|HEzdrxJKKFg@bLKwxo_p?j&iS5upYsYCs%c0dC8c3PC1dd7S{M?J zfP2^kp`8=K2aW;_jNyukFoL@S)KK=f2Q)m~iEuQj zft0y{{V@Wb2$xyERX<78G$Gm%32^Ckf}tIeNbq)tW1w4@l!>#QgNwTp9Q*TEZx6f) zfe5z*!>VZjRtP7Fa4Bs!z=F!ptLo3IHr)C*c94Hh0OA|s-kSipTW4R&nBe2#>x~EV zf%rFuOX(9FTj9=?g8$QP_y3!0sZ_- za(V=JC!#YPbVBM}!^IWM6)vUW3Iw1=z3`J$SdabHlOE?R!3!@4fX@L9nyBq`j!TL#~eJ z%Il2E`&Q?^EWCIh?qtw*U`;mj$#cnP45-zMOGCMQ=i8Rsj9Q((oKtXK@l&8gemRBt z5j{L{)PL>6kljg4=GmxgOT90K5wf{CL#;905f}Q>0%l9GVrU{pwU36-YjxT-+-Vo_ zK*%t|yPZ`^MnZAT0aV=bwU4?(#^`kl_7FZ#F&ePUwoTsuQpR_`Z|;#0NAuS`ipQ&- z7-{R-v_`#|eD&4yvkbR->}=V!LfN(Dw`W##4saj6IQyL8yu$AK-NIt3pLaZ%dJwiP z+ZxufcPSXH{q|O5P@jKb=+h8xdbyS`>LzPG`=G~Ii(P%f?gB4TBH;pf)$VK|70)cK z_J_H*wsRi66cM5>(0Yv;aHD5x1dD5bX3OdDGJUnjC>FInIJ?waYP5@N@63+G zv_y`4byHJ&Zr{k}8q6w!``+hRT*vs}+rgtAt0&h!gm*?Qdpuh`<9o_Srp|x9)Qs`N zh4}!VtmK*)N$S+(M{=Pk8Rbyd+z^MevAvD#hZ)mMOM|(FJPYOoXrfA`7`fpS1Fv~p zbwvGot}c1LHHUjdR|evkM`k8&9qNON%(;3UTbi9X{b9ly*PYfr;920DMez-J&b9*3l^EY0XbQep5dnKg&ZWHYd3Uo9cG4H&=J?W8>G^ zL>BzWTw9ss^LqaepF8$yH59I#`_cKRW#-F^Vkd*1XZKD%rk9a4RTZ|aOTzDv@K>`} zTR_&|<5|Nzi*{UbZ}!`pAHyoBm>2MEYvOFdk_6u-!demgP|Bp7QN}`QZ>;`=k6x>^5r6T%J*9abp&Xatvr*Oc z44rKOp0}3S%_a+(ZV@_dD~hZo5BH5;_Fnd{7*p>oeS(t7S6o~;Y)~`5A{R=K8Pk@UYW;|s;Mp<#TH$omcDxPx z1Q*rm`ytp%l{&JLN{u|*skh&qT{P%g`|{nOZagLN(Mh$Nh5Q*IdYtyh1mpvF;DK$; zPMHt;@yp>SE=CEne+pyeNo6!{B+go`j9fvwbU!^e_A2Q}yQNip=266T_5SAGytE70 z4u`ds_zOIzc}II)`gOd+?jECA=3Ml58V^>h>A%BKqDnXZ(kspzru>UJ@Yxo2ioeQQ+JglS3~bwef;%~mXXWo3rEA-RW{Q95Fe0axXM)juaV3-~K zsBP4x-8oiHT07YjxHEG;4ZgIwVPSmW`@4JHGYeG0RTmA+&z~u)cqJ{oL(bIy+i~qL zU6gT#-n4hNMJTnH(N5@aY2`6zF6xvCJ*pH5zy}O#WL0eokziBma1x21pjp;Bx2rY(xISHmWvtJ01-r|a znT^D#qEKRHu#i$irI?7&w~TUDk6FH@CS8}18{`uG8u7T2He>(cH9=j2JmoHXExCk; z%{(F>X}2D^iGN#guO!_eqiL{nFO`m%%Bf`5v}DsQr^1gYFEFIPG{R*}-zr8eOcxv+ zxWaSDFP}@w&9q=5sTK8dOyfy}!9}s$8!_ju-;#;w31xLJ6jL?OO&_MzRK22NNU(Q6G<;k)AR!E z9CfVk^;kqtPe5Ek$oK}mZZFWcto3YyWMdd8He0ROv&+NFU zl#)wX$tQG&`cdwauSYVc+6?pR+4a8DY9;vJ-y#@PPFog*?HXK};4;c*7w#~=kg#Vj z)u|*z{*D9TD22M-#^ej~wWlTOPha*`VpK%3B3_^MFqvYLnL{cf8MJMOgIBL`y(`tV z8fbG!$aH4y|3dk>)kES1!&I4Eb8}!HU!>*gTWZeY&xlHRRc8(+T&CRoQ&=Qbt^()n z%t;1uvlu%^W#?4&Sk0@bqh0V(TJ161x7XwBoOafY1>xZW z=IHn_;0T`kvCTWEZf}+fAOa%p^qf57%DvJYlfIbm7_4 z%Za8(Oo^fDM6nl1wE=JHW-l%;m`9yvE8ZDR;0*bE#9ZsagrJT~?uS=?B@E1IbjenD zeYV=hZQ<-3J54J(y=D$tRN{?TL}L!cwe>Zhh`QJ_+xFsZ8gm{}lIoMMCv))a=+0JiHLl)K;Myygsk1$P+OOv< zoyed-RGMhER|riU#h$lmU&@ob{O8|YHrGLX`oul6V;k;Lm%;AqCp$c6nl+patj4_U z4A|~XS9`Y#SLtTovNagI=Wq+PUFGhJ`=5@2o9RmEM%?1VCx>xe;eGdM_^aApqR^j_i1x>|LzxP80#+{p@3 zy~xx}X+Z&59^`B0__I`n%W9QeToqzjd_&V!tkoL)*>VS!EvHzhi>#9jArrMym3D?AQ~j zizv1VV|t&cS*(&=P%)UVWIms;taI{~i;FCE=zhh9k`vsDl)uc0um7$NONCyIaWl4$ zT78z*y;8aUqIwDU*oCyMVDI-%>LSmH&5WjT8Xn?bRFvXCTimT)vkYI2y|gw~_x0q5 z>tFMqnA;FB%cF14pLl3;35tGx!uOwX5DDu;_#cJCVSfcjq#ywzT7pR8S3@Ww+7Kw9 zu+ktp_>U+-!MG-#0!iO_8;j&i?2apKN6|SvPF5>-3YhqfSXWf?8;Mu zllFAV@>RF@wK=b;3opTaI=nhK>{H&~Vco+iK@c8vDhht@#c+(H{w*7tt)E}Jnbq4k zR-h!dEO_V>&)89!S1UWYcN1L={dE(HubMX1dsngUPkC(}JRf}R(&N|Y{u}cF`R~F? z2J#M`=lyhFP`okknU*>YKj;1aPTrg1c{~H7HRmWMtrvpK4NB$bnO&Zl`Rj6Yz3igY z<96F$5^(DXzR{HKE$1Wt0gH$(F1gZ(uhV0F52O#B+9}Xv92&f!kc+XJ+sUfi^i{i+ z^5y-v-%8ru&aZ9pbatvGBl+J$(4UbPN?PW>1*&hP?cR)_zx$nIrD01-Ny{mN3(8Cj zYh^;b>1&m;>CdxjlARorj~Fvq|qLt zk<89zEtxu4-?_Y?!$r&HA6QQ8dt@o+S8;#&LDkE6v0@eZ>8CuyS{6fUeeV>ip82Q} zAHo9_W%y~-TrS##=H36?|XjPZ=9?(vjZ2N9{h^b?BDC~^?118J%J9M1%yK_fElPNrcDnxv3t!*ecR5-aE zrbqX4Md>(l-p>#&WP7~F>zP{ietn(GOjZ4(6QQ#dpI-6Kli~39orPZkzRYG=jMHUp zg43%mXR2s!Squ13)&DAJ?7U@XIx(0X-zlYbm^hyt-qxBFv+RsF2$8UwA?`eA!W|oR z*U|cFryZ)_qb{SS!PGAP$$ZYY9d+3sN*u$>4xy1DGU=>4ZN*M_GkyAWU3OtKAB#rQaT0~t=BU8N12Y(_mKVk#i@pge8xfOdi=dO0$O zy<^o4;n$RPu&k{qf-bX7-{qrrs5xItQs{|lTRKOgapB( zDC-#xe@s7*@5xNQK7OG6`>uX6976kA8#qKGHerPn5_H{xSyx?F43%VV_%?jyL*Cxn zv}fl>MTA~m;2GvrKbBIfcf^D*PC7B<{HwQ~6_?-dckOX>oUq7AjhsoE?d7mv%sDgM zrD1(Uq~&#pqulK|BODhER=+5#SUi7R@+9Ms@DLfh z)n_r3y0JH$`9DmQxerR()|tqu+dCZeE^&!Eba2^_y}zdE9zF9XS9MiUpZF)Qk2ZC0 zKc3>sc~a*a6<_YSur8z7@MyuorBeCf-sRmEL&V)=WDjj3Zje1@GkS`}0t<$aMjh_O zps=5~{u{df(#9{Sb5|?j03&*LU8-4kUTY$KyE;}uo(1(( zh*=JCm)iHK<49yZ|74}0@gt3t{vp`tO$*$X*73`{bEJ7`HuhT^2g9m$i8Ru&joL(H_ zceWnC?uIYW&}r+x&!DfLK>JmV&Z$~@k#qHY&%OGg?@R~zO=3fPN!nAa#-kuNMk>*Pz!6B`r;Z_NlQA!r7)oZjG~} z-Sc24&}$2y208dm!&00Tx0TBluN+nyZxCWi1XL5 z@oaLundk#{GWd%P>PYI$SHx=yG{U&=jj#}(U4(fvEs@9~fs3!v7e%j#oz6PRDZbE- zwG>HEn1)Xp8%K8JXC*y6;Px;HCg<2kDb89m;lQRZ+j&>pIGQ^N^_uURpTOA?`Sx77 zIS2jj62bK0euL5UFFc;NkB>Ol~9cb$!XK>JG+n)Gkj&VrleaM&{<1*<8e=jPLSV zTt4T^nmoQ^b_Z*!(|PXa4r^a&R>Lg>$ViZkS4ZP8|Gk_F#=7n;O0=W1>{eGFzrECR z%J|TD-FJ`Us>X$4b&<$c#My^&H8Z-_tX`oVw{31%%3nJpX=#yuD&lTbjF4jWSn48w zwKY+)LY7za{dUx4F|~q6MSa;{R8m4|;tsye-rjf?w>l*i@L;H%;m)K2O1XT`OHq|p zcO&i$R%I5u*?5kEJj1Lbjn9o1-z=l3j(J4qZ^!d|q^qJ*anowv_F8F++I*5}Y1~Dp zJ#66s+MwkzAd7#yCAwwty$4P|J&XaJqfc`cN6ZTD@oAl2d5ZNjZbj?M?4T=un4J6I`^cHy=oS=fx2-PDHam4nb7J&}FUt+y znmgUZz}?A%o_bYp&b5CsSS5~HJo2scrp-~B(N}(Zm+^*1MRPMs@p6o)e~x(h&Wt(@ z^aDF)mciwwc8Z-pyz7{yuOAnvG8edX%U&(!f~$|hJ<377r54;^6I-L}(&fSIn`pEc zf6z#6*s(WCewFnfJgc(~pyzVj$gC+De1t@A!UgKbDl<|bX(^UYg=Q(il}`4U9by_? zJF5HCuYWsVM}Vslb)ZoA$?#p1eWOgz)v+8-Q<+DNMeX_13_P@1H6)s ziC=dasLS*37re1Rp)LozeLES9JHzj_X#JcOM>8i79Lk}54ag=mr35h9b z=G&Q&A-lyDzQptU$o$xKE@$}e5&wja;kC3`0TZJy#s!l0D%KIZda_yGDr6sLJyLP+ z9Zozq-KU;rcjz5TboN&KS#4BJ!OLWRwuAJ(Jd753jdzaQH@&dvlaCp5|E}Rk5DsC8 zbx;}G5fA}?v+d>%@gyvk4RxfOVu7K9!NPW^aFR+(fZNCp()HnGj19`k^u0ZvP~W~xjFgN+ynCjzG%!;Cs(JIlBjLbbj6 zl*VqVWMLZ)a~d7|m|?8ay-9D0jxj1GPm%K@y{4Hd{U==FwGWAo<*}YxV{{UZD>&7M zzr4I1|6Nd?43}h}6Iyx`(Wg2vMrugf(fcNzfm)NPaUZIRAk=JmN$<&{2|8QXlbt5l z93Ga}2np^PN-{VrpjvWmMcR9hz_l;hbX0!Hv2Dk7OJe5Z3)r(p-c40})VMaxVex6P zFaJ@F(W8g$xOV33D;%{qPY*B-zxB7$s$EKY#Lv^YW*8_=825$=DO>iQm}hy;#Cs!X zUyxn-amnBX0%~CGTR5lH(%EJEHpu|alG*wPj9Ut>vsF>GbB`=yMV+@Cs*Tca%zf|g z?w8}5d+60yo-Jrbgm*)PkR@+@-?J?#XRYVdV=nyIr(|?QX{G&%--Bn!4lPg4m(&;U zdCIq>d+N%zZd1B&@5K)Pw?a=}N*%rtk@1$!x7IvsAQA2oV7LRMPf=>CI|c*vEcl{~apy_6CjRR)@Xph)uD# zCD07#HMtjCzoGDTPt3vbC~G}eSV_qVuy!R(6)Opzk-OQJ zSY3_&LA2RD&%eaBCMGuPZnsskXi@2_@$Pp6vBO5MGqPjF_8sRtHgY(`nnh<+QSv(K z;P^cXi^+S*#R?-!2l)Kt-jkUu8JsG!37CpT-G)%S^j6j|P7v)nQj_*U>I)-lR3y2y z=$jvIt)vgA63PsUa0*C+yD_)GIn^rCBe~UhGq=WfLQ#d#C}#&5e#pQ;*-bn?-i33!H{-=WiOtnlWS&(K5D*V|Ev76!L<)cCv6|r64{nark=EBveFft)X`6= z-R?h7RXhZeXlyF+LP=#MzGbj~uFBoxqvx-$li)zBjGo&|Jl~;{7Jg>Bl3Fy}=Wv@AU7dZrRms zjVpBU%8m9l^%e8f>q-7(H2&4^NYYtNMgr_Z+N*5Wysic93iDt_J zeE2PmNX*&f;(eZ%T|f8fA{Z%usIa9?M=8Cb4{Uv8@^PAVern2ZHqh4N6pPj{b9+4h z4&ETY%T{YI2EX20^9tMFbBp^1&(ez%X@h=kN9_|he6~i7i5AQrm$Vq-mV1_nZkxXL zJ*>ONHuY|>-qP0tgFW9?DfmPkzmX9o84!rXZbG}%s&mWFje=oK z3?R1!{8-p8>!YrJx z!V=z_XFF64_ikg_LH{6~ElPl1N1uA_hIee1P1G*Mrgy6B@d=DCF>B^O-up0=nZ%03 zyL;5Xu*ocBq+sDm$fy&vHm{ImWpTkM>-RKRNCe!yZL_+oO0bq*>2qYWSBCM$?EA+( z^LwObL`5(VbsR?nBtUFD@@xq7nwW-gwJ5gDl6#uJ%ZwR=T6 zMoBz4L(pHP1tIQ<)ea0-Hq$<=Xql&qa!{4RZmpet<=ymt~i<|vkoEh~KI zwk|Lq+GAZwkFO}L^;EE<#I*Qxy!?c~Me(Qe#^K>VYU)2-jZ=b}K4p3c=Yac)2CauQ(NIoq0 z`aH+MZ&E&q1x679wOL;QZ zN5@Vgo{(8SvJheu%cmZPV+D~3rO&+}5iP5r<&w@@55BDx(hO?myxJzsQ_HFU(4XB| z@l}OJc5UzRIdPo?wFjIDyoT5D5&QRQ>1p8;w5s`n1vAVmt<%y?#VU;_t6W+d?+;r% zX;+K!j9H0%MQyrb@Y*Wrt$uJ>0`l7q)4Kh^(b`{Z)W+oC%B6IdcSlJ9n`8c1)L2=ww-OS7?vcXuDiQXI#{s1c}hoYU0z#X8Goz zrQ&ydd|yktIx1;erG`&1?fX0sLg1fj;U2N$XtU#(HZS+(FFJkFB(JG(&(o=>HN_Sm z2}+4G8odEWr?bi~PJB(^bXiVw>%ZY&uD8D^ubju#z{f`TsD)FF9A6Jj$LAYQS&Z*) zV-KxCcl%84?Kjh?;`GC69kX)vxcI|dr#IByO5=iHTvnlg{MEdUE@9Y<*i1feF$%la z88?XDyAF9>ztV-d6m=V|?f9l%t0KMel(lT-D!qy0ty3aB-Z=w>v`ZCEy1l))8&~e7 z9C0RIVynvw>1jP^aIpR``VBvhgRaot;_w3(_H=u?QP!g-MkX2l+^gT;FTUG7OGHr7Q2wqW!$pNS;^ibQKs3$a$5pMX~d>ro-V~9FwpaH zPOiXHCoAwU?QQm5b-~(Of{%Tn;mb$E4QG{F@7kk`DT4Mq^>Oi{jr10dJfBy0<+(^{ zUiL?YBk7&b6Ls@@dG!y@ZEbnjqJ7m``|H*8G##~s)Hi1~iI8+v*%Mo*bI9ZR86VjXu1+vVhO^y(L?69&hOs_b}pFC&I)&d$OEr z;H5nwXmMr=Q`Q)+)6HKIXpM$Z%A+X9>PfGzyd=KeQg4e zkTz5Un%GrA&%7@wjL9e%8LWduZwA-J>M~OD1BjfNbFraq_pBG{RjAL}LcnErHoQWp zX<$uIwu0nwipw0k`MIm#-7ZYIsD4Dz)Zf^QsbeC%q_vZO?5gt9tw=gD zvmpaZ&>;BO#Os1{2gvQs57=3FfcxmuFkCBsc&O>EdE(cq zm&7u5sCg3(MK3ByLhi+)Gcn z0OK`_jn2eA z+x9Z&fEx!5ol9j@aE%UcWe<^o@>mA$)FKQy$PcOjsa%;Jy%W~j{|7YQS6 zZwh7FyFL3phB#h-j!)eeKD{oSFO<&F}o zW+65%y(e~pPK*cAQIb_Y3Nr9n4f9mHICrz1Xx59;p<4W*MVIorDV15HZ)n6|z1hF# zOT`ky4#gB8=Y1sHj`rxw9`BtuFL^(m#Aod#DZ(XH!t?PHmV(R3rR2SmR>Kc>rcXXM z6Q7^do$coApE~`h<9(y4ZOf%0cO~qT7tiY=roKm)&CY&Zp*U)589_#d2*Z{$8@UQX&<(Tl_ls(sm+qIU()b3hOv63p<$JyMND>|9qA%{d#52`U3CF+a zlU?h$qu7j89?y|5PxWgX?qc<6L?O3TYjh0D=QJ?2S<$jOr0%Y7fD2g1el`1WB=KX< z?fv7iwN;Ks6>=IP=V@EYw;m4oQQH=%+xXR{e>zo62+tP&KJc}GF?N48^DWbP@1hHp z01*YBc@m~qmw~uGMhO6K;M!!2IFJF2QQ8quxr9EfBWFY0p z@mBzXUhTBOB+HBY!!-P`r}ta5J@L=S6K#xND9y;RT)Y-2qWme{20r|1tC{)>Bkis= z{dVgz|BQ*Ef4XF5?buM)G3nWqDhZ!} zJJoekr3np@juF8RT+;4r&5+-!I@DKSq{_wDJCJX(w|tCJ@xpC)rg8kOz|?B~*+{tk zUT$&aM}_KaoNhfLd6@eka(&0tN*xp!3KSr?*w6c^Oqo~{e195Q67B;WWZ4OyFNtM&h9u*%esSkf!-I}Fo zn)0`NfGGaO6YGgnq--cD6+0io&zioGk(${)HNr7FGhY)scOS@&dHWDmo$b8A!%}*7 zKkFzA1||hPqek$-d%JiNJ-p#a$P*OE`4M@8fBQ2T5OGex6VUmMg)L__ONm z;^a((O9Nwv_i%HwgF_pLKCX5?&TuJzg0}};%EKK@jd;QXCPj4iCV;vl$k_t_b@2mr zAHcPg55bS%4woX3dV)3r@gA-o?r`WEB^9^~2%F+qRrKvnl4b-ZB#rt#Gw&BN zwS;34Aj9tezrg-q`#^)uKWU)t(!i?8qGjP&G!}*fp_vTO5ftzQXz*znR4{-?=y#|+ z8hjE-x}s&E_W!p6`}I2-w1o=P$FG7xBY~zslLxAZl?5H)WZ^O(BE^EY1p@|`feHu^ zP&f?umKX!af-jjc7|;oLePyIUTUo#W0@R@^^g9|9koG78o`?Fx0l$HjU8g};(qzyw zFz5;~0Zj@jSfFswvluY(^*%sbFfOSM_AO8{vZR8M!I7>w086ld4pJk*1lPxv2IFFY zZ9#)nN$58WxXJ=yL03{iLG7?4AwXAXeA1#p0t7q&fx@6MG2j`JP<}B7T}k=_jU)}- zARrMzY(rCnuAq?C0~&6K{`;4uuu72q6t98!Z{%k}FDmJIfb z5mFD*z?484p%D9!9{kfk)CwY!lnL|%-T%`I#1sMnBmMTX1047W>sNskVO<9xJ@~Vb zWDWEJ(uv;+Ngp7c_*F={@>3@uefYgVe39-!ACPYRD$vZa2n<+5z$>H!WC~6etO=wS zz^(mWe(A)oD@c^(DmPs;(y1%D!@&VY#}h;K!_y2 zrw2zu5D9odeK_e`VNf%KgOw%GOyMXfq6Rc`I9Oy*g8@qo%q)opnOd;q>-FF4J&NSf zHrRaB&!O>f6lrMKPoDw|BdLRRUq$*N3Jga8O0-U2H<3Wjq*jhV<$u;7Zw00g$gp(> z2Ifv`;{kL-z_yawdBCNAjs&?& z;7v)?e|@d=&(TPfpDvd)EDSg%;BraLec(We)@vk>3cjaWzXMK|v}FaRy6&1u`zFvG z$*q(2F~GctaLi98);%@A@}%d1^9FjfUiXCqLq?+h=2C%OBa!^zzX!>SO)1e)9(esuwwk_K?`a94v;{P(KKp%5r6 z_}~wXKwzXXI1AC=WOCA*;0Ob@1qB0c@aG@!av1PlIKuy-L2wiLgS-EvA+adnEjQ8- zkg9HIhmZwR-#`Pb{7r-2mW}POV9EZVfg3V3G-Lt(YzN>DSg${62owV7^`A7bf%O-E z($diT{6{;aGzzj5f6~xc$QJxb!$<><@Fxv{{<}W}S_XLX4eg{60M2ftp&%;;{rw|* zgftq&4I61#$ZBn92YIB8G~D0qfb;w7S@4bMU(aG7m%gDtj0|LR{~Q+pqK&$Nz#{+B z0|aoUe?NjoMCIKSy1(c8|&mEcYKpQxC)_!EGwgG(8Ect8f4(^+5i9m literal 0 HcmV?d00001 diff --git a/results/unknown_float32_gemv_t.pdf b/results/unknown_float32_gemv_t.pdf new file mode 100644 index 0000000000000000000000000000000000000000..754514309c1d654ba0e633118f16d9e78282e1d7 GIT binary patch literal 17773 zcmbun2|SeF_XnOmLu4&k8YJ0gF~i8dWz8;YVXS4JA<<%w?0ZDnvsOxENn~FNMJh{$ zNVd|VMfu-n)aTPz^ZS3i{{QK9J@+}!eeON?+;h))-}~H?kfFM!1X5BOCR8y7FRzCo z;Rv{w!*Q6rJY353l(!RHO3j{V@9yOSmol_>aU#G`pnx%4Q4!|k=?E2+{o@5qFHa&I zO)4N|Vc>Aw2~UK}Y}{%D5D%FU?TJos=?#LRJ(1|->j}p|w=gLaS9?b{PZv1$*Hd3F zyonPLZVj4M*95FM1rXs<+8%%f)n8Y&Usr9o?H}wQ|Ly?9H^jZK6X0%xeJNunf|s8! z9`py|-xw~X@8sxauj&;5T10?VNz`JLoH1O4A((K-~%N<>&;lOd$F?*?YhOvfHg~y?A6;PhUBP zOu!}%ILat3OEfW4weWgtGmcTz*i*|r(>|QcGU{`5sq@&Ug@-rl_W1Z8NoQ++R3@)$ zPUu>?R{AOBp3T{BUnf_-yBKsTts~@1@7bQ-J@$p+XCv<-_`9Z9|DFrv#m_eI zX!*fjeQ8p{H6*_q9!*cf6^bqmxKNa1+9Js<(|=H2qwpT{y~zgNdo{cJUD$G%pG7$M zROxrfEuVo+wSRi|@$0*3AW8qzKb@`OXVeP$x{Pyj*=^qh26&pbvHe@^)a3!X|Ra|&R{fk_3>@nf6J5hL%_rbCE zWxCoNC?!{u-uvy#yrUC&;NrA#Ri046dqpnar!_@)cFH%!?@Jl!hD+*&ojIwj`HI_- zlDp=PQTI1*aV{6;r*Dd8T(eIWg(op|d}e3;L>sg^+BtGXQcAHYX#LyUl^Oq7%wF{q zyZIycea+o1r+#D#zo0tnb3jd+-!}>UmawcG&O~<_%|hkfs*-nxN$3)fhQ;ny8 z9-hAu6^`dE?kll4P~9)klSIQIu6OfVXMTgQgYSO2!+Fx;qV+8T&Um%HOsAmF7Q~Yp zz7uzp`*tuye74+o9v>b{4dXB3i}63UCf6lmKfB~phl@>yhqD_jr>`U&`b;Y( ztyq70eP;Z7XMt`h!dwco)Lxknakby68p9H4Y$QeEW04}1oik_8 z;eMWI(;G`*ut-{T<4=7kN90|Mu?t z`!nC2alHRnQvXP=-Q98P#VN$|MeMu4${lTKowZ5dCgr+pr=E+X+p^6SbFNe>8pXy` zS;byT%^$sIltXd6zj?HG-r%~D!_|A<$KqNB%iq&9ox?1}m3lo!s~aa#mZRwuDOr0n)kYtoGAsIa5BI$s#kyjl%+ zT+OgyM2B~rJbH->_9Rhbp&@DJRmhTLwwri!M1Di?Zl+mAkMLGSW3L&*>)9@rHzHyg zabXN|Vl-t>hot@JJ+HrrD2cjy>%v8*%kII}mQM1ha>l3M!sY8+;(uhU)^@+3Y>}@B ze&AeTrMc^eY$rdBecCn#W=;(VT)cyRDTeuu`IeT7qUPntDl3q zpvyGzC`5iZ;9jpw;n$1u--BSLUO6hfqNj5WF-{1Ku#{3X); zkjE{dSL;D1OcU;us1MzxLcw0ZQz&(i#EBo6<905ra`ORj2F!#oTOxl_9o%@i(uis@ zf7=;Kj(+o}R19O)b#- zm{CCN(yi5SJG116)W)do0;NJF1Dqk$_N;77in>9>{5?6n+*y)^!#Sx>(S}cHBle*X z_8j>d>wf9%PAoM7O?ypaE>z( zeY?7;96nFz5I8P~qo4a275J*&{iG8&A$;Sx6w_5#BVLQl{V!X)CJS%n7>sE?DKX;U zN*|;?|DZ@ZvNfE7v zX(|loi*tw->6uaMWYI<;uK4mx=)bNzewV0}MwupSR$_b=R@w5lG~LP4dgT1TRh8$3 z3g1Ny)$YWFp0QD@rLob{6+aiQX~us%smx(0(3Iv>mY$i>S(co7`F;m%*QJ0e`G;1C z?MfPtq~CnZwBRn;5qKf2tk^Vnp=G?nda3k4LU*a`V>z=AVmD;Yow&q@TrYMhlXa~= zG}$br5ZYdBS!Q{lx;G!AlVDdhmn_P;XR%7wTMOK{B^>93E#7pk92B(RnTyHa%S3}( zCS1&qHPIPt$hEx!yVgO2vE4_b-Fu=)UzBTnINBi8newRxU%}Uo*b2Lm7(Ll95k_-{ zgAe^2dGnqzgjg{f=GoFRU6*zA;%dmhim9Kt#CF|4)L+e>;yYv8Pr;rAAK9SL^7&3y z>V*D@Q;*o$pPyWxZf?&W@KIa4QV}G^@G;||i)llohK7tqZryA!d+M1Rwny$X2o5mY zzw+@NMH>w&GL&9;k{5JKvgn%F*f?9P|xjThZBD^MZ;o3PG>eA+%5_ab6lYE4D;Pd=lW9hy|7$WnfbNau;D1%IVomI`D+{= zk`;nuxVMor8YkmZJIpUH>&)BwCv1!VXy5&f@tK&C;w3hQyKbQd4MTTUl{uaSRKZ?c zD(iD73sT*0u`}zs<(S2(vp2;bh336iJGd*q22V473`Z!9LOF6gm%aJz$F0&bm$Hk) z1rJ?cpDh~PQ;?-y+1_KNSRx^I%~JG(!iQR;*VfoK^i4ut@K^mjg$N(J-pf}{ zr>&N;)v7kITPb~Ze_yV5|I$PeK%|*F-OwDKQ$kCbWlq z3IvaSmTf#D+`al`IC7rl?W$~Tnoqn)QQxp**}&VQhe|_kMwvXvxV~f9L;Zro_GFMv z;+<0EY0;~0+OabUlSH;60Rz?k52pr2c1s@c>X|lu8riOuf)!2tk<`cHs9elr(tt&_25VpWE?N}h5?B)4yGIULpP|6u0f;Y7=#EZW;_95pH- zgP#g=_f0#y#{~u6Wv(PJbH_@#eH6_sUofPToanSsJ#oGVUzy1Ev-q}J4#BlTPnaL`CIiJ%qPBJAJx|9#r0Hr zzKNa$xpRr%%zTc2EuS3HInsVAYn!>xSw)%(nwm8=;WiNoC()Jnp`Qiube%tIFRf2B ze*ZCJ@PVzuXLX>w)VFn=;^~M4JG7qp1;~G`Y)I%GLi8vc2Jw5jLxTMf(Hg7`eiww+ zIGdn93M&op{(s?FTDkNiXEiAqHn6W}rosZq>6~1PLgGi(<%Qlr&Pa_<+l7~=l z!rD_O7wzfv)m)F2`uw-FrI+A@Zl7)rhxC;@tot}6oc2F)DGOWi*>Rks=^Y!I?J>W0 z8>_EzfCs#C<7*4S zMeidkh6)d#&{ZWvRw?dFsPJY*y%Q59;nOF zH`7P0$K`RbBKWo?{=O;wJI)9ELsn6JyW}dPzD|z~wn&?u+9S|v91*stP=K*{vxil; z^{aLV^~}9@-zvI1&aH3rc6F&IBl$lUmA}?fC~2Ah7O1|Fwr3lHvEe(%TJyH_iuO|m z7gU%R*RKolvM;vm6x6kuTxwJ2JX&KoqI!O+Rr4tQrI6*YC)r>2R81;9*DOp^GPQ=Y zA&vGCjb!$$YRNRh2G16T9w}S3_`q^f?18nMf7QL!mg<=#(Q;M!>8Cu;w5&$d2j44H zPY~3IJ@61k8Gbr-xA>zGh4((U`guq<+{WG#E(<$U9KND`Yfsi=5$7)Vl1>j%a}-Rb zEcU{8Q%x)873@P)aLQCT%F4jMVt@YHkZ?xgE;gV0k_QiRI!B(x|*btAzdRY>!fL zsB&^SPLCek6{F+Kc`s{!DchrcJ`?JB2laJwn5!R;zKocq`ZUM8K!(FV_X>V5*=4rE zVxlfw$Ie;zW)j2L@x4;&M~Dk)=Q}%6<5peq2H_GmuZepO zn{Xw>G&tMl_S&N!do^a&HJjQe4KL(>+ufM=p~Ct6bu%}!vPE>t8?W3|IrK@wqA-`eR_St z;&>C2ed!d5SQIAg@lLK>3U6BVbF@eHY%@N=-b{uQXrE{kCtFbuiMT3AJ*a~WTDHHW zi(QS*XYc;$f$(p=a`<{@YZQHUr@q@q?F{KdgClJ&gQIqO$CDi2bmArxZ?bCE*}NKl zR1zKvkD+ebapYs>IedS1+SPHTuJ7EB$#4kmRBhrAjo5+}Qb^YI1ZG`rRWU+xXY;pb zmp>F9sJ}RIZd6!k?gGy-A(IFQRXgps6|As#7k z5|PH2eLwdoXe}1?^0=O@yK#BYUiqWyW^dlm9f}sfXY87tQlX#Ix$kNfm3C5tN@Z!K zx+jQ!X>{Uv*G;<8iw_TqINz8j8cTZaI?vD7wNmo#b-~jUs{LPXhzWhvdK!uL6<%`f zm>KGtO#C?!N3EN1&6WSd%j=#`B<&haPwo#X?|aqrQip2W2|4ETWZH-J zAvd)zgWkeCsR0cd-awcPV5G68h?~;ppm{^kJS8l|qRs|U#e?YWV{HS@OwC~~i%Zoh z$pSv*d&iD;wz;o&gCV|X&NMO%k-->P+?E-V2J(Oy!Xk`*G4zi-^mxOaXdQNQWx=cZ zf;&QlUNI(e;k+MjnD%d_oEl`9A?KgbsQ=~$^QV7)veYQto_qfD`iOB4`*vlanma-c zctJ`j07U=h034uWkt%KhR8nJi_|%ktlVKNzIG7P*#ufZ-nm^3?AQ~ zvmN#idaM1KEowHsqS7!F7kMV}t$UtuUgLo)VeX2@qtr7FxV(&RX*n>! zv{1L`GHJ%|YCC?_17D)4)A{(`4t@RP?O)aDU23G4IDek&zuPqOo%t}oDGM2ak#Wdd zQm0?)Or4~uNee4hHow5XORIUHOPxQ1Vc&KK;E-vR_vFjcUE6yNT@IuB7qolW>+X{vU#5y!|ub&)r{c} z1DT_U3&q$ZHaXsGw359HKE7E4Nt-=KELNb~kNbWf7Ea)HS}@lVjxH03e~XS6xhxuV zC4f_Wu?uT0oUAYne`RbO-CcAgwMWULClw~=JV-6hTKCeCO<%URLEAW%D;4#Yuh?JU zOoe<`f!rHM{f8BTna>^@jAnk}@vc4g-0^T)Cy}r3OLq0cFdRqy>QpqAPCxGXn}t<2 zH&H3$241V2vwo~;HXCpJ?sY=VxKy+;8rgw3)00^DTGy7ayoXj8P&uONB@fC5^S-F2N6;l6ewW8^{|xTulvHrb$c-I$UMZkd zZtR;8QSIu!GVX>c0;wULJykdF(h0)UdDvIW~S8Ndjp64TdHI1r=RvYD8;LdPEfc@{X}oM9rvV_?Y{eR z&Xc^GXtXGQ==1u>n$mbxEmLiFeWZse z0!K}MJvk-)q6GspGJLPN!j~j|g3J%@v-!^&ECZ9fpRHe<6)-XSVq7BWplTb%-Ji$u zP9g6Et7X;Q_c-x_OhOag-iSNY=sXJi8EsTu$xIqQ+hImO9ww{8`*%(_v`$(L%Eyg) ze%Ew%+8@3n!BKT=cW@MZp7Q2y@l-694Q2U|YH>&R4lBEnQf{M|P_fh}^u&Dgwz9sL zw`QLB%cx1Lyev8M{+p|LhF!#5oskC25<9j1{t; zT4!?EpICBg5TCQkfd4KiPlih}&7r zIJv;`f|>VPs931|jT4e#i%zJa^>61nZI;ihI&?|~b5_hYwJ>cfxyn{e)5Z0C2`l2d z&8$8~`+mVnpr?Pndx6>9SDtNXCWLQul#n%V)8NFm^fR_^G~zD&5K}g?R9@>E_HUU$ zc58Wa&d|o+^_FkX^wyQ_pj5tacXD^&JE5mDQb(>uWxb>Kt9O)Yynp%m%&cn4_LG&{ zn2tKOR+bl(q%Y3Nhkka^@YReRijkG$Pquo53!QwF%V#xZBU3f$Ay`#+61kPVDviWW9~!WzSuUd#LIlG;Rdjw}p8t70^5bKbz}>2h4hHvc z*c|b(C$=WkN}zX~JH)lr@ePG<9JYW9zoBAGiY^~4{N~@zRrYau*x_11zKxWe0Bc|J zRJpR?X}O!7DK$0dAH<`37x^xZB|}_sMra<9u&u4u8QtM%r3d}Ha+n)g7H&@mcm$h%?1uL zDHg3~JG+wjck_n&=h&=IKKXig-6!&3|81^oJj;_OFFx__Jm!$hL7<2k6DgTJA!#+j zB{z|R?wl_E9{I4&E~6n#Z~3dzlm2f%srW>kzmX9o84!rXZb7>=5)ZOc@iVFg9nKT{ zB1om!?~D?`u6p60x*In?wt$B2kdcHBQ^ z+aJ6&8T*Pom|tJt{j;!htYYM>*n*k8o270qeQi#Ivoigd3GR8hg3?}?+-EXt}&MP7PyGOlGo(lwYclrOAZc>GU?H zQ<1Md9kD{98d51YgjNS&@6SZfu&<5}r{7YO&Tz$@Wk8FTaq~Xb#b$YnveoB~wW7c6 z@E}r|pXaNxh7T0ljnu$>JDGPgwq&x!2r%mC)2?6hO}KJ2hFh`qy&8K`GSduZ-Qvdz zVaIio1mPr4ucpbP*`-WWEIi3sje@ooRg$bMZWtB){$?wQ;D*|xKe?*~>lu|lN4NQ8 z8OP_{JKZ)GWCHT{jExG)%}4?M|NP zjOx8>(s9b-VOfHKs_h7IZ>)C6c@=Z*BZ}79di=L(j%s~*!Y+Vf{7@&*@j&lA^A2wkONyol%)e#RM= z6sx9Zw{Q{vO32ipQgNSPZM`;bXQ`nKwSlW1BRacK_hs5#laqXH0IqU~CMU6L+|ol@ zo$G;oWWm)1j>F%i2q`6LTuw|~6(-|{yLv)7F76ugebIlX(l?KduSTpj|6qHR?!oy04&~2LO7qln z>h}b)yDH9AY39`roOmOyldRstnapc=6(4o*fR>&XK3S`VFHA7Y;+E~jOjFTY#;>Z~ z+V9_cW;NWU9_Jmm7ClF6x@Pd!CiR_u*!5)Ox80_V2g72uzZ}&VvqCQZR7X^CeWv%) z*iRE4T~+$5soyTLZxuEo+xcOKUZ~7GnU0cy4`3twzn9e##=5@kaK=#r%r+POg>G7Y zj#wXXcfBS#MMIOx?j!PbWVFs{OL%R;leXHpKIMp4*US4N`n5*8?E_;=b>;X{;77Z=k^?(_8ij|H~jd^f&xqmTTAyn zor+mkY$r%iOPtmm2tGD_<$Cat_kElaGB)mHC*j%%k(;usK+{WE2{)dImK!m4_<^{pTE2RSRxrN<*`(cv_ z*?hjDRQ7MPt`U8?&3vw2?!#P)sYPo$&o^mRW!^t!D|_oFqlxqFQ^Nhe`9r1Km#bWK z2L^E0F5gYJbR}M5Yb*@!?>KC5xakObo*&0SU+Q3Wq{WRr(}8}J^_Yo~Nmd}&&+jWs z?+>2+Ng+-+O&~KYWKh&rzS^MCSQBV(mJ)p3iX`(5L%c6UT#z9#okvoQ0gv!-a4)oT zcA#~%gd^eg&}XiTS!`5)a|7(wUpvKU@a^kAKmKCCa)$;iclt%{C>Ommc=(~LET#$U zy|CTo&_mcwiAY&K(rp?N8ZBy_w%)rcRHZtRu$a|n+`h_ri@jgs`k{W7S_v4n5u1WV zrWA+3Q2)m_awXn6S3-_#uVv?M4Ab5gcKiz+UlAH^IIG;z;D9ow3f=dV;O4VE+IN5S zxx&WFFN7-#^FAtAX7;{F(JdO_)j#}(qP?eGJJ(kGYi{O69rfgl`7>3jA0wNS!GY(FnORuLe*Y8Iy8l~8_dM~9vIJte$^(QxH&r3W z(a;|2HdR|)K^i=Kbgavmzdf>kHS^9;8Iw^kGFS(R-U_Zu&}E|L2M{?sKfbwh@2n4ME`qSW z7@XtDhF6F%4XF#wQ;&)Moa-B;d8Y+2j8)LYXQGxrLZ zP3HV+NV2IskP`T zL+96;s_U=2coX<6I0W<5n^Em;Z}y}=T)N?GeKy^#IUyiCTkMHMrv;5M_qgohh1I-M z5;tk1?q;T6fbp6~bDmXspL;0o%;(*v7VyDE9&PW~Xv3sn|B&ZejJP0if1|x0(Fc%h zZ=Q2&jH!eOC11p~*AqX!_%cwH`}1k6%sn~LIcqCR{3Au~#ncXQ)Lfv&N8*PkonZ{p z%lT`w!;GxbPI1W{bd}F>k@=ZM<^i>8DLWn@JNZ8_U2f%~*ZC&TelJ0YjW@OH1=Eha zbI14!VzaRmlr#BC9vpN`leb!r)`f8B>^hOWN>Qr!CS-2?`up2WBO`lx7lS*<3=SD! zhS~~pkucKsrBd16&@c8e-1+JYe1_Qh>GhHAJ#@4?Vw8zilew{1;NnLbUniBjw!T0k zLtoIGKPxukVKJ?23fIOPV)Fdo2Y%#x>T{94FXhP%M&}CVwOEm{H|?*GTrRaEPD48v zY_BV1s&rS_vU-?<+1amaRLAVBqsYjR z9Qu>lf*sON)Miox%dpk#`@2fQ(ap2+_sZ&Bm+w`o(fNg}O~bwAtiL{MUCX z_-g*aGUa1`?ZNSc`fBH63i-{^3)|aoP#g*VQQsM&d;jav$I}_2LU^|GD0S=GPyb0IV!Bh?cyDZEO`pGk--upwOxDzLq#SBZj4bYUa0kC9>?Df z$*AFCrD-h`9$=&J!7n<6S029qGG6XerEPc}Cuhyh~Fs*gr_* zKn;~+dUw;aX$Z)FjV~T%o41mFgkl})V5vAx)p+m+;_$X@Y(Bk|j#qS2Z%wy(Reiig zd*s_siYtdq)Blza5XHZEVj~oa6yGGJYEN+bmD4veQa2Yj9Frsh0bwHW@1GnL&jgq= z1mRU6gb6ZLAofbb6GS9I!B_tVef=iEq}1Ju8gxwsPH(!ki^y*xbZ;m`&m!QGzV3YQ9W^7VpCd3l1a5l?!-q=>G*P9X0L zqOibUH-C^P0IsD7PX11wa49EJP0&Ul-pk#~6AqnGQiaQaMf4%K49M$&h%LA*V8IY3 z3z{>9%Yvd7a2zP$;A@XR;Y1{{g#+^Zb-%$I^ikB$)6vP7fcNrsg5v;#_HZ1SmUs}* zg@j2#EWxEfup4064UoX|6L2YaxReK63b5q`M}lFK0xW}}2EF!!OZmg4P6GJ@ssk8w zTNsGzk~-q%NOT4Ag^=5yA!4K<|9z_DUzPepDM%yue`o+0UUz#J0vLlW>4FLY4+Xs8 zP@s2E3($=O3i^VmX78=FAz%bd%HnqxgwugQKR3LJr;EE2U|hNIe1_ zl1BX*Yxf(OTEnpj5Ht7xKf(UL)`13_f6_qPrGZtGMa#mmXeSkcY0&b2LaGkWmJ_57mhSegiAJ zL4&TO&Y)#r&=q0=>J*f)K;fWwF`(lcb%3&C1UvwN!k{)W;2n}selrJM zN%{k|Bn{mlAQ3@qLtTTeAd$ucYHlOJFu%s^pQ<)8fWeI_P@u=)H4MZR;2e4#QiI>a z_4^8j4ECE5QVr6;lt3|_5c`lG{8K+v3L=t}3DgAL|5FRZ6aoPwJ^NJw4jjVzoghWn z&;dvf{x^|i4fF!ii9ZraA0VChok+U!OD7ZVeE2INt z5>6J338WXmt^JvP>%{LXBor|2Fwzyu|B*;zz406>^G|~G0vc85`tPCmKXI@sa8o2( z2+TJSBFXRR!I2O|0v=EwPC6?La)xj)vLu=*90jeY0nGvq1{vgFz)}MuN0^O0^I%yvR^ot0`{9W8W0U&xE6xPisefNq1F_=}j64AFp0~Rf(cR4f?vIv4A|#RG za91MHn;<7845R3ljYu=8|a0du9c{={C0u&`R;O6D24h8KW zP?tj?P*`x_4~;+oJA$(k`9mfFzD~|CU|Ucy;0AyFfS1F7dEpHIhX%n-=m+=wMMGjy z&^*{gLjX_s7Y&+Wo5~?&fPgpCfQ8*mLqTTdf69Ry?B@DFtB?iw-*UihY~~q(Kp{4_ z0bYfs#Q(gDlt$w=zl)Sc0#ClF92x`g`z9JVTe-O$XydQ?5NH|5CjC#l2x$Z~aW~PB z0F`W}p&9X=*=`4$WCr5C;N9f3<`L}P316X0IfIEKM5===YEAB4m+&>xQf}1o=0WgOJ{4 znhfxwn`yF;-}&EnabSl0MMM9k;|Lt~FTFzGWdD*g06Eyrynuz|Ut@_xU?4~PKfE9j zxWD*A0tmOczhHd?)@qyHMMB76GYtz6$!3}?SjKIl0fVzyE=UxJG5M>UEI8S-`CZ&! zvO=PfkpF~!|BNNrO-KC2GXPV6=pWJ7-p$>~m-OGFn79Qx0b2)`GWPO<3^d8zX?r?* e0kA~60e*o%wD%>F%py2kg+arFgft8^VgC;?6UOQQ literal 0 HcmV?d00001 From 1279cc1a0e50fdb0e8f9dd598ae39ac51d6d042c Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Sun, 26 Apr 2026 07:00:35 +0000 Subject: [PATCH 2/6] bench updates --- benchmarks/python/blas/bench_gemm.py | 122 +++++++-- benchmarks/python/blas/bench_gemv.py | 259 +++++++++++++----- .../blas/results/unknown_float16_gemv.pdf | Bin 17529 -> 0 bytes .../blas/results/unknown_float16_gemv_t.pdf | Bin 17427 -> 0 bytes .../blas/results/unknown_float32_gemv.pdf | Bin 17563 -> 0 bytes .../blas/results/unknown_float32_gemv_t.pdf | Bin 17266 -> 0 bytes results/unknown_float16_gemv.pdf | Bin 17615 -> 0 bytes results/unknown_float16_gemv_t.pdf | Bin 17413 -> 0 bytes results/unknown_float32_gemv.pdf | Bin 17819 -> 0 bytes results/unknown_float32_gemv_t.pdf | Bin 17773 -> 0 bytes 10 files changed, 283 insertions(+), 98 deletions(-) delete mode 100644 benchmarks/python/blas/results/unknown_float16_gemv.pdf delete mode 100644 benchmarks/python/blas/results/unknown_float16_gemv_t.pdf delete mode 100644 benchmarks/python/blas/results/unknown_float32_gemv.pdf delete mode 100644 benchmarks/python/blas/results/unknown_float32_gemv_t.pdf delete mode 100644 results/unknown_float16_gemv.pdf delete mode 100644 results/unknown_float16_gemv_t.pdf delete mode 100644 results/unknown_float32_gemv.pdf delete mode 100644 results/unknown_float32_gemv_t.pdf diff --git a/benchmarks/python/blas/bench_gemm.py b/benchmarks/python/blas/bench_gemm.py index 3629c0ea39..edf16e187a 100644 --- a/benchmarks/python/blas/bench_gemm.py +++ b/benchmarks/python/blas/bench_gemm.py @@ -27,9 +27,17 @@ torch_device = "cpu" torch_sync = lambda: None -N_warmup = 2 -N_iter_bench = 10 -N_iter_func = 5 +FULL_WARMUP = 8 +FULL_ITER_BENCH = 80 +FULL_ITER_FUNC = 5 + +QUICK_WARMUP = 2 +QUICK_ITER_BENCH = 10 +QUICK_ITER_FUNC = 5 + +N_warmup = FULL_WARMUP +N_iter_bench = FULL_ITER_BENCH +N_iter_func = FULL_ITER_FUNC def bench(f, a, b): @@ -120,7 +128,7 @@ def gemm_tt_torch(a, b): return ys -def bench_shape(B, M, N, K, np_dtype, transpose="nn"): +def bench_shape(B, M, N, K, np_dtype, transpose="nn", max_torch_ops=None): shape_a = (B, M, K) if transpose[0] == "n" else (B, K, M) shape_b = (B, K, N) if transpose[1] == "n" else (B, N, K) @@ -149,7 +157,11 @@ def bench_shape(B, M, N, K, np_dtype, transpose="nn"): "tt": gemm_tt_torch, }[transpose] - time_torch = bench(f_pt, a_pt, b_pt) + gemm_ops = B * M * N * K + time_torch = None + if max_torch_ops is None or gemm_ops <= max_torch_ops: + time_torch = bench(f_pt, a_pt, b_pt) + time_mlx = bench(f_mx, a_mx, b_mx) t_a = (0, 1, 2) if transpose[0] == "n" else (0, 2, 1) @@ -172,34 +184,98 @@ def get_gflop_count(B, M, N, K): return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3) -if __name__ == "__main__": +def main(): + global N_warmup, N_iter_bench, N_iter_func + parser = argparse.ArgumentParser(description="Run gemm benchmarks") + parser.add_argument( + "--quick", + action="store_true", + help="Run fewer iterations and a reduced shape set.", + ) + parser.add_argument( + "--max-torch-ops", + type=int, + default=None, + help="Skip PyTorch timing for cases where B*M*N*K exceeds this value.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print per-shape timing results.", + ) + parser.add_argument( + "--single-threaded", + action="store_true", + help="Set OMP_NUM_THREADS=1 and OPENBLAS_NUM_THREADS=1 for single-threaded PyTorch/NumPy comparison.", + ) + args = parser.parse_args() + + if args.single_threaded: + os.environ["OMP_NUM_THREADS"] = "1" + os.environ["OPENBLAS_NUM_THREADS"] = "1" + + if args.quick: + N_warmup = QUICK_WARMUP + N_iter_bench = QUICK_ITER_BENCH + N_iter_func = QUICK_ITER_FUNC + else: + N_warmup = FULL_WARMUP + N_iter_bench = FULL_ITER_BENCH + N_iter_func = FULL_ITER_FUNC dtypes = ("float32", "float16", "complex64") transposes = ("nn", "nt", "tn") - shapes = ( - (16, 234, 768, 3072), - # (1, 64, 64, 25344), - # (16, 1024, 1024, 1024), - (1, 1024, 1024, 2048), - # (4, 1024, 1024, 4096), - # (4, 1024, 4096, 1024), - # (1, 4096, 4096, 4096), - ) + if args.quick: + shapes = ( + (16, 234, 768, 3072), + (1, 1024, 1024, 2048), + ) + else: + shapes = ( + (16, 234, 768, 3072), + (1, 64, 64, 25344), + (16, 1024, 1024, 1024), + (1, 1024, 1024, 2048), + (4, 1024, 1024, 4096), + (4, 1024, 4096, 1024), + (1, 4096, 4096, 4096), + ) + + if args.verbose: + print(f"{'B':>3}, {'M':>4}, {'N':>4}, {'K':>4}, {'dtype':<9}, {'t':<2}, torch_gf, mlx_gf, diff") + print("-" * 66) for dtype in dtypes: for transpose in transposes: for B, M, N, K in shapes: np_dtype = getattr(np, dtype) - time_mlx, time_torch = bench_shape(B, M, N, K, np_dtype, transpose) + time_mlx, time_torch = bench_shape( + B, + M, + N, + K, + np_dtype, + transpose, + args.max_torch_ops, + ) gflop_count = get_gflop_count(B, M, N, K) gflops_mx = gflop_count / (time_mlx) - gflops_pt = gflop_count / (time_torch) - diff = gflops_mx / gflops_pt - 1.0 + if args.verbose: + if time_torch is None: + print( + f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, skipped, {gflops_mx:05.3f}, n/a" + ) + else: + gflops_pt = gflop_count / (time_torch) + diff = gflops_mx / gflops_pt - 1.0 + print( + f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%" + ) + if gflops_pt >= 2.0 * gflops_mx: + print("ATTENTION ^^^^^^^") - print( - f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%" - ) - if gflops_pt >= 2.0 * gflops_mx: - print("ATTENTION ^^^^^^^") + +if __name__ == "__main__": + main() diff --git a/benchmarks/python/blas/bench_gemv.py b/benchmarks/python/blas/bench_gemv.py index 4457a50fbb..1dec32ee4b 100644 --- a/benchmarks/python/blas/bench_gemv.py +++ b/benchmarks/python/blas/bench_gemv.py @@ -1,5 +1,6 @@ # Copyright © 2023 Apple Inc. +import argparse import os import subprocess import time @@ -15,8 +16,9 @@ os.mkdir(results_dir) try: - device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"]) - device_name = device_name.decode("utf-8").strip("\n") + device_name = subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL + ).decode("utf-8").strip() except (subprocess.CalledProcessError, FileNotFoundError): device_name = "unknown" @@ -30,26 +32,30 @@ torch_device = "cpu" torch_sync = lambda: None -# N_warmup = 5 -# N_iter_bench = 50 -# N_iter_func = 20 -N_warmup = 2 -N_iter_bench = 10 -N_iter_func = 5 +FULL_WARMUP = 5 +FULL_ITER_BENCH = 50 +FULL_ITER_FUNC = 20 -# out_vec_sizes = [128, 512, 2048, 4096] -# in_vec_sizes = [128, 512, 2048, 4096] -out_vec_sizes = [512, 2048] -in_vec_sizes = [512, 2048] +QUICK_WARMUP = 2 +QUICK_ITER_BENCH = 10 +QUICK_ITER_FUNC = 5 -benchmark_vector_lens = [128, 1024, 4096, 11008] -# benchmark_vector_lens = [] -# benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2] -# benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2] -# benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2] -# benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000] +FULL_OUT_VEC_SIZES = [128, 512, 2048, 4096] +FULL_IN_VEC_SIZES = [128, 512, 2048, 4096] +FULL_BENCHMARK_VECTOR_LENS = sorted( + [(i + 1) * 4096 for i in range(8)][::2] + + [(i + 1) * 4095 for i in range(8)][::2] + + [(i + 1) * 4097 for i in range(8)][::2] + + [64, 128, 512, 1024, 2048, 11008, 32000] +) -benchmark_vector_lens.sort() +QUICK_OUT_VEC_SIZES = [512, 2048] +QUICK_IN_VEC_SIZES = [512, 2048] +QUICK_BENCHMARK_VECTOR_LENS = sorted([128, 1024, 4096, 11008]) + +N_warmup = FULL_WARMUP +N_iter_bench = FULL_ITER_BENCH +N_iter_func = FULL_ITER_FUNC def bench(f, m, v): @@ -102,7 +108,7 @@ def gemv_t_torch(m, v): return ys -def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False): +def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False, max_torch_elements=None): shape_mat = (in_vec_len, out_vec_len) if transpose else (out_vec_len, in_vec_len) shape_vec = (1, in_vec_len) if transpose else (in_vec_len, 1) @@ -115,11 +121,14 @@ def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False): torch_sync() - time_torch = ( - bench(gemv_t_torch, mat_trc, vec_trc) - if transpose - else bench(gemv_torch, mat_trc, vec_trc) - ) + matrix_elements = in_vec_len * out_vec_len + time_torch = None + if max_torch_elements is None or matrix_elements <= max_torch_elements: + time_torch = ( + bench(gemv_t_torch, mat_trc, vec_trc) + if transpose + else bench(gemv_torch, mat_trc, vec_trc) + ) time_mlx = ( bench(gemv_t_mlx, mat_mlx, vec_mlx) if transpose @@ -147,35 +156,55 @@ def get_gflop_count(in_vec_len, out_vec_len): def get_gbyte_size(in_vec_len, out_vec_len, np_dtype): n_elem = in_vec_len * out_vec_len + in_vec_len + out_vec_len - item_size = 4 if np_dtype == np.float32 else 2 + item_size = np.dtype(np_dtype).itemsize return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3) -def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose): +def bench_with_in_len( + ax, in_vec_len, out_vector_lens, dtype, transpose, max_torch_elements, verbose=False +): np_dtype = getattr(np, dtype) mlx_gb_s = [] mlx_gflops = [] pyt_gb_s = [] pyt_gflops = [] + if verbose: + print(f" {'in':>5}, {'out':>5}, mlx_GB/s, trc_GB/s, diff") + for out_vec_len in out_vector_lens: gflop_count = get_gflop_count(in_vec_len, out_vec_len) gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype) - time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose) + time_mlx, time_torch = bench_lens( + in_vec_len, + out_vec_len, + np_dtype, + transpose, + max_torch_elements, + ) mlx_gb_s.append(gbyte_size / time_mlx) - pyt_gb_s.append(gbyte_size / time_torch) + pyt_gb_s.append(np.nan if time_torch is None else gbyte_size / time_torch) mlx_gflops.append(gflop_count / time_mlx) - pyt_gflops.append(gflop_count / time_torch) - - print( - f" in={in_vec_len:5d}, out={out_vec_len:5d}, " - f"mlx={gbyte_size/time_mlx:7.2f} GB/s, " - f"torch={gbyte_size/time_torch:7.2f} GB/s, " - f"diff={gbyte_size/time_mlx/(gbyte_size/time_torch) - 1:+.1%}" - ) + pyt_gflops.append(np.nan if time_torch is None else gflop_count / time_torch) + + mlx_gb_s_value = gbyte_size / time_mlx + if verbose: + if time_torch is None: + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={mlx_gb_s_value:7.2f} GB/s, torch=skipped" + ) + else: + pyt_gb_s_value = gbyte_size / time_torch + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={mlx_gb_s_value:7.2f} GB/s, " + f"torch={pyt_gb_s_value:7.2f} GB/s, " + f"diff={mlx_gb_s_value/pyt_gb_s_value - 1:+.1%}" + ) if transpose: title = f"gemv_t ([1, {in_vec_len}] [{in_vec_len}, out_vec_len]) | {dtype}" @@ -189,31 +218,51 @@ def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose): ax.legend() -def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose): +def bench_with_out_len( + ax, out_vec_len, in_vector_lens, dtype, transpose, max_torch_elements, verbose=False +): np_dtype = getattr(np, dtype) mlx_gb_s = [] mlx_gflops = [] pyt_gb_s = [] pyt_gflops = [] + if verbose: + print(f" {'in':>5}, {'out':>5}, mlx_GB/s, trc_GB/s, diff") + for in_vec_len in in_vector_lens: gflop_count = get_gflop_count(in_vec_len, out_vec_len) gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype) - time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose) + time_mlx, time_torch = bench_lens( + in_vec_len, + out_vec_len, + np_dtype, + transpose, + max_torch_elements, + ) mlx_gb_s.append(gbyte_size / time_mlx) - pyt_gb_s.append(gbyte_size / time_torch) + pyt_gb_s.append(np.nan if time_torch is None else gbyte_size / time_torch) mlx_gflops.append(gflop_count / time_mlx) - pyt_gflops.append(gflop_count / time_torch) - - print( - f" in={in_vec_len:5d}, out={out_vec_len:5d}, " - f"mlx={gbyte_size/time_mlx:7.2f} GB/s, " - f"torch={gbyte_size/time_torch:7.2f} GB/s, " - f"diff={gbyte_size/time_mlx/(gbyte_size/time_torch) - 1:+.1%}" - ) + pyt_gflops.append(np.nan if time_torch is None else gflop_count / time_torch) + + mlx_gb_s_value = gbyte_size / time_mlx + if verbose: + if time_torch is None: + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={mlx_gb_s_value:7.2f} GB/s, torch=skipped" + ) + else: + pyt_gb_s_value = gbyte_size / time_torch + print( + f" in={in_vec_len:5d}, out={out_vec_len:5d}, " + f"mlx={mlx_gb_s_value:7.2f} GB/s, " + f"torch={pyt_gb_s_value:7.2f} GB/s, " + f"diff={mlx_gb_s_value/pyt_gb_s_value - 1:+.1%}" + ) if transpose: title = f"([1, in_vec_len] [in_vec_len, {out_vec_len}])" @@ -227,34 +276,94 @@ def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose): ax.legend() -for transpose in (False, True): - for dtype in ("float32", "float16", "complex64"): - op_name = "gemv_t" if transpose else "gemv" - print(f"\n{'='*60}") - print(f"{op_name} | {dtype} | device: {torch_device}") - print(f"{'='*60}") - - fig, axs = plt.subplots( - len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained" - ) - - print(f"--- sweep out_vec_len (fixed in_vec_len) ---") - for i, in_vec_len in enumerate(in_vec_sizes): - bench_with_in_len( - axs[i][0], in_vec_len, benchmark_vector_lens, dtype, transpose +def main(): + parser = argparse.ArgumentParser(description="Run gemv benchmarks") + parser.add_argument( + "--quick", + action="store_true", + help="Run fewer iterations and a reduced vector-length set.", + ) + parser.add_argument( + "--max-torch-elements", + type=int, + default=None, + help="Skip PyTorch timing for cases where in_vec_len*out_vec_len exceeds this value.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print per-shape timing results.", + ) + parser.add_argument( + "--single-threaded", + action="store_true", + help="Set OMP_NUM_THREADS=1 and OPENBLAS_NUM_THREADS=1 for single-threaded PyTorch/NumPy comparison.", + ) + args = parser.parse_args() + + if args.single_threaded: + os.environ["OMP_NUM_THREADS"] = "1" + os.environ["OPENBLAS_NUM_THREADS"] = "1" + + global N_warmup, N_iter_bench, N_iter_func + if args.quick: + N_warmup = QUICK_WARMUP + N_iter_bench = QUICK_ITER_BENCH + N_iter_func = QUICK_ITER_FUNC + out_vec_sizes = QUICK_OUT_VEC_SIZES + in_vec_sizes = QUICK_IN_VEC_SIZES + benchmark_vector_lens = QUICK_BENCHMARK_VECTOR_LENS + else: + N_warmup = FULL_WARMUP + N_iter_bench = FULL_ITER_BENCH + N_iter_func = FULL_ITER_FUNC + out_vec_sizes = FULL_OUT_VEC_SIZES + in_vec_sizes = FULL_IN_VEC_SIZES + benchmark_vector_lens = FULL_BENCHMARK_VECTOR_LENS + + for transpose in (False, True): + for dtype in ("float32", "float16", "complex64"): + op_name = "gemv_t" if transpose else "gemv" + print(f"\n{'='*60}") + print(f"{op_name} | {dtype} | device: {torch_device}") + print(f"{'='*60}") + + fig, axs = plt.subplots( + len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained" ) - print(f"--- sweep in_vec_len (fixed out_vec_len) ---") - for i, out_vec_len in enumerate(out_vec_sizes): - bench_with_out_len( - axs[i][1], out_vec_len, benchmark_vector_lens, dtype, transpose + print(f"--- sweep out_vec_len (fixed in_vec_len) ---") + for i, in_vec_len in enumerate(in_vec_sizes): + bench_with_in_len( + axs[i][0], + in_vec_len, + benchmark_vector_lens, + dtype, + transpose, + args.max_torch_elements, + args.verbose, + ) + + print(f"--- sweep in_vec_len (fixed out_vec_len) ---") + for i, out_vec_len in enumerate(out_vec_sizes): + bench_with_out_len( + axs[i][1], + out_vec_len, + benchmark_vector_lens, + dtype, + transpose, + args.max_torch_elements, + args.verbose, + ) + + fig.suptitle(f"{device_name}: {dtype} {op_name}") + fig.savefig( + os.path.join( + results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf" + ) ) + plt.close(fig) - op_name = "gemv_t" if transpose else "gemv" - fig.suptitle(f"{device_name}: {dtype} {op_name}") - fig.savefig( - os.path.join( - results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf" - ) - ) - plt.close(fig) + +if __name__ == "__main__": + main() diff --git a/benchmarks/python/blas/results/unknown_float16_gemv.pdf b/benchmarks/python/blas/results/unknown_float16_gemv.pdf deleted file mode 100644 index fee838a0be7f8f30a01565f3ad2a618f57005ff7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17529 zcmbun2Rzl^{|8>7xEa|g<66nSn`>on*(00l8fC>LY1t!W6KNQgiX^g<>=A_$DIyfI zN`p%L&-<#+r;qOM|MmF)ugB?q-mmu=uk$*u^LjncdB0x?8>wqbBBd}e;i4hOYEC34Paik9w2_k=(I1Wi1x(;dN-(0g3sg|{j|!ST-Xu7h zTtM2&(D^WtK!VGx-)aPrv`k4(BqAKMPB3yJk%)fYa4d8SlQwmCa`EtXgX4ZZ_46T^ z5=n3y(5<>AV1*b&f=lao0Txt$UDbYFb>Mb?u!H=201)30_kKjc-8%cyCPaUq06zj4 z55&I-T-t!>;^Cy~69jrhfIl1(1((62;C3)+H83bp(;tpsFR9|~?E~(RP_aJ+0X_aL zIentH8_69G8X-5X>EQ{+3YXUO1OiYe5`0{U5X=4~KcbTtEGVbN+RlethUMgChX$>^ z+WYT*JLU87)Nb1O5Iup>Z7(!;&2}90ONlSz`SJGjih1|p=*PVhZh=Jnc9%oM`8(Na zfyYWoA2V+yKAgW@cV<=p=eOuLd8_3o4p@EJTlac@{^_Ox5#n?IlMlG)PuFne37>q( z`|!Hza%T09)lNh$uT)(_&(iLXCwJTS-9FD9zUUI5tIo{i8%6rKtwu>%RNWw|mM-3a z&wKQ~j{AX*y}PJ%E~>DaqEO3M2(&Gx=v&HO{Un)>ALx@$?^%s|nDG7OsK=#p!{o4I zT;&E&ac>=;&$|W%;%!$Riyi5EJ9AW0YE7}$$gs=wi>}>i`IRqz%ig=Q=v&-vblWSO z;5|~|Y31GfU*NY7y?YC+@x8;UMj*&hVn={Vyj{5;WXM+%$oIPL$MpC2Ur(PL>(eTs zGHzXMC>6Q5&=B_#@0Ig~W<35Nn{KQ8dlXEOr9m#?Z9_0lh`P3RhNK_#cDMUkDg6DE z{RFD79MyV)0unA6L$@PCEZwPdG9B*bsvIQ@HOjd?+xc@V3u**)R7&wE8_iY_M}JHxUn zl9hz6R@OMRW_wSuc@*uIGO@gcc=1Ccy&GN#&-ZP1oIOv#RI{~7OmJp8NAlc*ZC4lq z^TrPw?$6FjVq-sex#HSx`S~8csD(bYv9aROM@pzR6#?4BiE~;XnFzJKqW3s6+dk8X zzQ8&iZ<8-W3`MlawuH6j(Z#*7RXA(I5O*m1MzP1_fu|;tOS(a?AGprAmJW_3+g!Nk zS`_rCSL6cn(y3dnZJKil3Wkj0-i05YXpILf7syig@rKD*%J7x;It)GMmr`}$zxYi2 zOMY2C7}yUcwY0xp~@gpOPPk)}Z8$4Y`x`q_5Nixfsuutj$ji6-IPOq=kW?X>blnEvJ~y@Eceg68nx(;V2^rJ zS162XIHG_mkbIr1!5sS_b@sb>iZ3Hxg>!&~c^Q_f9!g_ZA)54QxbdjEi@e2bsQD+s zo3qA=QePx5&`eL;&F|MvZVHI$<*a+AO7&=fIG(IOE_vbO$z4~>PKgn2Pv|E+c`B4R z9P*yabS!lbOwXL{oJ9y<+ShOadDFvj?FIcDp}9?sxCVOzFUR;v`YV{^xi{Puhv(kn zK3loPibw?%%CyOu_4i)K)XXhw2Ju~XGB~jyJrFk{R+oEUEB1l(sOEod}=4lth4iWy?@mr%xzpiJZX2K z#*FY?jk=27e7`T(ja{!$J*k+g_)l>$AEwWBY#z2|qk6<88Pj9z>I_`G{ZRboBY&e{UGEmhBo>(d}!~LM0l@_;KBM+f{Z15uF{<}Iaqz2`8#hh!$}+L?IUXW zjpCS|J9%Sy^ctwi6XosnH7`{2oh&>c3mt0f)XiPe{KaRb24fVRn{Lt}nE+ez*md)0 zs1U!KL8ZWwf%QaA@92KXI66E2WPYJChJLq^;>@_gekWWTId^8RtqmdNX8rRNkQ4tpq= z>q@wM;PdDZQ&KE?$-(F**P^`=`|3x(sMJg1D0*!#U*{E9n+vt7U>c^S&`btl4U<$a1Ke?4C;TJBuAZ1xnPP=K_%Y%k+(>gIB7R?Qa}t}%lSuI0>_ zuYydrVs}}K#LtS1u^+dQcyfKEc4u^Hz_qi7zV%UAS3BsHx9)dkIZ}EjBQaOKntpCl zB|ww;;R&}@CD>OdI=cZ^Hm1@$97AX3cZ&7w$&;Dt{l=@UDI1DX-g=3J&z`vFz*pj| zL$d38p3HX4Hc#rpphgdvSye5ng|2+yLuLQMaRD={+LTb``}Ec$OP)N|4E8sg#a~#C zB}|{FU}S%HTZq3$Dz5dTqD?f+%`z#;J+ZlK9=fryTvwkjN=DHZw~mFwhP^-U^PXaO z%+L>8^0Kd;i#WV{zx8|K^*TG^pr`KCQ@6nMddpcc`PVP*M{K|76}+vO>5*#c*G=er%TaNCT6so&0rttKRPpxSv&iuwy`m3$n56VX z&zH^|7^7+yoXW+ukHR;hn}Ysr%{BpC)|%U znPKCJAJd#Aa*?YstipcQ$qyA)lIoSh^E)cP5l3$7XqDYOz-)mCF2|Z2aa+7G_U&K= znn*ZWV19V0>n+Yxim|fNtE|_ApZCn_&2*iU(vdQEbp%y;=OafakD|hl?a@kd7H8;% z9qip1Kr39rKcw%`$DbeFkQ$V7r6*=v2FkcG^>k{P`BjlZoznf=k`E)}zv@c$6}F7d zVS=&`hFK^Dgw|BU)GbuNkt6cul-!06E2_VsO-MX^L@l;&Hu;OXLFYsE0xaroXtKf zv%(irNv#X*M}7h7Upo^r`i2lWN>d%Hr0?GVuhVEf+%p*_k5=#Ii+0ABwexLwht zI++Sd-&qOEsCO_?tgAlixOADfUQr&2)MMVFvgB=y-+RDIxHEC(1<_4sB4g=_*J4fH z`|UR_!2LUpcCb5VELO1yaYz#P^}F2+Up&fin7!@;E1LBgzfLoYpGl%XQDRB>z!&Zz z2bt-WJzRVwPorSHlv`KKn(F+@S>!X`+l9}DU%T-1J^I=0*^sLrql)?p4xZup@=#Es zvEa3~#twdthtE2B?n)GJ_r0nYRYdf`vMX%|bPAm1hhabKdwR@de z+v4l)RzpGZe{KqY?S)Vnng1H~gPj?}7A-q3lc~9xsftQ*E=?Bbg;H*evA(FAoo+WLWT3GMt zf-uXQOIDwlkBL9Fkqaz+xYSVoE?MlBs=~wz?m=zq0rlRGisi5U)ku%vp-M9RJJdbS z*+&*U{Mr=Yg}HwZS0-{ZT}R5`?Vg2gUSbv~n9R-ivp>uAT{$`Pc=>`y@ztcGDgv06t0Qi; zZIeN3Tha|}_fe1^g}o#W`M<@7#$qiIMt91$uw1*Q345oR0TYd~RaS~g?gkqF_FY|nGl7COxEodl9`V1KP(0VkP>-NhThMQy zePg=qhnYE3$Lc`T$>0VioAxR?4tl=)myzjIK4GOI0giU|RP3r8TrLx@MXT{#|R z@4LbJROsky^;~%a-HS}+&t8p2PEviD=9#6y;h%f`-&<~(&9IoH$I=9+S6j+a)!MQa z@~QmUblA`to6am!I2)l;THTT~n;zZPnwGHSPB4s+w0%q3bI_D4G48&r-IY!!)H9#j z?CN?mr{w3edEa-}=6))2jV>`qBO_$8SoS!I9rI)Q^5weh+^du%PfMqsi+c*n5M9YD zin~nxd;DXPR|FmWjyUyXz2oGRyv0>a)bE{jy6GuN|2{$G*rP9f!M8slm=``#ji@NO zP#7|FqOf67aJc_HDQGmt3}MuR-Nxb?weLTgLBra)wX93OD@1}|YATemH3^5phCkcI zb!DsXcJ(vcE%$6OIl@*?ffMMQX9FjjQ4h(4Qb;|hPZ&1Pj_!zGip^u|SoJ~#HeEhg z($*A1m(yn8@mVJmqt!dm?AH6rQU7qV%TybFEa@(bX0`2`=T8eG!r*b#bqtoDv(6B@ zbJDL5A87xv_ZbBap_8T!9HJ4MutJI{dfvdSt1T%-O6{uuK6vRja;Iy}J3-)DEp!0YWC{C5^uZn ze;O_E?w4|`HI>tFb~)%*uL=(?6e|q&Eb&3-9}E-L!wd z!J$Ec8A|>cjr!l*VE^>b{qwaVEmxwyt__$xVxv_SuBZ}rCJ1hmMlR!HhL%&X`TI`0 z`B|<=7)EL}7LV$%g=W674-D(v+e$pZh~}%!w0Kg`no8fUfm2jqM!gW;C5O1bJ>Z4w z%h)>pH)Vn*k2N!%4X}KEDn|Xtv|?mBjK%BLcmE$nJH39qWIY&o;=RsW)|knRqI%*u zduof-c17cAJ>;pR_nx^TxwZQ*hkGg=j#1Ct?=~9S(6GOUakhHSZOokC-ER207okv7 zx9!+HdM~ZWWk$j@2{W59$VfFdgJKW2PW53J!Tw>hw#UsgpG|x5KU}Tb$+R z)UNMoSLe^%DMafG9P)PMJ$bS_Ztsg0r|R175m~sS6#hOzG9g!tSt7<0vcA{bIZOU# z2u;B`<7EH$1fp@}K)(?(kUC+&@xo{>H>4 zw&UvMv_}WL9;LzLTzjb{SgJ=|SPf)5@9UVvbETo)^Ii)SI8~(3o-a4$V(_F$Fl+Fc z;j64=Zr@6Wmo5j>+eo}!%Q@vw!tv}iOXIP)9R>+6r)HN}J;bCK! zH%q458LknRwQoCCqpbxfNRWb8N8_>odpZ?O^t@Y?XB;o3_}3jvOHRg zJ5d+K)C(Wq?9E+P&4}EQbnrv&&c;*t)p6;NhJiZ_Rc{ngDtCn5iK@1G8*^o|s4zQC zCbJh77+pTpIAT0MvxK5K>=S==CxQDjT{(@amv-~E_sUz;XVc7zlg_ybu|8xUc5~)- zob0{3^){E8dd7*_wwN1F3gc_lo0SX=zVFY_m$Yd0Hsx|yNsI40Num+Efmuvj>@hFv z*=ZGfoaHNiMSFR&-;+N|&bxmx_AMW}1;z5jQ4jB!n>nsE`sz>s^KG8$swbpSzVv=y z{qmX9?OzO6Nw2IQ2h{q}<|)55Q2B75@wR4Zb2CcmVuEOJp2VF!*|nPJ2B%%j{Yy>l zRC|8<)$YPvKO#_WC2--MvwFf=Pk+S+)cyJkE%^Q>)<(~Ti~YHG(P%OLu$MJahi8-n z%j!P)R$M-Sp33v0FsBso5fZ%#7if^E!bpXrrCK;1d082*9Mo-L&h%={LGMN2vz@#h zA)dzDLxrP*qW8Y(eZ@4Qfn#?Y&v7sjb>=^B=zAGq%AFv3Y>|){ay0F*#C7)|d%Y(H zO`MX)SS7C(XQp!bRLy5DX{CE5GiY>6?o=J{O!dsPuP2D8>cV&M2)Y|K6S!$DO1CJOZ@ziuBDXAIft#&dYBlsi~ zmy`MZWq$5Goi}*@P;g4e;M(~~0aN2;lR_zHRlAtI-MP#k6myTT94dYA5igOSTmX>)+m>#d# z=1`Cz1yF?F#4&?LLIZO>oR)gcOoa{xdug~Y6*%35U8s|elZ^8bG&?JfYx2>g@3UvO z+M!DrGD=i_@WxNFV~B>ySLDpg9F}m$bDm>mxxVW^F@Iz36 z0+$q^6B@IL-KPdHM(Rk~R}W2nL$%*zCVeWu387}23;NF=kJ34M26dWVb9r>9T3AqM zAkFZUfLhVD6^x&dz_n!^I-0=r#I_@PMG3RXg>08!ejG3Tta)va-TKRX@72e7#*ZJh z#!D~XO{%8S9xL9$Xz$Wgd@H{&V{Td@?5mrG zpJsetoU9ywiuF@`*x1u6yw>BkGNof)f~D1#BlCQ3a-?m)&T#6@Ml@5H9SS%SxtX&n zjik1tG+O`q8ZL;am}}W zM-ghDTfs%9sOXbpZ}k>@4{YJO`FY~G^X>dRTWL7~madfXTgrkb6;&IEu#-9 z7tRTbaSJ&Q_uAD0=TI-rise%0$=RCR3GFI`UvYF$;D-VXl-A7UO zA{fywuym9=E@;gH(Yu}ZR`3;m+3=c%;x~?uYDlaLrsL0Bb6DsKgLDltYIX+qmDlyv zxsTqTd8hY0_Q2zA&WlXPzhC+SZ%l3lq51kiLCO zwHt#7g>ETfKkO!es7A7$A2pm-ed$t|V93gZLaHtiS-pm1c;bQ!%q;vFp(}WeMPice zIO0^t;$xSJ>0@{c`Mor38vD<8m`{~%yCrYhoOLAj@C<=CQT0rId&}M@cK91ENAu$Y z%mT!G^}Ex*7!Q97Jd}1yi;)O>QRL69@pvFM59M)k82+|5Rl?KM|)qLm%4u*!1%R%k21g;AE(y&v9n$!Cmdi{JVL=0x#OG zjrD(fuy!;`zWW~6ZSIA!W9Rz=+Z>!z*!{Q04T%;`9+9#h;F5ctif)^@_9N;^wPWV} zaQ%gE2l~6euTt@fx_+l1N(vwliQ9yBX(Y+BQ1R2Noj8~)xGYGe)a{BA#Vz>|UU-_+ zKeK{|-)9Z$dl?ox6&76~JWR7tOzlQknZ|izSth6hLM#f#-P24?RkwJglRmoR{Ka<` zv+}qy2B+n7UD+ka{(3HN!tdJrlW%IiUmU(XbRNcEhz;#{#6$b|z3I?5oP1tQO~-0M z+fdQKY4KTeCyzsVophBK@4G6~4VmI!-cmfk2b23sVXPFu5em5pwNW&xPM`)S?sk!T ztm=d3Zn$y8>0Mj*>gvB(ZQ||~b z^}s%!ihaknH2gfHObwIijz7HbHur@(Qb z0`Awww41&mi#1MwUe{pz+HJqY%l2`5m6|@Pu_dQ4zQe9r{ao~CC^1bGN%r=s8?(>3 z!AQl-oswNEXlGR_#lq}?RWazUx0VdKUunO(w_LD>UioWm^U-XRbGZ+X_+IUnek)@7 zf`W)Bz^%>L8jbk-(D|DBiEED3r}uqTGEQrJ{g7douy>Wnu%aywU6!afx=C&OC3=Y{ zyQ86>GY~s%wT#YPoXvN7UiY3l;mb9XPYXL=_pJS4!X;&rOPRMP61;;Ab<)XZ0c+VAEQXc&>9I&IvsndN*dm;_~s@NB_g z)fR+=FHR>kTE#-gQpqMqpZ^|>z4mfHo5EfLm#|B1F+5eN9aB^bCN`x3Q(NbDnG4yK z(GyCGYkU=*sIe` z2?C;HUvw)omwf}=K5pCPBrJ~Mz4d2&p?C96Zo8E@uuHi6v6wKzXG+nBhThnu7 zNltQ zK}2`!LBoS}mgpINJUiVDXKTv_54J34x>qa?rpBh(!Cb397Uw_8pI+T6;eXPf!n9C8 zQE2eC>3_a%#exk64cK6Gi&iOL{b2ay)6JW4b>Osv^#Qv+!f{+w`p{Fox<27oSDPkm z_?CoARbMC0Wp|mhEOC^vbxW3Lbu(8=!l;c|6|J(Q*#-K#KTpXO`s!W|wWO_N+gls1 zvnBlS@($jsXt>d&a_fC(lnGUs&u4Xp5J8j!) zU9+E?PdPU7MvKNqF{ zi>>duRCd*=wT;*M%O@tY>5~)=$$`oK>yR9~iRFQYPBpiv+UW_>5a6$d+D-Udk~-V_ zsJpt}hS9zAJ+wRA{H-sSvd8&6?%VXYFA8rkdH7!n%LAp%RKL> zO#W6dEF~m`-VBmT)MKRP2cR@3?_7NwANcxVI?{h_F65#&D?u^REVMc-S5fK+)kStb zey)m-l{YfZX&h2A3pTM}>KKhKYVG77x+3u*Oh+t=-ze_84lD z-HFRbBz`UTWao9rvk?eLIS}3~COtG>!_$7X|NiT|xVke1Tak1WMneHBp#j|A#B#xV z1FSUj6Ltz7;{9r25U!oP@{r-iAREh-9W z=BYgyPv-Bq+MLets80-v$Pw?CY_p;<**h#dcXlcFxa8gKF%Pmb&cb*sVmVH$e7vHS zaO&%BGb?!S9Jh{de5_G&NMPtoZF>9(i9ln%&*=R~)~T0!HHK6|g;UPsTWUz3N0xg^ zudKd^mw6~BHf>{li11Wt?_64|1Zq0i>NDw6e_QxY%tGGE%?|J}o}rWi_E}2G=1NVRHjN z27l&#ar8W0S8D$qdeb6P^+-S-_}gP`LX@DDxFA*GEXQ(&iYGw zIdmJy9_g92Dq5UK^p;}a7Xm@BaRhq6VjthR9NbA4yT ze?E#^ieT!0WvjAA>ovzcHxERMhl?|8Nl#)~puiFZxF)-a*GO2f;|W2~tDi7@!#wtI za0fr^#lsdIU&6?6s=djW@>@CPbJs#eRK8@{!w09gT4;vXLdJh-7@+I?;AmI1X0 z`rMZQ?!xS6!9CXdy;3<|`<~j8!!t*LK?=YHh1`VDu*P^sutT3+VqaAjikA*D%-jEX z_2J|AUJKp^+o`e~yXn=|jNPHzw_3<@4@?bwNLMu6*GpnY_1(gD?xtha5K#Db?%H$K8Eec_ z6iZ(#bJ1a{TKS)dgIl(+9_`%ba#=U6Y@*qx^mEyE%kQgOFKd}){4F0Kihn)&dhifA zB1l@*$)ETuXJBlsZXvEtJnUo0qLt4_w;E8w`zf%m*e-a`z*Gyeo)X0{?mhg1kTA8v3e&NNxo>b|d(B z`gp^kFM?FzGGNQ91(yMNeGuOSmj&G$!DTTpX*0MiC~5`Ag96TePJ|;w5}74DkmaxY zb-tiikOAH1=OoU0h!lglI7+}`}kihdJaA{Atv=?03 z8}JTCf+>>*?1E_qBMyK|2g0R~0oenpGnjBY7>I(Bw)AizxdYikIOorRD)Myy{)XdU zb^1dg$g}r<%s!Y}PbW8jApcG2fr>u?3J=4h!04bJpc_dP^uA8b$ya+_zzCSM)$c3_ zJ_BKK9t0I{H%}s9T-uaG^s<0522Mfb5rF|=P=7|I{l;-Na2x_enEn4xu>Y@ZpaIUt zfCeQEZ`A(4wXlP4-m;$v&m&=hF!vKWv+LwH%Z4ESmf2UZFebS?uW0O3)1EDR(#4rn+Q3mO4SR|W&h z$^r%uAP-%k=V*{X*q;oj54DL0J_9GaPJ^!G!JuVe&=q0=8WfapK;fXeSTOMQHb7a> zFF6lR2~aY!s0S`c+FsM%~s6!UYZ|0yYS%09OFwhMG5)s5UG&JZ65_vwL?$#3w`)khr zX=*(K7+h}x1x5_2VIi&n=TLP>4SrA8?<<%x*l$M2Ent8tfg&Oy_8~p^r+ugtL?kN{ zXbHOirxl1P1Oi5W_NxIr_~hw#f)rt02OvH8Zz9VY=mn$`ee0pFo%FwNCzk+yeya#NH2g(`!oI4iQiX9C}7@U zZpp>{;XsMjbHJs7Z_WScjDwz3vfpQs|YIf+b?5(F#%jP9H4k|{Q#gt zzjFUNQDDu;4+DX*ThASX1J6xvitO^S>%IRQVGXtX+YM@h#kbz(zt5rGem_@1{s~F` ziMf_mQvrpf2~Kb%0u&{8;Njz~4h6OESC>N}P&n{G9U1|?s6%3{MgNdVkRQ<%23ZpraD2ahz^`G! zN^ph$LxaF2^n-i{Y^td zYaaUj!yf{T2KH?O4S_-cZQf801)%1CX-Etb`2GzvH2&|gq0s=WZYYOELAC+<{mBbt zdp6K;z~5}7LHJ+;4TFTt=YQ*BP{1c`q+ucJxS<^GZ&_hv|1O6`0eHNjE*1^h!vD61 z#Q+etk%of|{f2UQ$bA2ohC={9yOD3pw(+mIL?R&Iys@tAU;F_H{9O)-{#);n7_dv*&>j*8FvtcP3h~!k1$!w77(l;& z)&l4j>;N~?p#RQdBMtx8Izpm>6gHLvdi6(pBtIt)Pof|BzX&n)2qpr%2A4MR@qvsn k*|q6-yZQi#MZN((z@OygMzfTiHsM1|h=i!`QMf*_Z5O8%x&4khGGDkex`0h_aWGCE1tA5|NZ_ zMUkyVyZ1h$e!tTE-=EL>j?eA6&w1{0?z!il^S$?ZPNK#dTGA*PEKIcU8N8qhhJqvE ze$EGAii&WA`C)$|9HH(+cJlUfha-%g+=&5jG$>#SS5}4*eO;k~a=%s3^7AFbF_Z!b zDDq#cJ)>>n5=C^b|Mqu*j0kD6PZjT`NDC~Eev7i;pFP+>kh~Nd`j|jF(Z=U zwxC-LEx-yfm<&hg`T!QxeqPmoUUlL2zp;b*djJsM5ceb^;BJ+Dgeft=FOcK{#sl$h z3P%_aT|J%D{DMJ`NbrY8q2aOwG~6DBPzQqoH3Q&;)sm{dzJA~i1r_^U5YXdalG7*p zx|2QNpb<*rTAtoutZ;;uHxPgZ(Z$b|2(cVMCJ~){V8I#9Hp9OBvK+_eOQyB?baq+9 zH+R%FA3<%N-FQ($g{PENr00R*&4lRN0#n32$JvkMxRpI`yv%ETZo|Wu7Yr^tCpOC; zdo_zl%{*Rt_uG%pw-&qiu(*F1Zay2@vVUQxe&nH39oX5^+(XY>vn{jeWzfPo4y|gwkl-^BCwBaZvQ#+k(|gfA)Ns=oDovECBxa2& z$83u>cD;!D5-RoNDYx+ClSvJx39LSLBu~wol|fs%Wiz|z#9%wTunkpt7k+ht6ky4Cx=xq=tnSZ=tCT~N#6l(u_wBi}zRe#q;Hik%<1Ut3l4S#j^BZ8h=XksXX2 zdZ=TYN?N9)o7JQ)K1jc?Goo=O-qtK$=X$3}M3$(@m~y1~oIw-oLld>L;qP@K;ue*6 z2C*{Hm(0_03Ug*<#fp8(IF=1BJ5q84yYcYklO&hM?CfJ4p>0pX$)aq`*R-=0whB43 za{kCo(L62dp&_-SLD+e*t?ZuN(ACg~-r0wuW7E%seZ|Hp>VK?v8nN@UFD`D|i#sJqCgkbjGMB?lGS?T|lD6XIeXe)M+a48H zW(Pgyt2bPH3sXpcP*o9#R|*2DYsknNMiGn zDtBMq-Cmwt9+SFg^KCdcpNj^B@pvNcoQUt>d*>d1YmSOrxlZ*pR{A=xE7=&xgH~o2 z>zM^wqa9fDL!}Lmx56~%%%uI!nvUu-U$P*dAKQBs=YV^pi17+IV)EvP`=P*%4r}4cs?XjGQ+r>S{qR81Mvc@(9FHsMa`A9x%gW;NiPtY>r>-vc zAI(vlydKOV>=VuH8`i&IVv>EFxte+T?BVkh5RlF!;Yp0*$S@`wq&{c69%F3Ff>y+HAF)lJE+ zOP%e*U*G7f+ObuLRHa0>e09y4RGmb<3f_My(lsi&BaO#adxFoEZpaoX=k_XUT0d5b zsHCfG=l+A#mzpNICtTS1(uc(2dYkCXv4XD~4|l3Q-Qp2EI}gSLI4_riOi$>621 z+y#?%cN>3`Phwk>40y)}uX@i4ie7l6ytV%cZ*N}dsOh4#Oc>`SZ;k8qFCToj8y~1~ zw0s)DRX%CDxKE}>&N=~cQ#%2}GL+n-{PC`e^#^e=|HB$*H=5a?VK0PFJ?almrJdV7 zsTtcjaE>YdO8AqPqG{*+5<)*QTF^aCdDy@26(5F^>9Ly*|4}`yF=XX**Fi=9Ko#)QoOl+0$D+!?=$W+((e7B`>3ma&mFr%q7xXG z5v|C(oOn4|rNVQ&#;*|V8^Xdj#a=Pyan+r!RD`RyXx!PH36tbK_;h%Ch-TS~JvE%T zY8X3fv{hNs@It9W>D98Pn`|XlSWvHrr^{RRXxy5Ya9i%rn?88E^WKQrD<1mec9qW@ zvNXSrY`KFy(iA-^R;8}_)v##Z@Uw1KEaFY-rD#9dw&S^TO@nk6xmDV8%}a5cWQ4iX zn!YBT;>nrx8kV%@zL8UPCfR(jbf7y>F1OrT_MX{e`OUeH%iKHOwMSgC9ou;0PQ_JE z^Y%d2-S>*ew%-=?y?o^{;W%Tv!_EPI)yK0kg>@0|;)dan0nt5E*?Tl9X5?=B4~8Cy zEEZ8Rtdh-AJ|CQ!$sq5Jl=}Md;Ak|yrF3_P9}r>^NGtaM7(a`%16i+0$E&(nJ|-wYp{Ok*YaEv!l+r@5f_ilPo13 z=9y(0o{P05ZdpAtQ3LOkPBzz{DcM%|KxKc+TWJE8uC*xBqPO_rn^INvkIu)Y_f{9@ zYB!szU(+e-xG^fjaMLOyO<=DPzi*z+y`(F0`T5H!8mGV33WrzJE6}$O1=b>sd!EeN-?F~0 zql&1W+RmQ!TzSts!CoRUV=$u6ASmdA@bgJQlcdw#HhUF&(k-qo2@|pe$tW|MTh>OA zGqr*R8k`$N&l_0fs+_*k?c`LJJ({a;MxUPz+a{G8$Ik8L)!cEJl=H!InM)yx`TVZ) zPX}&i`mXS)v4$uGtHy-#y~-Z8f9e}~`QAC73-HGehL5pY$l!E4gU;~|_lxY9b-pUU z72~j1GvZoCRD!9TyQMAXdGl_Wid`>m-`j>gABgIaq=<~k!*^znU z(|qN~mzqYe9X5v@I3}B|RWw&Uj=NmIQ_@?o<*|NI*Rr~Y96#!vPEX9stlqh5jAGv4E;g6$Px?eu>4O_+bc7B95FVyfVYa-!7Ig~K5EZ)oS)BvSnwrI zM|3wmUd`|W@ET@r1zkVxz43H_q-5PI zv47vHO5DC&IGvPr!^)$))!FGLbdz|)dRKWzcA(|__37ml zlinJhGuqfSjT4k~Xfqj1d2_@xyhb=0ji^xMs?`iHL|A?f$0UbvK9|kOPtwzIPUF#+ zto6-r8JE*%<9K8K>}~Y3_=G)QQfrc7hM!4li*s9>lg~9&%-XB{@mK?hbZApG&>Ytkv3IK;dIa*GR@N36HLGBRI7Qp*o1)*)q8uX6F zV*zCU59qFBS{YA^V(F-WrPS>=RjRE#=(wCISf`|bLg{UxQC;>mA@J?-5$%Zm_JruJ zJDIY4#b>E1`^~277vTZz2iv)wQ?BtOqiVe8mIJI<;^#FJ60w;#EPe`|kgJc>j zTo_w&YVZU9v;DHKzisE;LH0Hd(Mu?}V%}IwD(6s0d1HTS_SChDkKSPVif2#eE}kju z&oMk9@Zr9QR71{)jwXW;&;7m*ftyk}{Qbj~$7#px--cNk6)Dbc_8hSY(c|uX)k&w% z>!VP3^0u{0gE`Xz&qJYpn~2V>@XD4hs(#WT{TsjaMmgL#wv#l*jG0Pq&%_vAJ1lg*;bSFuF&ml$vzA!TsDnkyQAbG68nb_d2(-u6XLur zoq8^9&$)%{jQ^&z*({(ZAR_*o$bOQSQ%~Bfty`rFc#DYoy|Yd?yro&*#Hb#6@S#7X z_#JY~{5#qgs>-fZh727lteF%%{=X*$gTb03jeBq#Io!^O{h=B3oE;l(b?SGXlyWgM z+nKT<4v)s2>f6kFWrP1FjT4(Jx6_zr6v{KV@8|FTO<3f|prZlgBYsgW;}rh2%Y=dzCu zb!ypLi#NXscayJ}HX&?fz#H7iERf25E)&c)D0bQQ|9(CNh%dkDQ7E%6U>z?$CnHu?M4jX=%LV+0RnxA3920)4wcc z9qZ%{=+L+nY1V_o;?0pau@<0vW6=FZSg2K{9kPrc+0n_-1e}?=^Xqh8HDbIKe1de> zann>?XU8f-f{{GQR2ZUyG4Oe;$~t5pTY7_T$Q+=>slC7129)MV(zlHX=}; z2ocXdCE%`#9d{>Lev>jft=&*Gs>>CcI_3}**1^|8+{1?1QIl%XmD7^Q(x!=5Qrv=m zBDz@~S-mOniQCi2TA{I9BBl?uQu+ot-anG0dti3w#p^H*pMoy|UkjOizCPtN3_9{g z_a$e?)6K?37v4JP+H*g2jbC%W&|4t`}f6f)mJMPO7M^19UNr#jQc zYiVtQT~o0*C$v?kuBS~yD3y69qcd>Gn^d-E%P|y-o!w2_xykc%K0_J&O^kF*mgE+x z2ye*x=Cy4t`I{jO73YkX`|k1J6=0anuFOK?o@c8;)c=Kb?CN|vBG%`A(k1!X~e?LB@ur? ztpaJc`4`2s=@Q2J$M0Zv_G}5&WN6Tn`^4{GvHz*7VR9>3u=7(!dDkfdch&MlB%Z+_=IQk8GN-2`!n9hz z=F;&%j^yXtrnYgUx}V^E;kxpf;m27UVJZ@&;?*$(+<#A}lBu3=vkK$z6qntVM++Cb z51a0LUh~z@OWpLkWKATh1$pd2T;)qWdya#r+bbN3Z56K_m9e!+I~-9R6(g#A=UM8U z&>efSOqra3_7XGtlB7od!yCO>pVU%LGsGD#WHC1!Bm9^^oU9)#WxYG5gjOxx`AR~q z#n*&4okMkt(^NcnevWabb;ApjxtV1&{QkJO4%|{K zPllB*tu&mj;3E`f9WKxyR+Wtw#Yj7U_;jWUTqU^MVjuhP z%6`2kL4C}E?I*oWHie3w4L-{^);rApLKDyJK9RBCRKi*4f{}kF(u_Yw;?R;y?8$>k z2c+^mf*tg_3>vpeAL5kGElN$~^}9Qlx~!e-6VIyIEzPVp=$-Hx9m;TY+Xd~w3D4Zt z=O4XGK7OO=^2X{?84wjXwXS6jHg)G{!KQAC`-_T@Lt;&ydYbyw%HRXG(*!!El7z&R z3s%hRsPG+9N}u9|0%X7Q9nT)Bwhl>XA6mIEC2VH$$uwWaSM?jhjrP# zMS|4Tw18TM9jEWoVX`*39MeTt=D$i7;xuFlZAb~eVI9W|nlbfT>fnrYE9R<9FxXQgu|(i>V>Y9YI!>|8h0`Go&z?Ec~>p7xk$r1Tx~jMv>K@hpeq@Cuu|SX}<$ zUYASD%r0L=6sd4Y1v+7{>)3s20%N3sVjRA2<{zpvmKyi2{04-YZ7=FSemKhH=pEc) zcFpxcX{D&h&cP(3W5Vi%*S=v%JB6=((q*C#N{(&y(kqOajnC)Ge7ZPM_Fn7S5Vy^T zx!&A|*(MJkv=Q1iXI7fvR$BJ>N6!AF;CaP-Nb7*$U3wD9x zu-#!!rCu_p-V)LMD__p?*v%hXc5an9$x}F0ThB(5pT}8F-^TlN4lm(Bv#%;jx8dqi zh;LA~_tkx`Kl9UI*pQ^U2vJ*s+TIbGlw%i zd;W^eY5sb0EgMzN-5cK)vLO286~eMOB7K1=u*wxt({TCet0}ca#zRFkY!0rCMFm&$ zQ{KK-4EyM=Nz#h$kCKxYO0aoE2pfNNMbKu#PPS~^N2ILM^2OYau?&RW$C<5qv*Art zW`_!nM6KtnN;9tYAU)XB%-`53iaZJXP}us8edb5WnfHCFA=}hgoQ)bv?JN&EksD(x zq%o`~w0Y-RzMx%d9$Uf1r)gQ@BMW+Sz63S%-grOx*tz&>wjDxVn4>dcqCiFDsQk^= z#5;E|-^mU;W`*WCS0p8;e4RGP+?Z4OY`SByKX%CEO?pfV|;Q_l{w@s9DklO2!m1;6u zby%7nY`Dd8vRpJHEW-Wd1-Q@VW;l;VSz08ohCs%K_zq}SAv(;{PK6&TFi>tCy9%XF z0>?=tBYn?hvu`5dZNke3`J;kYERemM1Pem02+5tQsxKOId{9N^oHv_z+>*h;lpnm; zAf<{qq`$njzt&^4dghhh(Fp-nrCKVI+5>b^ac-Aggd{Aos z_wc#`?YcRhGDI?y~&(qF$j`Gd*x&q3Bn$F$jquqTB9{NAGvd>XfjYh5gwV_S}d8;P4~ zH@}4sEij1397`_P?SIMpW3L{PjqbY|=Y`2Al^K@MmWO8VCpl&(CW59y9sLe((HYv@ z7B93-AS~#T-OBjD=X)y$&nR@?<}K!*A3tCVf|0TBB#U7-VuFHKVL-W?(*$5-WSI)Nf&t1B4@%Q$@Exd zvu84S(E}f_o4JTnz>O^==_>D)&GOtMZ?h*!*XFuDtNVZ0|KYy#bHc^Q(Dnxcj1S+K zJ^PGT$gZkt|B=)BtZ?x7?%91#p4NIDOcj@^-Bg&KnGv2ADDBw+lmAF%tW>}e8nq6! zQ8KQKp#vxG_NTX7)rGvh?#>gXcWvdp&gYN9y*!$xlDM=42}EF$sI}RM{Lz#qw!>#$ z`nuvpB{dO=rJ~C{u*GAMueg?%q5?H2xo^coR+t1=mmEuJ`4gYG&T>66tJ4nO$HMXFd-K1MbjOgBBB zb>GWBw;S#;SO(bdrTn#Pf9j&#SxJ}R3eHH_S6-7M;RSNyq>od8ps zgfFI1ebZ$YsWbKmL*J($neDWV&tICo>h!qw4PDHKYo_n!*^@5M7LK2p*|1SdGst7knQ3_Jg4HL z)%6``FSv|}ni~}~73fXpSIVO%Ywdy)bbdyRhZLj@$KLR<1J)XlK9Ph~=;1F?bNd<)Qq# zOe?FZ$7@<@Z-PcWPlAAPo=b$nE**Uxmjsw+1zk{$mVgI zMvQ;Vx5(F<%)c4Eu}fMoI8~B>`m)WuM&VSn?k5M$XEvy}KQxeKydRnTG{xw{Bg?K2 z)pk3c>0E{l%C){@)en=Mp)yb^NCBL||M!kw+EkC!3}+b*fZ1Qcd}5eXd`+$jzPEBG zAyHF{&G9|z<(bI6N3G!%R|lFZVmeh$kCha3p6=EeZj+Dbh>NkIi9!?2yic9DTb;Nktx5xhZX5|V%W1s6^T|ZiigI&+ zG@bNOt)7$nCo@aVkA6a)-)=Ly2=9I*5DJOABkZ_jC8JFRmfahcbJ6|dy9cZ$KrV>1LvlC(~5 z(u>I?zI_MtE_dQCMpa;R-DYZa%F-GR+sobh!D8li`>=R7DZBqV<9wOBUQZ99`0~9J zYY*~8&YGO??iNEM!&*zsj1YmF>AJIxWxXd?nlsZd$9^*tv-A+&A77W|78Q>F*dP^f zG~n-P0Y>WYW8rnE9|nUrgN}taf){INsJuRr>I{_UwwMMpP|0_Tjw8t&en^ zkU&^J(yQ$k9nNi>wB4~RTBbG<`!>DPw0W857FV})iFWsv3TYUf38#`(8iHH6zx(~P ze7^tQ%uq|l3NF5yQ@S*#4t!z|%*DWsr&L<1ozbSWVLP7$cphYoB#A|y$f>#fLcAy^ z>%Ee7TE~k-z1$uF1H-qMc1#@};dCYuwTw@0)gP5IidpZOMEqU(kEWmNPj=plE`32szS$G;_`xE2Zy$ zLThcJIFOVyyGOW z)kU~mN|$1B@es=kIUiZqa~XV`*U~Cz^Wh!4`Rk)p-WK4fKrh@n7Apoi-lb8q*At<4 zfe$}xGZku%>uBqz>!g@Of9q|h_PzAyRq?!#&0ow?jHUS+&PL^3362UFS`QYY*yEFp zTHU)Q>0>LB{{>425DColPOx+tAC53LN# zQj+nay~Mpki1*H7#r2f)n%2tZA*L4W?W1Q4TRMcEU6EP{)0I3UWE^#2d%QH!x8me5 zp7rL^apWiFh_qCD4{TP(6|zReK_51B!{tXsEyr z^m-hZw23Z>R+X{3d-wZrx4aiFsk_fkt_(6hVA#YOr9!qDzY?7Zm$IyR8DHSh_yU6p zdqIEp`0ha;t4S4exUP#aE-Pp;pPm`!~

yts`{$#>Jd2I#rJ7 zVyzfFiR)`FB9?q+Urat~HbB~Ef4700SI?|f;nHN1p#4D^@vVqL{`Vj7v|CL)3zt-M z>c0E3e+aml_##+us*9&@;>g4Hr3Q1y=8J>AD)`6aFKQwtzDAc!O@01GyWi0^f{F~O zVH??XsJcO%E}J^oWGrViY|RhH)J-Ygzft8ef4@kbAu#mYB-~$7vFG4U8M5ni^!eQv z`6udb!Cl6CTd_6#7Qk6t9rA z`L_8i>BhNR&oA!w=YIdKSAFg+{cRD&7gUB#4f3pG)EFFiSz_FC+kEwHB#!PT>qC8R zQXBW?=|1K*))e%Q2cL7eX~C|E zz54BQ4p*2PFfG>U5>{3;ykc&zjD;?o;g@kS3%IgDMYH9aqynzK|Z9T|S9CBmT!|G6dfK%^(1Ue|c&& zY8^tunGo2(Zg_T?`-jTTXhg74_O6FN?mvv~wGgbgJ5!z{l(uA=m@o9wOgSV?CZp=v zVCMnsR7$yYz{$IJY7j*Ub&+lnr|LZ~+})6_xIuleH{V2kt6)!muGy~AXLQQvDty_W zyW9>3kXKb*K&>a0OTM7@*)+l^O8q&~k9_-lYnz=XW)?|}k z+520YEWiBNkg08+^0$0|DE^htYOD?=00*Jw6hQo$GcYmHu-L6ZJm6#zXy)V_009e9 z09oC`i3BPm^qqd@(Ks9o0oBnU2Dp$s{mFhLI0|wdrf`H=pfi~g2?X^4Jcpr^4~VBx zTif(%eYMpX8fg>~je<*K!Qll;794P+(7@-|gWf?G22Ny>CneBE28o355BU2h2L;>! z=8Qr74~VgWOce+Y()0x(Gf;HUzfnoQNHBzkr<)rQ#P&eJJhpJ?6?TBXlM9G50^{?C zf_}V-Zsea?lBc@|8IA?Z-o?+y#|aMhc?f?Z$d9(<^OL9_UkzYl!X%1#;=6Kpn#@8g9oaJmjewE!-1Jta*P=xY} zIp|8!AE+lRbc2LM1hEYb4Z4CvnGdMD)da)+oU?zLTFn3kSDQeC5rb+th%3N3R2@=- zU(@yL3Z@M9ixEl-SYS$^KtzarNDuyLA1VbADar&|g6{um1!4+`gi)UTY=8hhGy0Vv zMOf7VNDux@6j=kkfOO)wM9~LGCw?W0uKd&qNFRPr5MPvg&<3O%zY;WZJQ4@y5bz4= z0F^|L19JlD1#oS@r(Zhp>k0`4%sY&7h4Q~8%3QBLhsykuAiaQQ6}tZW+4t{QSPi%; ziY)}@8wio&`}E-`2qu{V`vRwY4GVI{a4@qJnmHT|Z5sj23JxY2Zb0RK z<{)nch7ZWFRR;#fPATIKbZIr`0S88cLi2=!S)=5hh`W~|a61OQZ(a`SJT^Dj37K@csl7FS#U_c_$_FR!VJ`W>nK6Zwo#R|SO-E>7S~ z2=c(H=>J&^c=>;eVg6?^qznW;oB)23111Z$5aA4A$z1^S`Fs13y*-`bK^Pen zQU)aj_aKw~1LP41pMRdo_>tTtU_3=(|QhQZlL{3erN644C?j0+kD{NT?YaC11YFx=q(&>+kS{ouYo zX(*f=0I+Lmcu0fRl*0oTxRxddEz~vTEqgRa(|ZtsNpa9;V_W#`?Fsh7QAR%OT$B! zeN8zCxvrt%kpQx+rJ*5M{byY~4l=}l(qw_&{zXGW7=2ASNNd;9WC0XdLz6@Ots8O> z^jTXDOZbZyIXq-^q2E7i0VxMT#x*nov?*CjgYf5C8Xj_O(C^>&2(o|48A3U0c>$XC z*Ic5I7|8Jd(H;tk1JHIYjR1C7YiKCcU-AQz`inml3JDF1nfnP&(Q*g81E)Xxtx k&=hy4>+1$w+v*MQ3jt&&5}9HaQFwrKVWOg%Mq04{1MuiMSpWb4 diff --git a/benchmarks/python/blas/results/unknown_float32_gemv.pdf b/benchmarks/python/blas/results/unknown_float32_gemv.pdf deleted file mode 100644 index e9a134209de4cb289d7cb7616a2c75d472f40eed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17563 zcmbun2|Sh0_Xl3ta_wu8<=T_w?w4!J9wBS8mhD=~?vk`wB0_c|C8EeuN<^|`--)PD ziK38_B8mSzSA9NxbbtS^*Z+ULre~h>%rkT5%$alEGtZpFj5M`nP_kH?jZz3F_VNbGm_i}+FjO<;AesDA>U;tPlf9aD=W0U_t%YRpZxH7jF9pJE*@q0PziR?@I*St+S6XA^Lgw`#OUD zK>VA)5e7sjH+yxjK+qx*{NYh(xEuiuw}l}zK&QZ)esIEiNi|PTFK~x~iv1}F=<#pK z=@UI&NUm^D3Au7@H+RriI6~VU2tbqQ=;cI&SoS0N674-;ftgKK!xwkSv7es1J=L;P z$2IX_Q%6nHNfhhMw#!bzOm@4}kct6`_ggH6OOJdTU%ZpY9jz==j-79+(|n>Et<)r5 z_+g>1p?0KtZFc$H`Iflmy=(GHMOh`Um_1kc&my!HS1;DGT;C-)KG$UT`TbMY%$Dhn z_c-Uc;{j{S(?}&jfiuy1uiji}X;7^cAq>hWS1DH_F7 zl_85sOQWI)RYp;RZ3g<5@gYsr37s|R2eV~)YO?|Z&*|)CTx&WzyA<2xI>XfRI$ScO zJhr(>)U)~wPN|AlV6`i@`{JRVoA)0qFMe`bX;;~q6?^Cv)7eM7HG<;e8s*JzrQ2|f zoG*y1*Tu(-9Q{3jAJE6DIQ zW6n*uzu`OZhRys$pdoE^I{~kR9eF!;1=C&GnxHN}Ta)WAEW~qRW_Ot;=ZJ(bF?xS% z0g@0Ed5=Ln=rp|xV(IRvcDqxfu+n?RZ_9DV_tw@{o>^S(>OpvU&NbgOJdK|)3Ft3f z6dw&4@_IneR%k2BDe}^~JiV&#t4^eUM5{eQPL#7+O?aP3Z?e?L?BdW=++JFDqX!(}`TFGE=Et ze*10N`PA~$@qH|>R8IHm%aw4c9+~HHIgioanbf9``njuX>GtPa;`S;D48KwK-m_2MAy9j?EZm83rd9Ttat-JG-8)0nwnoR0 z98G*DjN&=a_S&DNh=bRy#Nyns=#th}teIz9T*KnHn0-pD>WLjYQ+BdZi4-{G&)dV& z65508XAvpKl7rXz_@5v6JYyv)htokUxCs|M_*&_H3aj)*?kUfwS)M?jFCUb@ zeNQ^QifCJ#HqJbYU&X4_PY+!`iB~y|T=hD&I{y_h*E)Ug(HAOy9n=yLk@a1&k_ZJHNTMJ{_^D$PHwPyJjjJt9N9f&fw^*UBu zJ=1R{S$u1_B8s_s$%!_Tb|5NU*`A|oF=j5u4sp!&p7OD|lHp8bOK98Uk0)C=7WS^L zCcX4jhxTWxQZ=R9GokL6znzN7g=-%QXQ?)l!^%6ICW&`#-ChUpm_J_4X`} zUl-4CkKz^do+?`UfeJU;>7%~gC~W$ToMPcZ$4tq#?dP}BE4(m|c6~5BZ_>8PX=Ea6 zL`<&d(aN&6yfLOiKd_I)kci-Nj!6{}`cfXMlf7#jj>lDK4v;c?zl^m$q+y|1jMDV- z4!Slb%i%1akhgX@)EJs%LU~*b_`7K^OjMp%cCxiSsEtkXp_zrsMWiEN1w8YK+ufOp z;hH_B^jl=uG|ZX`zKdsmRb2Y8y7*EvBjd346?%Ls<7-nx4#InkZea8|X$E1s!03JY z;@3i=`1n2yq`xSd5B^kdtv_uo40~lKyt_-y;T@~r%#Cxa7j$0FFSHO3BMX+elk5z1 z%&74Sul>Rui0smh6yg=IWFF z7BFMXWS|wN$JUf(q-hxBSA2qC>3#Lpn**U7Q6qLqe%xXqrGxpj{g{GBHQJKt2h`gJ zr=&g3xoWta55S$ji}<`v^1=s{n1PGlVnOnPR>G+1{=!yKn7V4hyoamS+1GdA@9<^X z21C8x)v^9N=QUA}Ud9MaBT#RwvX(zsWLce?C(L@DJbk?=uIG+e$#KJCOxCRup&|*U zM^fW?5r$P&XGS`v`!(;Y9uqtC_?Vf0Tk8++vOCPFj99M|?YBOKcJB}*-Q0G++We0B zGg!%!Lp|nNcXMv|m%}s}Jks4Wl2412(LD64Swddfc7MS9&fp7JiQ4q@56eX*pKlrs zPwi%5+g&MhcbiD)9heuxllk32YfXrp2L~_h_buF=Qf;nwRjle@OqQ0v;ME~EHL14l zuxHA}ig3CgNyr_%g~C4fe?G69e3o_e08Ogx?tM0K%`1(o6%!AJa!HR2agVm0?62sa zr2pc@lXW3<+euFD@=1C=yse>udbTZh?Bt1UQ3f5R3B9gyjgLPmg*4rGRbaWhtRu%? z(a_H4)^kg~+k91rU%WSWA+=u0=WrWy>6(ftjELwHcMoEY>PhLRV|JA_C~vJO9=-Ki zg~5gD0}Y*8VOmhflj92f5}5(H34`7TKhZL2_Z;iyL0uV&HUts>{t*v%ZRV7~W&7gBQMs zMjkz)FwztBE&alZe~te@Df`qK51~A^ZO47Scpa;`=Ne|s(tO?buljRTz7#N@JdrnM zb2!k-@5N!u7I}q_5oqnOrY}9Q!ac8)`UAeD_UFcRT-J+qaW&?+A{95p(tPhR@3`gI z$sd=tS4!n%&vSkLf}(bEn>Z|a+UbIm5duC)nxISzq{*v ztyeZ|(5I#XUNU(1jHprlsz^w)C%82)zbl^ID;wQ>47#~`S2(**AS%4?l$Tl&Be4ro zsN#-O+3*taFZ8}kq-Y*VaZTxg$Ksd!WRhayiSStYlQng=5ve$WnzrnSLbw;JPp?>D zQc|Pq$;YP}jsyfoF95G^h|kMv30FT zm;1QSZMk;6?`MJ>A3d--wlqHO#XCYIw0jVAn>6a?a+h1aRqXnF*kD4tR;tBv>}jU? zym{FECMgSs6HJ1f|Rc6+&bN7@6n5$Xqz2b~SJIV2q&N$g~7q>wskAXirNo@|e3*l0`H;B~YiVh(hVHZBbkFG$!oY z>mk+=xAKhWqC1hgc+F$JD(3@j!DYB#n@<~;L+bnkc2RB_qWGZ8t+06?rlVXnpExia z{X)8p?7k*(!bNen!-l@{jU17CzrxESNOCs{(n~D7X4X*STgI-K`oT7AChW%LCm%5V zB{RW!pCgI}au1y4|5_^|{W$lfjuxE|cWr+M|6S=^zJcM&GgM=?v!ND-#Y!`*ZZFM) z^td|TbyDl|cqkSH-?wypY)1cyyGv-mDzbBjLUH8xiIKi~?7&|6DfyS}mcma{pVP2FALcjY!6aKe$%vwxkv{ zoiMzh#xlEhTa2G`ww_f)&*sg?MosR+@%z zcP(9ZE6$7eQ4_{C<-KyLZkY^R+md2vBThkn6jqyf)c+PA28TO@G`dr^h5g12ZP+{Q zRG4IhjjBpyN(a%^joJBauqAi1Dy)`+0&kcV z1myo_Liu6`Qyp@<#+-ir_A%3KKgOEv2J}p&&?=!{Y7{Rl~-t~guk8b zVJa?lZXTzJVZ|NM`<=OKGsFuxo{0Lq)XY{i*nfqktbh1*_$1ZW_xv*yIQ(-r;P;|j zZZj;#>#;Y$88sF&)%R^#3;t5p|2}l&taV2^DU8#x1EG0{G?Q|^r8zlv(bduLoQ%y| z67K<1p1A0T&bHS&?9u&R)fts_X7&kBXL6QzR%d@Haz20iAO>|#E}fm%ZqIRFmakuL z%Fhlb#=9S~@4dpCdk@)}u%gUr>euTRnXn>q#P^teZ~D6(J7fxZii!GtGxoRKWf(ui zsvU3tIuKOy1<5w|h3b`>iW7w{LwgPz1_h7*--Cj|V9k(5y|``c&Jp7O(F_`nj;;4P z^*e*59ZgL|Q@6(B(YUaFR-S8Hy=gVi(jMa7VsebLjshppPS6HUHlrRgv89lDP@goc z-~O5|W-%&m7I-mdP?e%BJ)O(ihNQo`zJd42i zN_=!}emBtiCVus0EVW)-iL20;*S9?fW$mg>6|@|j4)_+iMISt{XvEoH+3<*w^{cy< zhNNG@(+@`)x*3k8x^oBaU#1bbJ}R!qZ2o*W|G-?a((~R$L8~E>AO+b&8-E*QkK2r% z;_<+OA+hK~y*M=fAQFu=Lds!`bx;$bD8G8?JMPRoY=q9yC-#ICc8;~b-cPmVm;%dr z3hhHXjT_pR!)#)nl!FG1?jTKjaag<=@-EgKG;ai&-v$e@sI){3o|7XrGqH2W3?unRmX{YuNu`fK?L)~s#{_1@wyGP-&-;W{& zj~_2M4g{S1p!=31ax%53jyTSh)MP=cY+R{_IvxMPJzFBXdUsZsyUNi>&9vPvucPYg zclR>SRL;7*IVj|6J9^W@F<*OsOMfksfkEQ-@0#>3<=Btht7m&2)eQY$IUr=lMnPZ{ z9P*~r>6bcFCunQa!fvRVUl7`%Q`g(7DU`+_y4?XdWLj0;9C^AD@$*ttowRP@`E;f5 z53w?_*?ZWeBi$kEd$V=N?Y|kqP;kz8`TsqD7Bx&XbK7m7lKNGXpUe57iofe8V^F8T?GMzZ#$79$zzg@%`a~+AOTf!GVU@l5t z-E%T4kXw4T6>lw(s5}85Gck#3%gah`-|Nwy3{!CKqn2i`eC@r3R6x7-9;kl&-)FB)0gsd9$)!Z&z9XMyshGJiM7&=Q?k}p=_eu| zM#qY&l#ir+6e_nR$(G9V@0({pU)iIX-*u}mdr3VtoG$*rr)-ADrwOa$h~WC6J4_G8 zl+kK;MBhoOH+vfMWU#BT*-s{L<>wk@Sw4Pc{Be2_O>@*MCXd09Z-KsyM%_cFaoY#g zEt)gQX2tOrT|_x*xkgVVw%wG-FA8eyX>^_ zeXoZt{n<+RD<5=|LIhI=z4gnc&$NCuTqO-#b@^BOZqHGDYoPY&A#;g#X=5W=%E+-i&H(OuaoT+u&kDD;4ig-)dIu&11r47Q&bBJ7~sUaQ9PwL_MfK*F+d>;CSpl zcV#g9E(WtlDD*{D#L;QhfO|DxyvwupVy1FDD2youe1yVm!UY<{sWDTbwo}cW2+vZ5 zs|NO%A7mL`JEHe2pr1jYE!f?dHbm@v;Q3u+eZwrTwD4Rmg3cv!F7pkas1aSoZh;8wX7<1IH~%kM!Lr2+E6=l zIDwk}c4AWMB?|@?)Hy+E<)s86Ke?Z~&g49QXc?5)_I&Nqq_C;+l1aX-gSu_xuAXeR zPs-WH*ey#ReI`g>Pxq^#6AXVqjmh5Xcv=@-ng1?Dh~ogGKOeJI?&Al?92(wO^(n=U zc>d6KCW@b9igQvQ*%=%OpWb$Nr*twN&w;k=rkZ7HW3sXvD%fQl9V(SPNKeW!Z@ks{ z`rf<2067hr`PccUKQFtQr`d(~>3y{bJQA*QsUSVzW1@P{n1P~jrvNM$ttcaP|77C_ z&yeN49VVZCwk|icoJ~Bv8?D* zc9YB>&(ScHpU@VhNfAHHWkI*!al|N2_0gEGOxp+zi?_tt7rkbgsr|>@5_Atrj^whR zSYvh(kIz5R=Xhn2!SRQP5(O?PKqm}#6RS@xV2m_T+lOmSy+d@y(&E3A-GWfF^=191 zU9aiw+ygsIZ#cEzsT30t9ZEJlEv!*=V+HFgDtu!}m!2jdC9dU|UQz5!LOy5Ki_hbw z3)(lHb6I`;*q7IpW8Br=N@!)xzRFc~_v8Tc^G`uGI#qMYT|#^vYepf`#8F?En3{F( z@fo&PEc_**Qla*Dj>(42643)|%jdam=1wmoo@?<8R=UWDQ3-I)Aw>q>S^04t=J1crBsbARaaV{2GqYq zwdr_szoWhQ$Xlr?-CIw-d7J8mM{jlpeG+^24soa?GUF4yf0YxW`tj8l?`4uA6*3s6v-@~HHZP+ctDjoHC8nqt6QT|cy29BJas6!o@0Yh5N7X8 z94}NAIi+y7C8@j|^OJN~a7Jj3V{Ol#Nl*K&vbR2}em4>PJP`NX_(Mi^+#ac80!LpQ zI%mtae^^EKCi=kWBPy%0M=6EMFXr|N1Srf?=qv@CDz^!kia}REC?30&J%SsocAaU- z`ye$1QI+blJ9HQu+qd3h3@#JP42^UNz6AGRZGv-amZnGXX!2)nP3VAD6=K8OZ4~&S z00ZSWv8qt+B(Mh~x6|}qHC+)o*DAc|!xtU6W{&Km6(|h4CL|wLRbM=2*Iq^9m@^%J z+MLNwpC7p2Ahn7iXrQcSpvLv}!|8W=PowsB_3XI9@{pfr^l#R%6ae97cHAh6DwWk- zc89(+UFWvGRBxE@)%q0LEtM`z5)_)Hg#B=v0HRt6dcM@~?V5{MLq&pDCX|x)6H%3G zc&2VAd_Y?M&vW}lZm>&Fa-Kk*Zkz9NDt~{RV6M1}hEr?zrM81prP~S>O&imXMID`X zBu+f&SEOy))on{CaPqkx<8S7_$6LQA<*V`N_W;Y})BBi-uxCYneD1H?JsNgOXkRY= z$h;T@Hx=I?pv$1+JH~ITOA&K6ZcC@cw$_Ypug5;c`?+ zTRZ>ut`DXo-|>n$RaI@Pxh*3_LuaIB4%)j}>UGdpTzTlMNJkc)LF>{Kdd;sx~oj2ic$4j zRHILZ$;Is2W8Qf^h_@1^&nSq90^Hh+tC+`WF%25DR~e-vY<)r&QjrWc`;0DLnYnKNwB`eK?AIG6U*=enug(;`iJ0EH zO`X@70&5g-3<|fIeOE&Qa|4ob_$TZPJlNl&{AH5xRQ&4M;;C!=)NI!RTkl?Xq+v#d z=(g}hrIiV;U}IIK!!ksI)SHmf-gw=R^J?b0hg7UH^@Z-!9M)MHthq?Wu81L0s>?A|0a=_hlW_o8C&pvGnBo*^i!c z0jIaxH6h6#f&Tm52ulF~p>Uf3K)ARLCeS^Nxr|T->fWBaN`@D%ohPjw6T3;ncnR4i z@tiv>Ax1;rZswBXn3$Piv5KfjMU^grwZKS@+R#;>5tEs#_d4Z$4N<8gkWf58b0xlY z)Y1d1$?XM1EM5c!nC@KJ;in;1T=cmg|EnNa&Z3YE~Y30*AexL?1#RzNqnFX#O> zEMFD3L3~op{I!XaPfQ|6~y{wio(erJBGYO@1=eGN-hAgO5yP-M;wxdm^{n;w6v%lAt^Kinns_@VOiM9Tq=ghV-$-`}&;gy(fzp>r@=GtZ40eGFGHtt>*e_HJC4fnLsz915xRUoW%Ew`;x z9QGzI6MU>p^8v#HHHR?MLIf`Q0tc%@^=_Q$4)nw9M@)@PGlFEPYIZ|@@i^qAMBVTqPO&zkw z?$8Xpb0CI(d^?W2{Wx;gILWrb`(tX?@$G%CtGl?{UpsOo2JgFB%Fg5_Ul&<-^{{-? z;V&En=DJA7D;d&^j;`W2gDytZ^`90|Cfen86L48lYzWcT^^SD;SOvlEQ2TNyX_gQwsr5nwfL4u0?Rt2mL@lhA$S6;H2o8H8XoL9Jog-~ldw|DRPdaW%7N-#m)@jN0dG`(jUj+i7M2@0;b*^^S=$N(c@Jd7;BdI4K=qEVzK#jpCSk zu}f=2EkrEw5}~PzwD4-FxAfZTvlzKrg+1@Btt=g%sO*|eZk9&B53*PweHm;CW5CYk ztV}*-WXBR?6PxLZUl1a4(v8gnD>Rarx=<}bUzo2p@X+sHR^qIU6XW1dZhghflzsMy z!1b6+{L5|ca`t*~(J{Zd*KoKpglqqfV~LAf3-qT#-ml&Me7|OBNRWRvxS7J>P(W$u z&0qx?V_jb=we1gkq!!LO-+bkmCUt&dZD@Nt9W7I|D#_~2wU{ip^r7mv35Bi=uP~_4 zS2X9(NDX;dOsJZ{bsddx*#Vz}76hL8T%zwx8oa~kT*R^xBRMkFG=}1FsTd*-u+G@t zR!&!IE3#=6}s2TY-ZPVdB{^0|MbnP>d5gQF}Eitzpqdov9pe(AVW%U zQf?D=$RJ*qSpzJ57Be63$Uldvn^da3RpmNYTdYCnAF?t5_f}Hs^%0dNIZee}l)B6} zUUv`f_@+;Ot?hwIBT8*FN5pyhcdyWe9pYFoK>+jFIyI+hlj?d(ozf;Dw;VcWRx z=3gw67J4caN8_r>oR28y)J4r~Z@RPfQ1H*HmJq$i-w*dsr0o%Nb-Ba{)H4reV3_ApUZ@? z^7V|(b`H_31{za(`;c5e);)gY(;a@sEO8WCrvL)wHu1#)b03_TG(zIR5ff)^*bVV_ ze;hG!f;j`bZ>W!iINw$OlD0(0@JKVN=Ee^9U0ZjU7Hm)@thWb)5aV2}cEL8CSy zG@LPk8LZG}7P(ecMPm?whB>>tR%^Rr`pgCDZ6eB&h0^CulJbS#nyLh)%Vt)M40RsG zPNtU0_ys>GuSOIn)_P5t@smpV9OQ`pN?%#S^JakO*DFyF5II% zw7j}CYoA%_-|_*X_}8PahZm7!h7juZe#Bon17l-Nb16;YQG0WLQ+rQ8$c_2>ku+TG zeZk8JefwW|G!6$tK<{W0{TzMWyh&cZa1>;0K{Su4zXORJyacrY-1-4~4-f*RzOm}{ z_tn?Keq>NcGzu<*1sf_TIk4%DLIXfx3t9(F7}%41-N>;)vPdL^O2FSgIVeU5FlPip zmOxk#G-p9Pla?ol@__=H{tcx1O@bjb-JG3?ARGybKeC2HpVj@m?HxgA6zHEf6sP1) zbSC}E`ntKelHgci>>Ry3JnZ4nmYtuwy`L)_5k&O$f`bqh&^6L=FBpR4>PrN9XAs#0 z{&fogc|X84bgF_#t_9k5bM$ie@`OVtg4E%1V9B}m{3GD`#?%U}2Fd_kWe{XLzCzJ88gzC<_yFlP@ZfGOz+0;IspL>R&u zjsU@9fL%900?&`Z5$>on}Wb=C!pgdr?`XF+%x2%K|sRP%IkCj!P1rX-?=Ig~N54pbO?+7Gj{Da zjcnls!0ILBxpd)DD1u)-N9Z|J z9s>>_lCKy!sQmw1g8hDu0cD{C)$u#wFesoYQ0L{bAc2Yq@^CqDY7Y-)3Jx?b2PFXE z(F7bgxr>A2frjI7pb{{3<*=ZvJYWC`^3WA}jsXb-2Fii=p*jh`XW-@6Y0#D28H^kZ zx-=I4s?9I4p0`fOU{E`0<@ewIpO38(qKT^z70 z7!XVdJ;Q;kJP;OiB_}jg4o?;WbcNa{4;myuzylB{3~Cbx-XRO+H*?UHtUpjoSm*`` zi3nmF>Kb$fi98-qbL$C)`!#0&RJEP~46av!20aF^;UKO6=g{kr8vGuv-&ZhXu-}Z3 zYrq0i0>w;1>_d9+PyJ9Sh)7l@P!n|jPc0BrNF>{kT@aOCNCf)rt02OvH8Zz9VY z=mn$`ej0jNo8o$uu)K8d~!LngtvTGRVPzr3Pk}OoL1<81nV}ANC$i_GTMw zKKfVFj&L-&Y1l8H0t_R$f^|PdK7Rz7BLXE_r>~nxAZKzZXQ1-Ga*(G2-3MggEgrj)6(Fo^uE4wVv~UgYVGfiag=Kwvx+v!Lh$uf?Or= zq-5&9eir+yH8SOwyCpXZ1C9x}TXJzfI8dVX9B`@N?9zIk?1;&`B%rJ7PMo}J0q#g( z7D1(c`(@lOCV;Dk0~Akw-yi7EuiU>*6j(Fz!vJ9H)^o?Cp(H@S1JQ6|;{eKn%ZvEe4nE$gFQWioR z_5di!gBt&Ea}amGxcQewsLy{9q3-`hgoOILGwVKoJun@i=jH^#DKL2<@d4zf;pGo} zG`T;&;y8de`}buH5Pc1S#@ztjdAoa&+}#}D0T@{nQWhl*cO{X${S*)gkAI%YdilCY z!Vm!FIr%$+Xzo8>^LBEEJ3uhW)9H5=peVTkH!n|3DC~W=rUDv?#)AWO7$i7Rhm^CD z{6i*zzC>pjWKCee@%{P%zlH-d!5RJ!4FZ?Y5AOMkh5}{^bm_k|oIC`SHqsz1-#{Y( zd$@rHj6cw{|CR$h1BLq!4cy>2)`vu5pgI5Fa$qxkBhN@A`mZ*?tC0Em&$}or8Zu)W zXvn|&20#?hiT{?vp~2zH|I$Dk0QGL9$pM_Ofrdpw1`_)HBWENQ1J~%MCK@5gv@v5nljHaCS&2chRj@&CUb<$V=5#mLrEn=WuC_IO0o$%W62196fzp;j$)5{+9=|eY{CtCh3@y97#ktX1&40kwhZ;d&9BNPnfK^o1?S4w<{d?}qO-fBx=%3RhyZ_aNEBQSkAmC7WHrE`z>@)R{5qwYx3>@Yg@nj{RRnbVr{)Za z-mWA!IOv4jxwg9}7%N;>+Y=~2lSuG!CPE?yko<{`Ua;WY7Ml_8opNl)XRp20*`mAq z#_8eXaZ5t*$F#VLP$`EU+Ii{e&O{y$#m`+QR`@>~%Egs&)w*i?Gqva7T~aRxi$8kz zYW}|6$<@X8U)$m)zh5{=ntVO-u&sCD{`h3k@OPgt%MZ@3Zr&qzuC?m$Il0KOs;1c_ zh6@X;OS1M$w-wizjrJ6@Xbo5F&m}FCS2?E*4;04mTczq-I#mgC<(ea{c=O)5$Ij5_ zkAm5xOOXf{jqcWUJvY}YVz5}e8E2a9_hJU`^!>qi?vo!@Z4*|d#CIs+yI7k8O`ckL zM0~jRJg`r;cK&|$^S7&W(+GjgwV4B-`xdO%YVNPx`}Tpjq#<|xU5cfs2)9v>dc7By z;O-rP&IvD$2ycB55*F)N5>CwNITbo$$dvrRxy(2#?RxTQ`y#fKg0~mr&@!733ckAQ zuTl_YbV8W>etT35XVGwxNkDS&*)2R5i~ zg4@xtU6~;4n&T&={Cex-0H+qN663Px&(~Jp^-O*!F`nI1SAF+htw5nJef;*0{BOC; zx-CaP&OMbBK)l1Bqv3Q9ZWr;Udt5M{MDpb=ljGbEz~nG}BJIg3`VU$bS9mKkI3+_gnq)og%8B$S6HHWZq%QZUs}E z9dA*f``+^*&*ad>BVE%`77uGk^Co>EF|PZyv-4wWlG|XDEbRJR_D3pD?%hdqm8G#~ zv9`}g#j*B2W^AFi|HG=t>T1#0L8fKP;P3BC*AA|I{`|e~Le|6)`L#0vdU6lXWsTWC ze8QR`DBmfzV@o%0HWxR+UjxIQ#x1iO7Qbh6?~8sJ!Aqx<+Jx0Z#wM=r5cJIRJEYz2 zuEkq18^T*lU*jSFAS+Z-?eOLSzDLbD&exloo2K-;{L&cDwol0D_zE4DRAoK8y~wcP z`*Lk3abdn9p8w2&18c`tz7p6BPK_GN+Z}y#r|RX2>?!4K-J_eWrd!OqyI^gJe!k^g zf%C23+s4*J4iBff%l-V4+{U`+c|B3;j#!dzn++r{ZC)w911bY(ANF zx$G)O)2hY9v3n0Vwmj}F%xT^qquJV1WUpXfYHg==Cw0&e&KPV|r%mgnNvg4`9C+Q> zcNWhbb}dPbHg2BA@$glvCh?xA7u!-Mgg)W8Gd)mKlE-VT0vaT`b`Bbr>K%)U?@}Dj z9}3x3;o=9oTZ!~eJ9Pi~t@NCBCe|aY23h_!8om;vP27nzg16>PXhuRqXaa?CbsDC} z^YkSQO%&fq^oW*cGaQRz6x}Hzpg)on^VTHaBJ0bsSK6IXjxCxmBn|5tcuw~U^=XZ$ z& z)~JqQ@{#d`K>_P8(vk+eCd|Rrl*b$EdyE zBZo{_&RiG0IM=gD&!rJ3c+1?Rc9(K-(JFEIMyloFyI?D2e3t~T(7_Hd4kH3wUF}Fh zZG^Shl{}O@?p$Q_rl>P)#YtwHr4@9Tv5Z0u`V11RVI~&d!A1h?dRkGD5;-%c*z)=< zHggu}=ZRS1hcMCx^Ith`9J*8ac`B~le?|rN`2?f=FrSQJ#R}^K{m9+7(ql|Jce747 zzSq}W4xD(nz#EOf|47vKd81+b#4*9Rf)uQwo%KP^V=+r2tsD-|qiP-ZACsLrkziZC zuN-S6i>%vZ+cX?K9Q|qdnT(oRs5A_H*@ODjzOx1?+uTyMh<;x1iuXQ@Y;0c1Sx0Pl zSe5eHrbMZpl{#eZK^<3ihO5)ZLh6m*lvJ9^ZccSCD?^8oa^1*+Uae@$14r!pog9=1 zyk1Y=g~mpNj!4BDoEGP${@zJ1c`=sm^~1=lk66ex!ts;kd5-LD8Qa}$b(>0TEN%vA zU#yqJ@Eg4bsUQKpHLIZv#LnT$|`29m#_n5`9uFL(Y* zyd7;+wTX&g;3KtU!J)kEP5Vt9b-QKoO*<|#f+We8`6GYA;ur6-GA5V*S=Z`N>gDo}IFl5K3p_7+_y z8Ll^TdZVQ)Cw3M3^-LYNc&yAB5;~eH&C=P2O`RXKw(&eA)IGb}m3%j`bGt~0ZoJjr z3l}^r@rfX!TbMr|iSZP!*FPG|&&+slUxowo-CYBnxq67h@RC+M@O00Gr_*^YBPb+g&bapdXLT*5|zBy(XjOv8!7* zd)DejeURi1c{(hk8@Ast$vK+)Sm|A?^kU^pldYY65^ddesIY6>kSZsKOjK1|A7A9V zI-bKM?{nKl=p(0Sl%rnu7OqSI)u%I{Z2~ zoigU`uNBS7*r z2{A}1$+Kv>=YNw;G2^{`#8gDl*$3~@gXL49mp(<84i)Y_$+vi4SgNt`xvmzyAlLoD zF1{O5g*%5vs!vdl+fRpE8J8(dZF7Ha8DhZM{id77kjG20H1w7=q0xfzBUg{$kWEZC zw?bLW^2D=&2F$)Ayh2T8Q4!P17qND4dD#q_mi1a`-rWEAwY1&qNjPSwj41&t;RCFc1?tjW4d9Ruz}snnPyF{1Gi0v z)la=@(mud=Hf%QHQSKMsikEwyY8R&Ov9N`+BTaXaOyzi&bmi(`11Ac@_g`JInrA&M z*<-5^RB?Z);pUrUi4t|CiN`yi=-Lcx4t!F+`8+^_)Cmt$krSlXbU$+-s__2jra&)D z?JZoT*wqM~qR2VDO5U77ahG<_;x;b{OB77*YW(SM7TPu}bGTkqXzHs-l#Ovvsc6B< zkXU9CH@jb>jG`i!6Z@7Or)~Ah(RQjTG3i}IH+N>2*P+&2t*UgoJh_gW z3LA0)976o>pzb`_)Wx~2`)?so(9Z6URe?wRw-Cx_y0+FM7&K-L8yLpTH-EF($I`Jn z9DOXLfyK7{4kH(nz{RIg>C`^q6=H!7_6Mjr)wy__Cq@*x<*Ms!S__@OIruL8*-6{3 zY*GXVp-WbCKWQrcR9kCW!jc=oI8xf~HHmkxIZt9-t&4qLmm_M>r!J?u-oi2Y;Z(ub z?REL{r7owg?L#9Y<+9m$9V8C>vn(!NmY*I;N%Gw9*nf_fA$-0Zs`)9GNR$Yl;c%TX{PrHYKJ=)heFEd5v;TG)MIKY&J>0WZKeO16ddlq zCk2hhSRhRLv76akqD6nR3|jUss>*J|?ocU$x%sXPsw5l=8!@0htHQQR4=N7xeK~9*t6qqprVD$owKc^s=C&ERFX&}r zbOwf-T?a-S3=btcziq?6OuE6QU2Ql1@Ii58I6RK#-q!sK*(V8ox#^cj_q2c8F-So| z=!oSH3DJlRcp>E!18?BgHI`JOWVY3ReR6)jaCc4C^OGZDBJWP`e8Q!5D5J*E+MGWL zlNx#Q-ACVwb90KGeJ(C9Yzi`CU#Cs>b2`lw9DCBOZEr2s@;=f<3M zPR@J%OWou4?OihA7_4r(%d~CLQ%gfUAo=0@gH63#JTg4Fg7v@B@?RVkHDI=UGE%&E zwoK_s|B`^sFiC)d?xFLnKXi}Xh@Ikaz=I($sQvv|6mA~^g)u?Mp-pv>6XD2!2C3_w z%-nW@k&G#QVI|$;oiFsMH+d+qoT9)VIzan_zZ`l4>*O|IXmTB4-jBuLED$#^mVmtp zV80m_W>sy6sMv|<>Sk*O!A!&HT@H^rG1&_K%Kwf7-DG`t*SbLbv0UjC1fqa3aQF=i zgaP`11i~Rqe+cxCJ`DLHiF8ied2-+t-Jz{vBI8U+Jb2&1b=Us&3YQiIVJO9CH0r;l z!TuVbAI;Q>wd9@pyf$ps$-$s1a{G>m6G3>hEb_yQNuKxR1q>Q6<8p~ej zafD@#9|#KX+R;kf!;BWF%e3q*Y)xfq*TN|)v7#P}Y*Rqg(gi+tc^Z39aJ*93tVcUz zaF}i3fdoyb`R%cH;cQ+dUjx3CZt?o|lznf|(f4|<*<&U%O6!TQI8$4!=#)*X4Uor^ z-h1YY<=5@L5aFqEC`L1Lx9f}8hKAk!%v06Vt}piqy4jCj_97H(>$eTw-)dx(!mzB# z=z1G7!?k*{@9w?fZ!CKSEm$cij6y)(kU9M@XPRVfZ8}(ys^w`xZr%F+c1^*|ExQ<; zKtQHbn>FBd%}N!?B79#u?V0e_z$osciVDiz}idEd+J+}HjV2u&e4iQ$Rc=w1D+VLD96 zv}>?(bh+05R1sq^0S zHWGjLm)x7Z5qQp;rB|^ydZUD=Z>N^n-6dqrYWZx=od{%0AKgB=oh{S#B+r=h+A{s> zDH|aQ8l(`_(Rl2CPp7h(fp?24!^k9uUEYI_XZwzr?HjH8=HsDZc15Bt7TJn8-kDVW z+Q6R8FRG*JK)J0_(J>iYo9rVowQ&g|Dz~3y&IsPNC&^UE^XbfOL7kJ(Ebh5FkpD$J zBZ@w0@5lTtjmPn;uVh0ThOckEGp>wMyT0p+TFgY0Fk`bDo1MY4JnHNVF1HFmq{h{rf8Ww^+p}vsZ3yQ7R!2s`&pZQAX#fWtvYdK%_onBIyK>8zJ!Tq?aNNfFXsM~4n)kbZ-8Rf+522e@ zLT7I|X(pWZ3{bvH^T=?v1^=jtz0q^_+@t&(Xtac2_|ux`LmyOwD(}tv-oCI0{kFi1 z!kkjTM@aMrLZDHi8Z$MLfqM2x)CEpsn(OLc3_Rtvkhs`&(OQqp(>?rGA z>glZ=TWuVMuk0|53ztlL#7HWzY`)t4qVmn7AUO@`xfjL9KYewx%yftvFj%w-J{YBv zbtOA!CPh7D+(=QVn;%w)Qk0gwb+q}tci7iGU1lG@cPxG3yT;P9VwX!ngA_m!egnr0 zS_uuT^>7B7H48OH80@LBXetQ03ENP|9447(B5AkWKB6r^n=X2Q(~4f7@XRDp_3pU8 zbjLGV7GJTGPx~!$GX@X4C+qDKe^$tLWR2NXG^zN=0O8!y7Q#1SB??kffKF)426mrX zz!_;G8Ak4#`-bU`XC}?xyb7Ua+p~rbdtNX)cm{Ww7ddxcuND#BHJoOATu7s|Xa(cH zOQ`6J9wTi~dSaW0L21HNaxuq+r=MO`ENB-!;j~$t8MxF_VA|8!j&I+Vf1b1E#?c|> zCm%!XbZchQdIWcNt(k;L5l8)DB5Jn%ho@M_Soq4rCBq%Bd&opg6H!BJUr%w_%^qKJ zYLf}&DxJL7z`UvWGW$*1cAlp*IB~a4`)cC!8ZXX;cn1}DUflO?dFLiHGs3?kZwRyS_>-*}ewyt(#R(-+j3~iuIg0^_-|cAr0R8%#wuQSE1aEXA4Cz(Dt7wX^tRb<_n!kG3n#;ym9IS7YQL`mv=;pnldu}gkKfEz`7`vyZ zkNX@;Eg#S5-=bkC0K$!&xKUJ9#x_eCZbK>h?#%-kzA&Nl4e4~7s$I8=QD936`(ZZ# zM75F){Au6}noH-yg+o^+lv4GH$m%uR)?Q~^P-gM>NPXcVHmOODBZ%W2b3M+t-yOzV zD(;}=(Au5VvF~lg<`PBo=4_AHLmvpli93UebS*o2?eSNf{Vv7_S_De?8uq0xnvO09 zS*IP>VJ5;Jmj>+ge9`IEv|UX5Y}pL+QY_q9%v`5s8vf)Xy;#EW^b$$mbDp0E3=qsT z-__Z(CgM~-Fom`Dm@iDQO}%;*G#TdLbA(m*$+q@n!R>tELFeq&UOrmByXF_I*msMk zeCO=T!l+73FUa0XDtJrggU^pLR`=23W_if)@I`WD?=?T}d;VK}?I=TYC+Rcd~5 zm#-95NdW{RaT~BMtt3S@YC$HAqkHp(zX(&S^tqtKaZ5gg$DU^OgI4f}TK4dvr{S@0 z!%y878Ks>qqj4pyyu*28*(PWLLoEwmxuuyMuWoTqCw+3m1xRiwW9M^W4oSPnlebNQ z^Z9hagn!XY@3MyP;=#rHPNVps0gP_(w7 zyZl*bfJ@6v0-K#8E*qF8Vr~9h;aEm9^O5M+-p)7?2`$;w>mp12uusQh-*7CAKFp}p zz+}4NPi#R;T;0JpXn@P{m0+*Qd)9>hy48zBZF!2n!WQ0N=rDX6?%&3;ov9(4Jx+*8 z--vFl+&}Tcfw&ziO`kM4l2e%9VArg^&joC~W}YaP?Co>!<$>HQ%+#zqQ*!Er?X4mg+!U^1QvDp;?3ZJ9CjY*N@1;K3*J9?6DX547+}en*(TcBy z&et@L7CF$I5dExTn%4OI{?=_G-gm@CmF@T#v&Fs9O&WCPnWUoa{lXS95L@hYOwOE} zy6E`u-g}ya#Uiu$S(dc(Q>8DXKTvH}=XIeV8U-AK#BSu=)fms*gkT!^4m$x44Ya!b zJXPpz((1{ww|RUtta-uhH!c!rnUP_7Z98K#Zwjnn5>%xka)d+FTM$ydIK8k_YL%Y)^UGv2d7$Eh4z5agFYzrQ!aPyj$k>;?c3HnD3f7@o## zPWTp@{=OSZ#;5a6k=8s!F4Hn)Av(mKa784?YZy9AWf8_jER4%kb_rM2=;60rF_EJ& zb~9u`=N1~gNPl;as8khrE9~eNg}qG7<@cAMc6^^xPV(Qe&+qd2ZtU5(Dzu)< zhkLpe*^Ni+cP0jdYg1t}wE%b+7T>d;8HJ z1o51VSDbA2H@I_TJ28&19W*yJ&k5mK{WdrAN%6!gl~lm701DGW0Y#y~+ou0~-HHVp z3|g?k=o7!Adg-Ha@BG!PareM!2m4)4LxjUCaarpJ2KRRWiLQ0%LJA#6WG6~~Uc2)#`ahrZDBUqZu8 zCRJN&ols`f;kzCOxcf20`isV%EUY^}CRSFMzo2ZL-8GhKaH*fqXzyF9md+NvJbS(6 zyzDG}&6La!$1BtqqU%$XRCJIXp;Ne6uL4@pN&ZVB-OU^d*V{JNx#T?D_tv42|CD(A z{w$i~3(nNH2~7H--bDqhO^2TIW+x~|S%g)G=PS#2P@m%z5ahZ2sp?9`87*rS zix4wQmW~&vN?W@GpXEt?4A+y07Bq>=;!Tz&dRK*x;I`hlk#2tJ%=Ng7h{Vq~JvsRg z7T5{}rtFDmmXLk+s)n!q(xckv1#$OI7E&P@DU5~!Rzd@~zk%g~_Xb$$!*|$mc&PWt z>=U?d^2+_KSDtWC`;pAj6m2Ymx=uw|+?paS-%NkXCO2`I7}d~L$Zy3doUd7rYH5DUo6$RS-Np7q zhI@TtaAdCJBk49PTC*La^3$i6@{dU0po_ViopBn*XBo?NLhVzYPQvle+byi%1JgV8 zeB)zHl0$>Sp6W8;k4goZ3M`;^BiY|R-J$hNEleaO3*S;hS{VD%Uy--^I9~3)g2X#p z8*9P?l^xS*tx~9WAyx~d`A2OLTQIW)E0YhI*f7L|lvet(r}*fCY*WkNDvi{wJ;*k} zdFJy?JdFBZl{oGvim>ygwU04x%|CgN|6+VD?)m071$(?W>6u?vHXW!Acwo;fJ3MdV=5v(9>s^?Fw#!%ZQxe)1cd5n-Ld1_*9n4y!N zZfl$>$>wEV`~|qw{<_!6C2mb)Xk_>p?Wq%z!(LVssuplPf(bT1=u^l7|6{)_#_rTd z*O^>OSytl3pS^7vNAkE<4HJj9P1#>l&Q|LvwQCk(=Qe!kDD29-CmSVmGeB7mKB;Y$ z>6qkg$%|$?gNf1=jJkTZu$M-SExuek0q4*0?UHOFDSBT*F>)$E+UsDSk-SI$lvU~6 zL>j-npNtr{Z0XL0MI1G^se9?1s&4&vZwT#29v#9gRhSEbj&qcIJBG{_EyC` zd^uJZ^XgmtwaLll73zZywlNfRNC{5LZNLv1CFwD1fQ`>mZXHgIkx7qt; z8uWo-D-&>EC8d79T{0x+xAA8r&+dFxUkN9?9FSk@xTDgHR2wajw#p1@d(zDo(1=2A zzOCKyM5&;jrOl3k%{f!xUOimMK5^M{-a2)muS#(=vF4`BLFIz_*eQmV>s0$gzt^;d z88j{*7@WwI5FxOinhSd`WQJ4B-*(Gl%Kz%=%F(luzMKmy0~#~aw6}zn#wZM%0?gcy z_r-!2I41qKEY{zUVrgz{?J?vuW2$~JWyiq2>p~N)Iiq7(VZb(t0P@!z0j4a86u46W zfpQyo0wK|l(?1%=#z(y*p@X0St_TH;()-4!nzY+SIrXZ3zh{D38YgI)B^ zG{NjSv(#e2*XAlA*)q8`&xX4XVJ0(fN(Y4Axm_n)mQo+<5);wjo^^*RM~O;fc%ax+ zgPXs9=#u&F>(6LZPFH!ej1q2zW!@H?jDN>^m zh`pOOvHNvxcD|sWRyon^Q?XD_U(cE$r^3jpI|dMc)Qn6` zH7zAIiH9651I-=11E4VGA3)M@bMyxf%NjcVsH3o0m@M>+CNY5E@9s4 z%mbZB_9Dm0OdvDQDAfs2k3`13VL6s;pnTou3!XA*6L>!WL<%LFn5BQx2q=+ z2rg?*B6?Xu6(h%B@`%8IFsNTsvVP(?TR08@Qmg*|3+(^34>Z8p7%+ht90D$nmWSie zI2a!62IPQ^pg%++x`zc9C{owgP+s&^9rU6_EQjY4;bJ|pj1Fee8>*|=^vtk8p*~4dV+rc(+eaN0s$l6 z`_Ta&eDd_OK!&hx1CSm3UdXxzb^+PMFNJImkWKt7WLx=R6OcXpS|GW|zd;|6ZTu|I z$Z-fPm_r~dWCIimULMQ|WEUW${aSw7#Lp|F6fo~F@)fH8Qpj_?ejlRwr$BZA%_?;L z_Y?GA!Js+_QDk2T+&54nIp!I{kr4U-(a#7@{xTQTOyFQ<$!!*J6tw3BZB}qF$)E-U zo*KAWavS7o!IZDpfARMyay0wH=c9hGCcsf-*03Kj1vo}>2kUW){QVJNM+8Q+-oEZ4 zfu6}!F2Lk})SyTOh7X!Sw19#a7(1E971+{x%?%Em1i8%}4rYy9^8kx#z2*tbYrW$Ssh5V^@+kwZRqo%_G3#Zb>b!=N@;eCvJwdmrNV^S&DLS4i?tvazg&8X%G-IKq)o zM3PlQ{ZBFw*8e3#|4%Z641_iu0Z@_$J^m8rAnATc^RI}|nEz^ohW}S1q|~2-S&#Ya z`PQ-q?#>XL0*ez{ivKK`!aFj)ZeoC66UN%_~ue4Sn3P7qA;cK+D~ASE+!_wm+* zvblF_DxeT39QdFPjR0TNA@Mfizvv{`pXdUEya@~hz8`-;T*HEu-~#`r4FZ?YAGr6A zHY5m#VDkTH!&0EbLgxDi9S#J8KilLW_5DEy`NiMcz%Lw_#^2}=2sE_je{Vx0pr!JA z8v=#+Qx*go0r`^O=#U^}{xvqV+}~qEBO%xI`?DB4K()WO0UjW>{Mm*<{7okZxxL@| zL!jlMMfZCf2HG$D*#>0={n-Wx|7ychpu31Gj1o)ca&o&g~{QsasLmuLfHXOi1 zf40g0t$Qr~FWW%iPykH)@hnab^1Q$Ez~LbW`Fk5s-(R|t!$3a$4?6k3Z3Bq+m+s}k ziuv9NP4}U3>tdu!9+x~jJ09^2QjkMYybcN diff --git a/results/unknown_float16_gemv.pdf b/results/unknown_float16_gemv.pdf deleted file mode 100644 index 7ef57cfdda1fa63a048d1c8cdf4a53e1bcb8257b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17615 zcmbun2{={V_W)cW+zc5@rfaIqcjlThN60)6nXVx-m!!!YndeAlEQ%yjh7g&eP$E+@ z6v|LiRN~v`s`u@!`~QBP@4KGe*=L<~_F8-GwbxpEpS=>&Q&ks7N?>3@B_r^gH83O` z0r#>$4wI9EOB$W@Ccq_C?1*;mUe0hyJv(QD9~=b`4B!e1FoLHeged*n3+i5;L^zs+ zAZey+f1KbzgiCGSss$1c8xrk^1UP0Bpl3%U5_~=3Sm+ieY3O3-=<4YV$NhZj>*Zib zAi^yHSygqw3L%gPm(=tCEGYlHs{Fia!mWQ}2l;mgAig2)eF=cOP4*=X2!3Avz7C*2 z5dQ{nNgaZttDUk}AfSifxxE`>+ItznWXpi|&YKRA99Q_0iQ3)~?gwBH2*J^m#* zZGxvW(FG0~AvLb<>JIt}msEEL0#GG5csUXvmi>sn1UnB{V0MeeuqT%k^O?o6nZukK z2k%@MI1{sC4}VF8iw>r@LJdJwPtUVjheti_diOl)p5+Pa-LO)^E@4YRVu9I*nkQ#e9ZLu zq7lv2x%tIok>j7eztBh4w?$1iH}-m$<0E@h(85e=+KuVjA7M2^D#EHN>2>s>PZ*^> z%=1@TPSHn&6m;Z73sG^LMj*o{ip`C4B{*xW0|R$z9irc8IXAz0x5fQRtZq{1G~E!6 zBg5BwvgPxKxzp0$FL7Sx?mnh_%4dCXyk+%~0{5}F=`n_L^1St&`}eDqx27@Qaoc;a zvJ);eS@Fc(kfttrhk0#dQ@K@0{(|aUkAj_*6%ysy!~dZt^9wzV}s)-FxWE z9qsN!M^#X3FR!DX+6emGOWJkG*3M=|`^%RV#zSZ7>Q+B|8Pzq-kUBAMcwIq;W$)s} zH!o8N39E*Mu@Oay+e5(OEi(OCun+ ztx3GjO|98E?4D!3m#a}=MC8h~sX1SUK7rn`m>mP7zUS_a+0 zuOkbQ06$q&V=qsA*UJ>1xSq&unQz@Pd(2s24EwU_@FHKk`gU#GI4#2;jhJXj={!+3 zR)0!HCOk*%+_8<*em?m6(@SfEgWofBFTH6kJRU4~|NF#l@4bFORZq`<9DTD|_R*=} z<1kZ_vO%rSR3asRVepSnAHN)H+R$XT(c~(M9XrH3e8j17wrTLlMFkmLELU=NAA-Yg z_nXVzd^PYl4jD|R=;QZ>c3+%QgUgpw9Mu@*VCWW@s3`9@Pd9K}wOjQ(9c4WMvkFKr z-%BeXXFzu?`OyAT@;gp-OC-F}O5RiUWGVag;V991tk#Fno@p%y_6yt&6wiF_9?=-8 zt~#F4aQ&90#7o53XtSZ) zw^F!}*-Ej4Qb6#ne%aoUvz2d6dv|#pjVRmM_Yx;~`x)YBzg&RL(U3IGx24|LH%FRZ zc-}P|&1*SRzuV`dyQIW5Ly@Px2X@493iJf$?-)dKHhPy2@7b4b>vDr|-)BsOSm{*s z1#xF!;o*0I)kEv+`CkrmyXfxHc&x3wd*Y(6+nZRNy2$cDPTIRY%OgMbUl9=H<2hoG zOc&9E6e@~2HWW6K6&6_gfyio}3J(>#TTXMz4R-w5%kcaQRr{)*`V>SFxMHW2@012T z_cj&0?MM@^A+<7UC6A68T*arA#9x!I*Z3$6r%D&{L>D*;G*ej^&qwmb?cE zXCmKE%+ylwz!q3E9lb;KOysm5PZ%9Zljg_|bh+Otpz&a*)E)3HpQcB1b)|tx^BI1n zI?U>alLaMhJ~4VmCod)b_(SIr(1`HCqNQwzIKd)TSsG6-)+2_D^^mOt^Mj5na5xFJ&Uf4{u%-9kl-V zhrH&52aav^bhGNXs5Z0U4v}N;H5a&Rswa6epX3dO&AdI_H<-e2EnY0Kp=zQRGr{G&NCZuGtXe#5Bu zSsQEZ{Hprtv4*Cs^;e@ydm4;`RZi)mPGvk23e$a7yMFR!U$LCF!!})3V+IOtVrKty1==v|$ z<*DCY?Z0m_wy&)JJ=^=E;)^Q|G~W8!6>LXB{H&Vn)GS&DJ_}zz{+yS;>2C3n2sf>? z*Zq+^h0S*_A5)F$UOC;Bs2g(n!JA7-)#tnPZ=u49Y(`Z)UpoY~AJ?I}AhnQgJqw?N z&t;h(z$Ly|g3XDKRV7H3H1x}xu(H0)_wJE>)o6FuzeE!_lRQ^0FL?Qol!+i_NWiBD z`>N6Yu4{=o^u&Y1%eMTk0~cNXEuX8`kHtn!nPpjc2)L`)j!s;p6A^hkb2;HUF6QG6 zy0qfU*M$$>AB$0W;e7hztLyGRdY5XRC2A}%DGjCPgqpg**$!W2DzTbvc)RYYc$zAl zi`Q}=n@w&~idBzVN4GrV?1`9Y{h*!kd{{Bo5oaYqc}YQGQ-h6#@>&kcLt?nXmuC|c zfo0EQRmN#$R1G&Ro`@Obt5VOfWM`XCu{BC!4U6zQ5+H4G)RZ~uz(KtuuDUk>)trON zN55$tbon7{5#(u}FQa@*(~x#;ufaFZk00Lz_4lN!w|?!zoFH8C`_LazRV{jSw@S7V zVSe6Vxyb&(X)70VUN7TPo%1ZR{TgH84}?A1_FdppJFVbirot6$F&xomcHa47yoc%g z9lq6=lF@e&_Mg08S2a~~n@`_A@rtcDcH;43Ls{C|dWV4J3-3}nTBonx_7z!Xx6CxV z&TCB~D)y=g&EafYr(OuDM1Rzs3_U8qeRTf?;TtsbZB}%r=tP`e6>tbN<5iU6gCg3$ zC!LaDtJW(Ky}mVPHv}^}PK0 zy4&KY?57j@w;8o&Y7|wQ#txkiBCL$;NqzW2)r*gH3U$O%I&4UhipPQ1hLFYc9d*Wq z&FP)mr3)CCyV32eacPc{p(iqrge$Mh4Dl?x9x?lzg{6oH;HadZNonz($Gm?Z4Es_> zk+txt!`{q46H6NrV2&xBPHDM)94X4YZ$3$SM)s;$r13fP3O$z$wPE7foV~Z?r|As> zPqlGWAz(RS3-4mT)IOnYDOZVp?U(CN=w0}l<|GyVb56O%cv@s;O3>BP(^%uv3sWYa zhzXN#YKN#*Me18A2W6E~(DCkbsM$pt9V%Sq+;*0TG6bKjbwCE)tC3n>GglnTX=$PN z3@fVRZrXy^5A0WuziYk!hy1u^exSk#gdFlrF*w!JMZ0)-Vt$d9I%X~-iIVZ6(doG8 z!%VK+k2Ml1{2ogcyTz=O`_C#Hf1r}BQ{C>bO*@;zG^Anpf`-7WXPaxYOGhA8?>GxA zJZE1nFg9pn;7VwP#|QRmt1cGZ(J?AI59^^;tX|A<-*4|$eMYUVCq@h_u}?gO!MNh;19XD(bt2o=?vV}L{58F+06F23@R!QJ)1F7 zS_yOJxUN)a!d60cGtyb8*z{(Wq5ktb#vFP6ZWn8G+CRm4s##=HcXzt>%tkw#X_?Y@ ztG|EoZe48QqeGNwpZoWS${>2X?U7VIOf~~2mf~*z2s;-ZZ0saLb+g3ixG~4c)JS98 zSqY|L3?XsNVd#vPh!U6J*wb^~Gv^wCL~1c@Q(E!V$8#R?5b8`=dr5HCoJwEG@mQ+Kd$+Ur65OxDr-RKt zed#{4AiFqWf4}pMuq7Y*<81XGSkNrb`7|Ff`x+$hmn4*h4SePrIVLr`wvU6C=&l!} zm3%YDsJY&^l36zWopsn;*tJW~-l3o0nhP%c7+KO+aP%Da=LP|>rh-=*YSet}4bMBd z?}!y}^$k~_r5LxK4>i**m7Ck``pP6oi>>Qz7o|3bhipmkU2}&fBiaw_Px$&QqPq6T zltz7>8tHwAF+RDEzu6!nY+gPeYc;cvS*!W0W-H~}h7aFL+C9!~Z1Z+;t|24&-F)c+Q!j=rYnBLrRTceb_0ZRsT~Cv`6K)^lN}cK4z-4-pd-OzKALh3`h{7K}@{r>NkR$#9f~Za~TYytTdq z8HsyXe3~R=W!ddnc5$As+jQW@wlrO<{bb}v zW`&AF{%`T2u~<`t-tEe5%-61|!``Z=!-ONP6cwV7a*6yNwZE9T2tdzTpV=6#jV~D_Z>ClNQkL*vd-zWLp}GZ%c^cPvP*g~m-lUN zUGAq6r}JgTXk@rlCi6a9krTd*pFdxho*zz5bT_r@xxBBS0@0PUCcoRzug5PcX-(jm zubW*@=G#4c#BXwx6102g>~6S=)4hvRI`Q~(U(l^j2&TnP6k|#Xj%2zF?Nj_QC^+2z z9uzbhV}#J_!R}yoiroJn&7flG++NY8-4!h6U}z|qzC96#!iGKH&5^U+d#CETou>P? z8Mv`FlHml}J^F)_t*D21Tsfp3l&5qb(!8dQU5U^lH*Jpej@P>vwF4F_={&n;i2#t%6fX!<;-&q-Pvi^M-R1s=X_3v zLumi%4-V0YEm$G>WGzo%)>T#%A|!SKHsA9-kVhZA<%@8!sXmv;rXh6s(*C34`spKqG>Z^i6tWSAl6pV6rQ z%?}dbcs0yj;dqp4#zE)T(GMRU>|vOzo_C%w=5w(gz3$;qq<*CBc>}$UPBP6` zRa)mN%mVxRx$Xz`1K$~s@)gqdT*Az`I@a@rP>}gl!%h)AI zV-FniPQ`tB($u&1pBJU*+UXimL|qPl7bhN+O5W{$>bG#yR7p|Mvi* zag{*75mJykVZii4L30h7UMLKP8tqqY8;ZA0kKUeWM5!>qlR~9Aw?Z0Xls|e-ybo)F-&8SxKMi7xBF28>%M!rsTdTC)I}p(5oaDJ zR=?4*X7-8bxMy?AQtsMm2}_I2lTo!XaY71JBN+>PRn|m_a%t|vOS@2)MO2HP-000+ zRZfqfPCWV{cURLH{Q9J1@WX-I^!LZ*QA)Q3-wG?Ydg^mzF)K0IO((Gx73f_xZyM8I zc)x<8I_?!)xXXcSnYNNj*+b*ej(3XNROeESN)zLq1z8%{MxE`sY^QtgY`?`}q?&$e zt}UwgX;EyQ>LUeRoo@%zwZ%wFa&&6O-&~)F;zIno;S1xYE^SineVShM%f}`@t-ry+s`yF@oilyRkEGYA* z6!Y{Q^cJ=Q#hm&vp`xk%w8T7o1&SG?^HkWvv}dR=tp<2M(R4BLoat&WtV@xTgDvf`q5Pd$3)8S?E=r@@CG9V@HcWsFbOtg^{SkW5g7-@-P7 zTHHgXMmP=ShLI923^t^@KLt45xZSAJw$ls?;Z(b-PO9@#rR}$2GowD@Frt^B_+Z>u zykmrl(fh!;p&p~`^yepBlQfNmM+%rvZZJ6SPb@mw>u`Bxm&11fIWk<537ycGEv!D( zfH6`<(hN5kdWUF?XC!{Aya7ecmY1|&Jb6uP>mJx?c+K(g?P?(b!GToWGyEzg*VZt; zg8bK3HEF2=(h}O-v`XUUl8RWb4t<;~Usk_1$Y$|*p||i!p8k`^?fCZHxmVa~?wsml z82k`qrBSn(`h<_Gb3-pgj4D5IWfmH#>jmuR5aA?wwpxQJOS0W@$Edj)#8~I z`!8&N*x_O z1>VOOhCAm9XGO~O5bfFu}52sx@$*ox;Kpf%EPps+p3-E50D{CF80H2 zArMtd((?euN(pxW+6t&3Y1Xreo=eW7X^l zyooF)6|368iyg)@w zZ7gei{leeNu4W*D%}f@S^iLHsH+&^?I{gvD$;daJjyNF^HOZ9QLMuJ6k7uIavaXE2 zNUu=AWVqnZ?m~;);N*U;h0F35VX4U(X-0pe_aIW3oaZUGg!dHK4phN?+ZgxKJugQ?JaGOhmrlzC(GR6B*XX#4$+hR`y+W@eIufy5S$Nv+!Vl zv#M7q{4jn^$ay6b zO;ZKSY;C@~R5lu`{j74FI*y@N+M>Acr*zCvEE-sr`_F8j-)$^tT|wtiURvWVZ%2u3 z31WNu8Q~C}eE6!d*8A;LEUVAHo?Gx-3^=phwgo}@2=w3YMi?>y5E8qE00^7VNe{ZG zvX~XRi?XNtj-2j=ob$vDH=*lPbQcjF2L{>0l44b~ZRajJj0+j*mMRDe+^f;V?=IGp zqSSTKrbA~JXuVFGttZIc3&fZ9QC&`KA2s*DsB%1!i_E`1$9D9aq+d!=8V7-)y~J?z zX#3+(wu^iEe8;-)m-^ZyUwb{fF~rcVM_`*mWKUvAQVCmaFI1^tjE49Z6Pw$ezrt zcikaM_MnEghC{MO6;GHzmRW`M#Y`iS3WM=V*OsP+L5mmds&U?NYtgeijn;JES*3o^ z2`fuRe%otQCmR;4xoV>}Vu74rS4EU_e4+JH+fNl9U0ys`-)$S&wE`QEZu>;99V+#n zOh?It4`3twe=n=W4YYh);B>=&FzX!jD)p4yEU_l=!A4bbikdou?K1LBWb~2K=J0#@ z{g3X&bty)Sm)-1&=++o+mx=03jJcmI9&t)lYzp~E?osIC&5t}hUrV|=DyUl}2Tw4H ze(4J*@J+UG4B4@@*|ANT-S+3ZaVpTTpt)G^51^PQokUOc(s^cE~lWQYd>rvA)Cingu?D!)-9qhr?JoVD_z)2G564# zPVehA$}^izT1!`~(-}J5J$azpH?OalX0hB^tEUHl>&k<4a~I+zmb!xQ?$)EaN9#?| z@A>d-w8i!orVm|NGwo@InU5Lj8)gM@tbbox_$YgJeY=?7X+JW!+=!_MyB#bK*t8M0lfshb&$R0MgoX>7r!09_gvym)CCq1a z8MLghSFm=AmmThAx+e~!)Mt@5%aml}@9SQkkty;%ay7)1<{m3&U6|&!u;Z)LJcVeu z-n3$Ctv$+sB2@6DpQ{f|wD11ta|Lx*#txJgl~fg-txFbGsjx< zYfk3HBdW<6@6VJgFGn^e%PAa2b_CDiqCN7c_D}N^3UxhVmAT!vqs}Soh4GAS6VG|! zSkqk_FSnbv}p9p|O{fqimLIBbhW^gmB3TT=8Err3~3Wg;YNujqA zNhN48Q1XGGG&?W8v5j}y2Q?evw=o}l*^|XVKEf!ZIy6^a!j0lG8!sP6)yI3q>G5jj z3PwQ&CX5}g&zH1z@{Qz(eF)VQiR9CZxwtP$oZxvcco;{2=T4eoVf^iwd_=;RN_RG% zV|kYR{>g{J9*IbfOxAF>7xve_%8RK#SFjyPOQttuVkI;P_qQ-z@SY%6djA7<1|IA= zyf_HgNLp*4FCJv2@F5zc%32r&be@kex;sbwTA4P)EH!n45b=n%<0kXGL8L}eoeAHI z#Y|M$J*+cjyFp>oRKb)_N9dvE!t1-*zE+i&y=mu8;4xzp$W?7bwLF^Hm;Q9&wv*-A zbl1j&!0>F*e(^RlDg(|@>G=yQxhKW%?2LMlnSKGrZ4%9XR_SBT;kYwj_8OVNd*`_{ zy(=X_lF@(6}R_f9rd}LmxzDeLcl@$6X$Tq%D z3|E>tXpel8V{J$fV&P70A7h};J$H;JKQY zo(Y-VDEoM~eqex?dp@|8%;1oT(okE86~y&5eJPY^YP&_3!=0{=Ib?{QpV}Cpc}%^N zK1PvfF_9B{6)t94_a^D4OY;~S89GLF{;cSLhuM^(5nR(j51SkCF=(0RrO!p$u9W`U zbWSCVYq7#3GcDst4(EFVgudN#)@AaUN*yIukAztEXuq%%aAr7^iIS-Flb3=|tD9xm zC3>3dLo>%?A~g6SZd@vON~y#gdrLhI=ga!NP%?oSX&hICobwa+IM%Hr?bb7ARKe>O+RISK6&a%$5NA#ZOf$rPet5| ziLtt<$?vgc)6-wqD2~}$Mv;*rxo}cy3wB5+QIkOhEPPh7oAwliqZ_B?8gA6MEH;#? zQ2U3hO~Jk8K%h}d5w&1Ry53x8NBt4aDMBAuO^?&QX#p^pdDI+wxkk%ECO=T>-f06?l{v#!yq*u>}5lXrnke` zXo`)&gyI_+rub_i2b4Z%+Q0{Ax0|R<=xcUu=sYM^Gwi-Le^ZC@06PA&KUY!K^PnD! zgB~gDue{G}%jTXZ!yuW!1%=##(6IV=2Czb(TVY#Q6pWP&)Xh8iWWC`@Y_AE=L#xQj zRKCn5gOnn^H--v9nG)GGBLiK>G1KXl;(o#RtLh|6lN+O*qQV}!UcA3OOK!W$KyQ)0 z${wDczCy!;w?`-yF5L5E9Cf%Gl2OGs9SyfX$RVcmq*#rG-J|;UnwsS8niyFV&*Z6BMSl(M;o}rlgTA50YQ`E`+ zKpfq+jm4*PhvU^FsTEU?yvmm=cA9=$-+uM5QTpHV0iyWVqi?1pk@AcrmF@fpKWjSr z`l=?Ps)XZqCjN$Yo_>%U^YtUDxY+rEmnF6Be%4V~EKCx5N0s2`;Opv5^zwxxA!7?N zfDHZZiKK)lhz4-$N9{a7+K}>}O>e%hyqPK_jzpl4aB&RSP(ez8O?M;;1O(QAIv}BA zNAz_i!c}0LZJ&@)EQi-5B3v!^;JV9mWP3{CI;?JtDtFsFcjseEb!OO$L4i0VE`MKNqxxgiZ z2)(HR1^`m?Y7~mjLQcAR7w&>ly&+et>J}R0V<53bgI!;N|Y+35QMuDZ{0} zlJziL3e>eh))ZVCkk^AtV_=d-aA|;Q2FC*gdtW;THv*Bw5+2C%=lv#M&?iWLPe+2U zpM#e#0geaE*}?H(N;-g~D)2G^Cg}v11PNz=T~`1C&)wjX?r=#DxTGiG9gYM;CJERD z!wh=t50?ypOP&C-2T*%3;MOpZTP11g>PU0}vW4QD-_x{6!~N?F$G__In?jIA@BbKm zFtqM=&VE4tThaq1KL;r3437f6gD5~Z;wb2Qor;~e#-@M~FiEptRglsKlI>g_lsui? z34n1)Ln6V$1ghxR1(JFMI)p*}p6~W6j<`BzBM`vB0jNK>{N53=6K(Kv>X~R8SBejwA%=3ehJG8YDo# z0}v<-qKO6Xkc9G!Ip|8#ABYkLx35o_-Zb5jJ%I(u4mNlB|JVKsxbTA?X986Tb>cSAOaQqz}Irh%eGTXamxX zUj^zp4uJ(@2zZ5bfK0(lgE4{h0=Tr_%P*byb%lfi#vMkwLiOJYX{apW&}q;YhC~|gM&c^ zH5jnez|4|hkf{YjzFGgx-lIs~><^od`bpXWjv|qU{q!loFp?VB^i!nsM}QmwDA6W- z(?kL}ldznC%KxlEo(gmyfPuGwoEPXj3C9`e(q_#C4vYi|<_ZU+Myk1iNwryX2kNz1 z^MHfz(4>Yu;lQ?%@VwxdpOhe12|OtY`mdkG{G>*L{B*Y@vM}J7fV(B3`@w+{ZPtKG z1!tEw>m)}^+9d&9-E`ulT?=qW1hWXj`sJ6gKbZio8V;g(()<2Ehkn-nb)vwUksbyB zW4Bp50SBI&)D+3(V>hY)o3{kK)R8ay*k`8uo zB;=7Kl~DgD8gT3XqM`p!G=u~cZPnseD1Yw?|zXQks|NS*@M<=*F6ef8({%QiCk_fnZ zd8$IG@&{FAPzV$b9H>Jh5WsTaEQEiPNuV#m2?kjc7;t<)f55L{!Ax+1{{w@9OXvsp z{0T$iK+yUp3@eQTBK|KPRvLt6f53o?1d8-uJRCSb`Zr7(i1a^r;N~wH2m~6M^Z&&| zqk)b71BO5${(=FA@)v&?6!?<=-*=Iizv%)e2>k9Jcvv)K&Y<5vvO<6(Du2_Jf+pa9 z@h}JwU;GIp!!rgA9PXcZkgxa?CIy)U==U!#82sOOSQH4a|G>jyAOrs2cd=Lyz5fZ5 z0-^OEFdPE-mOo)g$fiKQe|f=y?5DqAILMj&7f%WS{O6xA6#g$>q;P-BSxO4{??2v^ zM*giE(iq70|Cg>b6kq)blY$(^A9&J`iTy7OkNlf19s{|@Kk%d=v+)N^`Y$+ kxi(EtCod3Uk#2ww@FUv!5=kZy9A`jcVM0P`y6Uk12Y8X8NdN!< diff --git a/results/unknown_float16_gemv_t.pdf b/results/unknown_float16_gemv_t.pdf deleted file mode 100644 index 30462a86343c591f5114d7925448e83beb598836..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17413 zcmbt+2|Sg}7jR|2_C19zLWH|tt}Xj6YeJNL-{O*#vM)t;B4tTsi$RZRewIU;nu&AgZjGz5Zw^H+!+M`(Be2BNG49c?{ff!R%mhP`-OSc7xQ&uh}%xM2Ns zrPiXF7Rd&$qT3y^o3=SB`C}r7VA~U2Ma64Q2y2Dxk3+)hO#*>o>rC zy5{TRhd1wArh1z9{lHrF8(d(fh<`XX_0gr}<=m&^1@nYcFPOM;yrQ(?UJmxnnr=Tm zG&*}EYOKp=o=@s~$lgoL;b)sUFW7HaNQuRpv~3U7#dFxll+L@yUt`hRe~GH>vI|j~ z$6NFSeHh#w{npM+nR-_oV!&7KxH{dfkJtDw35@qNoSyTRJLQhba-L|FZ4rBp4Z08~ zJR1y~c;gfpK(Jbk)VMkE`hu&4)EcgKd(N|K6D9Kt-^O|}8kif@oLZZ1&D*hQo#o9O?(8SOi+GI<%j^W}82+$Z6*;BfB8hL=@+Fs7GxpynpBl4^L!yV(bfZ zxhqsTt0%_=c{HpPt5e^&7SAd2Ad#*59ib`L*xM!@(J^O>b{~Jv2%*mp$%sr|zd=Z1 z7~+po{z3Q`H>e!v$mpSLUcSgHkvbulz z)6Dn2q-25WskW}3I~PjkU#Cp*Qb&32}cgL+0(?qx#&I^BNZXEg?cb#Xlv75C&Sudj&hdOj;RAL|FzPKass_AyT zY_tz;XrO})LtnI*)2>493mL@udu*2WpWx||3k8f__Z)h66cjx)%sW2o;LBj9%AUU5 z>s$G0m+Oi2Z`rO)t|ey zQ;xmWsu&+WMXd81qKU+yWOnY3ZW9ddRmCFtE7dQna#RR%YD6C_m(*Z-mb`;Qzi#t{=mmua--UP~2Kt#^X|xhHBY*1!D!E0aOZx4@&fB>Rwa%$mwhcW0 z@j;}fIn1)VFSv#N%gY~M4KALF4#K{;!(YU5+-=m3)-#UNE3ALc!sup37ebn!`4bzB zX{ScFD52CGkNWXq^~sEa+wu#qH?Av$oL-+mImDsQ4XNm~Bi0e(BQf`6TaUl*rZUB9 zT$by@?3k5qm&z@j2=RMc#xFPzKN=Sw`ADzRSc#@AC;Uh%r>h<%_Uay&)BZ+SUS908Y#Zb`Dz)tCrIE5C=o-=5VI zT`bRW$ka*-J1o27hyc6c{XDtefMA|9(d)@e9PRJKX9tHLw3;jlydsF$?ZHnQPTJLk zBWCFPQJP}xF1a^%TF?GapjU3X1Srs>vW!UdJszCD`=6aMH+ z=b}h&>`-!QFLz}5(fT6Bw6WX}g?^oAT}z7$czIS`UL#M&8H8_WyO_WUR>fBC@G16Z zb`5X51j{7!8Vk8bgW4s5UPw}QcDRvp#-GL38tu-*u8L;$M|Gn$#!4J~1{@Ny>CD@u zGH<8GKDntX`^BPt&&f2sd1}s`_bs}=4|2K_DD>l~!(h?ragPRDckiiV3uEMRe4rWS zOt@T7BZNitUK;*B7w8uJq%!*L!z#OD#tWGhKK2oJb6x25PvC4T1f2L%sSC97m5dH^ zX{#d^l1r1`g$G5wYlu&fn*hLwBg=mnwpGMBkeJKiiaNe(A;-?;kRsNF~|T<$=X_tMwp>>TXZ z&-tgG!M+4U^A5hZeXo=`LDYdKSuL9Esiar07v$;gA6ebrjFf-`xKqSMbOXuFQiX z$F>8^IT1qj#lWuQD>pxd%)DzUe#iOj!oztxgY&1J#vI_lKA%8hYz=HG-q*Qt!9Lz} z869LdFSRU}W#xQskGf?IeW_mm@_m5aJlms+I=nP)(J|ZXxWLDW{YHEYx6cFw4+!G- z^4~SE3epc6v@F<-81V@-(#?AmY>;4wGBhY`4?Q`{==IXcksvxW^z4bDPY(6hinhe2 zipRmMk5?=zN=3{sczjqEzuU7djU~FWBfHdN&?I<3IGjDKo?}hzMiEa_Ta#@D*=`Itw(`tinA z+Fd?16QP86Z6a=YmlJjzO1a*{qP6fLuUSM_I`Pa!yJq|nox(WB&ZoC!UOg$uJCl*D z+b?t3Q_)Iarc?gjlltg=QUo8-GZ9)D}~Ofj8*q7Kb5?_8hdSg`_9T6lQO1R zMl4}9LHjOiT<5y-8$Zjr?(&WaYK-YbgCR=LNqqvg{JOP!&ttO%8RK{s#9&R)Ngnp0%4 zr?Mk|Nd|gJDl&JeY_W6ldT~fsjK<`ap-YDP2RB7SU0nJbI?MMA`}$&O6M#p@|Eejw zC2aET#DGnjkkACaT|usJmMST2P@QEQlc6`}uGXrv+;(r-;a44u;lc)6sT z!uoDr$)CfCck6IB@XOMpzbm4dr$Xlw5q*89Jg!_Dxm;ejlB~ii@ zJ+~T(7mgNqy7E@MPIn3T!Xw!~?I!v8T*5=QXu_8lLbn{gx`uf-v0T{w>dQIv%k#yv z{aW&l#$vT!Ep#Sa4C&;l+=H);g+F_8>wxXmLRbH5+a8bauQr91pGwc#A8y8cGUWEn zzBvIGg+takO}JB52Bk^i;>_i4*=UNa+#@0T1g&7OKk;GD`Z@#`wN zWVdacW0%#Bp7hD2{+2u6XYZ^%2wwTJ%>3<+%*1k&f!`|Q8pV;_Wzx_hJab{aOi&_LJN}({yG|u1y1^ZIO__=)kzd6hK@V1^tvO)3J0q)m!w2jj)ec>+K zZZ1hjkpbu8@P(s{Zk!M1*fH#V0@{sizQzfHr3vNXgG;<4N2K4a?%>%)^e_z4O)kkb zZFt~Y$tIWn-a33b{OW}#?=gL4(;-*pqe}Y=_MhfkdMG6EsNjW`I*kC=!@dr_n-T@Q z{lnF#D8{TmhM5~&m!D>FePI@)%h~y+lTwezQ?4}RwuSv8Q@S~>E`ff_m`-ll>oH#@ zM;_N>4;G zc(_tu`6fxcL`8n`8Sjvm<)GT*d4PyARzyn2!(SUjCDv_)mgoI+A))D-%JlMxBVE5IvG^Y ze5my<9T&r{%g-azD7?ceMEz~74^wceaPc@y4$E<$)p6o_m?cug{$%&j7ixKO`Z^bx zEBl6DMNUyHz2loEL*efW{9o&A>CKRssLR#>XHZ?qR?*zD7V@dG?_Jo)=|dfv#BdJ# z4usl4;&fVcOLJ=cii^ELgrwDL;*R|$JPBv-Ia%j+*rNNqYqP5BOl^~%PUnB!UYqx+ z)G50B00tEyoyoStM*NsB^U~5anUBNCi5>@SdoS)NxP|OYT2){%@$2=ANm>;;;_GhP zoB4*DTe5`bx})CXY1?8CNrw0F%E#K5`h&_oAz5cXQM^=EbRg4ZXd7X}py2WUJt!Cq z))Z;ji`&NL6eaS9WKgkpY`xW~*BK&VZ(_1LeQP2fjSKH%;mO_VL#=k2`rwW&#_k+- zWGI0)cQ#P68S#*euYkmZ@`OP>?JJtNmDqfawjZ9zfQFp?rd9NxAN#uIO{X;fQ{J$+Ib5e7d?`GE1@V&-Z4p6s-1qx)LF?d&5%A+)`; zfkF&&6Iw_iS=S4wb=4KcNGX=OuS1tU73`@w|KjwpsPMZpyhB{-Zs|387AE|O*p!IV z@8*0eE-uJ<^f)=avdqtjeVsbh%V{^0e{!f(!`ec$>3xKg?47qp1a2C%tB@R3A&l#7jCULX^PHw%ZWMN_&_w4^5Tvb;BQ?h zoO^xwnY&8Q=No&47qy;6VSGhrT$!5xx=dz6!-e~2ugeeh zuI#cLB<>=^duT6j1MhL0kyAV#XfPxeeXtjY#veeUv4%)#jFA>I7I#_ZXfo|Xn}-|Pm&R;jo>YMb4R0V#dU061De@-P3^Z>Dn%@Qs zHLtcpR`4P_I@ub5F;lgBm&K#vm}Cy0@V#q8JyqA)u}%m@vC)Gt9aRno}5B)$t1QtmtRLEV9UZ)c(($p2t2A7`r88+@+D;H^{d5 zM4YnSr0V6nFgDMUuYTW3={>(aXWt)i;=T53_L!;k(mKZp&Xgu|Y6YWeT~u)5dyhQP zyxKiE;U0=^F=`omoL|M(*YD|Nny&unJbpmH#d`Fbr+uM@PD|fIMt%Kc+AnH!&Q;hM zt{C1N}v?R%}2yZmp0Fl3A~Ugp0C5QDD-@{N>+ z#0d+g7aE#t(DXuMu{0RJYMU^EO?vFsL|fY3^*exETl&?>!fzU0Cs^@Vxo$a{&*n^q z7BXfSjopN7D;Ub1QxjnLK!`-l?OqV_;nygUbeerxMv*0EsCTLgv%8lyRFeUfT0S9s zJ-00M;aQb=ukjDzkKiS$#_;YS<{0AiReTb=EMGQepRKh0xjJvQMGbBFzDEkyp# z&)JpT;RMc_m5Eq9jeh*|x6>=^uHp#edwiA`Px-T@jc%XX&X(bPn&+j%+83H1(UyW_ zI7r5-V+gqa9!>>gU9To3+TkeaGt0$!nS!N!Oxpy{RSg~p(V@9CLnkZEv!>74GkG?3bR@hbiIPbGcdL&Ka z{<%E*N5O<26Nr%d!5fTs#}v@YH+H`fQ)%`x;>luDX0@G4;w&sM%&~a%(rD(x3YyBz zJMId-J?|o2C6$V&R^zt!N?X*XQ%$cYo^#&K{*ZIj*^bv{>haC3Wjv;8=_jUJVv4#8 z<7(9!6%F*i?n&2^G;8)U;W@IJ8rOf4NF`o`T}WN%J#ed+-aPg=+cIHQ>+{rrhd`99 z*T6#T>s^>8G+VcgF2N=*V?y)Qu!TQs8DI6?Zer-Jv;iNz$`7YnmkfRohb_DOYkg_+ zm0s&B&)s7x)2L`{L@Qp57YoXlxUnOvRs&OS%fdRa($Gq=h1+&& z@n<~z6z)?F=*>0}1{&BOdCXoM$h(Qbhzo>0uZeQ|pcHWH!6%=poPC(L`JQCPl#G9b z!fe6>>L(~OQJ`okW{*eaD8ZEid&~|n53e23eHPG1&)*i}VMHA&932?FbL{aj^GkI+ zr}ISi5o0ksf%67FIY<-Uc(G#(_6Z?JQ{5!4xda~8?bdJLmORETdF6UW3Xk{QnT!?9 zG|wbP^&Uxjl|hf>FX&L3liSa0`cJrCX&GJiC|_RP((QrOCetA@?$oBHIat)4p#Y1z zDbBA-0*6y-uc>CLPOS~LGer_8>B^H+($AaIGovDQNho|y67ZA$zVlT6&^?QwuhU`%jO8QC5Z1OKq?=5~oxJf0nG(M|D@v5nEv zX0T|d(b=%QsRMMxe6z;l&R4hI30q$u1|liI=*F|4Puff|%|uYqR~^^bMU^IUnA4m_$9}{xLFxXOuVmW@6|;}% z>F2$s+39`9T$8j9h>aAm9baQ|7D+5T{@DKF3cdX|A$c-Xl5w3d*iEcH)qyfnL(vXD zH1P@58p}xhR9Osp&4(`NJ?(l$XX6prVFHftZd3~k?H)`u2o_W=y}F9^-7R?avo;-7 zKw3hJyKZUxbW$Nl&hz<+ibaj9L!6dNGmo!y{_HkxJsuU z)H7`S!-`My`l$UcvY4zn zI{z96MD3$X&)-a`q|hF_zJ=+qL&Now%Z2G5-^qt9JFEL@#Py$*krha`d_o8tf0D~@ zIbkJTG43f;QGM{`%&xI)gw^r~Zr$mKMlz#A#*aj8X0J*;vE?WgSk%lGZxcp_z?Mo| zJ~4mzQ69C}ryR6hg~86?(G9DEM{S7>33nthjHfkuW}3gE?Q5T!!$sdxFeJs6JTCYe z(8N=`IQi7B>~g*pLROHiGkKy!N$8~P&6bp^D$IA{;a$@Lv+Qf);!|F>TcwI;l)f16 zn(t2-GJ2ntmmt2^o!{;G!3b+sonb|(Yv}!>_bDvL?x&R~JfGdiA0WFxrn6-HROwB) zsTlMf$cx8rWsBkht6e8*(mu$8qS$H`DQ+!>#`dkZ7(yzAv%_MXL(aoJS(@NnY89EW zJZgN|Ta!AVRfX^{R~s36$hd(rn^;vSbQ0J?khE02mrPcLB3cDkj`E%jTr)#HrsgjR z$`z0auc^O2X476nWS=#ec-oxJMpqc9qn}ri+b2iew6oiqP~>p*a-6@ZzqpTHPuh~v=$8PC)L=~} zN7%DcKVFYl?Vb(WMKvy5pJ7^wg&T;PXf}O>56#ht#s{aB?De_mvHVyU$wc{Gh5h{G zS)~sQq0L<;i<4~A6B7Ybp*G&fS+#~(T9X8}^MwUmv|1Y<_;P>kXp~&fZJsjT+3{oN z2Lf7-*d=rNZ9O|8RygG@WjV+r`yvI?GI{k|RCl#a#=UU8*)RJBdcOXk;1_fHN`{qW zoIn(Q6Vj!gD91)2z@U0!f1c21AqvGFC$t!T#oPXwhjCq>IXwIxdszSTu-La@(N)5u zRI}G9o$XiO;k|Hdla&4;W(5;2sm8(8O|EIgc^ACjUi$0od`?V3sh4?jS!6k1e9WKp zy*ktVMb&5N$kIc*QNo4T(6)9y+OGE|BVX`x`873dKMGn#N(WEvojzdeYN6Xfcjw|g zCndTO6To=k6KIv(}f%K`ujoc~y?2WTrQ`HS_NaevIWN38G0}-VeqPXBRP1u<|Bn)e2dgS4gq3y5f}e zd+IDDL+;%<{9|XOPz{69a%|(#EaP)|58ZvP^dMe~nmi-JA~NpQW^|2u+&yT2P3^>0 z8_H86%Zf&+k6t`vWD)kdD>|xR#YdMZ=7nicrM|=<5oLWebTJ)CZ>4E??&9=i+ouoS zQ^qe{HU2cqoO)@xbUf>~oXC(y#*abaHnZ<)KwxS>G7Nu*oq~t>n^(O^5qz8Y zkP<$4?a*jtGwp+lhqCnq zZc`oB`aHlPzf<2K>{3e%-`$k9w-mF+hbsKvZvDt|V7K)x2K$QZH9iWql(?oK&NoX) z``Bd7oCCTawoH@K&29B-3Ztj& z?YSv$a3(jJxaKZ=jf&wsvQ2b|D?BMqRnKPny#1K4slj!{-9mS2v;m0aY4095%HIhjT<@p4nAkdM;fYn_>5`ASd~KR@|5t=xN?{t0BU5Xs$>{#p_At)# z-2J{Ud+uKM&12`U+S`yX*A(M>fyedp_t^qz>mwt9$fsn6kBkekneP4EJHR4ok~N}~ z(vxCUpxulujh(N<=-&LklM%j`am!dG<9RYolQHNh{3awvJ(0i$7DbeP_rpZBtio1G zJ8sv1T`Qs)(9D0g4a-}@rQaUJ;iCAiLL;xH*Zr-8PO@4(S2CaBHTxL3Jz9EN_Q_gR z{NX}b=C`cRXPS!NG9IgRZF=-@$nt5cTD(vEYV12|(^Z4_R;hFP;pNGwuiH&)<-+5% zKOa^fu|$3Rp@yvBS*G(=7om!Xttc9L&|?$TxdI!MY5Byc7bg9IOh?Ih3SbZZzgP5< z#=5>uaE4(&m~}4ZGtH#@J7P`X{k5v(6m< zgV)2r@36=bOXq4?{vMjP<+5k2#`m^yL{?+E{l@n6nQ2sV1>m*ZtUSEWeK*(Xjr6k8 zI3tvpQzR&#ThP`i0vk`r=Jyq+uzjCZM)ciz;OMnWowy5U?_ji@K0MH>$b59%TISXd z1{0^-$3=U5^ZScvXDgg_dwU6Gm+q%qxDYR}*A_(dH19Xq|KK3zg8+e(uE@^vV7)6x zrXAfd+Yu8ZldK?~AKw;c=H*WP*ec<7((mtK0ZQucZQ)Ib9|nUrfwqOWfrGUuGRG$p zouLv!^oi-bQmXX!NKZSD0vjhgY6lBA3SI-9u$|9hryxTO8RrQDj@|zA+!+T}T2x@A z)gyLS>B^iz_ow3Gvk$C-flg9gdx~;o4im-9FRsU%|i)r<9uS z*`bXo!gfFNb3IBM>njp_x}f&bOVR5Ed5a1bnH?`vbg%UC>Fh(B5^?jZfG zLVk~hL+ohhV!-`F{Z&Agt#l|PF4Xp~~z;4_onb&U40%a5I0 z?XT=PlS4GGRj@I-%GAZwT{qni4x&`zbp~D$&Pcd-#FZK0HdU z=ks~X7|O6)zC|#RIV-`DafUYI3z773NCSr|sG08#g)tccBjZD$Fq?5H6Lgs<1%P*+ zoqw*bW!KbE^t(vEwT~eez1Zy)B27c9!}1iQ+$k<{?h@dsn!i(&eooy&(KN`|jJfSq zbZK*kz(}sdT$r|alz`#c^E;9x9lh>^4C5JZ-b^#Oa_+|2%gBV~N)Jx{Bl(8}{gd~F zH;N-hCTjRvuMFIKk$?8V>4L2&Ix@W><947oW4I)Zw0$X*Y47#yU5s$L_R>CMZ}jBa zAZ+xYUnW|r(oaDe zKBZxvVVme>wgba<4jZW@5LtYopqo;eEv`%>9`DQX?Fu4+73O6_U&g(6 z+PrjOGL_%@sFWx-qLg=W2~WXoyQPQ@Z{yDGy}&zBcMEPm{#a(M?XF@YN_jM2(mW%eWvG+Q z?-3fctxBV9NIt)gxy6c>%^_pggF3jNb;1|3PZlYQJ$K|r6KX1*jws~U#ZJ>U-Pn3C zp)0PWu+0xixxWuMg+q>SNWJk5spX z6Uekq#_N&Z#O^-kHr5!lj|bN|oHSrpMc({&ox=g<1XPPfmYBKu79J6&H--Y|qj;t4 zP5g4VDycVLwb@a;FM2QJh@~a6&5|LBjDI4ti6a{vSY3o58PrY~jIoYC9HJ3`J$u-s z?PLFPH07}IxYBD`)^k@wMU|H_55tGvZ8cLLH`4B0)4yM&Zqjr0V~IYcDCXRfKW|}H zUr?{*9?ul67e2vTviUy#O(1Xz|FYC*)FzmQGa@j7)$sHR=MSabafm>J{5@Si9(Kh& zHsi0iimFT%$XqZ^DHM2Zq8OAZm0dG3*y)CyO0Sgk3%OfWi@2U#7wZ%gUhjJT?$#{% zt*V2M3yoB{`Fs1XnC!VRLaBJ>ju-Q&{q4|_!4mrf&DP({ zTIxnoEB76_f6Es3qaE8Ea&%H}O*VR0EZ(9%`1Qxu98J^ozxe}L@h^YYqjN|hHwYD5 zKgXXreIp|^v%PALZnkFrCbnLFkYnNNM^tsO^#z3ydbU6FXdDiPfXb*j`q}%s`VhT+ z;V8&-fM6ICe>);6_6KSMSdRU+o*;roWn1)CQrX|TbKLIay) z4O$0H=-U!~T}h!eQb;7^|A4=La!`m3AkGj({D7z$NL7JwAaySgBm>0*{Tqk$3j{-` zxjH#Hf@mHneCH4xI>Pqzv9$-0MWBB^P?(R0qZ9FG*4Ne9g$T!jX>afC>1hiG>pX;y zqpz#C11RNT>*oST1UdS8!w}wHpo_#~-f-X-Ae=yO5ct7hz@@=5Q4=l=@_HbU2qpu_Foes1CQaco0M#5$00?%zw)XChL=tHP zz|)`a>-0g#e*Rt#j=p~O-oB1-0=TbY3nvg@2zwA`1PTK(oZtu$g9XTS1t9R<9ggsT zBRt^7;CgR-AJ6o*AX1H}Wi0KG_}p))d7TOX};1|wkz z^IusIuxMFKvPu$+L%|I9(w3UnWUVe1AA^qqv`3}k6N=K=>xf&_Dg zgHa>p+`*(;&v^jxTF-gH!PzgVA}=`5tt32eIQC~tkf{U~l?46Q-eP~YMuPk_yQF4e zzYH!onj)3)g#$?<<^15_%Fud_WMD~KHK4bU%>|3R^)DnV4dyO*gZ;Ghq}4OPBf`Oo zorFcQ+&?MshXYMUg8pVxfnpq9ENiY9KHvcjskPD&#=Hhzo z|9*#B{$(|lQNMdC|HL{YRFwe|!rm6_2|*TE8T~)efR+Cj4f8+IkW!HIVGH~x8BpVI zRu7`?CpG_)2(jTWAjF8jfDn6sb!OcKSP%C`=(;*UUKW_k5c_~br|RtwOg5=MKSL>i zqWZV6Du`)@e91k)>GSdMCVIHq!2>W-D5Mlh0`5X2`uNEr5T5^hlk)a;7K0&x_vhem z4`QT$FXrRm1h<2{CNGCyRRB~{1Fqg)YEUrw9yM7s5{(CU+Av5Y4u{5Civ7luKwn2E z7*H-~7_fssf56P)z{GHZ{{w@3PUr{s`UykfWPp9%2*X3-zX1;qOz1|K45U*w;6W8~cmI;i1T(4R{1d5&Q|mgDbj!!O%dWHsawR>#?CM0r(Od zVA3eUU$g*E?Jr(QZj{*phW3>KK)jd*y-JV3vH<^mFgQT@d~0)TGB zBVZtV_-9!H9$H##fJy(&X95A3=s)qG)#S!@Q9#%>@&$#&fy*Hq%MySqxdDbk{lz~N zIN;kz9}3*7**G33l=NS75QQQ@Miu(~!z&aTXrjMhIIvpVh=+%a_J*?1nqVW00GZ)G z@o}v`QLS&i63?pRUWlge$F_y9$DsA>9WM5OVl{G|)B-urlh-^iO z%F?by{O>dBeM|HEzdrxJKKFg@bLKwxo_p?j&iS5upYsYCs%c0dC8c3PC1dd7S{M?J zfP2^kp`8=K2aW;_jNyukFoL@S)KK=f2Q)m~iEuQj zft0y{{V@Wb2$xyERX<78G$Gm%32^Ckf}tIeNbq)tW1w4@l!>#QgNwTp9Q*TEZx6f) zfe5z*!>VZjRtP7Fa4Bs!z=F!ptLo3IHr)C*c94Hh0OA|s-kSipTW4R&nBe2#>x~EV zf%rFuOX(9FTj9=?g8$QP_y3!0sZ_- za(V=JC!#YPbVBM}!^IWM6)vUW3Iw1=z3`J$SdabHlOE?R!3!@4fX@L9nyBq`j!TL#~eJ z%Il2E`&Q?^EWCIh?qtw*U`;mj$#cnP45-zMOGCMQ=i8Rsj9Q((oKtXK@l&8gemRBt z5j{L{)PL>6kljg4=GmxgOT90K5wf{CL#;905f}Q>0%l9GVrU{pwU36-YjxT-+-Vo_ zK*%t|yPZ`^MnZAT0aV=bwU4?(#^`kl_7FZ#F&ePUwoTsuQpR_`Z|;#0NAuS`ipQ&- z7-{R-v_`#|eD&4yvkbR->}=V!LfN(Dw`W##4saj6IQyL8yu$AK-NIt3pLaZ%dJwiP z+ZxufcPSXH{q|O5P@jKb=+h8xdbyS`>LzPG`=G~Ii(P%f?gB4TBH;pf)$VK|70)cK z_J_H*wsRi66cM5>(0Yv;aHD5x1dD5bX3OdDGJUnjC>FInIJ?waYP5@N@63+G zv_y`4byHJ&Zr{k}8q6w!``+hRT*vs}+rgtAt0&h!gm*?Qdpuh`<9o_Srp|x9)Qs`N zh4}!VtmK*)N$S+(M{=Pk8Rbyd+z^MevAvD#hZ)mMOM|(FJPYOoXrfA`7`fpS1Fv~p zbwvGot}c1LHHUjdR|evkM`k8&9qNON%(;3UTbi9X{b9ly*PYfr;920DMez-J&b9*3l^EY0XbQep5dnKg&ZWHYd3Uo9cG4H&=J?W8>G^ zL>BzWTw9ss^LqaepF8$yH59I#`_cKRW#-F^Vkd*1XZKD%rk9a4RTZ|aOTzDv@K>`} zTR_&|<5|Nzi*{UbZ}!`pAHyoBm>2MEYvOFdk_6u-!demgP|Bp7QN}`QZ>;`=k6x>^5r6T%J*9abp&Xatvr*Oc z44rKOp0}3S%_a+(ZV@_dD~hZo5BH5;_Fnd{7*p>oeS(t7S6o~;Y)~`5A{R=K8Pk@UYW;|s;Mp<#TH$omcDxPx z1Q*rm`ytp%l{&JLN{u|*skh&qT{P%g`|{nOZagLN(Mh$Nh5Q*IdYtyh1mpvF;DK$; zPMHt;@yp>SE=CEne+pyeNo6!{B+go`j9fvwbU!^e_A2Q}yQNip=266T_5SAGytE70 z4u`ds_zOIzc}II)`gOd+?jECA=3Ml58V^>h>A%BKqDnXZ(kspzru>UJ@Yxo2ioeQQ+JglS3~bwef;%~mXXWo3rEA-RW{Q95Fe0axXM)juaV3-~K zsBP4x-8oiHT07YjxHEG;4ZgIwVPSmW`@4JHGYeG0RTmA+&z~u)cqJ{oL(bIy+i~qL zU6gT#-n4hNMJTnH(N5@aY2`6zF6xvCJ*pH5zy}O#WL0eokziBma1x21pjp;Bx2rY(xISHmWvtJ01-r|a znT^D#qEKRHu#i$irI?7&w~TUDk6FH@CS8}18{`uG8u7T2He>(cH9=j2JmoHXExCk; z%{(F>X}2D^iGN#guO!_eqiL{nFO`m%%Bf`5v}DsQr^1gYFEFIPG{R*}-zr8eOcxv+ zxWaSDFP}@w&9q=5sTK8dOyfy}!9}s$8!_ju-;#;w31xLJ6jL?OO&_MzRK22NNU(Q6G<;k)AR!E z9CfVk^;kqtPe5Ek$oK}mZZFWcto3YyWMdd8He0ROv&+NFU zl#)wX$tQG&`cdwauSYVc+6?pR+4a8DY9;vJ-y#@PPFog*?HXK};4;c*7w#~=kg#Vj z)u|*z{*D9TD22M-#^ej~wWlTOPha*`VpK%3B3_^MFqvYLnL{cf8MJMOgIBL`y(`tV z8fbG!$aH4y|3dk>)kES1!&I4Eb8}!HU!>*gTWZeY&xlHRRc8(+T&CRoQ&=Qbt^()n z%t;1uvlu%^W#?4&Sk0@bqh0V(TJ161x7XwBoOafY1>xZW z=IHn_;0T`kvCTWEZf}+fAOa%p^qf57%DvJYlfIbm7_4 z%Za8(Oo^fDM6nl1wE=JHW-l%;m`9yvE8ZDR;0*bE#9ZsagrJT~?uS=?B@E1IbjenD zeYV=hZQ<-3J54J(y=D$tRN{?TL}L!cwe>Zhh`QJ_+xFsZ8gm{}lIoMMCv))a=+0JiHLl)K;Myygsk1$P+OOv< zoyed-RGMhER|riU#h$lmU&@ob{O8|YHrGLX`oul6V;k;Lm%;AqCp$c6nl+patj4_U z4A|~XS9`Y#SLtTovNagI=Wq+PUFGhJ`=5@2o9RmEM%?1VCx>xe;eGdM_^aApqR^j_i1x>|LzxP80#+{p@3 zy~xx}X+Z&59^`B0__I`n%W9QeToqzjd_&V!tkoL)*>VS!EvHzhi>#9jArrMym3D?AQ~j zizv1VV|t&cS*(&=P%)UVWIms;taI{~i;FCE=zhh9k`vsDl)uc0um7$NONCyIaWl4$ zT78z*y;8aUqIwDU*oCyMVDI-%>LSmH&5WjT8Xn?bRFvXCTimT)vkYI2y|gw~_x0q5 z>tFMqnA;FB%cF14pLl3;35tGx!uOwX5DDu;_#cJCVSfcjq#ywzT7pR8S3@Ww+7Kw9 zu+ktp_>U+-!MG-#0!iO_8;j&i?2apKN6|SvPF5>-3YhqfSXWf?8;Mu zllFAV@>RF@wK=b;3opTaI=nhK>{H&~Vco+iK@c8vDhht@#c+(H{w*7tt)E}Jnbq4k zR-h!dEO_V>&)89!S1UWYcN1L={dE(HubMX1dsngUPkC(}JRf}R(&N|Y{u}cF`R~F? z2J#M`=lyhFP`okknU*>YKj;1aPTrg1c{~H7HRmWMtrvpK4NB$bnO&Zl`Rj6Yz3igY z<96F$5^(DXzR{HKE$1Wt0gH$(F1gZ(uhV0F52O#B+9}Xv92&f!kc+XJ+sUfi^i{i+ z^5y-v-%8ru&aZ9pbatvGBl+J$(4UbPN?PW>1*&hP?cR)_zx$nIrD01-Ny{mN3(8Cj zYh^;b>1&m;>CdxjlARorj~Fvq|qLt zk<89zEtxu4-?_Y?!$r&HA6QQ8dt@o+S8;#&LDkE6v0@eZ>8CuyS{6fUeeV>ip82Q} zAHo9_W%y~-TrS##=H36?|XjPZ=9?(vjZ2N9{h^b?BDC~^?118J%J9M1%yK_fElPNrcDnxv3t!*ecR5-aE zrbqX4Md>(l-p>#&WP7~F>zP{ietn(GOjZ4(6QQ#dpI-6Kli~39orPZkzRYG=jMHUp zg43%mXR2s!Squ13)&DAJ?7U@XIx(0X-zlYbm^hyt-qxBFv+RsF2$8UwA?`eA!W|oR z*U|cFryZ)_qb{SS!PGAP$$ZYY9d+3sN*u$>4xy1DGU=>4ZN*M_GkyAWU3OtKAB#rQaT0~t=BU8N12Y(_mKVk#i@pge8xfOdi=dO0$O zy<^o4;n$RPu&k{qf-bX7-{qrrs5xItQs{|lTRKOgapB( zDC-#xe@s7*@5xNQK7OG6`>uX6976kA8#qKGHerPn5_H{xSyx?F43%VV_%?jyL*Cxn zv}fl>MTA~m;2GvrKbBIfcf^D*PC7B<{HwQ~6_?-dckOX>oUq7AjhsoE?d7mv%sDgM zrD1(Uq~&#pqulK|BODhER=+5#SUi7R@+9Ms@DLfh z)n_r3y0JH$`9DmQxerR()|tqu+dCZeE^&!Eba2^_y}zdE9zF9XS9MiUpZF)Qk2ZC0 zKc3>sc~a*a6<_YSur8z7@MyuorBeCf-sRmEL&V)=WDjj3Zje1@GkS`}0t<$aMjh_O zps=5~{u{df(#9{Sb5|?j03&*LU8-4kUTY$KyE;}uo(1(( zh*=JCm)iHK<49yZ|74}0@gt3t{vp`tO$*$X*73`{bEJ7`HuhT^2g9m$i8Ru&joL(H_ zceWnC?uIYW&}r+x&!DfLK>JmV&Z$~@k#qHY&%OGg?@R~zO=3fPN!nAa#-kuNMk>*Pz!6B`r;Z_NlQA!r7)oZjG~} z-Sc24&}$2y208dm!&00Tx0TBluN+nyZxCWi1XL5 z@oaLundk#{GWd%P>PYI$SHx=yG{U&=jj#}(U4(fvEs@9~fs3!v7e%j#oz6PRDZbE- zwG>HEn1)Xp8%K8JXC*y6;Px;HCg<2kDb89m;lQRZ+j&>pIGQ^N^_uURpTOA?`Sx77 zIS2jj62bK0euL5UFFc;NkB>Ol~9cb$!XK>JG+n)Gkj&VrleaM&{<1*<8e=jPLSV zTt4T^nmoQ^b_Z*!(|PXa4r^a&R>Lg>$ViZkS4ZP8|Gk_F#=7n;O0=W1>{eGFzrECR z%J|TD-FJ`Us>X$4b&<$c#My^&H8Z-_tX`oVw{31%%3nJpX=#yuD&lTbjF4jWSn48w zwKY+)LY7za{dUx4F|~q6MSa;{R8m4|;tsye-rjf?w>l*i@L;H%;m)K2O1XT`OHq|p zcO&i$R%I5u*?5kEJj1Lbjn9o1-z=l3j(J4qZ^!d|q^qJ*anowv_F8F++I*5}Y1~Dp zJ#66s+MwkzAd7#yCAwwty$4P|J&XaJqfc`cN6ZTD@oAl2d5ZNjZbj?M?4T=un4J6I`^cHy=oS=fx2-PDHam4nb7J&}FUt+y znmgUZz}?A%o_bYp&b5CsSS5~HJo2scrp-~B(N}(Zm+^*1MRPMs@p6o)e~x(h&Wt(@ z^aDF)mciwwc8Z-pyz7{yuOAnvG8edX%U&(!f~$|hJ<377r54;^6I-L}(&fSIn`pEc zf6z#6*s(WCewFnfJgc(~pyzVj$gC+De1t@A!UgKbDl<|bX(^UYg=Q(il}`4U9by_? zJF5HCuYWsVM}Vslb)ZoA$?#p1eWOgz)v+8-Q<+DNMeX_13_P@1H6)s ziC=dasLS*37re1Rp)LozeLES9JHzj_X#JcOM>8i79Lk}54ag=mr35h9b z=G&Q&A-lyDzQptU$o$xKE@$}e5&wja;kC3`0TZJy#s!l0D%KIZda_yGDr6sLJyLP+ z9Zozq-KU;rcjz5TboN&KS#4BJ!OLWRwuAJ(Jd753jdzaQH@&dvlaCp5|E}Rk5DsC8 zbx;}G5fA}?v+d>%@gyvk4RxfOVu7K9!NPW^aFR+(fZNCp()HnGj19`k^u0ZvP~W~xjFgN+ynCjzG%!;Cs(JIlBjLbbj6 zl*VqVWMLZ)a~d7|m|?8ay-9D0jxj1GPm%K@y{4Hd{U==FwGWAo<*}YxV{{UZD>&7M zzr4I1|6Nd?43}h}6Iyx`(Wg2vMrugf(fcNzfm)NPaUZIRAk=JmN$<&{2|8QXlbt5l z93Ga}2np^PN-{VrpjvWmMcR9hz_l;hbX0!Hv2Dk7OJe5Z3)r(p-c40})VMaxVex6P zFaJ@F(W8g$xOV33D;%{qPY*B-zxB7$s$EKY#Lv^YW*8_=825$=DO>iQm}hy;#Cs!X zUyxn-amnBX0%~CGTR5lH(%EJEHpu|alG*wPj9Ut>vsF>GbB`=yMV+@Cs*Tca%zf|g z?w8}5d+60yo-Jrbgm*)PkR@+@-?J?#XRYVdV=nyIr(|?QX{G&%--Bn!4lPg4m(&;U zdCIq>d+N%zZd1B&@5K)Pw?a=}N*%rtk@1$!x7IvsAQA2oV7LRMPf=>CI|c*vEcl{~apy_6CjRR)@Xph)uD# zCD07#HMtjCzoGDTPt3vbC~G}eSV_qVuy!R(6)Opzk-OQJ zSY3_&LA2RD&%eaBCMGuPZnsskXi@2_@$Pp6vBO5MGqPjF_8sRtHgY(`nnh<+QSv(K z;P^cXi^+S*#R?-!2l)Kt-jkUu8JsG!37CpT-G)%S^j6j|P7v)nQj_*U>I)-lR3y2y z=$jvIt)vgA63PsUa0*C+yD_)GIn^rCBe~UhGq=WfLQ#d#C}#&5e#pQ;*-bn?-i33!H{-=WiOtnlWS&(K5D*V|Ev76!L<)cCv6|r64{nark=EBveFft)X`6= z-R?h7RXhZeXlyF+LP=#MzGbj~uFBoxqvx-$li)zBjGo&|Jl~;{7Jg>Bl3Fy}=Wv@AU7dZrRms zjVpBU%8m9l^%e8f>q-7(H2&4^NYYtNMgr_Z+N*5Wysic93iDt_J zeE2PmNX*&f;(eZ%T|f8fA{Z%usIa9?M=8Cb4{Uv8@^PAVern2ZHqh4N6pPj{b9+4h z4&ETY%T{YI2EX20^9tMFbBp^1&(ez%X@h=kN9_|he6~i7i5AQrm$Vq-mV1_nZkxXL zJ*>ONHuY|>-qP0tgFW9?DfmPkzmX9o84!rXZbG}%s&mWFje=oK z3?R1!{8-p8>!YrJx z!V=z_XFF64_ikg_LH{6~ElPl1N1uA_hIee1P1G*Mrgy6B@d=DCF>B^O-up0=nZ%03 zyL;5Xu*ocBq+sDm$fy&vHm{ImWpTkM>-RKRNCe!yZL_+oO0bq*>2qYWSBCM$?EA+( z^LwObL`5(VbsR?nBtUFD@@xq7nwW-gwJ5gDl6#uJ%ZwR=T6 zMoBz4L(pHP1tIQ<)ea0-Hq$<=Xql&qa!{4RZmpet<=ymt~i<|vkoEh~KI zwk|Lq+GAZwkFO}L^;EE<#I*Qxy!?c~Me(Qe#^K>VYU)2-jZ=b}K4p3c=Yac)2CauQ(NIoq0 z`aH+MZ&E&q1x679wOL;QZ zN5@Vgo{(8SvJheu%cmZPV+D~3rO&+}5iP5r<&w@@55BDx(hO?myxJzsQ_HFU(4XB| z@l}OJc5UzRIdPo?wFjIDyoT5D5&QRQ>1p8;w5s`n1vAVmt<%y?#VU;_t6W+d?+;r% zX;+K!j9H0%MQyrb@Y*Wrt$uJ>0`l7q)4Kh^(b`{Z)W+oC%B6IdcSlJ9n`8c1)L2=ww-OS7?vcXuDiQXI#{s1c}hoYU0z#X8Goz zrQ&ydd|yktIx1;erG`&1?fX0sLg1fj;U2N$XtU#(HZS+(FFJkFB(JG(&(o=>HN_Sm z2}+4G8odEWr?bi~PJB(^bXiVw>%ZY&uD8D^ubju#z{f`TsD)FF9A6Jj$LAYQS&Z*) zV-KxCcl%84?Kjh?;`GC69kX)vxcI|dr#IByO5=iHTvnlg{MEdUE@9Y<*i1feF$%la z88?XDyAF9>ztV-d6m=V|?f9l%t0KMel(lT-D!qy0ty3aB-Z=w>v`ZCEy1l))8&~e7 z9C0RIVynvw>1jP^aIpR``VBvhgRaot;_w3(_H=u?QP!g-MkX2l+^gT;FTUG7OGHr7Q2wqW!$pNS;^ibQKs3$a$5pMX~d>ro-V~9FwpaH zPOiXHCoAwU?QQm5b-~(Of{%Tn;mb$E4QG{F@7kk`DT4Mq^>Oi{jr10dJfBy0<+(^{ zUiL?YBk7&b6Ls@@dG!y@ZEbnjqJ7m``|H*8G##~s)Hi1~iI8+v*%Mo*bI9ZR86VjXu1+vVhO^y(L?69&hOs_b}pFC&I)&d$OEr z;H5nwXmMr=Q`Q)+)6HKIXpM$Z%A+X9>PfGzyd=KeQg4e zkTz5Un%GrA&%7@wjL9e%8LWduZwA-J>M~OD1BjfNbFraq_pBG{RjAL}LcnErHoQWp zX<$uIwu0nwipw0k`MIm#-7ZYIsD4Dz)Zf^QsbeC%q_vZO?5gt9tw=gD zvmpaZ&>;BO#Os1{2gvQs57=3FfcxmuFkCBsc&O>EdE(cq zm&7u5sCg3(MK3ByLhi+)Gcn z0OK`_jn2eA z+x9Z&fEx!5ol9j@aE%UcWe<^o@>mA$)FKQy$PcOjsa%;Jy%W~j{|7YQS6 zZwh7FyFL3phB#h-j!)eeKD{oSFO<&F}o zW+65%y(e~pPK*cAQIb_Y3Nr9n4f9mHICrz1Xx59;p<4W*MVIorDV15HZ)n6|z1hF# zOT`ky4#gB8=Y1sHj`rxw9`BtuFL^(m#Aod#DZ(XH!t?PHmV(R3rR2SmR>Kc>rcXXM z6Q7^do$coApE~`h<9(y4ZOf%0cO~qT7tiY=roKm)&CY&Zp*U)589_#d2*Z{$8@UQX&<(Tl_ls(sm+qIU()b3hOv63p<$JyMND>|9qA%{d#52`U3CF+a zlU?h$qu7j89?y|5PxWgX?qc<6L?O3TYjh0D=QJ?2S<$jOr0%Y7fD2g1el`1WB=KX< z?fv7iwN;Ks6>=IP=V@EYw;m4oQQH=%+xXR{e>zo62+tP&KJc}GF?N48^DWbP@1hHp z01*YBc@m~qmw~uGMhO6K;M!!2IFJF2QQ8quxr9EfBWFY0p z@mBzXUhTBOB+HBY!!-P`r}ta5J@L=S6K#xND9y;RT)Y-2qWme{20r|1tC{)>Bkis= z{dVgz|BQ*Ef4XF5?buM)G3nWqDhZ!} zJJoekr3np@juF8RT+;4r&5+-!I@DKSq{_wDJCJX(w|tCJ@xpC)rg8kOz|?B~*+{tk zUT$&aM}_KaoNhfLd6@eka(&0tN*xp!3KSr?*w6c^Oqo~{e195Q67B;WWZ4OyFNtM&h9u*%esSkf!-I}Fo zn)0`NfGGaO6YGgnq--cD6+0io&zioGk(${)HNr7FGhY)scOS@&dHWDmo$b8A!%}*7 zKkFzA1||hPqek$-d%JiNJ-p#a$P*OE`4M@8fBQ2T5OGex6VUmMg)L__ONm z;^a((O9Nwv_i%HwgF_pLKCX5?&TuJzg0}};%EKK@jd;QXCPj4iCV;vl$k_t_b@2mr zAHcPg55bS%4woX3dV)3r@gA-o?r`WEB^9^~2%F+qRrKvnl4b-ZB#rt#Gw&BN zwS;34Aj9tezrg-q`#^)uKWU)t(!i?8qGjP&G!}*fp_vTO5ftzQXz*znR4{-?=y#|+ z8hjE-x}s&E_W!p6`}I2-w1o=P$FG7xBY~zslLxAZl?5H)WZ^O(BE^EY1p@|`feHu^ zP&f?umKX!af-jjc7|;oLePyIUTUo#W0@R@^^g9|9koG78o`?Fx0l$HjU8g};(qzyw zFz5;~0Zj@jSfFswvluY(^*%sbFfOSM_AO8{vZR8M!I7>w086ld4pJk*1lPxv2IFFY zZ9#)nN$58WxXJ=yL03{iLG7?4AwXAXeA1#p0t7q&fx@6MG2j`JP<}B7T}k=_jU)}- zARrMzY(rCnuAq?C0~&6K{`;4uuu72q6t98!Z{%k}FDmJIfb z5mFD*z?484p%D9!9{kfk)CwY!lnL|%-T%`I#1sMnBmMTX1047W>sNskVO<9xJ@~Vb zWDWEJ(uv;+Ngp7c_*F={@>3@uefYgVe39-!ACPYRD$vZa2n<+5z$>H!WC~6etO=wS zz^(mWe(A)oD@c^(DmPs;(y1%D!@&VY#}h;K!_y2 zrw2zu5D9odeK_e`VNf%KgOw%GOyMXfq6Rc`I9Oy*g8@qo%q)opnOd;q>-FF4J&NSf zHrRaB&!O>f6lrMKPoDw|BdLRRUq$*N3Jga8O0-U2H<3Wjq*jhV<$u;7Zw00g$gp(> z2Ifv`;{kL-z_yawdBCNAjs&?& z;7v)?e|@d=&(TPfpDvd)EDSg%;BraLec(We)@vk>3cjaWzXMK|v}FaRy6&1u`zFvG z$*q(2F~GctaLi98);%@A@}%d1^9FjfUiXCqLq?+h=2C%OBa!^zzX!>SO)1e)9(esuwwk_K?`a94v;{P(KKp%5r6 z_}~wXKwzXXI1AC=WOCA*;0Ob@1qB0c@aG@!av1PlIKuy-L2wiLgS-EvA+adnEjQ8- zkg9HIhmZwR-#`Pb{7r-2mW}POV9EZVfg3V3G-Lt(YzN>DSg${62owV7^`A7bf%O-E z($diT{6{;aGzzj5f6~xc$QJxb!$<><@Fxv{{<}W}S_XLX4eg{60M2ftp&%;;{rw|* zgftq&4I61#$ZBn92YIB8G~D0qfb;w7S@4bMU(aG7m%gDtj0|LR{~Q+pqK&$Nz#{+B z0|aoUe?NjoMCIKSy1(c8|&mEcYKpQxC)_!EGwgG(8Ect8f4(^+5i9m diff --git a/results/unknown_float32_gemv_t.pdf b/results/unknown_float32_gemv_t.pdf deleted file mode 100644 index 754514309c1d654ba0e633118f16d9e78282e1d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17773 zcmbun2|SeF_XnOmLu4&k8YJ0gF~i8dWz8;YVXS4JA<<%w?0ZDnvsOxENn~FNMJh{$ zNVd|VMfu-n)aTPz^ZS3i{{QK9J@+}!eeON?+;h))-}~H?kfFM!1X5BOCR8y7FRzCo z;Rv{w!*Q6rJY353l(!RHO3j{V@9yOSmol_>aU#G`pnx%4Q4!|k=?E2+{o@5qFHa&I zO)4N|Vc>Aw2~UK}Y}{%D5D%FU?TJos=?#LRJ(1|->j}p|w=gLaS9?b{PZv1$*Hd3F zyonPLZVj4M*95FM1rXs<+8%%f)n8Y&Usr9o?H}wQ|Ly?9H^jZK6X0%xeJNunf|s8! z9`py|-xw~X@8sxauj&;5T10?VNz`JLoH1O4A((K-~%N<>&;lOd$F?*?YhOvfHg~y?A6;PhUBP zOu!}%ILat3OEfW4weWgtGmcTz*i*|r(>|QcGU{`5sq@&Ug@-rl_W1Z8NoQ++R3@)$ zPUu>?R{AOBp3T{BUnf_-yBKsTts~@1@7bQ-J@$p+XCv<-_`9Z9|DFrv#m_eI zX!*fjeQ8p{H6*_q9!*cf6^bqmxKNa1+9Js<(|=H2qwpT{y~zgNdo{cJUD$G%pG7$M zROxrfEuVo+wSRi|@$0*3AW8qzKb@`OXVeP$x{Pyj*=^qh26&pbvHe@^)a3!X|Ra|&R{fk_3>@nf6J5hL%_rbCE zWxCoNC?!{u-uvy#yrUC&;NrA#Ri046dqpnar!_@)cFH%!?@Jl!hD+*&ojIwj`HI_- zlDp=PQTI1*aV{6;r*Dd8T(eIWg(op|d}e3;L>sg^+BtGXQcAHYX#LyUl^Oq7%wF{q zyZIycea+o1r+#D#zo0tnb3jd+-!}>UmawcG&O~<_%|hkfs*-nxN$3)fhQ;ny8 z9-hAu6^`dE?kll4P~9)klSIQIu6OfVXMTgQgYSO2!+Fx;qV+8T&Um%HOsAmF7Q~Yp zz7uzp`*tuye74+o9v>b{4dXB3i}63UCf6lmKfB~phl@>yhqD_jr>`U&`b;Y( ztyq70eP;Z7XMt`h!dwco)Lxknakby68p9H4Y$QeEW04}1oik_8 z;eMWI(;G`*ut-{T<4=7kN90|Mu?t z`!nC2alHRnQvXP=-Q98P#VN$|MeMu4${lTKowZ5dCgr+pr=E+X+p^6SbFNe>8pXy` zS;byT%^$sIltXd6zj?HG-r%~D!_|A<$KqNB%iq&9ox?1}m3lo!s~aa#mZRwuDOr0n)kYtoGAsIa5BI$s#kyjl%+ zT+OgyM2B~rJbH->_9Rhbp&@DJRmhTLwwri!M1Di?Zl+mAkMLGSW3L&*>)9@rHzHyg zabXN|Vl-t>hot@JJ+HrrD2cjy>%v8*%kII}mQM1ha>l3M!sY8+;(uhU)^@+3Y>}@B ze&AeTrMc^eY$rdBecCn#W=;(VT)cyRDTeuu`IeT7qUPntDl3q zpvyGzC`5iZ;9jpw;n$1u--BSLUO6hfqNj5WF-{1Ku#{3X); zkjE{dSL;D1OcU;us1MzxLcw0ZQz&(i#EBo6<905ra`ORj2F!#oTOxl_9o%@i(uis@ zf7=;Kj(+o}R19O)b#- zm{CCN(yi5SJG116)W)do0;NJF1Dqk$_N;77in>9>{5?6n+*y)^!#Sx>(S}cHBle*X z_8j>d>wf9%PAoM7O?ypaE>z( zeY?7;96nFz5I8P~qo4a275J*&{iG8&A$;Sx6w_5#BVLQl{V!X)CJS%n7>sE?DKX;U zN*|;?|DZ@ZvNfE7v zX(|loi*tw->6uaMWYI<;uK4mx=)bNzewV0}MwupSR$_b=R@w5lG~LP4dgT1TRh8$3 z3g1Ny)$YWFp0QD@rLob{6+aiQX~us%smx(0(3Iv>mY$i>S(co7`F;m%*QJ0e`G;1C z?MfPtq~CnZwBRn;5qKf2tk^Vnp=G?nda3k4LU*a`V>z=AVmD;Yow&q@TrYMhlXa~= zG}$br5ZYdBS!Q{lx;G!AlVDdhmn_P;XR%7wTMOK{B^>93E#7pk92B(RnTyHa%S3}( zCS1&qHPIPt$hEx!yVgO2vE4_b-Fu=)UzBTnINBi8newRxU%}Uo*b2Lm7(Ll95k_-{ zgAe^2dGnqzgjg{f=GoFRU6*zA;%dmhim9Kt#CF|4)L+e>;yYv8Pr;rAAK9SL^7&3y z>V*D@Q;*o$pPyWxZf?&W@KIa4QV}G^@G;||i)llohK7tqZryA!d+M1Rwny$X2o5mY zzw+@NMH>w&GL&9;k{5JKvgn%F*f?9P|xjThZBD^MZ;o3PG>eA+%5_ab6lYE4D;Pd=lW9hy|7$WnfbNau;D1%IVomI`D+{= zk`;nuxVMor8YkmZJIpUH>&)BwCv1!VXy5&f@tK&C;w3hQyKbQd4MTTUl{uaSRKZ?c zD(iD73sT*0u`}zs<(S2(vp2;bh336iJGd*q22V473`Z!9LOF6gm%aJz$F0&bm$Hk) z1rJ?cpDh~PQ;?-y+1_KNSRx^I%~JG(!iQR;*VfoK^i4ut@K^mjg$N(J-pf}{ zr>&N;)v7kITPb~Ze_yV5|I$PeK%|*F-OwDKQ$kCbWlq z3IvaSmTf#D+`al`IC7rl?W$~Tnoqn)QQxp**}&VQhe|_kMwvXvxV~f9L;Zro_GFMv z;+<0EY0;~0+OabUlSH;60Rz?k52pr2c1s@c>X|lu8riOuf)!2tk<`cHs9elr(tt&_25VpWE?N}h5?B)4yGIULpP|6u0f;Y7=#EZW;_95pH- zgP#g=_f0#y#{~u6Wv(PJbH_@#eH6_sUofPToanSsJ#oGVUzy1Ev-q}J4#BlTPnaL`CIiJ%qPBJAJx|9#r0Hr zzKNa$xpRr%%zTc2EuS3HInsVAYn!>xSw)%(nwm8=;WiNoC()Jnp`Qiube%tIFRf2B ze*ZCJ@PVzuXLX>w)VFn=;^~M4JG7qp1;~G`Y)I%GLi8vc2Jw5jLxTMf(Hg7`eiww+ zIGdn93M&op{(s?FTDkNiXEiAqHn6W}rosZq>6~1PLgGi(<%Qlr&Pa_<+l7~=l z!rD_O7wzfv)m)F2`uw-FrI+A@Zl7)rhxC;@tot}6oc2F)DGOWi*>Rks=^Y!I?J>W0 z8>_EzfCs#C<7*4S zMeidkh6)d#&{ZWvRw?dFsPJY*y%Q59;nOF zH`7P0$K`RbBKWo?{=O;wJI)9ELsn6JyW}dPzD|z~wn&?u+9S|v91*stP=K*{vxil; z^{aLV^~}9@-zvI1&aH3rc6F&IBl$lUmA}?fC~2Ah7O1|Fwr3lHvEe(%TJyH_iuO|m z7gU%R*RKolvM;vm6x6kuTxwJ2JX&KoqI!O+Rr4tQrI6*YC)r>2R81;9*DOp^GPQ=Y zA&vGCjb!$$YRNRh2G16T9w}S3_`q^f?18nMf7QL!mg<=#(Q;M!>8Cu;w5&$d2j44H zPY~3IJ@61k8Gbr-xA>zGh4((U`guq<+{WG#E(<$U9KND`Yfsi=5$7)Vl1>j%a}-Rb zEcU{8Q%x)873@P)aLQCT%F4jMVt@YHkZ?xgE;gV0k_QiRI!B(x|*btAzdRY>!fL zsB&^SPLCek6{F+Kc`s{!DchrcJ`?JB2laJwn5!R;zKocq`ZUM8K!(FV_X>V5*=4rE zVxlfw$Ie;zW)j2L@x4;&M~Dk)=Q}%6<5peq2H_GmuZepO zn{Xw>G&tMl_S&N!do^a&HJjQe4KL(>+ufM=p~Ct6bu%}!vPE>t8?W3|IrK@wqA-`eR_St z;&>C2ed!d5SQIAg@lLK>3U6BVbF@eHY%@N=-b{uQXrE{kCtFbuiMT3AJ*a~WTDHHW zi(QS*XYc;$f$(p=a`<{@YZQHUr@q@q?F{KdgClJ&gQIqO$CDi2bmArxZ?bCE*}NKl zR1zKvkD+ebapYs>IedS1+SPHTuJ7EB$#4kmRBhrAjo5+}Qb^YI1ZG`rRWU+xXY;pb zmp>F9sJ}RIZd6!k?gGy-A(IFQRXgps6|As#7k z5|PH2eLwdoXe}1?^0=O@yK#BYUiqWyW^dlm9f}sfXY87tQlX#Ix$kNfm3C5tN@Z!K zx+jQ!X>{Uv*G;<8iw_TqINz8j8cTZaI?vD7wNmo#b-~jUs{LPXhzWhvdK!uL6<%`f zm>KGtO#C?!N3EN1&6WSd%j=#`B<&haPwo#X?|aqrQip2W2|4ETWZH-J zAvd)zgWkeCsR0cd-awcPV5G68h?~;ppm{^kJS8l|qRs|U#e?YWV{HS@OwC~~i%Zoh z$pSv*d&iD;wz;o&gCV|X&NMO%k-->P+?E-V2J(Oy!Xk`*G4zi-^mxOaXdQNQWx=cZ zf;&QlUNI(e;k+MjnD%d_oEl`9A?KgbsQ=~$^QV7)veYQto_qfD`iOB4`*vlanma-c zctJ`j07U=h034uWkt%KhR8nJi_|%ktlVKNzIG7P*#ufZ-nm^3?AQ~ zvmN#idaM1KEowHsqS7!F7kMV}t$UtuUgLo)VeX2@qtr7FxV(&RX*n>! zv{1L`GHJ%|YCC?_17D)4)A{(`4t@RP?O)aDU23G4IDek&zuPqOo%t}oDGM2ak#Wdd zQm0?)Or4~uNee4hHow5XORIUHOPxQ1Vc&KK;E-vR_vFjcUE6yNT@IuB7qolW>+X{vU#5y!|ub&)r{c} z1DT_U3&q$ZHaXsGw359HKE7E4Nt-=KELNb~kNbWf7Ea)HS}@lVjxH03e~XS6xhxuV zC4f_Wu?uT0oUAYne`RbO-CcAgwMWULClw~=JV-6hTKCeCO<%URLEAW%D;4#Yuh?JU zOoe<`f!rHM{f8BTna>^@jAnk}@vc4g-0^T)Cy}r3OLq0cFdRqy>QpqAPCxGXn}t<2 zH&H3$241V2vwo~;HXCpJ?sY=VxKy+;8rgw3)00^DTGy7ayoXj8P&uONB@fC5^S-F2N6;l6ewW8^{|xTulvHrb$c-I$UMZkd zZtR;8QSIu!GVX>c0;wULJykdF(h0)UdDvIW~S8Ndjp64TdHI1r=RvYD8;LdPEfc@{X}oM9rvV_?Y{eR z&Xc^GXtXGQ==1u>n$mbxEmLiFeWZse z0!K}MJvk-)q6GspGJLPN!j~j|g3J%@v-!^&ECZ9fpRHe<6)-XSVq7BWplTb%-Ji$u zP9g6Et7X;Q_c-x_OhOag-iSNY=sXJi8EsTu$xIqQ+hImO9ww{8`*%(_v`$(L%Eyg) ze%Ew%+8@3n!BKT=cW@MZp7Q2y@l-694Q2U|YH>&R4lBEnQf{M|P_fh}^u&Dgwz9sL zw`QLB%cx1Lyev8M{+p|LhF!#5oskC25<9j1{t; zT4!?EpICBg5TCQkfd4KiPlih}&7r zIJv;`f|>VPs931|jT4e#i%zJa^>61nZI;ihI&?|~b5_hYwJ>cfxyn{e)5Z0C2`l2d z&8$8~`+mVnpr?Pndx6>9SDtNXCWLQul#n%V)8NFm^fR_^G~zD&5K}g?R9@>E_HUU$ zc58Wa&d|o+^_FkX^wyQ_pj5tacXD^&JE5mDQb(>uWxb>Kt9O)Yynp%m%&cn4_LG&{ zn2tKOR+bl(q%Y3Nhkka^@YReRijkG$Pquo53!QwF%V#xZBU3f$Ay`#+61kPVDviWW9~!WzSuUd#LIlG;Rdjw}p8t70^5bKbz}>2h4hHvc z*c|b(C$=WkN}zX~JH)lr@ePG<9JYW9zoBAGiY^~4{N~@zRrYau*x_11zKxWe0Bc|J zRJpR?X}O!7DK$0dAH<`37x^xZB|}_sMra<9u&u4u8QtM%r3d}Ha+n)g7H&@mcm$h%?1uL zDHg3~JG+wjck_n&=h&=IKKXig-6!&3|81^oJj;_OFFx__Jm!$hL7<2k6DgTJA!#+j zB{z|R?wl_E9{I4&E~6n#Z~3dzlm2f%srW>kzmX9o84!rXZb7>=5)ZOc@iVFg9nKT{ zB1om!?~D?`u6p60x*In?wt$B2kdcHBQ^ z+aJ6&8T*Pom|tJt{j;!htYYM>*n*k8o270qeQi#Ivoigd3GR8hg3?}?+-EXt}&MP7PyGOlGo(lwYclrOAZc>GU?H zQ<1Md9kD{98d51YgjNS&@6SZfu&<5}r{7YO&Tz$@Wk8FTaq~Xb#b$YnveoB~wW7c6 z@E}r|pXaNxh7T0ljnu$>JDGPgwq&x!2r%mC)2?6hO}KJ2hFh`qy&8K`GSduZ-Qvdz zVaIio1mPr4ucpbP*`-WWEIi3sje@ooRg$bMZWtB){$?wQ;D*|xKe?*~>lu|lN4NQ8 z8OP_{JKZ)GWCHT{jExG)%}4?M|NP zjOx8>(s9b-VOfHKs_h7IZ>)C6c@=Z*BZ}79di=L(j%s~*!Y+Vf{7@&*@j&lA^A2wkONyol%)e#RM= z6sx9Zw{Q{vO32ipQgNSPZM`;bXQ`nKwSlW1BRacK_hs5#laqXH0IqU~CMU6L+|ol@ zo$G;oWWm)1j>F%i2q`6LTuw|~6(-|{yLv)7F76ugebIlX(l?KduSTpj|6qHR?!oy04&~2LO7qln z>h}b)yDH9AY39`roOmOyldRstnapc=6(4o*fR>&XK3S`VFHA7Y;+E~jOjFTY#;>Z~ z+V9_cW;NWU9_Jmm7ClF6x@Pd!CiR_u*!5)Ox80_V2g72uzZ}&VvqCQZR7X^CeWv%) z*iRE4T~+$5soyTLZxuEo+xcOKUZ~7GnU0cy4`3twzn9e##=5@kaK=#r%r+POg>G7Y zj#wXXcfBS#MMIOx?j!PbWVFs{OL%R;leXHpKIMp4*US4N`n5*8?E_;=b>;X{;77Z=k^?(_8ij|H~jd^f&xqmTTAyn zor+mkY$r%iOPtmm2tGD_<$Cat_kElaGB)mHC*j%%k(;usK+{WE2{)dImK!m4_<^{pTE2RSRxrN<*`(cv_ z*?hjDRQ7MPt`U8?&3vw2?!#P)sYPo$&o^mRW!^t!D|_oFqlxqFQ^Nhe`9r1Km#bWK z2L^E0F5gYJbR}M5Yb*@!?>KC5xakObo*&0SU+Q3Wq{WRr(}8}J^_Yo~Nmd}&&+jWs z?+>2+Ng+-+O&~KYWKh&rzS^MCSQBV(mJ)p3iX`(5L%c6UT#z9#okvoQ0gv!-a4)oT zcA#~%gd^eg&}XiTS!`5)a|7(wUpvKU@a^kAKmKCCa)$;iclt%{C>Ommc=(~LET#$U zy|CTo&_mcwiAY&K(rp?N8ZBy_w%)rcRHZtRu$a|n+`h_ri@jgs`k{W7S_v4n5u1WV zrWA+3Q2)m_awXn6S3-_#uVv?M4Ab5gcKiz+UlAH^IIG;z;D9ow3f=dV;O4VE+IN5S zxx&WFFN7-#^FAtAX7;{F(JdO_)j#}(qP?eGJJ(kGYi{O69rfgl`7>3jA0wNS!GY(FnORuLe*Y8Iy8l~8_dM~9vIJte$^(QxH&r3W z(a;|2HdR|)K^i=Kbgavmzdf>kHS^9;8Iw^kGFS(R-U_Zu&}E|L2M{?sKfbwh@2n4ME`qSW z7@XtDhF6F%4XF#wQ;&)Moa-B;d8Y+2j8)LYXQGxrLZ zP3HV+NV2IskP`T zL+96;s_U=2coX<6I0W<5n^Em;Z}y}=T)N?GeKy^#IUyiCTkMHMrv;5M_qgohh1I-M z5;tk1?q;T6fbp6~bDmXspL;0o%;(*v7VyDE9&PW~Xv3sn|B&ZejJP0if1|x0(Fc%h zZ=Q2&jH!eOC11p~*AqX!_%cwH`}1k6%sn~LIcqCR{3Au~#ncXQ)Lfv&N8*PkonZ{p z%lT`w!;GxbPI1W{bd}F>k@=ZM<^i>8DLWn@JNZ8_U2f%~*ZC&TelJ0YjW@OH1=Eha zbI14!VzaRmlr#BC9vpN`leb!r)`f8B>^hOWN>Qr!CS-2?`up2WBO`lx7lS*<3=SD! zhS~~pkucKsrBd16&@c8e-1+JYe1_Qh>GhHAJ#@4?Vw8zilew{1;NnLbUniBjw!T0k zLtoIGKPxukVKJ?23fIOPV)Fdo2Y%#x>T{94FXhP%M&}CVwOEm{H|?*GTrRaEPD48v zY_BV1s&rS_vU-?<+1amaRLAVBqsYjR z9Qu>lf*sON)Miox%dpk#`@2fQ(ap2+_sZ&Bm+w`o(fNg}O~bwAtiL{MUCX z_-g*aGUa1`?ZNSc`fBH63i-{^3)|aoP#g*VQQsM&d;jav$I}_2LU^|GD0S=GPyb0IV!Bh?cyDZEO`pGk--upwOxDzLq#SBZj4bYUa0kC9>?Df z$*AFCrD-h`9$=&J!7n<6S029qGG6XerEPc}Cuhyh~Fs*gr_* zKn;~+dUw;aX$Z)FjV~T%o41mFgkl})V5vAx)p+m+;_$X@Y(Bk|j#qS2Z%wy(Reiig zd*s_siYtdq)Blza5XHZEVj~oa6yGGJYEN+bmD4veQa2Yj9Frsh0bwHW@1GnL&jgq= z1mRU6gb6ZLAofbb6GS9I!B_tVef=iEq}1Ju8gxwsPH(!ki^y*xbZ;m`&m!QGzV3YQ9W^7VpCd3l1a5l?!-q=>G*P9X0L zqOibUH-C^P0IsD7PX11wa49EJP0&Ul-pk#~6AqnGQiaQaMf4%K49M$&h%LA*V8IY3 z3z{>9%Yvd7a2zP$;A@XR;Y1{{g#+^Zb-%$I^ikB$)6vP7fcNrsg5v;#_HZ1SmUs}* zg@j2#EWxEfup4064UoX|6L2YaxReK63b5q`M}lFK0xW}}2EF!!OZmg4P6GJ@ssk8w zTNsGzk~-q%NOT4Ag^=5yA!4K<|9z_DUzPepDM%yue`o+0UUz#J0vLlW>4FLY4+Xs8 zP@s2E3($=O3i^VmX78=FAz%bd%HnqxgwugQKR3LJr;EE2U|hNIe1_ zl1BX*Yxf(OTEnpj5Ht7xKf(UL)`13_f6_qPrGZtGMa#mmXeSkcY0&b2LaGkWmJ_57mhSegiAJ zL4&TO&Y)#r&=q0=>J*f)K;fWwF`(lcb%3&C1UvwN!k{)W;2n}selrJM zN%{k|Bn{mlAQ3@qLtTTeAd$ucYHlOJFu%s^pQ<)8fWeI_P@u=)H4MZR;2e4#QiI>a z_4^8j4ECE5QVr6;lt3|_5c`lG{8K+v3L=t}3DgAL|5FRZ6aoPwJ^NJw4jjVzoghWn z&;dvf{x^|i4fF!ii9ZraA0VChok+U!OD7ZVeE2INt z5>6J338WXmt^JvP>%{LXBor|2Fwzyu|B*;zz406>^G|~G0vc85`tPCmKXI@sa8o2( z2+TJSBFXRR!I2O|0v=EwPC6?La)xj)vLu=*90jeY0nGvq1{vgFz)}MuN0^O0^I%yvR^ot0`{9W8W0U&xE6xPisefNq1F_=}j64AFp0~Rf(cR4f?vIv4A|#RG za91MHn;<7845R3ljYu=8|a0du9c{={C0u&`R;O6D24h8KW zP?tj?P*`x_4~;+oJA$(k`9mfFzD~|CU|Ucy;0AyFfS1F7dEpHIhX%n-=m+=wMMGjy z&^*{gLjX_s7Y&+Wo5~?&fPgpCfQ8*mLqTTdf69Ry?B@DFtB?iw-*UihY~~q(Kp{4_ z0bYfs#Q(gDlt$w=zl)Sc0#ClF92x`g`z9JVTe-O$XydQ?5NH|5CjC#l2x$Z~aW~PB z0F`W}p&9X=*=`4$WCr5C;N9f3<`L}P316X0IfIEKM5===YEAB4m+&>xQf}1o=0WgOJ{4 znhfxwn`yF;-}&EnabSl0MMM9k;|Lt~FTFzGWdD*g06Eyrynuz|Ut@_xU?4~PKfE9j zxWD*A0tmOczhHd?)@qyHMMB76GYtz6$!3}?SjKIl0fVzyE=UxJG5M>UEI8S-`CZ&! zvO=PfkpF~!|BNNrO-KC2GXPV6=pWJ7-p$>~m-OGFn79Qx0b2)`GWPO<3^d8zX?r?* e0kA~60e*o%wD%>F%py2kg+arFgft8^VgC;?6UOQQ From 85a222475538834badd0ce985ff2e008ae286d1c Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Sun, 26 Apr 2026 07:35:26 +0000 Subject: [PATCH 3/6] fix beta bug --- mlx/backend/cpu/gemms/avx_simd_gemm.h | 37 ++++++++++++--------------- mlx/backend/cpu/gemms/avx_simd_gemv.h | 26 ++++++++----------- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/mlx/backend/cpu/gemms/avx_simd_gemm.h b/mlx/backend/cpu/gemms/avx_simd_gemm.h index 69d4129613..0a6791addd 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemm.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemm.h @@ -242,26 +242,15 @@ void simd_gemm_optimized_higher_precision( pack_A_block( a, A_packed, M, K, ldA, ic, pc, mc, kc, a_trans); - // Initialize C_acc on first K-panel + // Initialize C_acc on first K-panel. + // Note: we always start from zero and apply both alpha (to a@b) + // and beta*c at writeback time. This avoids the bug where + // pre-loading beta*c here and multiplying the whole accumulator + // by alpha at writeback would produce alpha*beta*c instead of + // beta*c. if (first_k) { - if (beta != 0.0f) { - simd::float8 beta_vec(beta); - for (int i = 0; i < mc; ++i) { - const T* c_row = c + (ic + i) * ldC + jc; - float* acc_row = C_acc + (ic + i) * NC_BLOCK; - int j = 0; - for (; j + sw <= nc; j += sw) { - simd::float8 cv = simd::load_convert_to_float(c_row + j); - simd::store(acc_row + j, beta_vec * cv); - } - for (; j < nc; ++j) { - acc_row[j] = beta * static_cast(c_row[j]); - } - } - } else { - for (int i = 0; i < mc; ++i) { - std::memset(C_acc + (ic + i) * NC_BLOCK, 0, nc * sizeof(float)); - } + for (int i = 0; i < mc; ++i) { + std::memset(C_acc + (ic + i) * NC_BLOCK, 0, nc * sizeof(float)); } } @@ -302,10 +291,13 @@ void simd_gemm_optimized_higher_precision( } } - // Write C_acc back to output on last K-panel + // Write C_acc back to output on last K-panel: + // c_out = alpha * (a@b) + beta * c_in if (last_k) { bool apply_alpha = (alpha != 1.0f); + bool apply_beta = (beta != 0.0f); simd::float8 alpha_vec(alpha); + simd::float8 beta_vec(beta); for (int i = 0; i < mc; ++i) { T* c_row = c + (ic + i) * ldC + jc; @@ -314,11 +306,16 @@ void simd_gemm_optimized_higher_precision( for (; j + sw <= nc; j += sw) { simd::float8 acc = simd::load(acc_row + j); if (apply_alpha) acc = alpha_vec * acc; + if (apply_beta) { + simd::float8 cv = simd::load_convert_to_float(c_row + j); + acc = acc + beta_vec * cv; + } simd::store_convert_from_float(c_row + j, acc); } for (; j < nc; ++j) { float val = acc_row[j]; if (apply_alpha) val *= alpha; + if (apply_beta) val += beta * static_cast(c_row[j]); c_row[j] = static_cast(val); } } diff --git a/mlx/backend/cpu/gemms/avx_simd_gemv.h b/mlx/backend/cpu/gemms/avx_simd_gemv.h index 89eedd335b..8c22a0646f 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemv.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemv.h @@ -161,23 +161,12 @@ void simd_gemv( acc_buf.reset(out_len); float* acc = acc_buf.get(); - // Initialize accumulator: acc = beta * C + // Initialize accumulator to zero. We add beta*C at writeback so that + // alpha is only applied to (a@b), not to beta*c. // When M=1, C is 1×N contiguous. When N=1, C is M×1 contiguous (ldC=1). constexpr int sw = 8; - if (beta != 0.0f) { - simd::float8 beta_vec(beta); - int j = 0; - for (; j + sw <= out_len; j += sw) { - simd::float8 cv = simd::load_convert_to_float(c + j); - simd::store(acc + j, beta_vec * cv); - } - for (; j < out_len; j++) { - acc[j] = beta * static_cast(c[j]); - } - } else { - std::memset(acc, 0, out_len * sizeof(float)); - } + std::memset(acc, 0, out_len * sizeof(float)); // Accumulate: acc += op(A) * op(B) if (M == 1) { @@ -200,18 +189,25 @@ void simd_gemv( } } - // Write back: C = alpha * acc (convert fp32 → T) + // Write back: C = alpha * acc + beta * C (convert fp32 → T) bool apply_alpha = (alpha != 1.0f); + bool apply_beta = (beta != 0.0f); simd::float8 alpha_vec(alpha); + simd::float8 beta_vec(beta); int j = 0; for (; j + sw <= out_len; j += sw) { simd::float8 val = simd::load(acc + j); if (apply_alpha) val = alpha_vec * val; + if (apply_beta) { + simd::float8 cv = simd::load_convert_to_float(c + j); + val = val + beta_vec * cv; + } simd::store_convert_from_float(c + j, val); } for (; j < out_len; j++) { float val = acc[j]; if (apply_alpha) val *= alpha; + if (apply_beta) val += beta * static_cast(c[j]); c[j] = static_cast(val); } } From 7590c04386503832dcf2a630dabda268c30c5730 Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Sun, 26 Apr 2026 23:16:17 +0000 Subject: [PATCH 4/6] formatting --- CMakeLists.txt | 11 +- benchmarks/python/blas/bench_gemm.py | 14 +- benchmarks/python/blas/bench_gemv.py | 17 +- mlx/backend/cpu/gemms/aligned_buffer.h | 4 +- mlx/backend/cpu/gemms/avx_simd_gemm.h | 687 +++++++++++++------------ mlx/backend/cpu/gemms/avx_simd_gemv.h | 328 ++++++------ mlx/backend/cpu/simd/avx_simd.h | 627 +++++++++++----------- mlx/event.h | 1 - 8 files changed, 873 insertions(+), 816 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dcb89e57f..5890405cde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,13 +57,16 @@ if(MLX_BUILD_CPU AND (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-9]86")) check_cxx_compiler_flag("-mfma" HAS_FMA) check_cxx_compiler_flag("-mf16c" HAS_F16C) - if(HAS_AVX2 AND HAS_FMA AND HAS_F16C) - message(STATUS "Compiler supports AVX2/FMA/F16C - enabling AVX SIMD backend") + if(HAS_AVX2 + AND HAS_FMA + AND HAS_F16C) + message( + STATUS "Compiler supports AVX2/FMA/F16C - enabling AVX SIMD backend") add_compile_options(-mavx2 -mfma -mf16c) add_compile_definitions(MLX_USE_AVX) else() - message(STATUS - "Missing required x86 SIMD support - using base SIMD backend") + message( + STATUS "Missing required x86 SIMD support - using base SIMD backend") if(NOT HAS_AVX2) message(STATUS " Missing: AVX2") endif() diff --git a/benchmarks/python/blas/bench_gemm.py b/benchmarks/python/blas/bench_gemm.py index edf16e187a..ee643f3be2 100644 --- a/benchmarks/python/blas/bench_gemm.py +++ b/benchmarks/python/blas/bench_gemm.py @@ -11,9 +11,13 @@ import torch try: - device_name = subprocess.check_output( - ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL - ).decode("utf-8").strip() + device_name = ( + subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL + ) + .decode("utf-8") + .strip() + ) except (subprocess.CalledProcessError, FileNotFoundError): device_name = "unknown" @@ -243,7 +247,9 @@ def main(): ) if args.verbose: - print(f"{'B':>3}, {'M':>4}, {'N':>4}, {'K':>4}, {'dtype':<9}, {'t':<2}, torch_gf, mlx_gf, diff") + print( + f"{'B':>3}, {'M':>4}, {'N':>4}, {'K':>4}, {'dtype':<9}, {'t':<2}, torch_gf, mlx_gf, diff" + ) print("-" * 66) for dtype in dtypes: diff --git a/benchmarks/python/blas/bench_gemv.py b/benchmarks/python/blas/bench_gemv.py index 1dec32ee4b..e0c781562f 100644 --- a/benchmarks/python/blas/bench_gemv.py +++ b/benchmarks/python/blas/bench_gemv.py @@ -16,9 +16,13 @@ os.mkdir(results_dir) try: - device_name = subprocess.check_output( - ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL - ).decode("utf-8").strip() + device_name = ( + subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], stderr=subprocess.DEVNULL + ) + .decode("utf-8") + .strip() + ) except (subprocess.CalledProcessError, FileNotFoundError): device_name = "unknown" @@ -108,7 +112,9 @@ def gemv_t_torch(m, v): return ys -def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False, max_torch_elements=None): +def bench_lens( + in_vec_len, out_vec_len, np_dtype, transpose=False, max_torch_elements=None +): shape_mat = (in_vec_len, out_vec_len) if transpose else (out_vec_len, in_vec_len) shape_vec = (1, in_vec_len) if transpose else (in_vec_len, 1) @@ -359,7 +365,8 @@ def main(): fig.suptitle(f"{device_name}: {dtype} {op_name}") fig.savefig( os.path.join( - results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf" + results_dir, + f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf", ) ) plt.close(fig) diff --git a/mlx/backend/cpu/gemms/aligned_buffer.h b/mlx/backend/cpu/gemms/aligned_buffer.h index 08e5aeadd5..33966ae2ed 100644 --- a/mlx/backend/cpu/gemms/aligned_buffer.h +++ b/mlx/backend/cpu/gemms/aligned_buffer.h @@ -48,7 +48,9 @@ class aligned_unique_ptr { aligned_unique_ptr(const aligned_unique_ptr&) = delete; aligned_unique_ptr& operator=(const aligned_unique_ptr&) = delete; - T* get() const { return ptr_; } + T* get() const { + return ptr_; + } void reset(size_t new_size) { if (new_size > size_) { diff --git a/mlx/backend/cpu/gemms/avx_simd_gemm.h b/mlx/backend/cpu/gemms/avx_simd_gemm.h index 0a6791addd..8ba143246f 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemm.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemm.h @@ -1,9 +1,9 @@ // Copyright © 2025 Apple Inc. #pragma once +#include #include #include -#include #include #include #include @@ -15,314 +15,339 @@ namespace mlx::core { template -inline void pack_transpose_8x8( - const T* src, float* dst, int src_stride, int dst_stride) { - simd::transpose_8x8_block(src, dst, src_stride, dst_stride); +inline void +pack_transpose_8x8(const T* src, float* dst, int src_stride, int dst_stride) { + simd::transpose_8x8_block(src, dst, src_stride, dst_stride); } // Pack A block (m_block x k_block) into A_packed (MC x KC float, column-major). template static void pack_A_block( - const T* A, float* A_packed, - int M, int K, int ldA, - int M_offset, int K_offset, - int m_block, int k_block, bool a_trans) -{ - static_assert(std::is_same_v || std::is_same_v, - "T must be float16 or bfloat16"); - constexpr int simd_width = 8; - - // Zero-fill only the portions we access (edge tiles) - if (m_block < MC || k_block < KC) { - for (int k = 0; k < k_block; ++k) { - std::fill(A_packed + k * MC, A_packed + k * MC + m_block, 0.0f); - } + const T* A, + float* A_packed, + int M, + int K, + int ldA, + int M_offset, + int K_offset, + int m_block, + int k_block, + bool a_trans) { + static_assert( + std::is_same_v || std::is_same_v, + "T must be float16 or bfloat16"); + constexpr int simd_width = 8; + + // Zero-fill only the portions we access (edge tiles) + if (m_block < MC || k_block < KC) { + for (int k = 0; k < k_block; ++k) { + std::fill(A_packed + k * MC, A_packed + k * MC + m_block, 0.0f); } - - if (!a_trans) { - // A is row-major (M x K). Pack with 8x8 transpose blocks. - for (int k = 0; k < k_block; k += 8) { - int k_chunk = std::min(8, k_block - k); - - if (k_chunk == 8) { - for (int i = 0; i < m_block; i += 8) { - int m_chunk = std::min(8, m_block - i); - - if (m_chunk == 8) { - const T* a_block_start = A + (M_offset + i) * ldA + K_offset + k; - pack_transpose_8x8(a_block_start, A_packed + k * MC + i, ldA, MC); - } else { - for (int ii = 0; ii < m_chunk; ++ii) { - const T* a_src_row_ptr = A + (M_offset + i + ii) * ldA + K_offset + k; - for (int kk = 0; kk < k_chunk; ++kk) { - A_packed[(k + kk) * MC + (i + ii)] = static_cast(a_src_row_ptr[kk]); - } - } - } - } - } else { - for (int i = 0; i < m_block; ++i) { - const T* a_src_row_ptr = A + (M_offset + i) * ldA + K_offset + k; - for (int kk = 0; kk < k_chunk; ++kk) { - A_packed[(k + kk) * MC + i] = static_cast(a_src_row_ptr[kk]); - } - } + } + + if (!a_trans) { + // A is row-major (M x K). Pack with 8x8 transpose blocks. + for (int k = 0; k < k_block; k += 8) { + int k_chunk = std::min(8, k_block - k); + + if (k_chunk == 8) { + for (int i = 0; i < m_block; i += 8) { + int m_chunk = std::min(8, m_block - i); + + if (m_chunk == 8) { + const T* a_block_start = A + (M_offset + i) * ldA + K_offset + k; + pack_transpose_8x8( + a_block_start, A_packed + k * MC + i, ldA, MC); + } else { + for (int ii = 0; ii < m_chunk; ++ii) { + const T* a_src_row_ptr = + A + (M_offset + i + ii) * ldA + K_offset + k; + for (int kk = 0; kk < k_chunk; ++kk) { + A_packed[(k + kk) * MC + (i + ii)] = + static_cast(a_src_row_ptr[kk]); + } } + } } - } else { - // A is transposed (K x M row-major). Contiguous copy with SIMD convert. - for (int k = 0; k < k_block; ++k) { - const T* a_src_row_ptr = A + (K_offset + k) * ldA + M_offset; - float* a_dst_col_ptr = A_packed + k * MC; - int i = 0; - for (; i + simd_width <= m_block; i += simd_width) { - simd::float8 a_vec = simd::load_convert_to_float(a_src_row_ptr + i); - simd::store(a_dst_col_ptr + i, a_vec); - } - for (; i < m_block; ++i) { - a_dst_col_ptr[i] = static_cast(a_src_row_ptr[i]); - } + } else { + for (int i = 0; i < m_block; ++i) { + const T* a_src_row_ptr = A + (M_offset + i) * ldA + K_offset + k; + for (int kk = 0; kk < k_chunk; ++kk) { + A_packed[(k + kk) * MC + i] = static_cast(a_src_row_ptr[kk]); + } } + } + } + } else { + // A is transposed (K x M row-major). Contiguous copy with SIMD convert. + for (int k = 0; k < k_block; ++k) { + const T* a_src_row_ptr = A + (K_offset + k) * ldA + M_offset; + float* a_dst_col_ptr = A_packed + k * MC; + int i = 0; + for (; i + simd_width <= m_block; i += simd_width) { + simd::float8 a_vec = simd::load_convert_to_float(a_src_row_ptr + i); + simd::store(a_dst_col_ptr + i, a_vec); + } + for (; i < m_block; ++i) { + a_dst_col_ptr[i] = static_cast(a_src_row_ptr[i]); + } } + } } // Pack B block (k_block x n_block) into B_packed (KC x NC float, row-major). template static void pack_B_block( - const T* B, float* B_packed, - int K, int N, int ldB, - int K_offset, int N_offset, - int k_block, int n_block, bool b_trans) -{ - static_assert(std::is_same_v || std::is_same_v, - "T must be float16 or bfloat16"); - constexpr int simd_width = 8; - - if (k_block < KC || n_block < NC) { - for (int k = 0; k < k_block; ++k) { - std::fill(B_packed + k * NC, B_packed + k * NC + n_block, 0.0f); - } + const T* B, + float* B_packed, + int K, + int N, + int ldB, + int K_offset, + int N_offset, + int k_block, + int n_block, + bool b_trans) { + static_assert( + std::is_same_v || std::is_same_v, + "T must be float16 or bfloat16"); + constexpr int simd_width = 8; + + if (k_block < KC || n_block < NC) { + for (int k = 0; k < k_block; ++k) { + std::fill(B_packed + k * NC, B_packed + k * NC + n_block, 0.0f); } - - if (!b_trans) { - // B is row-major (K x N). Contiguous copy with SIMD convert. - for (int k = 0; k < k_block; ++k) { - const T* b_src_row_ptr = B + (K_offset + k) * ldB + N_offset; - float* b_dst_row_ptr = B_packed + k * NC; - int j = 0; - for (; j + simd_width <= n_block; j += simd_width) { - simd::float8 b_vec = simd::load_convert_to_float(b_src_row_ptr + j); - simd::store(b_dst_row_ptr + j, b_vec); + } + + if (!b_trans) { + // B is row-major (K x N). Contiguous copy with SIMD convert. + for (int k = 0; k < k_block; ++k) { + const T* b_src_row_ptr = B + (K_offset + k) * ldB + N_offset; + float* b_dst_row_ptr = B_packed + k * NC; + int j = 0; + for (; j + simd_width <= n_block; j += simd_width) { + simd::float8 b_vec = simd::load_convert_to_float(b_src_row_ptr + j); + simd::store(b_dst_row_ptr + j, b_vec); + } + for (; j < n_block; ++j) { + b_dst_row_ptr[j] = static_cast(b_src_row_ptr[j]); + } + } + } else { + // B is transposed (N x K row-major). Pack with 8x8 transpose blocks. + for (int k = 0; k < k_block; k += 8) { + int k_chunk = std::min(8, k_block - k); + + if (k_chunk == 8) { + for (int j = 0; j < n_block; j += 8) { + int n_chunk = std::min(8, n_block - j); + + if (n_chunk == 8) { + const T* b_block_start = B + (N_offset + j) * ldB + K_offset + k; + float tmp_transpose[64]; + pack_transpose_8x8(b_block_start, tmp_transpose, ldB, 8); + for (int kk = 0; kk < 8; ++kk) { + for (int jj = 0; jj < 8; ++jj) { + B_packed[(k + kk) * NC + (j + jj)] = tmp_transpose[kk * 8 + jj]; + } } - for (; j < n_block; ++j) { - b_dst_row_ptr[j] = static_cast(b_src_row_ptr[j]); + } else { + for (int kk = 0; kk < k_chunk; ++kk) { + float* b_dst_row_ptr = B_packed + (k + kk) * NC + j; + for (int jj = 0; jj < n_chunk; ++jj) { + b_dst_row_ptr[jj] = static_cast( + B[(N_offset + j + jj) * ldB + (K_offset + k + kk)]); + } } + } } - } else { - // B is transposed (N x K row-major). Pack with 8x8 transpose blocks. - for (int k = 0; k < k_block; k += 8) { - int k_chunk = std::min(8, k_block - k); - - if (k_chunk == 8) { - for (int j = 0; j < n_block; j += 8) { - int n_chunk = std::min(8, n_block - j); - - if (n_chunk == 8) { - const T* b_block_start = B + (N_offset + j) * ldB + K_offset + k; - float tmp_transpose[64]; - pack_transpose_8x8(b_block_start, tmp_transpose, ldB, 8); - for (int kk = 0; kk < 8; ++kk) { - for (int jj = 0; jj < 8; ++jj) { - B_packed[(k + kk) * NC + (j + jj)] = tmp_transpose[kk * 8 + jj]; - } - } - } else { - for (int kk = 0; kk < k_chunk; ++kk) { - float* b_dst_row_ptr = B_packed + (k + kk) * NC + j; - for (int jj = 0; jj < n_chunk; ++jj) { - b_dst_row_ptr[jj] = static_cast(B[(N_offset + j + jj) * ldB + (K_offset + k + kk)]); - } - } - } - } - } else { - for (int kk = 0; kk < k_chunk; ++kk) { - float* b_dst_row_ptr = B_packed + (k + kk) * NC; - for (int j = 0; j < n_block; ++j) { - b_dst_row_ptr[j] = static_cast(B[(N_offset + j) * ldB + (K_offset + k + kk)]); - } - } - } + } else { + for (int kk = 0; kk < k_chunk; ++kk) { + float* b_dst_row_ptr = B_packed + (k + kk) * NC; + for (int j = 0; j < n_block; ++j) { + b_dst_row_ptr[j] = static_cast( + B[(N_offset + j) * ldB + (K_offset + k + kk)]); + } } + } } + } } -/** - * Optimized single-threaded matrix multiplication using AVX/FMA with - * float32 accumulation. Inputs/outputs are float16 or bfloat16. - * - * Uses jc→pc→ic loop order (classic Goto) so B is packed once per (jc,pc) - * and reused across all ic blocks. An M×NC_BLOCK fp32 accumulator keeps - * partial C sums in fp32 across K-panels, eliminating fp16 round-trips. - */ +// Single-threaded fp16/bf16 GEMM with fp32 accumulation. Goto-style +// jc→pc→ic blocking; A and B are packed to fp32 once per panel. template void simd_gemm_optimized_higher_precision( - const T* a, const T* b, T* c, - bool a_trans, bool b_trans, - int M, int N, int K, - int ldA, int ldB, int ldC, - float alpha, float beta) -{ - static_assert(std::is_same_v || std::is_same_v, - "GEMM kernel requires float16_t or bfloat16_t."); - - // --- Blocking Parameters --- - constexpr int MR = 6; - constexpr int NR = 16; - static_assert(NR % 8 == 0, "NR must be multiple of float SIMD width (8)"); - - constexpr int KC_BLOCK = 256; - constexpr int MC_BLOCK = 96; - constexpr int NC_BLOCK = 256; - - static_assert(MC_BLOCK % MR == 0, "MC_BLOCK must be a multiple of MR"); - static_assert(NC_BLOCK % NR == 0, "NC_BLOCK must be a multiple of NR"); - - // Thread-local buffers (grow-only, reused across calls) - thread_local aligned_unique_ptr A_packed_buf(MC_BLOCK * KC_BLOCK); - thread_local aligned_unique_ptr B_packed_buf(KC_BLOCK * NC_BLOCK); - thread_local aligned_unique_ptr C_acc_buf(1); - - A_packed_buf.reset(MC_BLOCK * KC_BLOCK); - B_packed_buf.reset(KC_BLOCK * NC_BLOCK); - C_acc_buf.reset(M * NC_BLOCK); - - float* A_packed = A_packed_buf.get(); - float* B_packed = B_packed_buf.get(); - float* C_acc = C_acc_buf.get(); - - // Scalar fallback for edge tiles (m_micro < MR or n_micro < NR) - auto compute_block_scalar_partial = []( - - const float* A_panel, - const float* B_panel, - float* C_sub, - int ldc_acc, - int m_micro, int n_micro, int k_block, - int a_stride, - int b_stride) - { - for (int i = 0; i < m_micro; ++i) { - for (int j = 0; j < n_micro; ++j) { - float acc = C_sub[i * ldc_acc + j]; - for (int k = 0; k < k_block; ++k) { - acc += A_panel[i + k * a_stride] * B_panel[k * b_stride + j]; - } - C_sub[i * ldc_acc + j] = acc; + const T* a, + const T* b, + T* c, + bool a_trans, + bool b_trans, + int M, + int N, + int K, + int ldA, + int ldB, + int ldC, + float alpha, + float beta) { + static_assert( + std::is_same_v || std::is_same_v, + "GEMM kernel requires float16_t or bfloat16_t."); + + // Blocking parameters. + constexpr int MR = 6; + constexpr int NR = 16; + static_assert(NR % 8 == 0, "NR must be multiple of float SIMD width (8)"); + + constexpr int KC_BLOCK = 256; + constexpr int MC_BLOCK = 96; + constexpr int NC_BLOCK = 256; + + static_assert(MC_BLOCK % MR == 0, "MC_BLOCK must be a multiple of MR"); + static_assert(NC_BLOCK % NR == 0, "NC_BLOCK must be a multiple of NR"); + + // Thread-local buffers (grow-only, reused across calls) + thread_local aligned_unique_ptr A_packed_buf(MC_BLOCK * KC_BLOCK); + thread_local aligned_unique_ptr B_packed_buf(KC_BLOCK * NC_BLOCK); + thread_local aligned_unique_ptr C_acc_buf(1); + + A_packed_buf.reset(MC_BLOCK * KC_BLOCK); + B_packed_buf.reset(KC_BLOCK * NC_BLOCK); + C_acc_buf.reset(M * NC_BLOCK); + + float* A_packed = A_packed_buf.get(); + float* B_packed = B_packed_buf.get(); + float* C_acc = C_acc_buf.get(); + + // Scalar fallback for edge tiles (m_micro < MR or n_micro < NR) + auto compute_block_scalar_partial = []( + + const float* A_panel, + const float* B_panel, + float* C_sub, + int ldc_acc, + int m_micro, + int n_micro, + int k_block, + int a_stride, + int b_stride) { + for (int i = 0; i < m_micro; ++i) { + for (int j = 0; j < n_micro; ++j) { + float acc = C_sub[i * ldc_acc + j]; + for (int k = 0; k < k_block; ++k) { + acc += A_panel[i + k * a_stride] * B_panel[k * b_stride + j]; + } + C_sub[i * ldc_acc + j] = acc; + } + } + }; + + constexpr int sw = 8; + + for (int jc = 0; jc < N; jc += NC_BLOCK) { + int nc = std::min(NC_BLOCK, N - jc); + + for (int pc = 0; pc < K; pc += KC_BLOCK) { + int kc = std::min(KC_BLOCK, K - pc); + bool first_k = (pc == 0); + bool last_k = (pc + kc >= K); + + pack_B_block( + b, B_packed, K, N, ldB, pc, jc, kc, nc, b_trans); + + for (int ic = 0; ic < M; ic += MC_BLOCK) { + int mc = std::min(MC_BLOCK, M - ic); + + pack_A_block( + a, A_packed, M, K, ldA, ic, pc, mc, kc, a_trans); + + // Zero C_acc on first K-panel; alpha and beta*C are applied at + // writeback. + if (first_k) { + for (int i = 0; i < mc; ++i) { + std::memset(C_acc + (ic + i) * NC_BLOCK, 0, nc * sizeof(float)); + } + } + + // Microkernel loop + for (int ir = 0; ir < mc; ir += MR) { + int m_micro = std::min(MR, mc - ir); + + for (int jr = 0; jr < nc; jr += NR) { + int n_micro = std::min(NR, nc - jr); + + const float* a_ptr = A_packed + ir; + const float* b_ptr = B_packed + jr; + float* c_ptr = C_acc + (ic + ir) * NC_BLOCK + jr; + + // Prefetch next C_acc tile into L2 + if (jr + NR < nc) { + for (int pi = 0; pi < MR && ir + pi < mc; ++pi) + _mm_prefetch( + reinterpret_cast( + C_acc + (ic + ir + pi) * NC_BLOCK + jr + NR), + _MM_HINT_T1); + } else if (ir + MR < mc) { + for (int pi = 0; pi < MR && ir + MR + pi < mc; ++pi) + _mm_prefetch( + reinterpret_cast( + C_acc + (ic + ir + MR + pi) * NC_BLOCK), + _MM_HINT_T1); + } + + if (m_micro == MR && n_micro == NR) { + simd::micro_kernel_6x16( + a_ptr, b_ptr, c_ptr, NC_BLOCK, kc, MC_BLOCK, NC_BLOCK); + } else { + compute_block_scalar_partial( + a_ptr, + b_ptr, + c_ptr, + NC_BLOCK, + m_micro, + n_micro, + kc, + MC_BLOCK, + NC_BLOCK); } + } } - }; - - constexpr int sw = 8; - - for (int jc = 0; jc < N; jc += NC_BLOCK) { - int nc = std::min(NC_BLOCK, N - jc); - - for (int pc = 0; pc < K; pc += KC_BLOCK) { - int kc = std::min(KC_BLOCK, K - pc); - bool first_k = (pc == 0); - bool last_k = (pc + kc >= K); - - pack_B_block( - b, B_packed, K, N, ldB, pc, jc, kc, nc, b_trans); - - for (int ic = 0; ic < M; ic += MC_BLOCK) { - int mc = std::min(MC_BLOCK, M - ic); - - pack_A_block( - a, A_packed, M, K, ldA, ic, pc, mc, kc, a_trans); - - // Initialize C_acc on first K-panel. - // Note: we always start from zero and apply both alpha (to a@b) - // and beta*c at writeback time. This avoids the bug where - // pre-loading beta*c here and multiplying the whole accumulator - // by alpha at writeback would produce alpha*beta*c instead of - // beta*c. - if (first_k) { - for (int i = 0; i < mc; ++i) { - std::memset(C_acc + (ic + i) * NC_BLOCK, 0, nc * sizeof(float)); - } - } - - // Microkernel loop - for (int ir = 0; ir < mc; ir += MR) { - int m_micro = std::min(MR, mc - ir); - - for (int jr = 0; jr < nc; jr += NR) { - int n_micro = std::min(NR, nc - jr); - - const float* a_ptr = A_packed + ir; - const float* b_ptr = B_packed + jr; - float* c_ptr = C_acc + (ic + ir) * NC_BLOCK + jr; - - // Prefetch next C_acc tile into L2 - if (jr + NR < nc) { - for (int pi = 0; pi < MR && ir + pi < mc; ++pi) - _mm_prefetch(reinterpret_cast( - C_acc + (ic + ir + pi) * NC_BLOCK + jr + NR), _MM_HINT_T1); - } else if (ir + MR < mc) { - for (int pi = 0; pi < MR && ir + MR + pi < mc; ++pi) - _mm_prefetch(reinterpret_cast( - C_acc + (ic + ir + MR + pi) * NC_BLOCK), _MM_HINT_T1); - } - - if (m_micro == MR && n_micro == NR) { - simd::micro_kernel_6x16( - a_ptr, b_ptr, c_ptr, - NC_BLOCK, kc, - MC_BLOCK, NC_BLOCK); - } else { - compute_block_scalar_partial( - a_ptr, b_ptr, c_ptr, - NC_BLOCK, - m_micro, n_micro, kc, - MC_BLOCK, NC_BLOCK); - } - } - } - - // Write C_acc back to output on last K-panel: - // c_out = alpha * (a@b) + beta * c_in - if (last_k) { - bool apply_alpha = (alpha != 1.0f); - bool apply_beta = (beta != 0.0f); - simd::float8 alpha_vec(alpha); - simd::float8 beta_vec(beta); - - for (int i = 0; i < mc; ++i) { - T* c_row = c + (ic + i) * ldC + jc; - float* acc_row = C_acc + (ic + i) * NC_BLOCK; - int j = 0; - for (; j + sw <= nc; j += sw) { - simd::float8 acc = simd::load(acc_row + j); - if (apply_alpha) acc = alpha_vec * acc; - if (apply_beta) { - simd::float8 cv = simd::load_convert_to_float(c_row + j); - acc = acc + beta_vec * cv; - } - simd::store_convert_from_float(c_row + j, acc); - } - for (; j < nc; ++j) { - float val = acc_row[j]; - if (apply_alpha) val *= alpha; - if (apply_beta) val += beta * static_cast(c_row[j]); - c_row[j] = static_cast(val); - } - } - } - } // ic - } // pc - } // jc + + // Writeback: C = alpha * acc + beta * C + if (last_k) { + bool apply_alpha = (alpha != 1.0f); + bool apply_beta = (beta != 0.0f); + simd::float8 alpha_vec(alpha); + simd::float8 beta_vec(beta); + + for (int i = 0; i < mc; ++i) { + T* c_row = c + (ic + i) * ldC + jc; + float* acc_row = C_acc + (ic + i) * NC_BLOCK; + int j = 0; + for (; j + sw <= nc; j += sw) { + simd::float8 acc = simd::load(acc_row + j); + if (apply_alpha) + acc = alpha_vec * acc; + if (apply_beta) { + simd::float8 cv = simd::load_convert_to_float(c_row + j); + acc = acc + beta_vec * cv; + } + simd::store_convert_from_float(c_row + j, acc); + } + for (; j < nc; ++j) { + float val = acc_row[j]; + if (apply_alpha) + val *= alpha; + if (apply_beta) + val += beta * static_cast(c_row[j]); + c_row[j] = static_cast(val); + } + } + } + } // ic + } // pc + } // jc } // Public interface: validates dimensions and dispatches to the blocked kernel. @@ -337,58 +362,56 @@ void simd_gemm( size_t N_s, size_t K_s, float alpha = 1.0f, - float beta = 0.0f) -{ - static_assert(std::is_same_v || std::is_same_v, - "simd_gemm requires T = float16_t or bfloat16_t."); - static_assert(std::is_same_v, - "simd_gemm requires AccT = float."); - - if (M_s > static_cast(std::numeric_limits::max()) || - N_s > static_cast(std::numeric_limits::max()) || - K_s > static_cast(std::numeric_limits::max())) { - throw std::overflow_error("Matrix dimensions exceed int limits."); - } - int M = static_cast(M_s); - int N = static_cast(N_s); - int K = static_cast(K_s); - - if (M <= 0 || N <= 0) return; - - int ldA = (!a_trans) ? K : M; - int ldB = (!b_trans) ? N : K; - int ldC = N; - - // K=0: C = beta * C - if (K <= 0) { - if (beta == 0.0f) { - for (int i = 0; i < M; ++i) { - T zero_val = static_cast(0.0f); - std::fill(c + i * ldC, c + i * ldC + N, zero_val); - } - } else if (beta != 1.0f) { - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { - float c_old_f = static_cast(c[i * ldC + j]); - c[i * ldC + j] = static_cast(beta * c_old_f); - } - } + float beta = 0.0f) { + static_assert( + std::is_same_v || std::is_same_v, + "simd_gemm requires T = float16_t or bfloat16_t."); + static_assert( + std::is_same_v, "simd_gemm requires AccT = float."); + + if (M_s > static_cast(std::numeric_limits::max()) || + N_s > static_cast(std::numeric_limits::max()) || + K_s > static_cast(std::numeric_limits::max())) { + throw std::overflow_error("Matrix dimensions exceed int limits."); + } + int M = static_cast(M_s); + int N = static_cast(N_s); + int K = static_cast(K_s); + + if (M <= 0 || N <= 0) + return; + + int ldA = (!a_trans) ? K : M; + int ldB = (!b_trans) ? N : K; + int ldC = N; + + // K=0: C = beta * C + if (K <= 0) { + if (beta == 0.0f) { + for (int i = 0; i < M; ++i) { + T zero_val = static_cast(0.0f); + std::fill(c + i * ldC, c + i * ldC + N, zero_val); + } + } else if (beta != 1.0f) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + float c_old_f = static_cast(c[i * ldC + j]); + c[i * ldC + j] = static_cast(beta * c_old_f); } - return; + } } - - // Dispatch to GEMV for M=1 or N=1 (avoids blocked GEMM overhead) - if (M == 1 || N == 1) { - simd_gemv(a, b, c, a_trans, b_trans, M, N, K, ldA, ldB, ldC, alpha, beta); - return; - } - - simd_gemm_optimized_higher_precision( - a, b, c, - a_trans, b_trans, - M, N, K, - ldA, ldB, ldC, - alpha, beta); + return; + } + + // Dispatch to GEMV for M=1 or N=1 (avoids blocked GEMM overhead) + if (M == 1 || N == 1) { + simd_gemv( + a, b, c, a_trans, b_trans, M, N, K, ldA, ldB, ldC, alpha, beta); + return; + } + + simd_gemm_optimized_higher_precision( + a, b, c, a_trans, b_trans, M, N, K, ldA, ldB, ldC, alpha, beta); } } // namespace mlx::core \ No newline at end of file diff --git a/mlx/backend/cpu/gemms/avx_simd_gemv.h b/mlx/backend/cpu/gemms/avx_simd_gemv.h index 8c22a0646f..178e9dd450 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemv.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemv.h @@ -1,215 +1,193 @@ // Copyright © 2025 Apple Inc. #pragma once +#include #include #include -#include #include "mlx/backend/cpu/gemms/aligned_buffer.h" #include "mlx/backend/cpu/simd/avx_simd.h" namespace mlx::core { -// Block size for output dimension in outer-product GEMV. -// 4096 floats = 16KB of fp32 accumulator, fits comfortably in L1 cache -// alongside the B row data and vector operand. +// Output-dim block: 4096 fp32 = 16KB, fits in L1 alongside B rows. constexpr int GEMV_NC_BLOCK = 4096; -// -------------------------------------------------------------------------- -// Outer-product GEMV core. -// acc[0:width] += sum_k vec[k] * mat[k * mat_stride + 0 : width] -// -// vec: K contiguous T elements (the "vector" operand) -// mat: K rows of `width` T elements, row stride = mat_stride -// acc: `width` fp32 elements (caller-initialized) -// -// Blocks along the output dimension so the accumulator fits in L1. -// -------------------------------------------------------------------------- +// acc[0:width] += sum_k vec[k] * mat[k * mat_stride + 0:width] template static void gemv_outer_product( const T* vec, const T* mat, float* acc, - int K, int width, int mat_stride) -{ - constexpr int sw = 8; - - for (int jc = 0; jc < width; jc += GEMV_NC_BLOCK) { - int nc = std::min(GEMV_NC_BLOCK, width - jc); - float* acc_block = acc + jc; - - for (int k = 0; k < K; k++) { - float v = static_cast(vec[k]); - simd::float8 v_bcast(v); - const T* mat_row = mat + k * mat_stride + jc; - - // Prefetch start of next row for this block - if (k + 1 < K) { - _mm_prefetch( - reinterpret_cast(mat + (k + 1) * mat_stride + jc), - _MM_HINT_T0); - } - - int j = 0; - for (; j + sw <= nc; j += sw) { - simd::float8 m = simd::load_convert_to_float(mat_row + j); - simd::float8 c = simd::load(acc_block + j); - simd::store(acc_block + j, - simd::fma(v_bcast, m, c)); - } - for (; j < nc; j++) { - acc_block[j] += v * static_cast(mat_row[j]); - } - } + int K, + int width, + int mat_stride) { + constexpr int sw = 8; + + for (int jc = 0; jc < width; jc += GEMV_NC_BLOCK) { + int nc = std::min(GEMV_NC_BLOCK, width - jc); + float* acc_block = acc + jc; + + for (int k = 0; k < K; k++) { + float v = static_cast(vec[k]); + simd::float8 v_bcast(v); + const T* mat_row = mat + k * mat_stride + jc; + + // Prefetch start of next row for this block + if (k + 1 < K) { + _mm_prefetch( + reinterpret_cast(mat + (k + 1) * mat_stride + jc), + _MM_HINT_T0); + } + + int j = 0; + for (; j + sw <= nc; j += sw) { + simd::float8 m = simd::load_convert_to_float(mat_row + j); + simd::float8 c = simd::load(acc_block + j); + simd::store( + acc_block + j, simd::fma(v_bcast, m, c)); + } + for (; j < nc; j++) { + acc_block[j] += v * static_cast(mat_row[j]); + } } + } } -// -------------------------------------------------------------------------- -// Dot-product GEMV core. -// acc[i] += dot(mat[i * mat_stride : +K], vec[0:K]) for i = 0..n_outputs-1 -// -// Processes 4 rows at once to amortize vec loads across rows. -// -------------------------------------------------------------------------- +// acc[i] += dot(mat[i*mat_stride : +K], vec[0:K]); 4-row unroll to share vec +// loads. template static void gemv_dot_product( const T* mat, const T* vec, float* acc, - int n_outputs, int K, int mat_stride) -{ - constexpr int sw = 8; - constexpr int UNROLL = 4; - - int i = 0; - for (; i + UNROLL <= n_outputs; i += UNROLL) { - simd::float8 s0, s1, s2, s3; - - const T* r0 = mat + (i + 0) * mat_stride; - const T* r1 = mat + (i + 1) * mat_stride; - const T* r2 = mat + (i + 2) * mat_stride; - const T* r3 = mat + (i + 3) * mat_stride; - - int k = 0; - for (; k + sw <= K; k += sw) { - simd::float8 v = simd::load_convert_to_float(vec + k); - s0 = simd::fma(simd::load_convert_to_float(r0 + k), v, s0); - s1 = simd::fma(simd::load_convert_to_float(r1 + k), v, s1); - s2 = simd::fma(simd::load_convert_to_float(r2 + k), v, s2); - s3 = simd::fma(simd::load_convert_to_float(r3 + k), v, s3); - } - - float d0 = simd::sum(s0); - float d1 = simd::sum(s1); - float d2 = simd::sum(s2); - float d3 = simd::sum(s3); - - for (; k < K; k++) { - float vk = static_cast(vec[k]); - d0 += vk * static_cast(r0[k]); - d1 += vk * static_cast(r1[k]); - d2 += vk * static_cast(r2[k]); - d3 += vk * static_cast(r3[k]); - } - - acc[i + 0] += d0; - acc[i + 1] += d1; - acc[i + 2] += d2; - acc[i + 3] += d3; + int n_outputs, + int K, + int mat_stride) { + constexpr int sw = 8; + constexpr int UNROLL = 4; + + int i = 0; + for (; i + UNROLL <= n_outputs; i += UNROLL) { + simd::float8 s0, s1, s2, s3; + + const T* r0 = mat + (i + 0) * mat_stride; + const T* r1 = mat + (i + 1) * mat_stride; + const T* r2 = mat + (i + 2) * mat_stride; + const T* r3 = mat + (i + 3) * mat_stride; + + int k = 0; + for (; k + sw <= K; k += sw) { + simd::float8 v = simd::load_convert_to_float(vec + k); + s0 = simd::fma(simd::load_convert_to_float(r0 + k), v, s0); + s1 = simd::fma(simd::load_convert_to_float(r1 + k), v, s1); + s2 = simd::fma(simd::load_convert_to_float(r2 + k), v, s2); + s3 = simd::fma(simd::load_convert_to_float(r3 + k), v, s3); + } + + float d0 = simd::sum(s0); + float d1 = simd::sum(s1); + float d2 = simd::sum(s2); + float d3 = simd::sum(s3); + + for (; k < K; k++) { + float vk = static_cast(vec[k]); + d0 += vk * static_cast(r0[k]); + d1 += vk * static_cast(r1[k]); + d2 += vk * static_cast(r2[k]); + d3 += vk * static_cast(r3[k]); + } + + acc[i + 0] += d0; + acc[i + 1] += d1; + acc[i + 2] += d2; + acc[i + 3] += d3; + } + + for (; i < n_outputs; i++) { + simd::float8 s; + const T* row = mat + i * mat_stride; + + int k = 0; + for (; k + sw <= K; k += sw) { + simd::float8 v = simd::load_convert_to_float(vec + k); + s = simd::fma(simd::load_convert_to_float(row + k), v, s); } - for (; i < n_outputs; i++) { - simd::float8 s; - const T* row = mat + i * mat_stride; - - int k = 0; - for (; k + sw <= K; k += sw) { - simd::float8 v = simd::load_convert_to_float(vec + k); - s = simd::fma(simd::load_convert_to_float(row + k), v, s); - } - - float d = simd::sum(s); - for (; k < K; k++) { - d += static_cast(vec[k]) * static_cast(row[k]); - } - acc[i] += d; + float d = simd::sum(s); + for (; k < K; k++) { + d += static_cast(vec[k]) * static_cast(row[k]); } + acc[i] += d; + } } -// -------------------------------------------------------------------------- -// Public GEMV interface. -// Handles M=1 and N=1 with all transpose combinations. -// C = alpha * op(A) * op(B) + beta * C -// -// Dispatch logic: -// M=1, B not transposed → outer product (SIMD along N, stream B rows) -// M=1, B transposed → dot product (SIMD along K, one dot per j) -// N=1, A not transposed → dot product (SIMD along K, one dot per i) -// N=1, A transposed → outer product (SIMD along M, stream A cols) -// -------------------------------------------------------------------------- +// C = alpha * op(A) * op(B) + beta * C, for M=1 or N=1. +// Dispatches to outer-product or dot-product core based on shape and transpose. template void simd_gemv( - const T* a, const T* b, T* c, - bool a_trans, bool b_trans, - int M, int N, int K, - int ldA, int ldB, int ldC, - float alpha, float beta) -{ - int out_len = (M == 1) ? N : M; - - // fp32 accumulator (thread-local, grow-only) - thread_local aligned_unique_ptr acc_buf(1); - acc_buf.reset(out_len); - float* acc = acc_buf.get(); - - // Initialize accumulator to zero. We add beta*C at writeback so that - // alpha is only applied to (a@b), not to beta*c. - // When M=1, C is 1×N contiguous. When N=1, C is M×1 contiguous (ldC=1). - constexpr int sw = 8; - - std::memset(acc, 0, out_len * sizeof(float)); - - // Accumulate: acc += op(A) * op(B) - if (M == 1) { - // A is always contiguous for M=1: a[k] for k=0..K-1 - if (!b_trans) { - // B is row-major K×N, stride ldB → outer product along N - gemv_outer_product(a, b, acc, K, N, ldB); - } else { - // B stored as N×K, stride ldB → dot product per output j - gemv_dot_product(b, a, acc, N, K, ldB); - } + const T* a, + const T* b, + T* c, + bool a_trans, + bool b_trans, + int M, + int N, + int K, + int ldA, + int ldB, + int ldC, + float alpha, + float beta) { + int out_len = (M == 1) ? N : M; + + // Thread-local fp32 accumulator (grow-only). + thread_local aligned_unique_ptr acc_buf(1); + acc_buf.reset(out_len); + float* acc = acc_buf.get(); + + constexpr int sw = 8; + std::memset(acc, 0, out_len * sizeof(float)); + + // acc += op(A) * op(B) + if (M == 1) { + if (!b_trans) { + gemv_outer_product(a, b, acc, K, N, ldB); } else { - // N=1: B is always contiguous: b[k] for k=0..K-1 - if (!a_trans) { - // A is row-major M×K, stride ldA → dot product per output i - gemv_dot_product(a, b, acc, M, K, ldA); - } else { - // A stored as K×M, stride ldA → outer product along M - gemv_outer_product(b, a, acc, K, M, ldA); - } + gemv_dot_product(b, a, acc, N, K, ldB); } - - // Write back: C = alpha * acc + beta * C (convert fp32 → T) - bool apply_alpha = (alpha != 1.0f); - bool apply_beta = (beta != 0.0f); - simd::float8 alpha_vec(alpha); - simd::float8 beta_vec(beta); - int j = 0; - for (; j + sw <= out_len; j += sw) { - simd::float8 val = simd::load(acc + j); - if (apply_alpha) val = alpha_vec * val; - if (apply_beta) { - simd::float8 cv = simd::load_convert_to_float(c + j); - val = val + beta_vec * cv; - } - simd::store_convert_from_float(c + j, val); + } else { + if (!a_trans) { + gemv_dot_product(a, b, acc, M, K, ldA); + } else { + gemv_outer_product(b, a, acc, K, M, ldA); } - for (; j < out_len; j++) { - float val = acc[j]; - if (apply_alpha) val *= alpha; - if (apply_beta) val += beta * static_cast(c[j]); - c[j] = static_cast(val); + } + + // Writeback: C = alpha * acc + beta * C (convert fp32 → T) + bool apply_alpha = (alpha != 1.0f); + bool apply_beta = (beta != 0.0f); + simd::float8 alpha_vec(alpha); + simd::float8 beta_vec(beta); + int j = 0; + for (; j + sw <= out_len; j += sw) { + simd::float8 val = simd::load(acc + j); + if (apply_alpha) + val = alpha_vec * val; + if (apply_beta) { + simd::float8 cv = simd::load_convert_to_float(c + j); + val = val + beta_vec * cv; } + simd::store_convert_from_float(c + j, val); + } + for (; j < out_len; j++) { + float val = acc[j]; + if (apply_alpha) + val *= alpha; + if (apply_beta) + val += beta * static_cast(c[j]); + c[j] = static_cast(val); + } } } // namespace mlx::core diff --git a/mlx/backend/cpu/simd/avx_simd.h b/mlx/backend/cpu/simd/avx_simd.h index cb3d684530..58ef4c7c02 100644 --- a/mlx/backend/cpu/simd/avx_simd.h +++ b/mlx/backend/cpu/simd/avx_simd.h @@ -11,198 +11,225 @@ namespace mlx::core::simd { // Forward declarations -template struct Simd; -template inline Simd load(const T* ptr); -template inline void store(T* ptr, Simd x); -template inline Simd broadcast(const T* ptr); -template inline Simd fma(Simd a, Simd b, Simd c); - +template +struct Simd; +template +inline Simd load(const T* ptr); +template +inline void store(T* ptr, Simd x); +template +inline Simd broadcast(const T* ptr); +template +inline Simd fma(Simd a, Simd b, Simd c); // Simd — wraps __m256 for AVX operations. using float8 = Simd; template <> struct Simd { - static constexpr int size = 8; - __m256 value; - - Simd() : value(_mm256_setzero_ps()) {} - Simd(float v) : value(_mm256_set1_ps(v)) {} - explicit Simd(__m256 v) : value(v) {} - Simd(const Simd& other) = default; - Simd& operator=(const Simd& other) = default; - operator __m256() const { return value; } + static constexpr int size = 8; + __m256 value; + + Simd() : value(_mm256_setzero_ps()) {} + Simd(float v) : value(_mm256_set1_ps(v)) {} + explicit Simd(__m256 v) : value(v) {} + Simd(const Simd& other) = default; + Simd& operator=(const Simd& other) = default; + operator __m256() const { + return value; + } }; // --- Load/Store (float) --- -template <> inline float8 load(const float* x) { - return float8(_mm256_loadu_ps(x)); +template <> +inline float8 load(const float* x) { + return float8(_mm256_loadu_ps(x)); } -template <> inline void store(float* dst, float8 x) { - _mm256_storeu_ps(dst, x.value); +template <> +inline void store(float* dst, float8 x) { + _mm256_storeu_ps(dst, x.value); } -template <> inline float8 broadcast(const float* x) { - return float8(_mm256_broadcast_ss(x)); +template <> +inline float8 broadcast(const float* x) { + return float8(_mm256_broadcast_ss(x)); } // --- Arithmetic --- -inline float8 operator+(float8 a, float8 b) { return float8(_mm256_add_ps(a, b)); } -inline float8 operator-(float8 a, float8 b) { return float8(_mm256_sub_ps(a, b)); } -inline float8 operator*(float8 a, float8 b) { return float8(_mm256_mul_ps(a, b)); } -inline float8 operator/(float8 a, float8 b) { return float8(_mm256_div_ps(a, b)); } +inline float8 operator+(float8 a, float8 b) { + return float8(_mm256_add_ps(a, b)); +} +inline float8 operator-(float8 a, float8 b) { + return float8(_mm256_sub_ps(a, b)); +} +inline float8 operator*(float8 a, float8 b) { + return float8(_mm256_mul_ps(a, b)); +} +inline float8 operator/(float8 a, float8 b) { + return float8(_mm256_div_ps(a, b)); +} // --- FMA --- -template <> inline float8 fma(float8 a, float8 b, float8 c) { +template <> +inline float8 fma(float8 a, float8 b, float8 c) { #ifdef __AVX2__ - return float8(_mm256_fmadd_ps(a, b, c)); + return float8(_mm256_fmadd_ps(a, b, c)); #else - return float8(_mm256_add_ps(_mm256_mul_ps(a, b), c)); + return float8(_mm256_add_ps(_mm256_mul_ps(a, b), c)); #endif } // --- Horizontal Sum --- inline float sum(float8 x) { - __m256 val = x.value; - __m128 vlow = _mm256_castps256_ps128(val); - __m128 vhigh = _mm256_extractf128_ps(val, 1); // high 128 - vlow = _mm_add_ps(vlow, vhigh); // add the low 128 - __m128 shuf = _mm_movehdup_ps(vlow); // broadcast elements 3,1 to 2,0 - __m128 sums = _mm_add_ps(vlow, shuf); - shuf = _mm_movehl_ps(shuf, sums); // high half -> low half - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); + __m256 val = x.value; + __m128 vlow = _mm256_castps256_ps128(val); + __m128 vhigh = _mm256_extractf128_ps(val, 1); // high 128 + vlow = _mm_add_ps(vlow, vhigh); // add the low 128 + __m128 shuf = _mm_movehdup_ps(vlow); // broadcast elements 3,1 to 2,0 + __m128 sums = _mm_add_ps(vlow, shuf); + shuf = _mm_movehl_ps(shuf, sums); // high half -> low half + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); } // 8x8 block transpose with fp16/bf16 → fp32 conversion. // Loads 8 rows of 8 half-precision values, converts and transposes to fp32. template -inline void transpose_8x8_block(const T* src, float* dst, int src_stride, int dst_stride) { - static_assert(std::is_same_v || std::is_same_v, - "transpose_8x8_block requires float16_t or bfloat16_t input"); - - if constexpr (std::is_same_v) { +inline void +transpose_8x8_block(const T* src, float* dst, int src_stride, int dst_stride) { + static_assert( + std::is_same_v || std::is_same_v, + "transpose_8x8_block requires float16_t or bfloat16_t input"); + + if constexpr (std::is_same_v) { #ifdef __F16C__ - // Load 8 rows of 8 float16 values, convert to fp32 - __m128i row0 = _mm_loadu_si128(reinterpret_cast(src)); - __m128i row1 = _mm_loadu_si128(reinterpret_cast(src + src_stride)); - __m128i row2 = _mm_loadu_si128(reinterpret_cast(src + 2 * src_stride)); - __m128i row3 = _mm_loadu_si128(reinterpret_cast(src + 3 * src_stride)); - __m128i row4 = _mm_loadu_si128(reinterpret_cast(src + 4 * src_stride)); - __m128i row5 = _mm_loadu_si128(reinterpret_cast(src + 5 * src_stride)); - __m128i row6 = _mm_loadu_si128(reinterpret_cast(src + 6 * src_stride)); - __m128i row7 = _mm_loadu_si128(reinterpret_cast(src + 7 * src_stride)); - - // Convert to fp32 (vcvtph2ps: 1/cycle throughput, 3 cycle latency) - __m256 frow0 = _mm256_cvtph_ps(row0); - __m256 frow1 = _mm256_cvtph_ps(row1); - __m256 frow2 = _mm256_cvtph_ps(row2); - __m256 frow3 = _mm256_cvtph_ps(row3); - __m256 frow4 = _mm256_cvtph_ps(row4); - __m256 frow5 = _mm256_cvtph_ps(row5); - __m256 frow6 = _mm256_cvtph_ps(row6); - __m256 frow7 = _mm256_cvtph_ps(row7); - - // Transpose via unpack / shuffle / permute - __m256 t0 = _mm256_unpacklo_ps(frow0, frow1); - __m256 t1 = _mm256_unpackhi_ps(frow0, frow1); - __m256 t2 = _mm256_unpacklo_ps(frow2, frow3); - __m256 t3 = _mm256_unpackhi_ps(frow2, frow3); - __m256 t4 = _mm256_unpacklo_ps(frow4, frow5); - __m256 t5 = _mm256_unpackhi_ps(frow4, frow5); - __m256 t6 = _mm256_unpacklo_ps(frow6, frow7); - __m256 t7 = _mm256_unpackhi_ps(frow6, frow7); - - __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); - __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); - __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); - __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); - __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); - __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); - __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); - __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); - - __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); - __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); - __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); - __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); - __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); - __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); - __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); - __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); - - _mm256_storeu_ps(dst + 0*dst_stride, r0); - _mm256_storeu_ps(dst + 1*dst_stride, r1); - _mm256_storeu_ps(dst + 2*dst_stride, r2); - _mm256_storeu_ps(dst + 3*dst_stride, r3); - _mm256_storeu_ps(dst + 4*dst_stride, r4); - _mm256_storeu_ps(dst + 5*dst_stride, r5); - _mm256_storeu_ps(dst + 6*dst_stride, r6); - _mm256_storeu_ps(dst + 7*dst_stride, r7); + // Load 8 rows of 8 float16 values, convert to fp32 + __m128i row0 = _mm_loadu_si128(reinterpret_cast(src)); + __m128i row1 = + _mm_loadu_si128(reinterpret_cast(src + src_stride)); + __m128i row2 = + _mm_loadu_si128(reinterpret_cast(src + 2 * src_stride)); + __m128i row3 = + _mm_loadu_si128(reinterpret_cast(src + 3 * src_stride)); + __m128i row4 = + _mm_loadu_si128(reinterpret_cast(src + 4 * src_stride)); + __m128i row5 = + _mm_loadu_si128(reinterpret_cast(src + 5 * src_stride)); + __m128i row6 = + _mm_loadu_si128(reinterpret_cast(src + 6 * src_stride)); + __m128i row7 = + _mm_loadu_si128(reinterpret_cast(src + 7 * src_stride)); + + // Convert to fp32 (vcvtph2ps: 1/cycle throughput, 3 cycle latency) + __m256 frow0 = _mm256_cvtph_ps(row0); + __m256 frow1 = _mm256_cvtph_ps(row1); + __m256 frow2 = _mm256_cvtph_ps(row2); + __m256 frow3 = _mm256_cvtph_ps(row3); + __m256 frow4 = _mm256_cvtph_ps(row4); + __m256 frow5 = _mm256_cvtph_ps(row5); + __m256 frow6 = _mm256_cvtph_ps(row6); + __m256 frow7 = _mm256_cvtph_ps(row7); + + // Transpose via unpack / shuffle / permute + __m256 t0 = _mm256_unpacklo_ps(frow0, frow1); + __m256 t1 = _mm256_unpackhi_ps(frow0, frow1); + __m256 t2 = _mm256_unpacklo_ps(frow2, frow3); + __m256 t3 = _mm256_unpackhi_ps(frow2, frow3); + __m256 t4 = _mm256_unpacklo_ps(frow4, frow5); + __m256 t5 = _mm256_unpackhi_ps(frow4, frow5); + __m256 t6 = _mm256_unpacklo_ps(frow6, frow7); + __m256 t7 = _mm256_unpackhi_ps(frow6, frow7); + + __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); + __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); + __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); + __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); + __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); + __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); + __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); + __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); + + __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); + __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); + __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); + __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); + __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); + __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); + __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); + __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); + + _mm256_storeu_ps(dst + 0 * dst_stride, r0); + _mm256_storeu_ps(dst + 1 * dst_stride, r1); + _mm256_storeu_ps(dst + 2 * dst_stride, r2); + _mm256_storeu_ps(dst + 3 * dst_stride, r3); + _mm256_storeu_ps(dst + 4 * dst_stride, r4); + _mm256_storeu_ps(dst + 5 * dst_stride, r5); + _mm256_storeu_ps(dst + 6 * dst_stride, r6); + _mm256_storeu_ps(dst + 7 * dst_stride, r7); #else - // Fallback without F16C - for (int i = 0; i < 8; i++) { - for (int j = 0; j < 8; j++) { - dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); - } - } + // Fallback without F16C + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); + } + } #endif - } else { // bfloat16_t + } else { // bfloat16_t #ifdef __AVX2__ - // bf16 → fp32: zero-extend to 32-bit, shift left 16 - __m256 rows[8]; - for (int i = 0; i < 8; i++) { - __m128i bf16_vals_u16 = _mm_loadu_si128( - reinterpret_cast(src + i * src_stride)); - __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); - __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); - rows[i] = _mm256_castsi256_ps(fp32_bits); - } - - // Transpose the 8 rows using AVX shuffles - __m256 t0 = _mm256_unpacklo_ps(rows[0], rows[1]); - __m256 t1 = _mm256_unpackhi_ps(rows[0], rows[1]); - __m256 t2 = _mm256_unpacklo_ps(rows[2], rows[3]); - __m256 t3 = _mm256_unpackhi_ps(rows[2], rows[3]); - __m256 t4 = _mm256_unpacklo_ps(rows[4], rows[5]); - __m256 t5 = _mm256_unpackhi_ps(rows[4], rows[5]); - __m256 t6 = _mm256_unpacklo_ps(rows[6], rows[7]); - __m256 t7 = _mm256_unpackhi_ps(rows[6], rows[7]); - - __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); - __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); - __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); - __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); - __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); - __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); - __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); - __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); - - __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); - __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); - __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); - __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); - __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); - __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); - __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); - __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); - - _mm256_storeu_ps(dst + 0*dst_stride, r0); - _mm256_storeu_ps(dst + 1*dst_stride, r1); - _mm256_storeu_ps(dst + 2*dst_stride, r2); - _mm256_storeu_ps(dst + 3*dst_stride, r3); - _mm256_storeu_ps(dst + 4*dst_stride, r4); - _mm256_storeu_ps(dst + 5*dst_stride, r5); - _mm256_storeu_ps(dst + 6*dst_stride, r6); - _mm256_storeu_ps(dst + 7*dst_stride, r7); + // bf16 → fp32: zero-extend to 32-bit, shift left 16 + __m256 rows[8]; + for (int i = 0; i < 8; i++) { + __m128i bf16_vals_u16 = _mm_loadu_si128( + reinterpret_cast(src + i * src_stride)); + __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); + __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); + rows[i] = _mm256_castsi256_ps(fp32_bits); + } + + // Transpose the 8 rows using AVX shuffles + __m256 t0 = _mm256_unpacklo_ps(rows[0], rows[1]); + __m256 t1 = _mm256_unpackhi_ps(rows[0], rows[1]); + __m256 t2 = _mm256_unpacklo_ps(rows[2], rows[3]); + __m256 t3 = _mm256_unpackhi_ps(rows[2], rows[3]); + __m256 t4 = _mm256_unpacklo_ps(rows[4], rows[5]); + __m256 t5 = _mm256_unpackhi_ps(rows[4], rows[5]); + __m256 t6 = _mm256_unpacklo_ps(rows[6], rows[7]); + __m256 t7 = _mm256_unpackhi_ps(rows[6], rows[7]); + + __m256 tt0 = _mm256_shuffle_ps(t0, t2, 0x44); + __m256 tt1 = _mm256_shuffle_ps(t0, t2, 0xEE); + __m256 tt2 = _mm256_shuffle_ps(t1, t3, 0x44); + __m256 tt3 = _mm256_shuffle_ps(t1, t3, 0xEE); + __m256 tt4 = _mm256_shuffle_ps(t4, t6, 0x44); + __m256 tt5 = _mm256_shuffle_ps(t4, t6, 0xEE); + __m256 tt6 = _mm256_shuffle_ps(t5, t7, 0x44); + __m256 tt7 = _mm256_shuffle_ps(t5, t7, 0xEE); + + __m256 r0 = _mm256_permute2f128_ps(tt0, tt4, 0x20); + __m256 r1 = _mm256_permute2f128_ps(tt1, tt5, 0x20); + __m256 r2 = _mm256_permute2f128_ps(tt2, tt6, 0x20); + __m256 r3 = _mm256_permute2f128_ps(tt3, tt7, 0x20); + __m256 r4 = _mm256_permute2f128_ps(tt0, tt4, 0x31); + __m256 r5 = _mm256_permute2f128_ps(tt1, tt5, 0x31); + __m256 r6 = _mm256_permute2f128_ps(tt2, tt6, 0x31); + __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); + + _mm256_storeu_ps(dst + 0 * dst_stride, r0); + _mm256_storeu_ps(dst + 1 * dst_stride, r1); + _mm256_storeu_ps(dst + 2 * dst_stride, r2); + _mm256_storeu_ps(dst + 3 * dst_stride, r3); + _mm256_storeu_ps(dst + 4 * dst_stride, r4); + _mm256_storeu_ps(dst + 5 * dst_stride, r5); + _mm256_storeu_ps(dst + 6 * dst_stride, r6); + _mm256_storeu_ps(dst + 7 * dst_stride, r7); #else - // Scalar fallback - for (int i = 0; i < 8; i++) { - for (int j = 0; j < 8; j++) { - dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); - } - } -#endif + // Scalar fallback + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + dst[j * dst_stride + i] = static_cast(src[i * src_stride + j]); + } } +#endif + } } // ========================================================================== @@ -213,94 +240,102 @@ inline void transpose_8x8_block(const T* src, float* dst, int src_stride, int ds // Load 8 half-precision values, convert to float8. template inline float8 load_convert_to_float(const T* src) { - static_assert(std::is_same_v || std::is_same_v, - "load_convert_to_float requires float16_t or bfloat16_t input for this specialization."); - static_assert(sizeof(T) == 2, "Input type T must be 2 bytes."); + static_assert( + std::is_same_v || std::is_same_v, + "load_convert_to_float requires float16_t or bfloat16_t input for this specialization."); + static_assert(sizeof(T) == 2, "Input type T must be 2 bytes."); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { #ifdef __F16C__ - __m128i f16_vals = _mm_loadu_si128(reinterpret_cast(src)); - return float8(_mm256_cvtph_ps(f16_vals)); + __m128i f16_vals = _mm_loadu_si128(reinterpret_cast(src)); + return float8(_mm256_cvtph_ps(f16_vals)); #else - float buffer[8]; - for (int i = 0; i < 8; ++i) buffer[i] = static_cast(src[i]); - return load(buffer); + float buffer[8]; + for (int i = 0; i < 8; ++i) + buffer[i] = static_cast(src[i]); + return load(buffer); #endif - } else { // bfloat16_t + } else { // bfloat16_t #ifdef __AVX2__ - // bf16 → fp32: zero-extend to 32-bit then shift left 16 - __m128i bf16_vals_u16 = _mm_loadu_si128(reinterpret_cast(src)); - __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); - __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); - return float8(_mm256_castsi256_ps(fp32_bits)); + // bf16 → fp32: zero-extend to 32-bit then shift left 16 + __m128i bf16_vals_u16 = + _mm_loadu_si128(reinterpret_cast(src)); + __m256i bf16_vals_u32 = _mm256_cvtepu16_epi32(bf16_vals_u16); + __m256i fp32_bits = _mm256_slli_epi32(bf16_vals_u32, 16); + return float8(_mm256_castsi256_ps(fp32_bits)); #else - // Scalar fallback - float buffer[8]; - for (int i = 0; i < 8; ++i) { - uint32_t val_int = static_cast(reinterpret_cast(src)[i]) << 16; - std::memcpy(&buffer[i], &val_int, sizeof(float)); - } - return load(buffer); -#endif + // Scalar fallback + float buffer[8]; + for (int i = 0; i < 8; ++i) { + uint32_t val_int = + static_cast(reinterpret_cast(src)[i]) + << 16; + std::memcpy(&buffer[i], &val_int, sizeof(float)); } + return load(buffer); +#endif + } } // fp32 → bf16 with round-to-nearest-even. #ifdef __AVX2__ inline __m128i convert_float_to_bfloat16_rne_avx2(__m256 src) { - __m256i val_int = _mm256_castps_si256(src); - __m256i bias = _mm256_set1_epi32(0x7FFF); - __m256i rounded_val = _mm256_add_epi32(val_int, bias); - __m256i bf16_bits_32 = _mm256_srli_epi32(rounded_val, 16); - __m128i bf16_bits_low = _mm256_castsi256_si128(bf16_bits_32); - __m128i bf16_bits_high = _mm256_extracti128_si256(bf16_bits_32, 1); - // Use signed pack to preserve negative values - return _mm_packs_epi32(bf16_bits_low, bf16_bits_high); + __m256i val_int = _mm256_castps_si256(src); + __m256i bias = _mm256_set1_epi32(0x7FFF); + __m256i rounded_val = _mm256_add_epi32(val_int, bias); + __m256i bf16_bits_32 = _mm256_srli_epi32(rounded_val, 16); + __m128i bf16_bits_low = _mm256_castsi256_si128(bf16_bits_32); + __m128i bf16_bits_high = _mm256_extracti128_si256(bf16_bits_32, 1); + // Use signed pack to preserve negative values + return _mm_packs_epi32(bf16_bits_low, bf16_bits_high); } #endif // Store float8, converting back to 8 half-precision values. template inline void store_convert_from_float(T* dst, float8 src) { - static_assert(std::is_same_v || std::is_same_v, - "store_convert_from_float requires float16_t or bfloat16_t output for this specialization."); - static_assert(sizeof(T) == 2, "Output type T must be 2 bytes."); + static_assert( + std::is_same_v || std::is_same_v, + "store_convert_from_float requires float16_t or bfloat16_t output for this specialization."); + static_assert(sizeof(T) == 2, "Output type T must be 2 bytes."); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { #ifdef __F16C__ - __m128i f16_result = _mm256_cvtps_ph( - src.value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), f16_result); + __m128i f16_result = _mm256_cvtps_ph( + src.value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), f16_result); #else - float buffer[8]; - store(buffer, src); - for(int i=0; i<8; ++i) dst[i] = static_cast(buffer[i]); + float buffer[8]; + store(buffer, src); + for (int i = 0; i < 8; ++i) + dst[i] = static_cast(buffer[i]); #endif - } else { // bfloat16_t + } else { // bfloat16_t #ifdef __AVX2__ - __m128i bf16_result = convert_float_to_bfloat16_rne_avx2(src.value); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), bf16_result); + __m128i bf16_result = convert_float_to_bfloat16_rne_avx2(src.value); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), bf16_result); #else - // Scalar fallback with RNE - float buffer[8]; - store(buffer, src); - alignas(16) uint16_t bf16_bits_arr[8]; - for (int i = 0; i < 8; ++i) { - uint32_t val_int; - std::memcpy(&val_int, &buffer[i], sizeof(float)); - - // Handle NaN - if ((val_int & 0x7F800000) == 0x7F800000 && (val_int & 0x007FFFFF) != 0) { - bf16_bits_arr[i] = 0x7FC0 | static_cast((val_int >> 16) & 0x003F); - } else { - uint32_t rounding_bias = ((val_int >> 16) & 1) + 0x7FFF; - val_int += rounding_bias; - bf16_bits_arr[i] = static_cast(val_int >> 16); - } - } - std::memcpy(dst, bf16_bits_arr, 8 * sizeof(uint16_t)); -#endif + // Scalar fallback with RNE + float buffer[8]; + store(buffer, src); + alignas(16) uint16_t bf16_bits_arr[8]; + for (int i = 0; i < 8; ++i) { + uint32_t val_int; + std::memcpy(&val_int, &buffer[i], sizeof(float)); + + // Handle NaN + if ((val_int & 0x7F800000) == 0x7F800000 && (val_int & 0x007FFFFF) != 0) { + bf16_bits_arr[i] = + 0x7FC0 | static_cast((val_int >> 16) & 0x003F); + } else { + uint32_t rounding_bias = ((val_int >> 16) & 1) + 0x7FFF; + val_int += rounding_bias; + bf16_bits_arr[i] = static_cast(val_int >> 16); + } } + std::memcpy(dst, bf16_bits_arr, 8 * sizeof(uint16_t)); +#endif + } } // 6×16 AVX2 microkernel: C[6][16] += A[6][kc] * B[kc][16] @@ -313,81 +348,85 @@ inline void micro_kernel_6x16( int ldc, int kc, int a_stride, - int b_stride) -{ - static_assert(MR == 6, "This kernel requires MR=6"); - static_assert(NR == 16, "This kernel requires NR=16"); - - // 12 accumulators + 2 B loads + 1 A broadcast = 15 YMM registers - __m256 c00 = _mm256_loadu_ps(C_block + 0*ldc); - __m256 c01 = _mm256_loadu_ps(C_block + 0*ldc + 8); - __m256 c10 = _mm256_loadu_ps(C_block + 1*ldc); - __m256 c11 = _mm256_loadu_ps(C_block + 1*ldc + 8); - __m256 c20 = _mm256_loadu_ps(C_block + 2*ldc); - __m256 c21 = _mm256_loadu_ps(C_block + 2*ldc + 8); - __m256 c30 = _mm256_loadu_ps(C_block + 3*ldc); - __m256 c31 = _mm256_loadu_ps(C_block + 3*ldc + 8); - __m256 c40 = _mm256_loadu_ps(C_block + 4*ldc); - __m256 c41 = _mm256_loadu_ps(C_block + 4*ldc + 8); - __m256 c50 = _mm256_loadu_ps(C_block + 5*ldc); - __m256 c51 = _mm256_loadu_ps(C_block + 5*ldc + 8); - - // Prefetch B and A data 8 iterations ahead into L1 - constexpr int PF_DIST = 8; - - for (int k = 0; k < kc; ++k) { - const float* b_ptr = B_panel + k * b_stride; - const float* a_ptr = A_panel + k * a_stride; - - // Prefetch next B and A rows into L1 - if (k + PF_DIST < kc) { - _mm_prefetch(reinterpret_cast(B_panel + (k + PF_DIST) * b_stride), _MM_HINT_T0); - _mm_prefetch(reinterpret_cast(B_panel + (k + PF_DIST) * b_stride + 8), _MM_HINT_T0); - _mm_prefetch(reinterpret_cast(A_panel + (k + PF_DIST) * a_stride), _MM_HINT_T0); - } - - __m256 b0 = _mm256_loadu_ps(b_ptr); - __m256 b1 = _mm256_loadu_ps(b_ptr + 8); - - __m256 a; - a = _mm256_broadcast_ss(a_ptr + 0); - c00 = _mm256_fmadd_ps(a, b0, c00); - c01 = _mm256_fmadd_ps(a, b1, c01); - - a = _mm256_broadcast_ss(a_ptr + 1); - c10 = _mm256_fmadd_ps(a, b0, c10); - c11 = _mm256_fmadd_ps(a, b1, c11); - - a = _mm256_broadcast_ss(a_ptr + 2); - c20 = _mm256_fmadd_ps(a, b0, c20); - c21 = _mm256_fmadd_ps(a, b1, c21); - - a = _mm256_broadcast_ss(a_ptr + 3); - c30 = _mm256_fmadd_ps(a, b0, c30); - c31 = _mm256_fmadd_ps(a, b1, c31); - - a = _mm256_broadcast_ss(a_ptr + 4); - c40 = _mm256_fmadd_ps(a, b0, c40); - c41 = _mm256_fmadd_ps(a, b1, c41); - - a = _mm256_broadcast_ss(a_ptr + 5); - c50 = _mm256_fmadd_ps(a, b0, c50); - c51 = _mm256_fmadd_ps(a, b1, c51); + int b_stride) { + static_assert(MR == 6, "This kernel requires MR=6"); + static_assert(NR == 16, "This kernel requires NR=16"); + + // 12 accumulators + 2 B loads + 1 A broadcast = 15 YMM registers + __m256 c00 = _mm256_loadu_ps(C_block + 0 * ldc); + __m256 c01 = _mm256_loadu_ps(C_block + 0 * ldc + 8); + __m256 c10 = _mm256_loadu_ps(C_block + 1 * ldc); + __m256 c11 = _mm256_loadu_ps(C_block + 1 * ldc + 8); + __m256 c20 = _mm256_loadu_ps(C_block + 2 * ldc); + __m256 c21 = _mm256_loadu_ps(C_block + 2 * ldc + 8); + __m256 c30 = _mm256_loadu_ps(C_block + 3 * ldc); + __m256 c31 = _mm256_loadu_ps(C_block + 3 * ldc + 8); + __m256 c40 = _mm256_loadu_ps(C_block + 4 * ldc); + __m256 c41 = _mm256_loadu_ps(C_block + 4 * ldc + 8); + __m256 c50 = _mm256_loadu_ps(C_block + 5 * ldc); + __m256 c51 = _mm256_loadu_ps(C_block + 5 * ldc + 8); + + // Prefetch B and A data 8 iterations ahead into L1 + constexpr int PF_DIST = 8; + + for (int k = 0; k < kc; ++k) { + const float* b_ptr = B_panel + k * b_stride; + const float* a_ptr = A_panel + k * a_stride; + + // Prefetch next B and A rows into L1 + if (k + PF_DIST < kc) { + _mm_prefetch( + reinterpret_cast(B_panel + (k + PF_DIST) * b_stride), + _MM_HINT_T0); + _mm_prefetch( + reinterpret_cast(B_panel + (k + PF_DIST) * b_stride + 8), + _MM_HINT_T0); + _mm_prefetch( + reinterpret_cast(A_panel + (k + PF_DIST) * a_stride), + _MM_HINT_T0); } - _mm256_storeu_ps(C_block + 0*ldc, c00); - _mm256_storeu_ps(C_block + 0*ldc + 8, c01); - _mm256_storeu_ps(C_block + 1*ldc, c10); - _mm256_storeu_ps(C_block + 1*ldc + 8, c11); - _mm256_storeu_ps(C_block + 2*ldc, c20); - _mm256_storeu_ps(C_block + 2*ldc + 8, c21); - _mm256_storeu_ps(C_block + 3*ldc, c30); - _mm256_storeu_ps(C_block + 3*ldc + 8, c31); - _mm256_storeu_ps(C_block + 4*ldc, c40); - _mm256_storeu_ps(C_block + 4*ldc + 8, c41); - _mm256_storeu_ps(C_block + 5*ldc, c50); - _mm256_storeu_ps(C_block + 5*ldc + 8, c51); + __m256 b0 = _mm256_loadu_ps(b_ptr); + __m256 b1 = _mm256_loadu_ps(b_ptr + 8); + + __m256 a; + a = _mm256_broadcast_ss(a_ptr + 0); + c00 = _mm256_fmadd_ps(a, b0, c00); + c01 = _mm256_fmadd_ps(a, b1, c01); + + a = _mm256_broadcast_ss(a_ptr + 1); + c10 = _mm256_fmadd_ps(a, b0, c10); + c11 = _mm256_fmadd_ps(a, b1, c11); + + a = _mm256_broadcast_ss(a_ptr + 2); + c20 = _mm256_fmadd_ps(a, b0, c20); + c21 = _mm256_fmadd_ps(a, b1, c21); + + a = _mm256_broadcast_ss(a_ptr + 3); + c30 = _mm256_fmadd_ps(a, b0, c30); + c31 = _mm256_fmadd_ps(a, b1, c31); + + a = _mm256_broadcast_ss(a_ptr + 4); + c40 = _mm256_fmadd_ps(a, b0, c40); + c41 = _mm256_fmadd_ps(a, b1, c41); + + a = _mm256_broadcast_ss(a_ptr + 5); + c50 = _mm256_fmadd_ps(a, b0, c50); + c51 = _mm256_fmadd_ps(a, b1, c51); + } + + _mm256_storeu_ps(C_block + 0 * ldc, c00); + _mm256_storeu_ps(C_block + 0 * ldc + 8, c01); + _mm256_storeu_ps(C_block + 1 * ldc, c10); + _mm256_storeu_ps(C_block + 1 * ldc + 8, c11); + _mm256_storeu_ps(C_block + 2 * ldc, c20); + _mm256_storeu_ps(C_block + 2 * ldc + 8, c21); + _mm256_storeu_ps(C_block + 3 * ldc, c30); + _mm256_storeu_ps(C_block + 3 * ldc + 8, c31); + _mm256_storeu_ps(C_block + 4 * ldc, c40); + _mm256_storeu_ps(C_block + 4 * ldc + 8, c41); + _mm256_storeu_ps(C_block + 5 * ldc, c50); + _mm256_storeu_ps(C_block + 5 * ldc + 8, c51); } - } // namespace mlx::core::simd \ No newline at end of file diff --git a/mlx/event.h b/mlx/event.h index 1ed8db234c..66a6a75df5 100644 --- a/mlx/event.h +++ b/mlx/event.h @@ -4,7 +4,6 @@ #include #include #include -#include #include "mlx/stream.h" From f128c41cd998a2ca15aa737f0ceaeafa6f0b6676 Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Wed, 13 May 2026 22:57:37 +0000 Subject: [PATCH 5/6] rename gemm and gemv files and namespace --- .../avx_simd.h => gemms/avx_gemm_simd.h} | 12 ++++- mlx/backend/cpu/gemms/avx_simd_gemm.h | 24 ++++----- mlx/backend/cpu/gemms/avx_simd_gemv.h | 50 +++++++++---------- 3 files changed, 47 insertions(+), 39 deletions(-) rename mlx/backend/cpu/{simd/avx_simd.h => gemms/avx_gemm_simd.h} (96%) diff --git a/mlx/backend/cpu/simd/avx_simd.h b/mlx/backend/cpu/gemms/avx_gemm_simd.h similarity index 96% rename from mlx/backend/cpu/simd/avx_simd.h rename to mlx/backend/cpu/gemms/avx_gemm_simd.h index 58ef4c7c02..8d125d310e 100644 --- a/mlx/backend/cpu/simd/avx_simd.h +++ b/mlx/backend/cpu/gemms/avx_gemm_simd.h @@ -8,7 +8,15 @@ #include "mlx/backend/cpu/simd/base_simd.h" -namespace mlx::core::simd { +// GEMM-private SIMD helpers for the AVX2 fp16/bf16 matmul path. +// +// This header intentionally lives under backend/cpu/gemms/ and uses the +// mlx::core::detail namespace rather than mlx::core::simd. The helpers here +// are scaffolding for the GEMM/GEMV kernels in this directory; they are not +// a general SIMD abstraction layer and should not be depended on from +// outside the gemms/ tree. A future, broader AVX2 SIMD layer can land under +// backend/cpu/simd/ in mlx::core::simd without colliding with these symbols. +namespace mlx::core::detail { // Forward declarations template @@ -429,4 +437,4 @@ inline void micro_kernel_6x16( _mm256_storeu_ps(C_block + 5 * ldc + 8, c51); } -} // namespace mlx::core::simd \ No newline at end of file +} // namespace mlx::core::detail diff --git a/mlx/backend/cpu/gemms/avx_simd_gemm.h b/mlx/backend/cpu/gemms/avx_simd_gemm.h index 8ba143246f..23c2a8f9cd 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemm.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemm.h @@ -9,15 +9,15 @@ #include #include "mlx/backend/cpu/gemms/aligned_buffer.h" +#include "mlx/backend/cpu/gemms/avx_gemm_simd.h" #include "mlx/backend/cpu/gemms/avx_simd_gemv.h" -#include "mlx/backend/cpu/simd/avx_simd.h" namespace mlx::core { template inline void pack_transpose_8x8(const T* src, float* dst, int src_stride, int dst_stride) { - simd::transpose_8x8_block(src, dst, src_stride, dst_stride); + detail::transpose_8x8_block(src, dst, src_stride, dst_stride); } // Pack A block (m_block x k_block) into A_packed (MC x KC float, column-major). @@ -85,8 +85,8 @@ static void pack_A_block( float* a_dst_col_ptr = A_packed + k * MC; int i = 0; for (; i + simd_width <= m_block; i += simd_width) { - simd::float8 a_vec = simd::load_convert_to_float(a_src_row_ptr + i); - simd::store(a_dst_col_ptr + i, a_vec); + detail::float8 a_vec = detail::load_convert_to_float(a_src_row_ptr + i); + detail::store(a_dst_col_ptr + i, a_vec); } for (; i < m_block; ++i) { a_dst_col_ptr[i] = static_cast(a_src_row_ptr[i]); @@ -126,8 +126,8 @@ static void pack_B_block( float* b_dst_row_ptr = B_packed + k * NC; int j = 0; for (; j + simd_width <= n_block; j += simd_width) { - simd::float8 b_vec = simd::load_convert_to_float(b_src_row_ptr + j); - simd::store(b_dst_row_ptr + j, b_vec); + detail::float8 b_vec = detail::load_convert_to_float(b_src_row_ptr + j); + detail::store(b_dst_row_ptr + j, b_vec); } for (; j < n_block; ++j) { b_dst_row_ptr[j] = static_cast(b_src_row_ptr[j]); @@ -297,7 +297,7 @@ void simd_gemm_optimized_higher_precision( } if (m_micro == MR && n_micro == NR) { - simd::micro_kernel_6x16( + detail::micro_kernel_6x16( a_ptr, b_ptr, c_ptr, NC_BLOCK, kc, MC_BLOCK, NC_BLOCK); } else { compute_block_scalar_partial( @@ -318,22 +318,22 @@ void simd_gemm_optimized_higher_precision( if (last_k) { bool apply_alpha = (alpha != 1.0f); bool apply_beta = (beta != 0.0f); - simd::float8 alpha_vec(alpha); - simd::float8 beta_vec(beta); + detail::float8 alpha_vec(alpha); + detail::float8 beta_vec(beta); for (int i = 0; i < mc; ++i) { T* c_row = c + (ic + i) * ldC + jc; float* acc_row = C_acc + (ic + i) * NC_BLOCK; int j = 0; for (; j + sw <= nc; j += sw) { - simd::float8 acc = simd::load(acc_row + j); + detail::float8 acc = detail::load(acc_row + j); if (apply_alpha) acc = alpha_vec * acc; if (apply_beta) { - simd::float8 cv = simd::load_convert_to_float(c_row + j); + detail::float8 cv = detail::load_convert_to_float(c_row + j); acc = acc + beta_vec * cv; } - simd::store_convert_from_float(c_row + j, acc); + detail::store_convert_from_float(c_row + j, acc); } for (; j < nc; ++j) { float val = acc_row[j]; diff --git a/mlx/backend/cpu/gemms/avx_simd_gemv.h b/mlx/backend/cpu/gemms/avx_simd_gemv.h index 178e9dd450..dfda359aeb 100644 --- a/mlx/backend/cpu/gemms/avx_simd_gemv.h +++ b/mlx/backend/cpu/gemms/avx_simd_gemv.h @@ -6,7 +6,7 @@ #include #include "mlx/backend/cpu/gemms/aligned_buffer.h" -#include "mlx/backend/cpu/simd/avx_simd.h" +#include "mlx/backend/cpu/gemms/avx_gemm_simd.h" namespace mlx::core { @@ -30,7 +30,7 @@ static void gemv_outer_product( for (int k = 0; k < K; k++) { float v = static_cast(vec[k]); - simd::float8 v_bcast(v); + detail::float8 v_bcast(v); const T* mat_row = mat + k * mat_stride + jc; // Prefetch start of next row for this block @@ -42,10 +42,10 @@ static void gemv_outer_product( int j = 0; for (; j + sw <= nc; j += sw) { - simd::float8 m = simd::load_convert_to_float(mat_row + j); - simd::float8 c = simd::load(acc_block + j); - simd::store( - acc_block + j, simd::fma(v_bcast, m, c)); + detail::float8 m = detail::load_convert_to_float(mat_row + j); + detail::float8 c = detail::load(acc_block + j); + detail::store( + acc_block + j, detail::fma(v_bcast, m, c)); } for (; j < nc; j++) { acc_block[j] += v * static_cast(mat_row[j]); @@ -69,7 +69,7 @@ static void gemv_dot_product( int i = 0; for (; i + UNROLL <= n_outputs; i += UNROLL) { - simd::float8 s0, s1, s2, s3; + detail::float8 s0, s1, s2, s3; const T* r0 = mat + (i + 0) * mat_stride; const T* r1 = mat + (i + 1) * mat_stride; @@ -78,17 +78,17 @@ static void gemv_dot_product( int k = 0; for (; k + sw <= K; k += sw) { - simd::float8 v = simd::load_convert_to_float(vec + k); - s0 = simd::fma(simd::load_convert_to_float(r0 + k), v, s0); - s1 = simd::fma(simd::load_convert_to_float(r1 + k), v, s1); - s2 = simd::fma(simd::load_convert_to_float(r2 + k), v, s2); - s3 = simd::fma(simd::load_convert_to_float(r3 + k), v, s3); + detail::float8 v = detail::load_convert_to_float(vec + k); + s0 = detail::fma(detail::load_convert_to_float(r0 + k), v, s0); + s1 = detail::fma(detail::load_convert_to_float(r1 + k), v, s1); + s2 = detail::fma(detail::load_convert_to_float(r2 + k), v, s2); + s3 = detail::fma(detail::load_convert_to_float(r3 + k), v, s3); } - float d0 = simd::sum(s0); - float d1 = simd::sum(s1); - float d2 = simd::sum(s2); - float d3 = simd::sum(s3); + float d0 = detail::sum(s0); + float d1 = detail::sum(s1); + float d2 = detail::sum(s2); + float d3 = detail::sum(s3); for (; k < K; k++) { float vk = static_cast(vec[k]); @@ -105,16 +105,16 @@ static void gemv_dot_product( } for (; i < n_outputs; i++) { - simd::float8 s; + detail::float8 s; const T* row = mat + i * mat_stride; int k = 0; for (; k + sw <= K; k += sw) { - simd::float8 v = simd::load_convert_to_float(vec + k); - s = simd::fma(simd::load_convert_to_float(row + k), v, s); + detail::float8 v = detail::load_convert_to_float(vec + k); + s = detail::fma(detail::load_convert_to_float(row + k), v, s); } - float d = simd::sum(s); + float d = detail::sum(s); for (; k < K; k++) { d += static_cast(vec[k]) * static_cast(row[k]); } @@ -167,18 +167,18 @@ void simd_gemv( // Writeback: C = alpha * acc + beta * C (convert fp32 → T) bool apply_alpha = (alpha != 1.0f); bool apply_beta = (beta != 0.0f); - simd::float8 alpha_vec(alpha); - simd::float8 beta_vec(beta); + detail::float8 alpha_vec(alpha); + detail::float8 beta_vec(beta); int j = 0; for (; j + sw <= out_len; j += sw) { - simd::float8 val = simd::load(acc + j); + detail::float8 val = detail::load(acc + j); if (apply_alpha) val = alpha_vec * val; if (apply_beta) { - simd::float8 cv = simd::load_convert_to_float(c + j); + detail::float8 cv = detail::load_convert_to_float(c + j); val = val + beta_vec * cv; } - simd::store_convert_from_float(c + j, val); + detail::store_convert_from_float(c + j, val); } for (; j < out_len; j++) { float val = acc[j]; From 3d8d892509897882f47f0cccb57b30284e74b45c Mon Sep 17 00:00:00 2001 From: Andrew Sweet Date: Fri, 15 May 2026 07:46:53 +0000 Subject: [PATCH 6/6] addressing PR feedback --- .gitignore | 1 + CMakeLists.txt | 59 +++++++++++++-------------- mlx/backend/cpu/gemms/avx_gemm_simd.h | 9 +--- mlx/backend/cpu/gemms/simd_bf16.cpp | 2 +- mlx/backend/cpu/gemms/simd_fp16.cpp | 2 +- 5 files changed, 33 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 1daaa46d12..f6e9b94480 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,4 @@ uv.lock .cache/ # vim *.swp +results/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 5890405cde..6eb3d739ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,36 +49,6 @@ option(USE_ASAN "Enable AddressSanitizer (ASan)" OFF) option(USE_UBSAN "Enable UndefinedBehaviorSanitizer (UBSan)" OFF) option(USE_TSAN "Enable ThreadSanitizer (TSan)" OFF) -# ----------------------------- x86 SIMD Detection ----------------------------- -if(MLX_BUILD_CPU AND (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-9]86")) - include(CheckCXXCompilerFlag) - - check_cxx_compiler_flag("-mavx2" HAS_AVX2) - check_cxx_compiler_flag("-mfma" HAS_FMA) - check_cxx_compiler_flag("-mf16c" HAS_F16C) - - if(HAS_AVX2 - AND HAS_FMA - AND HAS_F16C) - message( - STATUS "Compiler supports AVX2/FMA/F16C - enabling AVX SIMD backend") - add_compile_options(-mavx2 -mfma -mf16c) - add_compile_definitions(MLX_USE_AVX) - else() - message( - STATUS "Missing required x86 SIMD support - using base SIMD backend") - if(NOT HAS_AVX2) - message(STATUS " Missing: AVX2") - endif() - if(NOT HAS_FMA) - message(STATUS " Missing: FMA") - endif() - if(NOT HAS_F16C) - message(STATUS " Missing: F16C") - endif() - endif() -endif() - # --------------------- Processor tests ------------------------- message( STATUS @@ -274,6 +244,35 @@ if(WIN32) endif() if(MLX_BUILD_CPU) + # ----------------------------- x86 SIMD -------------------------------- + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-9]86") + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mavx2" HAS_AVX2) + check_cxx_compiler_flag("-mfma" HAS_FMA) + check_cxx_compiler_flag("-mf16c" HAS_F16C) + + if(HAS_AVX2 + AND HAS_FMA + AND HAS_F16C) + message( + STATUS "Compiler supports AVX2/FMA/F16C - enabling AVX2 SIMD backend") + target_compile_options(mlx PRIVATE -mavx2 -mfma -mf16c) + target_compile_definitions(mlx PRIVATE MLX_USE_AVX2) + else() + message( + STATUS "Missing required x86 SIMD support - using base SIMD backend") + if(NOT HAS_AVX2) + message(STATUS " Missing: AVX2") + endif() + if(NOT HAS_FMA) + message(STATUS " Missing: FMA") + endif() + if(NOT HAS_F16C) + message(STATUS " Missing: F16C") + endif() + endif() + endif() + find_library(ACCELERATE_LIBRARY Accelerate) if(ACCELERATE_LIBRARY) message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}") diff --git a/mlx/backend/cpu/gemms/avx_gemm_simd.h b/mlx/backend/cpu/gemms/avx_gemm_simd.h index 8d125d310e..d63e3ff908 100644 --- a/mlx/backend/cpu/gemms/avx_gemm_simd.h +++ b/mlx/backend/cpu/gemms/avx_gemm_simd.h @@ -8,14 +8,7 @@ #include "mlx/backend/cpu/simd/base_simd.h" -// GEMM-private SIMD helpers for the AVX2 fp16/bf16 matmul path. -// -// This header intentionally lives under backend/cpu/gemms/ and uses the -// mlx::core::detail namespace rather than mlx::core::simd. The helpers here -// are scaffolding for the GEMM/GEMV kernels in this directory; they are not -// a general SIMD abstraction layer and should not be depended on from -// outside the gemms/ tree. A future, broader AVX2 SIMD layer can land under -// backend/cpu/simd/ in mlx::core::simd without colliding with these symbols. +// GEMM-private AVX2 SIMD helpers for fp16/bf16 matmul namespace mlx::core::detail { // Forward declarations diff --git a/mlx/backend/cpu/gemms/simd_bf16.cpp b/mlx/backend/cpu/gemms/simd_bf16.cpp index b841ffe450..11ef34f46f 100644 --- a/mlx/backend/cpu/gemms/simd_bf16.cpp +++ b/mlx/backend/cpu/gemms/simd_bf16.cpp @@ -3,7 +3,7 @@ #include "mlx/backend/common/utils.h" #include "mlx/backend/cpu/gemm.h" -#ifdef MLX_USE_AVX +#ifdef MLX_USE_AVX2 #include "mlx/backend/cpu/gemms/avx_simd_gemm.h" #else #include "mlx/backend/cpu/gemms/simd_gemm.h" diff --git a/mlx/backend/cpu/gemms/simd_fp16.cpp b/mlx/backend/cpu/gemms/simd_fp16.cpp index 5e298a3a94..826a2ca60a 100644 --- a/mlx/backend/cpu/gemms/simd_fp16.cpp +++ b/mlx/backend/cpu/gemms/simd_fp16.cpp @@ -3,7 +3,7 @@ #include "mlx/backend/common/utils.h" #include "mlx/backend/cpu/gemm.h" -#ifdef MLX_USE_AVX +#ifdef MLX_USE_AVX2 #include "mlx/backend/cpu/gemms/avx_simd_gemm.h" #else #include "mlx/backend/cpu/gemms/simd_gemm.h"