@@ -138,39 +138,29 @@ static bool
138138add_impl (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
139139 int nthreads)
140140{
141- #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
142- if (OIIO::pvt::enable_hwy && R.localpixels () && A.localpixels ()
143- && B.localpixels ()) {
144- auto Rv = HwyPixels (R);
145- auto Av = HwyPixels (A);
146- auto Bv = HwyPixels (B);
147- const int nchannels = roi.nchannels ();
148- const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
149- && ChannelsContiguous<Atype>(Av, nchannels)
150- && ChannelsContiguous<Btype>(Bv, nchannels);
151- if (contig) {
152- // Use native integer path for scale-invariant add when all types
153- // match and are integer types (much faster: 6-12x vs 3-5x with
154- // float conversion).
155- constexpr bool all_same = std::is_same_v<Rtype, Atype>
156- && std::is_same_v<Atype, Btype>;
157- constexpr bool is_integer = std::is_integral_v<Rtype>;
158- if constexpr (all_same && is_integer)
159- return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
160- return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
161- }
162-
141+ #if OIIO_USE_HWY
142+ // First case: hwy enabled, all images have local pixels and the
143+ // number of channels in the ROI. and fully encompass the ROI.
144+ if (OIIO::pvt::enable_hwy && HwySupports<Rtype>(R, roi)
145+ && HwySupports<Atype>(A, roi) && HwySupports<Btype>(B, roi)) {
146+ // Use native integer path for scale-invariant add when all types
147+ // match and are integer types (much faster: 6-12x vs 3-5x with
148+ // float conversion).
149+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
150+ && std::is_same_v<Atype, Btype>;
151+ constexpr bool is_integer = std::is_integral_v<Rtype>;
152+ if constexpr (all_same && is_integer)
153+ return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
154+ return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
155+ }
156+ // Second case: the buffers are RGBA but we are only adding RGB
157+ // (preserving alpha).
158+ // Is this a case we will actually encounter?
159+ if (OIIO::pvt::enable_hwy && HwySupports<Rtype>(R, roi, 4 )
160+ && HwySupports<Atype>(A, roi, 4 ) && HwySupports<Btype>(B, roi, 4 )
161+ && (roi.chbegin == 0 && roi.chend == 3 )) {
163162 // Handle the common RGBA + RGB ROI strided case (preserving alpha).
164- if (roi.chbegin == 0 && roi.chend == 3 ) {
165- const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
166- && Bv.nchannels >= 4 )
167- && ChannelsContiguous<Rtype>(Rv, 4 )
168- && ChannelsContiguous<Atype>(Av, 4 )
169- && ChannelsContiguous<Btype>(Bv, 4 );
170- if (contig4)
171- return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
172- nthreads);
173- }
163+ return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
174164 }
175165#endif
176166 return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -181,7 +171,7 @@ static bool
181171add_impl (ImageBuf& R, const ImageBuf& A, cspan<float > b, ROI roi, int nthreads)
182172{
183173#if OIIO_USE_HWY
184- if (OIIO::pvt::enable_hwy && R. localpixels () && A. localpixels ( ))
174+ if (OIIO::pvt::enable_hwy && HwySupports<Rtype>(R, roi ))
185175 return add_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
186176#endif
187177 return add_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
@@ -238,39 +228,29 @@ static bool
238228sub_impl (ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
239229 int nthreads)
240230{
241- #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
242- if (OIIO::pvt::enable_hwy && R.localpixels () && A.localpixels ()
243- && B.localpixels ()) {
244- auto Rv = HwyPixels (R);
245- auto Av = HwyPixels (A);
246- auto Bv = HwyPixels (B);
247- const int nchannels = roi.nchannels ();
248- const bool contig = ChannelsContiguous<Rtype>(Rv, nchannels)
249- && ChannelsContiguous<Atype>(Av, nchannels)
250- && ChannelsContiguous<Btype>(Bv, nchannels);
251- if (contig) {
252- // Use native integer path for scale-invariant sub when all types
253- // match and are integer types (much faster: 6-12x vs 3-5x with
254- // float conversion).
255- constexpr bool all_same = std::is_same_v<Rtype, Atype>
256- && std::is_same_v<Atype, Btype>;
257- constexpr bool is_integer = std::is_integral_v<Rtype>;
258- if constexpr (all_same && is_integer)
259- return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
260- return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
261- }
262-
231+ #if OIIO_USE_HWY
232+ // First case: hwy enabled, all images have local pixels and the
233+ // number of channels in the ROI. and fully encompass the ROI.
234+ if (OIIO::pvt::enable_hwy && HwySupports<Rtype>(R, roi)
235+ && HwySupports<Atype>(A, roi) && HwySupports<Btype>(B, roi)) {
236+ // Use native integer path for scale-invariant sub when all types
237+ // match and are integer types (much faster: 6-12x vs 3-5x with
238+ // float conversion).
239+ constexpr bool all_same = std::is_same_v<Rtype, Atype>
240+ && std::is_same_v<Atype, Btype>;
241+ constexpr bool is_integer = std::is_integral_v<Rtype>;
242+ if constexpr (all_same && is_integer)
243+ return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
244+ return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
245+ }
246+ // Second case: the buffers are RGBA but we are only subtracting RGB
247+ // (preserving alpha).
248+ // Is this a case we will actually encounter?
249+ if (OIIO::pvt::enable_hwy && HwySupports<Rtype>(R, roi, 4 )
250+ && HwySupports<Atype>(A, roi, 4 ) && HwySupports<Btype>(B, roi, 4 )
251+ && (roi.chbegin == 0 && roi.chend == 3 )) {
263252 // Handle the common RGBA + RGB ROI strided case (preserving alpha).
264- if (roi.chbegin == 0 && roi.chend == 3 ) {
265- const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
266- && Bv.nchannels >= 4 )
267- && ChannelsContiguous<Rtype>(Rv, 4 )
268- && ChannelsContiguous<Atype>(Av, 4 )
269- && ChannelsContiguous<Btype>(Bv, 4 );
270- if (contig4)
271- return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
272- nthreads);
273- }
253+ return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
274254 }
275255#endif
276256 return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
0 commit comments