diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 353303e54c..6920ab4713 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -150,6 +150,8 @@ jobs: - { name: 'half', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' } - { name: 'sse', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=SSE42' } - { name: 'avx', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX' } + - { name: 'simd', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX -DUSE_STD_SIMD=ON' } + - { name: 'simd-fallback', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX -DUSE_STD_SIMD=OFF' } - { name: 'pygrid', image: '2026', build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' } - { name: 'asan', image: '2026', build: 'asan', components: 'core,test', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON -DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors - { name: 'ubsan', image: '2026', build: 'ubsan', components: 'core,test', cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' } diff --git a/CMakeLists.txt b/CMakeLists.txt index 089381aa41..25b53106cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,13 @@ required.]=] ${USE_EXR}) option(USE_PNG "Use PNG while building openvdb components." OFF) option(USE_AX "Use OpenVDB AX while building openvdb components." ${OPENVDB_BUILD_AX}) option(USE_NANOVDB "Use NanoVDB while building openvdb components." ${OPENVDB_BUILD_NANOVDB}) +option(USE_STD_SIMD [=[ +Suppress auto-detection of std::experimental::simd (C++ Parallelism TS v2) and +force the std::array fallback backend for openvdb::simd::Simd. By default +the TS v2 header is used when the compiler provides it; set this to OFF to +disable. Has no effect on non-x86 or compilers that do not ship the TS header. +Note: unlike USE_VCL this option requires no external dependency — the TS header +is part of the compiler's standard library.]=] ON) cmake_dependent_option(OPENVDB_DISABLE_BOOST_IMPLICIT_LINKING "Disable the implicit linking of Boost libraries on Windows" ON "WIN32" OFF) @@ -420,6 +427,11 @@ elseif(OPENVDB_SIMD STREQUAL "SSE42") add_compile_definitions("$<$:OPENVDB_USE_SSE42>") endif() +# Suppress std::experimental::simd auto-detection if USE_STD_SIMD is OFF. +if(NOT USE_STD_SIMD) + add_compile_definitions("$<$:OPENVDB_NO_STD_SIMD>") +endif() + ######################################################################### # Configure our cmake modules to only search for static libraries diff --git a/openvdb/openvdb/Platform.h b/openvdb/openvdb/Platform.h index 017876490b..dae09ee8db 100644 --- a/openvdb/openvdb/Platform.h +++ b/openvdb/openvdb/Platform.h @@ -55,6 +55,17 @@ #endif #endif +/// Auto-detect std::experimental::simd (C++ Parallelism TS v2). +/// When available, openvdb::simd::Simd wraps it to emit native SIMD +/// instructions without relying on the auto-vectorizer. +/// Suppress with -DOPENVDB_NO_STD_SIMD to force the std::array fallback. +#if !defined(OPENVDB_NO_STD_SIMD) && defined(__has_include) && __has_include() +# include +# ifdef __cpp_lib_experimental_parallel_simd +# define OPENVDB_USE_STD_SIMD 1 +# endif +#endif + /// Windows defines #ifdef _WIN32 ///Disable the non-portable Windows definitions of min() and max() macros diff --git a/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h b/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h index 1e70a12677..f394141e91 100644 --- a/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h +++ b/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h @@ -399,6 +399,20 @@ struct EllipsoidTransferQuat final : this->BaseT::rasterizePoint(ijk, id, bounds, R); } + /// @brief Point-by-point dispatch — ellipsoids are not batchable via + /// the spherical SIMD path. This override prevents SphericalTransfer:: + /// rasterizePoints (which calls rasterizeN2) from being instantiated for + /// the FixedBandRadius radius type used by ellipsoids. + inline void rasterizePoints(const Coord& ijk, + const Index start, + const Index end, + const CoordBBox& bounds) + { + for (Index i = start; i < end; ++i) { + this->rasterizePoint(ijk, i, bounds); + } + } + private: std::unique_ptr mRotationHandle; }; @@ -446,6 +460,17 @@ struct EllipsoidTransferMat3 final : this->BaseT::rasterizePoint(ijk, id, bounds, mXformHandle->get(id)); } + /// @brief See EllipsoidTransferQuat::rasterizePoints. + inline void rasterizePoints(const Coord& ijk, + const Index start, + const Index end, + const CoordBBox& bounds) + { + for (Index i = start; i < end; ++i) { + this->rasterizePoint(ijk, i, bounds); + } + } + private: std::unique_ptr mXformHandle; }; diff --git a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h index 393eb26c22..563efdf658 100644 --- a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h +++ b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h @@ -9,6 +9,13 @@ #ifndef OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED #define OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED +#include + +/// @brief Set to 1 to disable batched SIMD rasterization and always use the +/// scalar per-point path, regardless of the ISA in use. Useful for +/// debugging or performance comparison. +#define OPENVDB_DISABLE_BATCHED_TRANSFERS 0 + namespace openvdb { OPENVDB_USE_VERSION_NAMESPACE namespace OPENVDB_VERSION_NAME { @@ -177,6 +184,19 @@ struct SignedDistanceFieldTransfer : using FilteredTransferT = FilteredTransfer; using PositionHandleT = AttributeHandle; + /// @brief Return the SIMD batch width for arithmetic type RealT. + /// Returns 1 (scalar fallback) when batched transfers are disabled or + /// no ISA is configured. + template + static constexpr size_t GetBatchSize() + { +#if OPENVDB_DISABLE_BATCHED_TRANSFERS + return 1; +#else + return simd::SimdNativeT::width; +#endif + } + // typically the max radius of all points rounded up inline Vec3i range(const Coord&, size_t) const { return mMaxKernelWidth; } @@ -305,10 +325,7 @@ struct SphericalTransfer : const std::unordered_map* ids = nullptr) : SphericalTransfer(pidx, Vec3i(width), rt, source, filter, interrupt, surface, cpg, ids) {} - /// @brief For each point, stamp a sphere with a given radius by running - /// over all intersecting voxels and calculating if this point is closer - /// than the currently held distance value. Note that the default value - /// of the surface buffer should be the background value of the surface. + /// @brief Dispatch: batched SIMD path when N2 > 1, scalar fallback when N2 == 1. inline void rasterizePoint(const Coord& ijk, const Index id, const CoordBBox& bounds) @@ -328,6 +345,46 @@ struct SphericalTransfer : this->rasterizePoint(P, id, bounds, this->mRadius.eval(id)); } + /// @brief For each point in [start, end), batch into groups of N2 and + /// dispatch to rasterizeN2. Partial final batches pad with -1 sentinels + /// and fall through to scalar for size-1 remainders. + inline void rasterizePoints(const Coord& ijk, + const Index start, + const Index end, + const CoordBBox& bounds) + { + constexpr size_t N2 = BaseT::template GetBatchSize(); + if constexpr (N2 == 1) { + // Scalar fallback: no ISA configured or batching disabled + for (Index i = start; i < end; ++i) { + this->rasterizePoint(ijk, i, bounds); + } + } else { + // Batched SIMD path. N2 must be a power of two. + static_assert((N2 > 1) && !(N2 & (N2 - 1))); + + std::array ids; + size_t offset = 0; + for (Index i = start; i < end; ++i) { + if (!BaseT::filter(i)) continue; + ids[offset++] = int64_t(i); + if (offset == N2) { + this->rasterizeN2(ijk, ids, bounds); + offset = 0; + } + } + + if (offset == 0) return; + if (offset == 1) { + this->rasterizePoint(ijk, Index(ids[0]), bounds); + } else { + // Pad remaining slots with -1 sentinel; stamp will ignore them. + for (; offset < N2; ++offset) ids[offset] = int64_t(-1); + this->rasterizeN2(ijk, ids, bounds); + } + } + } + /// @brief Allow early termination if all voxels in the surface have been /// deactivated (all interior) inline bool endPointLeaf(const PointDataTree::LeafNodeType&) @@ -371,9 +428,6 @@ struct SphericalTransfer : /// to pass a different P and scaled FixedBandRadius from its ellipsoid /// path (as isolated points are stamped as spheres with a different /// scale and positions may have been smoothed). - /// @todo I would prefer this second function wasn't necessary but there - /// is no easy way to allow differently scaled radii to exist in a more - /// efficient manner, nor use a different P. inline void rasterizePoint(const Vec3d& P, const Index id, const CoordBBox& bounds, @@ -383,64 +437,179 @@ struct SphericalTransfer : CoordBBox intersectBox(Coord::round(P - max), Coord::round(P + max)); intersectBox.intersect(bounds); if (intersectBox.empty()) return; + this->stamp(RealT(P[0]), RealT(P[1]), RealT(P[2]), + RealT(r.get()), RealT(r.minSq()), RealT(r.maxSq()), &id, intersectBox); + } + + /// @brief Gather N2 points from the point-data leaf, convert AoS→SoA, + /// load into Simd vectors, and call the generic stamp. + template + inline void rasterizeN2(const Coord& ijk, + const std::array()>& points, + const CoordBBox& bounds) + { + using SimdT = simd::Simd; + + // Scratch: 3 SoA position arrays (+ 3 radius arrays for varying radii) + constexpr size_t CacheWords = (RadiusType::Fixed ? 3 : 6) * Size; + std::array cache; + + const Vec3d ijkd = ijk.asVec3d(); + Vec3d tmp{}; + + // Find last valid index; sentinel -1 marks padding slots. + // It is guaranteed at least 2 valid entries (caller routes size-1 to scalar). + int64_t firstInvalid = Size; + for (size_t i = 0; i < Size; ++i) { + if (points[i] == int64_t(-1)) { firstInvalid = int64_t(i); break; } + } + + // AoS → SoA: convert positions (and optionally radii) to RealT arrays. + for (size_t i = 0; i < Size; ++i) { + if (int64_t(i) < firstInvalid) { + tmp = ijkd + Vec3d(this->mPosition->get(Index(points[i]))); + tmp = this->transformSourceToTarget(tmp); + } + cache[i + Size*0] = tmp[0]; + cache[i + Size*1] = tmp[1]; + cache[i + Size*2] = tmp[2]; + if constexpr (!RadiusType::Fixed) { + if (int64_t(i) < firstInvalid) { + auto r = this->mRadius.eval(Index(points[i])); + cache[i + Size*3] = RealT(r.get()); + cache[i + Size*4] = RealT(r.minSq()); + cache[i + Size*5] = RealT(r.maxSq()); + } else { + cache[i + Size*3] = cache[i + Size*4] = cache[i + Size*5] = RealT(0); + } + } + } + + // Load SoA arrays into SIMD registers. + SimdT px(cache.data() + Size*0); + SimdT py(cache.data() + Size*1); + SimdT pz(cache.data() + Size*2); + + SimdT rad, rmin2, rmax2; + if constexpr (RadiusType::Fixed) { + auto r = this->mRadius.eval(Index(0)); // same for all + rad = SimdT(RealT(r.get())); + rmin2 = SimdT(RealT(r.minSq())); + rmax2 = SimdT(RealT(r.maxSq())); + } else { + rad = SimdT(cache.data() + Size*3); + rmin2 = SimdT(cache.data() + Size*4); + rmax2 = SimdT(cache.data() + Size*5); + } + + // Compute bounding box from the widest point in the batch. + const SimdT rmax_broad = simd::hmax(rmax2); // broadcast max radius² + // Conservative: use sqrt(max rmax2) as the search radius + const RealT searchR = std::sqrt(RealT(simd::hmax(rmax2))); + const SimdT pxMin = simd::hmin(px), pyMin = simd::hmin(py), pzMin = simd::hmin(pz); + const SimdT pxMax = simd::hmax(px), pyMax = simd::hmax(py), pzMax = simd::hmax(pz); + + CoordBBox intersectBox( + Coord::round(Vec3d(RealT(pxMin) - searchR, RealT(pyMin) - searchR, RealT(pzMin) - searchR)), + Coord::round(Vec3d(RealT(pxMax) + searchR, RealT(pyMax) + searchR, RealT(pzMax) + searchR))); + intersectBox.intersect(bounds); + if (intersectBox.empty()) return; + + this->stamp(px, py, pz, rad, rmin2, rmax2, points.data(), intersectBox); + } + + /// @brief Generic-T stamp: ScalarT is either RealT (scalar) or Simd + /// (SIMD batch). The same source compiles for both with zero #ifdef. + /// + /// Key design points: + /// - Arithmetic operators (+, -, *, /) and comparisons (>=, <=, ==) are + /// overloaded for Simd and resolve to the built-in operators for + /// scalar T. No simd::add() free-function calls. + /// - simd::where(mask, a, b) replaces ternary; scalar overload is a + /// plain ternary. + /// - simd::hmin(dist) reduces all lanes to the minimum and broadcasts + /// back, keeping the result in ScalarT space. For scalar T it is the + /// identity. + /// - ScalarT(hmin_result) invokes explicit operator T() on Simd to + /// extract the scalar at the single write boundary; for scalar T this + /// is a trivial copy. + /// - simd::hall / simd::hany return bool for both scalar and SIMD. + template + inline void stamp(const ScalarT& Px, const ScalarT& Py, const ScalarT& Pz, + const ScalarT& r, const ScalarT& Rmin2, const ScalarT& Rmax2, + const IdT ids, const CoordBBox& intersection) + { + using simd::where; + using simd::hall; + using simd::hany; + using simd::hmin; + using simd::hfirst; + using ElemT = simd::scalar_t; + + if (intersection.empty()) return; auto* const data = this->template buffer<0>(); - auto* const cpg = CPG ? this->template buffer() : nullptr; + [[maybe_unused]] auto* const cpgbuf = + CPG ? this->template buffer() : nullptr; auto& mask = *(this->template mask<0>()); - // If min2 == 0.0, then the index space radius is equal to or less than - // the desired half band. In this case each sphere interior always needs - // to be filled with distance values as we won't ever reach the negative - // background value. If, however, a point overlaps a voxel coord exactly, - // x2y2z2 will be 0.0. Forcing min2 to be less than zero here avoids - // incorrectly setting these voxels to inactive -background values as - // x2y2z2 will never be < 0.0. We still want the lteq logic in the - // (x2y2z2 <= min2) check as this is valid when min2 > 0.0. - const RealT min2 = r.minSq() == 0.0 ? -1.0 : r.minSq(); - const RealT max2 = r.maxSq(); + // When Rmin2 == 0 the sphere interior must be filled but x2y2z2 can + // equal 0 at the centre, which would incorrectly trigger the <= check. + // Force min2 negative so x2y2z2 <= min2 is never true in that case. + const ScalarT min2 = where(Rmin2 == ScalarT(ElemT(0)), ScalarT(ElemT(-1)), Rmin2); + const ScalarT max2 = Rmax2; + const ScalarT vdx(ElemT(this->mDx)); - const Coord& a(intersectBox.min()); - const Coord& b(intersectBox.max()); + const Coord& a(intersection.min()); + const Coord& b(intersection.max()); for (Coord c = a; c.x() <= b.x(); ++c.x()) { - const RealT x2 = static_cast(math::Pow2(c.x() - P[0])); - const Index i = ((c.x() & (DIM-1u)) << 2*LOG2DIM); // unsigned bit shift mult + const ScalarT x2 = simd::square(ScalarT(ElemT(c.x())) - Px); + const Index ii = ((c.x() & (DIM-1u)) << 2*LOG2DIM); for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) { - const RealT x2y2 = static_cast(x2 + math::Pow2(c.y() - P[1])); - const Index ij = i + ((c.y() & (DIM-1u)) << LOG2DIM); + const ScalarT x2y2 = x2 + simd::square(ScalarT(ElemT(c.y())) - Py); + const Index ij = ii + ((c.y() & (DIM-1u)) << LOG2DIM); for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) { - const Index offset = ij + /*k*/(c.z() & (DIM-1u)); - if (!mask.isOn(offset)) continue; // inside existing level set or not in range + const Index offset = ij + (c.z() & (DIM-1u)); + if (!mask.isOn(offset)) continue; + + const ScalarT x2y2z2 = x2y2 + simd::square(ScalarT(ElemT(c.z())) - Pz); - const RealT x2y2z2 = static_cast(x2y2 + math::Pow2(c.z() - P[2])); - if (x2y2z2 >= max2) continue; //outside narrow band of particle in positive direction - if (x2y2z2 <= min2) { //outside narrow band of the particle in negative direction. can disable this to fill interior + // All lanes outside positive band — skip voxel entirely. + if (hall(x2y2z2 >= max2)) continue; + + // Any lane inside negative band — voxel is interior. + if (hany(x2y2z2 <= min2)) { data[offset] = -(this->mBackground); mask.setOff(offset); continue; } - const ValueT d = ValueT(this->mDx * (math::Sqrt(x2y2z2) - r.get())); // back to world space + // Distance to surface, back to world space. + const ScalarT dist = vdx * (simd::sqrt(x2y2z2) - r); + + // hmin reduces all lanes to the minimum distance and + // broadcasts it back into ScalarT so subsequent arithmetic + // stays in the same type domain. operator ElemT() then + // extracts the scalar at this single write boundary. + const ScalarT mindist = hmin(dist); + const ValueT d = ValueT(ElemT(mindist)); + ValueT& v = data[offset]; if (d < v) { v = d; - OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN - if (CPG) cpg[offset] = Int64(this->mPLeafMask | Index64(id)); - OPENVDB_NO_TYPE_CONVERSION_WARNING_END - // transfer attributes - we can't use this here as the exposed - // function signatures take vector of attributes (i.e. an unbounded - // size). If we instead clamped the attribute transfer to a fixed - // amount of attributes we could get rid of the closest point logic - // entirely. @todo consider adding another signature for increased - // performance - // this->foreach([&](auto* buffer, const size_t idx) { - // using Type = typename std::remove_pointer::type; - // buffer[offset] = mAttributes.template get(idx); - // }) + if constexpr (CPG) { + // Find which lane contributed the minimum distance. + const int winner = hfirst(mindist == dist); + OPENVDB_ASSERT(winner >= 0); + OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN + cpgbuf[offset] = Int64(this->mPLeafMask | Index64(ids[winner])); + OPENVDB_NO_TYPE_CONVERSION_WARNING_END + } } } } - } // outer sdf voxel - } // point idx + } + } }; /// @brief The transfer implementation for averaging of positions followed by diff --git a/openvdb/openvdb/simd/ASSEMBLY_NOTES.md b/openvdb/openvdb/simd/ASSEMBLY_NOTES.md new file mode 100644 index 0000000000..ef487b79b6 --- /dev/null +++ b/openvdb/openvdb/simd/ASSEMBLY_NOTES.md @@ -0,0 +1,94 @@ +# Assembly notes for `Simd` — `rasterizeSdf` demo path + +Examined symbol: +`SphericalTransfer<..., NullCodec, FixedBandRadius, NullFilter, false>::rasterizeN2<4>` + +Object file: `build/.../unittest/CMakeFiles/vdb_test.dir/TestPointRasterizeSDF.cc.o` +Build flags: `-O3 -mavx -msse4.2` (CPU: Intel Core Ultra 9 285K — supports AVX2) + +--- + +## Inner per-voxel z-loop (hot path) + +The `stamp>` kernel is fully inlined into `rasterizeN2<4>`. +Key instructions in the innermost loop body: + +```nasm +; --- broadcast voxel coord to all 4 particle lanes --- +vbroadcastsd %xmm0, %ymm0 ; voxel.z → 4 lanes + +; --- x²+y²+z² for 4 particles simultaneously --- +vsubpd Pz_simd(%rbp), %ymm0, %ymm0 ; vox.z - Pz[0..3] +vfmadd213pd xy2_simd(%rbp), %ymm0, %ymm0 ; z² + (x²+y²) [AVX2 FMA] + +; --- all-outside early exit (zero extra cost) --- +vcmplepd %ymm0, rmax2_ymm, %ymm1 ; rmax² <= dist² lane-wise +vmovmskpd %ymm1, %eax ; fold 4 comparison bits → scalar int +not %eax +test $0xf, %al +je next_voxel ; all 4 outside → skip with one branch + +; --- 4 square roots in one instruction --- +vsqrtpd %ymm0, %ymm1 ; sqrt(x²+y²+z²) for all 4 particles + +; --- dist = (sqrt(dist²) - r) × vdx --- +vsubpd r_simd(%rbp), %ymm1, %ymm1 +vmulpd vdx_simd(%rbp), %ymm1, %ymm1 + +; --- hmin: tree reduction 4→2→1 --- +vmovapd %xmm1, %xmm0 ; lower 2 lanes +vextractf128 $1, %ymm1, %xmm1 ; upper 2 lanes +; (stdx::reduce helper — see note below) +vmovsd %xmm0, %xmm0, %xmm1 +vunpckhpd %xmm0, %xmm0, %xmm0 +vminsd %xmm1, %xmm0, %xmm1 ; scalar minimum + +; --- scalar write boundary --- +vcvtsd2ss %xmm1, %xmm1, %xmm1 ; double min → float (explicit operator T()) +vcomiss %xmm1, grid_val ; compare with existing voxel value +vmovss %xmm1, grid_val ; conditional store +``` + +--- + +## Key observations + +| | Observation | +|---|---| +| ✓ | All arithmetic uses 256-bit YMM registers (`vsubpd`, `vmulpd`, `vaddpd`, `vcmplepd`) | +| ✓ | `vsqrtpd` — 4 double square roots in **one** instruction vs. 4 serial `sqrtsd` in scalar | +| ✓ | `vcmplepd + vmovmskpd` — 4-lane comparison folds to a single branch (all-outside check) | +| ✓ | `vbroadcastsd` — voxel coordinates broadcast across all 4 particle lanes for free | +| ✓ | `vfmadd213pd` — fused multiply-add for z²+(x²+y²), no extra instruction | +| ✓ | `vcvtsd2ss` at write boundary — clean extraction of scalar min (matches `explicit operator T()`) | +| ⚠ | Two `vzeroupper + call` in the inner loop — AVX→SSE ABI transition before stdx helpers | + +--- + +## The `vzeroupper` concern + +There are two `vzeroupper + call` sequences per voxel iteration: + +1. Before `stdx::any_of` — used for `hany(x2y2z2 <= min2)` (interior-fill check). +2. Before `stdx::reduce` — used for the `hmin` 4→2 tree step. + +These arise because GCC's `std::experimental::fixed_size_simd` calls +internal helpers for reductions rather than emitting everything inline. +On Arrow Lake (`vzeroupper` ≈ 2 cycles), this is minor. +On older microarchitectures (Skylake, ~100 cycle penalty), it matters more. + +**Mitigation if needed:** replace `stdx::reduce` with explicit `vextractf128 + +vminpd + vpermilpd + vminsd` in `hmin`, bypassing the helper entirely. This +is mechanical and can be added to Backend A without changing the public API. + +--- + +## Scalar comparison (`-march=native` vs `-mavx`) + +Recompiling with `-march=native` (enables AVX2 + FMA) yields: +- `vbroadcastsd` replaces `vmovddup + vinsertf128` (cleaner 1-instruction broadcast). +- `vfmadd213pd` replaces `vmulpd + vaddpd` (fused multiply-add, saves an instruction). +- The `vzeroupper + call` pattern is **unchanged** — it is a stdx implementation + detail, not an ISA limitation. + +Both builds produce correct results; all 8 `TestPointRasterizeSDF` test cases pass. diff --git a/openvdb/openvdb/simd/Simd.h b/openvdb/openvdb/simd/Simd.h new file mode 100644 index 0000000000..d02b778bee --- /dev/null +++ b/openvdb/openvdb/simd/Simd.h @@ -0,0 +1,421 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +/// +/// @file Simd.h +/// +/// @brief Generic-T SIMD abstraction for OpenVDB. +/// +/// @details Provides Simd and SimdMask wrapper types together with +/// free functions (where, hmin, hmax, hall, hany, hfirst, sqrt, square) that +/// have matching scalar overloads, enabling kernels to be written once as +/// templates on a value type T and compiled for both: +/// +/// T = float / double — scalar path, used directly or on GPU +/// T = Simd — W-wide SIMD path for CPU batch processing +/// +/// The same source — operators, where(), hmin() etc. — compiles correctly +/// for both instantiations with no #ifdef or duplicated logic. +/// +/// @note Two backends are provided, selected automatically: +/// +/// Backend A (OPENVDB_USE_STD_SIMD): wraps std::experimental::simd +/// (Parallelism TS v2) in a thin class to provide the full API including +/// explicit operator T() and value_type. The compiler emits native SIMD +/// instructions directly; no auto-vectorizer involvement. +/// +/// Backend B (default, C++17): wraps std::array with element-wise +/// operator loops. The compiler auto-vectorizes fixed-count loops. +/// Annotated __hostdev__ for CUDA compatibility (NanoVDB use). +/// +/// @note Migration to std::simd (C++26) will be a one-line change in the +/// backend detection guard; all call sites are unchanged. + +#ifndef OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED +#define OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED + +#include +#include +#include +#include +#include + +#ifdef OPENVDB_USE_STD_SIMD +#include +#endif + +namespace openvdb { +OPENVDB_USE_VERSION_NAMESPACE +namespace OPENVDB_VERSION_NAME { +namespace simd { + +// ============================================================================ +// Register-width constants — derived from OPENVDB_SIMD compile flags. +// SimdNativeT uses this to select the natural lane count for element type T. +// ============================================================================ + +static constexpr size_t RegisterBits = +#if defined(OPENVDB_USE_AVX) + 256; +#elif defined(OPENVDB_USE_SSE42) + 128; +#else + 0; // no explicit ISA targeting; fall back to scalar (width 1) +#endif + +// ============================================================================ +// Backend A: std::experimental::simd — thin wrapper class +// ============================================================================ +#ifdef OPENVDB_USE_STD_SIMD + +namespace stdx = std::experimental; + +/// @brief SIMD mask wrapper for Backend A. +template +struct SimdMask +{ + using MaskT = stdx::fixed_size_simd_mask; + MaskT m; + + SimdMask() = default; + /*implicit*/ SimdMask(MaskT mask) : m(mask) {} + + bool operator[](int i) const { return m[i]; } +}; + +/// @brief SIMD value wrapper for Backend A. +template +struct Simd +{ + using value_type = T; + using VecT = stdx::fixed_size_simd; + VecT v; + + Simd() = default; + /*implicit*/ Simd(T scalar) : v(scalar) {} // broadcast + explicit Simd(const T* p) : v(p, stdx::element_aligned) {} // load + /*implicit*/ Simd(VecT vec) : v(vec) {} + + /// @brief Extract lane 0. When all lanes hold the same value (e.g. + /// after hmin/hmax), this converts a SIMD result back to a scalar. + explicit operator T() const { return v[0]; } + + T operator[](int i) const { return v[i]; } + + void store(T* p) const { v.copy_to(p, stdx::element_aligned); } + + // Arithmetic operators + Simd operator-() const { return Simd(-v); } + Simd operator+(Simd o) const { return Simd(v + o.v); } + Simd operator-(Simd o) const { return Simd(v - o.v); } + Simd operator*(Simd o) const { return Simd(v * o.v); } + Simd operator/(Simd o) const { return Simd(v / o.v); } + + // Compound assignment + Simd& operator+=(Simd o) { v += o.v; return *this; } + Simd& operator-=(Simd o) { v -= o.v; return *this; } + Simd& operator*=(Simd o) { v *= o.v; return *this; } + Simd& operator/=(Simd o) { v /= o.v; return *this; } + + // Comparison — return mask + SimdMask operator>=(Simd o) const { return SimdMask(v >= o.v); } + SimdMask operator<=(Simd o) const { return SimdMask(v <= o.v); } + SimdMask operator> (Simd o) const { return SimdMask(v > o.v); } + SimdMask operator< (Simd o) const { return SimdMask(v < o.v); } + SimdMask operator==(Simd o) const { return SimdMask(v == o.v); } + SimdMask operator!=(Simd o) const { return SimdMask(v != o.v); } +}; + +// Mixed scalar-SIMD operators +template Simd operator+(T a, Simd b) { return Simd(a) + b; } +template Simd operator+(Simd a, T b) { return a + Simd(b); } +template Simd operator-(T a, Simd b) { return Simd(a) - b; } +template Simd operator-(Simd a, T b) { return a - Simd(b); } +template Simd operator*(T a, Simd b) { return Simd(a) * b; } +template Simd operator*(Simd a, T b) { return a * Simd(b); } +template Simd operator/(T a, Simd b) { return Simd(a) / b; } +template Simd operator/(Simd a, T b) { return a / Simd(b); } + +// SIMD free functions — Backend A + +template +Simd where(SimdMask mask, Simd a, Simd b) { + auto result = b.v; + stdx::where(mask.m, result) = a.v; + return Simd(result); +} + +template +Simd min(Simd a, Simd b) { return Simd(stdx::min(a.v, b.v)); } + +template +Simd max(Simd a, Simd b) { return Simd(stdx::max(a.v, b.v)); } + +template +Simd sqrt(Simd v) { return Simd(stdx::sqrt(v.v)); } + +template +Simd abs(Simd v) { return Simd(stdx::abs(v.v)); } + +/// @brief Horizontal min — reduces all lanes to the minimum value and +/// broadcasts it back to all lanes. The result stays in Simd so +/// subsequent arithmetic (e.g. equality comparison to find winner lane) +/// remains in the same type domain. Use explicit operator T() to extract +/// the scalar at the write boundary: T result = T(hmin(v)); +template +Simd hmin(Simd v) { + // stdx::reduce passes intermediate simd chunks to the binary op (tree reduction), + // so the lambda must use auto parameters and stdx::min for element-wise selection. + T m = stdx::reduce(v.v, [](auto a, auto b){ return stdx::min(a, b); }); + return Simd(m); +} + +template +Simd hmax(Simd v) { + T m = stdx::reduce(v.v, [](auto a, auto b){ return stdx::max(a, b); }); + return Simd(m); +} + +/// @brief Returns true if all lanes of the mask are set. +template +bool hall(SimdMask mask) { return stdx::all_of(mask.m); } + +/// @brief Returns true if any lane of the mask is set. +template +bool hany(SimdMask mask) { return stdx::any_of(mask.m); } + +/// @brief Returns the index of the first set lane, or -1 if none. +template +int hfirst(SimdMask mask) { + for (int i = 0; i < W; ++i) if (mask[i]) return i; + return -1; +} + +// ============================================================================ +// Backend B: std::array fallback — auto-vectorizer target +// ============================================================================ +#else // !OPENVDB_USE_STD_SIMD + +/// @brief SIMD mask wrapper for Backend B. +template +struct SimdMask +{ + std::array m{}; + + SimdMask() = default; + + bool operator[](int i) const { return m[i]; } + bool& operator[](int i) { return m[i]; } +}; + +/// @brief SIMD value wrapper for Backend B. Element-wise operator loops are +/// the auto-vectorizer target; fixed-count, no struct indirection. +template +struct Simd +{ + using value_type = T; + std::array v{}; + + Simd() = default; + /*implicit*/ Simd(T scalar) { v.fill(scalar); } // broadcast + explicit Simd(const T* p) { // load + for (int i = 0; i < W; ++i) v[i] = p[i]; + } + + /// @brief Extract lane 0. See hmin() for the intended usage pattern. + explicit operator T() const { return v[0]; } + + T operator[](int i) const { return v[i]; } + T& operator[](int i) { return v[i]; } + + void store(T* p) const { + for (int i = 0; i < W; ++i) p[i] = v[i]; + } + + Simd operator-() const { + Simd r; for (int i = 0; i < W; ++i) r[i] = -v[i]; return r; + } + Simd operator+(Simd o) const { + Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] + o[i]; return r; + } + Simd operator-(Simd o) const { + Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] - o[i]; return r; + } + Simd operator*(Simd o) const { + Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] * o[i]; return r; + } + Simd operator/(Simd o) const { + Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] / o[i]; return r; + } + + Simd& operator+=(Simd o) { for (int i = 0; i < W; ++i) v[i] += o[i]; return *this; } + Simd& operator-=(Simd o) { for (int i = 0; i < W; ++i) v[i] -= o[i]; return *this; } + Simd& operator*=(Simd o) { for (int i = 0; i < W; ++i) v[i] *= o[i]; return *this; } + Simd& operator/=(Simd o) { for (int i = 0; i < W; ++i) v[i] /= o[i]; return *this; } + + SimdMask operator>=(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] >= o[i]; return r; + } + SimdMask operator<=(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] <= o[i]; return r; + } + SimdMask operator>(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] > o[i]; return r; + } + SimdMask operator<(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] < o[i]; return r; + } + SimdMask operator==(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] == o[i]; return r; + } + SimdMask operator!=(Simd o) const { + SimdMask r; for (int i = 0; i < W; ++i) r[i] = v[i] != o[i]; return r; + } +}; + +// Mixed scalar-SIMD operators +template Simd operator+(T a, Simd b) { return Simd(a) + b; } +template Simd operator+(Simd a, T b) { return a + Simd(b); } +template Simd operator-(T a, Simd b) { return Simd(a) - b; } +template Simd operator-(Simd a, T b) { return a - Simd(b); } +template Simd operator*(T a, Simd b) { return Simd(a) * b; } +template Simd operator*(Simd a, T b) { return a * Simd(b); } +template Simd operator/(T a, Simd b) { return Simd(a) / b; } +template Simd operator/(Simd a, T b) { return a / Simd(b); } + +// SIMD free functions — Backend B + +template +Simd where(SimdMask mask, Simd a, Simd b) { + Simd r; for (int i = 0; i < W; ++i) r[i] = mask[i] ? a[i] : b[i]; return r; +} + +template +Simd min(Simd a, Simd b) { + Simd r; for (int i = 0; i < W; ++i) r[i] = a[i] < b[i] ? a[i] : b[i]; return r; +} + +template +Simd max(Simd a, Simd b) { + Simd r; for (int i = 0; i < W; ++i) r[i] = a[i] > b[i] ? a[i] : b[i]; return r; +} + +template +Simd sqrt(Simd v) { + Simd r; for (int i = 0; i < W; ++i) r[i] = std::sqrt(v[i]); return r; +} + +template +Simd abs(Simd v) { + Simd r; for (int i = 0; i < W; ++i) r[i] = std::abs(v[i]); return r; +} + +template +Simd hmin(Simd v) { + T m = v[0]; for (int i = 1; i < W; ++i) m = m < v[i] ? m : v[i]; + return Simd(m); +} + +template +Simd hmax(Simd v) { + T m = v[0]; for (int i = 1; i < W; ++i) m = m > v[i] ? m : v[i]; + return Simd(m); +} + +template +bool hall(SimdMask mask) { + for (int i = 0; i < W; ++i) if (!mask[i]) return false; + return true; +} + +template +bool hany(SimdMask mask) { + for (int i = 0; i < W; ++i) if (mask[i]) return true; + return false; +} + +template +int hfirst(SimdMask mask) { + for (int i = 0; i < W; ++i) if (mask[i]) return i; + return -1; +} + +#endif // OPENVDB_USE_STD_SIMD + +// ============================================================================ +// Scalar overloads — always present. +// +// These make the Generic-T pattern complete: a kernel templated on T compiles +// identically for T=float (scalar/GPU) and T=Simd (CPU batch). +// The scalar overloads are the identity in every case. +// ============================================================================ + +template T where(bool m, T a, T b) { return m ? a : b; } +template T min(T a, T b) { return a < b ? a : b; } +template T max(T a, T b) { return a > b ? a : b; } +template T sqrt(T v) { return std::sqrt(v); } +template T abs(T v) { return std::abs(v); } +template T hmin(T v) { return v; } // identity: one lane +template T hmax(T v) { return v; } // identity: one lane +inline bool hall(bool b) { return b; } // identity: one lane +inline bool hany(bool b) { return b; } // identity: one lane +inline int hfirst(bool b) { return b ? 0 : -1; } + +// square — useful shorthand, defined after scalar overloads so T works for both +template T square(T v) { return v * v; } + +// ============================================================================ +// Scalar — extract the underlying element type from T. +// +// scalar_t = float (primary: T has no value_type) +// scalar_t> = float (specialization: T::value_type exists) +// +// Detection uses std::void_t so it generalises to any type with a value_type +// member, without an explicit Simd specialization. +// ============================================================================ + +template +struct Scalar { using type = T; }; + +template +struct Scalar> { + using type = typename T::value_type; +}; + +template +using scalar_t = typename Scalar::type; + +// ============================================================================ +// SimdTraits — compile-time lane count for a SIMD type. +// +// SimdTraits::size = 1 +// SimdTraits>::size = W +// ============================================================================ + +template +struct SimdTraits { static constexpr size_t size = 1; }; + +template +struct SimdTraits> { static constexpr size_t size = size_t(W); }; + +// ============================================================================ +// SimdNativeT — select the natural SIMD type for element type T. +// +// SimdNativeT::Type on AVX → Simd (256 / 64 bits) +// SimdNativeT::Type on AVX → Simd (256 / 32 bits) +// SimdNativeT::Type no ISA → Simd (scalar fallback) +// ============================================================================ + +template +struct SimdNativeT +{ + static constexpr size_t elemBits = sizeof(T) * CHAR_BIT; + static constexpr size_t width = (RegisterBits > 0) + ? (RegisterBits / elemBits) : 1; + using Type = Simd; +}; + +} // namespace simd +} // namespace OPENVDB_VERSION_NAME +} // namespace openvdb + +#endif // OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED