diff --git a/meson.build b/meson.build
index d6c7a6082..8aa493d70 100644
--- a/meson.build
+++ b/meson.build
@@ -71,6 +71,7 @@ add_project_arguments(global_cpp_args, language: 'cpp')
 sdl2_dep = dependency('sdl2')
 thread_dep = dependency('threads')
 dl_dep = cc.find_library('dl', required: true)
+glm_dep = dependency('glm')
 
 stb = subproject('stb').get_variable('stb_inc')
 stb_dep = declare_dependency(include_directories: stb)
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXCollision.h b/targets/app/linux/Stubs/DirectXMath/DirectXCollision.h
deleted file mode 100644
index 6605197bd..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXCollision.h
+++ /dev/null
@@ -1,448 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXCollision.h -- C++ Collision Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX {
-
-enum ContainmentType { DISJOINT = 0, INTERSECTS = 1, CONTAINS = 2 };
-
-enum PlaneIntersectionType { FRONT = 0, INTERSECTING = 1, BACK = 2 };
-
-struct BoundingBox;
-struct BoundingOrientedBox;
-struct BoundingFrustum;
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4324 4820)
-// C4324: alignment padding warnings
-// C4820: Off by default noise
-#endif
-
-//-------------------------------------------------------------------------------------
-// Bounding sphere
-//-------------------------------------------------------------------------------------
-struct BoundingSphere {
-    XMFLOAT3 Center;  // Center of the sphere.
-    float Radius;     // Radius of the sphere.
-
-    // Creators
-    BoundingSphere() noexcept : Center(0, 0, 0), Radius(1.f) {}
-
-    BoundingSphere(const BoundingSphere&) = default;
-    BoundingSphere& operator=(const BoundingSphere&) = default;
-
-    BoundingSphere(BoundingSphere&&) = default;
-    BoundingSphere& operator=(BoundingSphere&&) = default;
-
-    constexpr BoundingSphere(_In_ const XMFLOAT3& center,
-                             _In_ float radius) noexcept
-        : Center(center), Radius(radius) {}
-
-    // Methods
-    void XM_CALLCONV Transform(_Out_ BoundingSphere& Out,
-                               _In_ FXMMATRIX M) const noexcept;
-    void XM_CALLCONV Transform(_Out_ BoundingSphere& Out, _In_ float Scale,
-                               _In_ FXMVECTOR Rotation,
-                               _In_ FXMVECTOR Translation) const noexcept;
-    // Transform the sphere
-
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept;
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                         _In_ FXMVECTOR V2) const noexcept;
-    ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
-    ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
-    ContainmentType Contains(
-        _In_ const BoundingOrientedBox& box) const noexcept;
-    ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
-    bool Intersects(_In_ const BoundingBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                _In_ FXMVECTOR V2) const noexcept;
-    // Triangle-sphere test
-
-    PlaneIntersectionType XM_CALLCONV
-    Intersects(_In_ FXMVECTOR Plane) const noexcept;
-    // Plane-sphere test
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction,
-                                _Out_ float& Dist) const noexcept;
-    // Ray-sphere test
-
-    ContainmentType XM_CALLCONV
-    ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1,
-                _In_ FXMVECTOR Plane2, _In_ GXMVECTOR Plane3,
-                _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
-    // Test sphere against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateMerged(_Out_ BoundingSphere& Out,
-                             _In_ const BoundingSphere& S1,
-                             _In_ const BoundingSphere& S2) noexcept;
-
-    static void CreateFromBoundingBox(_Out_ BoundingSphere& Out,
-                                      _In_ const BoundingBox& box) noexcept;
-    static void CreateFromBoundingBox(
-        _Out_ BoundingSphere& Out,
-        _In_ const BoundingOrientedBox& box) noexcept;
-
-    static void CreateFromPoints(_Out_ BoundingSphere& Out, _In_ size_t Count,
-                                 _In_reads_bytes_(sizeof(XMFLOAT3) +
-                                                  Stride * (Count - 1))
-                                     const XMFLOAT3* pPoints,
-                                 _In_ size_t Stride) noexcept;
-
-    static void CreateFromFrustum(_Out_ BoundingSphere& Out,
-                                  _In_ const BoundingFrustum& fr) noexcept;
-};
-
-//-------------------------------------------------------------------------------------
-// Axis-aligned bounding box
-//-------------------------------------------------------------------------------------
-struct BoundingBox {
-    static constexpr size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Center;   // Center of the box.
-    XMFLOAT3 Extents;  // Distance from the center to each side.
-
-    // Creators
-    BoundingBox() noexcept : Center(0, 0, 0), Extents(1.f, 1.f, 1.f) {}
-
-    BoundingBox(const BoundingBox&) = default;
-    BoundingBox& operator=(const BoundingBox&) = default;
-
-    BoundingBox(BoundingBox&&) = default;
-    BoundingBox& operator=(BoundingBox&&) = default;
-
-    constexpr BoundingBox(_In_ const XMFLOAT3& center,
-                          _In_ const XMFLOAT3& extents) noexcept
-        : Center(center), Extents(extents) {}
-
-    // Methods
-    void XM_CALLCONV Transform(_Out_ BoundingBox& Out,
-                               _In_ FXMMATRIX M) const noexcept;
-    void XM_CALLCONV Transform(_Out_ BoundingBox& Out, _In_ float Scale,
-                               _In_ FXMVECTOR Rotation,
-                               _In_ FXMVECTOR Translation) const noexcept;
-
-    void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
-    // Gets the 8 corners of the box
-
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept;
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                         _In_ FXMVECTOR V2) const noexcept;
-    ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
-    ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
-    ContainmentType Contains(
-        _In_ const BoundingOrientedBox& box) const noexcept;
-    ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
-    bool Intersects(_In_ const BoundingBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                _In_ FXMVECTOR V2) const noexcept;
-    // Triangle-Box test
-
-    PlaneIntersectionType XM_CALLCONV
-    Intersects(_In_ FXMVECTOR Plane) const noexcept;
-    // Plane-box test
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction,
-                                _Out_ float& Dist) const noexcept;
-    // Ray-Box test
-
-    ContainmentType XM_CALLCONV
-    ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1,
-                _In_ FXMVECTOR Plane2, _In_ GXMVECTOR Plane3,
-                _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
-    // Test box against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateMerged(_Out_ BoundingBox& Out, _In_ const BoundingBox& b1,
-                             _In_ const BoundingBox& b2) noexcept;
-
-    static void CreateFromSphere(_Out_ BoundingBox& Out,
-                                 _In_ const BoundingSphere& sh) noexcept;
-
-    static void XM_CALLCONV CreateFromPoints(_Out_ BoundingBox& Out,
-                                             _In_ FXMVECTOR pt1,
-                                             _In_ FXMVECTOR pt2) noexcept;
-    static void CreateFromPoints(_Out_ BoundingBox& Out, _In_ size_t Count,
-                                 _In_reads_bytes_(sizeof(XMFLOAT3) +
-                                                  Stride * (Count - 1))
-                                     const XMFLOAT3* pPoints,
-                                 _In_ size_t Stride) noexcept;
-};
-
-//-------------------------------------------------------------------------------------
-// Oriented bounding box
-//-------------------------------------------------------------------------------------
-struct BoundingOrientedBox {
-    static constexpr size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Center;   // Center of the box.
-    XMFLOAT3 Extents;  // Distance from the center to each side.
-    XMFLOAT4
-    Orientation;  // Unit quaternion representing rotation (box -> world).
-
-    // Creators
-    BoundingOrientedBox() noexcept
-        : Center(0, 0, 0), Extents(1.f, 1.f, 1.f), Orientation(0, 0, 0, 1.f) {}
-
-    BoundingOrientedBox(const BoundingOrientedBox&) = default;
-    BoundingOrientedBox& operator=(const BoundingOrientedBox&) = default;
-
-    BoundingOrientedBox(BoundingOrientedBox&&) = default;
-    BoundingOrientedBox& operator=(BoundingOrientedBox&&) = default;
-
-    constexpr BoundingOrientedBox(_In_ const XMFLOAT3& center,
-                                  _In_ const XMFLOAT3& extents,
-                                  _In_ const XMFLOAT4& orientation) noexcept
-        : Center(center), Extents(extents), Orientation(orientation) {}
-
-    // Methods
-    void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out,
-                               _In_ FXMMATRIX M) const noexcept;
-    void XM_CALLCONV Transform(_Out_ BoundingOrientedBox& Out, _In_ float Scale,
-                               _In_ FXMVECTOR Rotation,
-                               _In_ FXMVECTOR Translation) const noexcept;
-
-    void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
-    // Gets the 8 corners of the box
-
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept;
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                         _In_ FXMVECTOR V2) const noexcept;
-    ContainmentType Contains(_In_ const BoundingSphere& sh) const noexcept;
-    ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
-    ContainmentType Contains(
-        _In_ const BoundingOrientedBox& box) const noexcept;
-    ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
-    bool Intersects(_In_ const BoundingBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                _In_ FXMVECTOR V2) const noexcept;
-    // Triangle-OrientedBox test
-
-    PlaneIntersectionType XM_CALLCONV
-    Intersects(_In_ FXMVECTOR Plane) const noexcept;
-    // Plane-OrientedBox test
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction,
-                                _Out_ float& Dist) const noexcept;
-    // Ray-OrientedBox test
-
-    ContainmentType XM_CALLCONV
-    ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1,
-                _In_ FXMVECTOR Plane2, _In_ GXMVECTOR Plane3,
-                _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
-    // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes)
-
-    // Static methods
-    static void CreateFromBoundingBox(_Out_ BoundingOrientedBox& Out,
-                                      _In_ const BoundingBox& box) noexcept;
-
-    static void CreateFromPoints(_Out_ BoundingOrientedBox& Out,
-                                 _In_ size_t Count,
-                                 _In_reads_bytes_(sizeof(XMFLOAT3) +
-                                                  Stride * (Count - 1))
-                                     const XMFLOAT3* pPoints,
-                                 _In_ size_t Stride) noexcept;
-};
-
-//-------------------------------------------------------------------------------------
-// Bounding frustum
-//-------------------------------------------------------------------------------------
-struct BoundingFrustum {
-    static constexpr size_t CORNER_COUNT = 8;
-
-    XMFLOAT3 Origin;       // Origin of the frustum (and projection).
-    XMFLOAT4 Orientation;  // Quaternion representing rotation.
-
-    float RightSlope;   // Positive X (X/Z)
-    float LeftSlope;    // Negative X
-    float TopSlope;     // Positive Y (Y/Z)
-    float BottomSlope;  // Negative Y
-    float Near, Far;    // Z of the near plane and far plane.
-
-    // Creators
-    BoundingFrustum() noexcept
-        : Origin(0, 0, 0),
-          Orientation(0, 0, 0, 1.f),
-          RightSlope(1.f),
-          LeftSlope(-1.f),
-          TopSlope(1.f),
-          BottomSlope(-1.f),
-          Near(0),
-          Far(1.f) {}
-
-    BoundingFrustum(const BoundingFrustum&) = default;
-    BoundingFrustum& operator=(const BoundingFrustum&) = default;
-
-    BoundingFrustum(BoundingFrustum&&) = default;
-    BoundingFrustum& operator=(BoundingFrustum&&) = default;
-
-    constexpr BoundingFrustum(_In_ const XMFLOAT3& origin,
-                              _In_ const XMFLOAT4& orientation,
-                              _In_ float rightSlope, _In_ float leftSlope,
-                              _In_ float topSlope, _In_ float bottomSlope,
-                              _In_ float nearPlane,
-                              _In_ float farPlane) noexcept
-        : Origin(origin),
-          Orientation(orientation),
-          RightSlope(rightSlope),
-          LeftSlope(leftSlope),
-          TopSlope(topSlope),
-          BottomSlope(bottomSlope),
-          Near(nearPlane),
-          Far(farPlane) {}
-    BoundingFrustum(_In_ CXMMATRIX Projection, bool rhcoords = false) noexcept;
-
-    // Methods
-    void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out,
-                               _In_ FXMMATRIX M) const noexcept;
-    void XM_CALLCONV Transform(_Out_ BoundingFrustum& Out, _In_ float Scale,
-                               _In_ FXMVECTOR Rotation,
-                               _In_ FXMVECTOR Translation) const noexcept;
-
-    void GetCorners(_Out_writes_(8) XMFLOAT3* Corners) const noexcept;
-    // Gets the 8 corners of the frustum
-
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR Point) const noexcept;
-    ContainmentType XM_CALLCONV Contains(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                         _In_ FXMVECTOR V2) const noexcept;
-    ContainmentType Contains(_In_ const BoundingSphere& sp) const noexcept;
-    ContainmentType Contains(_In_ const BoundingBox& box) const noexcept;
-    ContainmentType Contains(
-        _In_ const BoundingOrientedBox& box) const noexcept;
-    ContainmentType Contains(_In_ const BoundingFrustum& fr) const noexcept;
-    // Frustum-Frustum test
-
-    bool Intersects(_In_ const BoundingSphere& sh) const noexcept;
-    bool Intersects(_In_ const BoundingBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingOrientedBox& box) const noexcept;
-    bool Intersects(_In_ const BoundingFrustum& fr) const noexcept;
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1,
-                                _In_ FXMVECTOR V2) const noexcept;
-    // Triangle-Frustum test
-
-    PlaneIntersectionType XM_CALLCONV
-    Intersects(_In_ FXMVECTOR Plane) const noexcept;
-    // Plane-Frustum test
-
-    bool XM_CALLCONV Intersects(_In_ FXMVECTOR rayOrigin,
-                                _In_ FXMVECTOR Direction,
-                                _Out_ float& Dist) const noexcept;
-    // Ray-Frustum test
-
-    ContainmentType XM_CALLCONV
-    ContainedBy(_In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1,
-                _In_ FXMVECTOR Plane2, _In_ GXMVECTOR Plane3,
-                _In_ HXMVECTOR Plane4, _In_ HXMVECTOR Plane5) const noexcept;
-    // Test frustum against six planes (see BoundingFrustum::GetPlanes)
-
-    void GetPlanes(_Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane,
-                   _Out_opt_ XMVECTOR* RightPlane,
-                   _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane,
-                   _Out_opt_ XMVECTOR* BottomPlane) const noexcept;
-    // Create 6 Planes representation of Frustum
-
-    // Static methods
-    static void XM_CALLCONV CreateFromMatrix(_Out_ BoundingFrustum& Out,
-                                             _In_ FXMMATRIX Projection,
-                                             bool rhcoords = false) noexcept;
-};
-
-//-----------------------------------------------------------------------------
-// Triangle intersection testing routines.
-//-----------------------------------------------------------------------------
-namespace TriangleTests {
-bool XM_CALLCONV Intersects(_In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction,
-                            _In_ FXMVECTOR V0, _In_ GXMVECTOR V1,
-                            _In_ HXMVECTOR V2, _Out_ float& Dist) noexcept;
-// Ray-Triangle
-
-bool XM_CALLCONV Intersects(_In_ FXMVECTOR A0, _In_ FXMVECTOR A1,
-                            _In_ FXMVECTOR A2, _In_ GXMVECTOR B0,
-                            _In_ HXMVECTOR B1, _In_ HXMVECTOR B2) noexcept;
-// Triangle-Triangle
-
-PlaneIntersectionType XM_CALLCONV Intersects(_In_ FXMVECTOR V0,
-                                             _In_ FXMVECTOR V1,
-                                             _In_ FXMVECTOR V2,
-                                             _In_ GXMVECTOR Plane) noexcept;
-// Plane-Triangle
-
-ContainmentType XM_CALLCONV
-ContainedBy(_In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2,
-            _In_ GXMVECTOR Plane0, _In_ HXMVECTOR Plane1, _In_ HXMVECTOR Plane2,
-            _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4,
-            _In_ CXMVECTOR Plane5) noexcept;
-// Test a triangle against six planes at once (see BoundingFrustum::GetPlanes)
-}  // namespace TriangleTests
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4068 4365 4616 6001)
-// C4068/4616: ignore unknown pragmas
-// C4365: Off by default noise
-// C6001: False positives
-#endif
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-#pragma clang diagnostic ignored "-Wunknown-warning-option"
-#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
-#endif
-
-#include "DirectXCollision.inl"
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-}  // namespace DirectX
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXCollision.inl b/targets/app/linux/Stubs/DirectXMath/DirectXCollision.inl
deleted file mode 100644
index f2db42359..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXCollision.inl
+++ /dev/null
@@ -1,4921 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXCollision.inl -- C++ Collision Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = {
-    {{{-1.0f, -1.0f, 1.0f, 0.0f}}},  {{{1.0f, -1.0f, 1.0f, 0.0f}}},
-    {{{1.0f, 1.0f, 1.0f, 0.0f}}},    {{{-1.0f, 1.0f, 1.0f, 0.0f}}},
-    {{{-1.0f, -1.0f, -1.0f, 0.0f}}}, {{{1.0f, -1.0f, -1.0f, 0.0f}}},
-    {{{1.0f, 1.0f, -1.0f, 0.0f}}},   {{{-1.0f, 1.0f, -1.0f, 0.0f}}},
-};
-
-XMGLOBALCONST XMVECTORF32 g_RayEpsilon = {{{1e-20f, 1e-20f, 1e-20f, 1e-20f}}};
-XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = {
-    {{-1e-20f, -1e-20f, -1e-20f, -1e-20f}}};
-XMGLOBALCONST XMVECTORF32 g_FltMin = {
-    {{-FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX}}};
-XMGLOBALCONST XMVECTORF32 g_FltMax = {{{FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}}};
-
-namespace MathInternal {
-
-//-----------------------------------------------------------------------------
-// Return true if any of the elements of a 3 vector are equal to 0xffffffff.
-// Slightly more efficient than using XMVector3EqualInt.
-//-----------------------------------------------------------------------------
-inline bool XMVector3AnyTrue(_In_ FXMVECTOR V) noexcept {
-    // Duplicate the fourth element from the first element.
-    XMVECTOR C =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(
-            V);
-
-    return XMComparisonAnyTrue(XMVector4EqualIntR(C, XMVectorTrueInt()));
-}
-
-//-----------------------------------------------------------------------------
-// Return true if all of the elements of a 3 vector are equal to 0xffffffff.
-// Slightly more efficient than using XMVector3EqualInt.
-//-----------------------------------------------------------------------------
-inline bool XMVector3AllTrue(_In_ FXMVECTOR V) noexcept {
-    // Duplicate the fourth element from the first element.
-    XMVECTOR C =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(
-            V);
-
-    return XMComparisonAllTrue(XMVector4EqualIntR(C, XMVectorTrueInt()));
-}
-
-#if defined(_PREFAST_) || !defined(NDEBUG)
-
-XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = {
-    {{1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f}}};
-XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = {
-    {{1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f}}};
-XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = {
-    {{1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f}}};
-
-//-----------------------------------------------------------------------------
-// Return true if the vector is a unit vector (length == 1).
-//-----------------------------------------------------------------------------
-inline bool XMVector3IsUnit(_In_ FXMVECTOR V) noexcept {
-    XMVECTOR Difference =
-        XMVectorSubtract(XMVector3Length(V), XMVectorSplatOne());
-    return XMVector4Less(XMVectorAbs(Difference), g_UnitVectorEpsilon);
-}
-
-//-----------------------------------------------------------------------------
-// Return true if the quaterion is a unit quaternion.
-//-----------------------------------------------------------------------------
-inline bool XMQuaternionIsUnit(_In_ FXMVECTOR Q) noexcept {
-    XMVECTOR Difference =
-        XMVectorSubtract(XMVector4Length(Q), XMVectorSplatOne());
-    return XMVector4Less(XMVectorAbs(Difference), g_UnitQuaternionEpsilon);
-}
-
-//-----------------------------------------------------------------------------
-// Return true if the plane is a unit plane.
-//-----------------------------------------------------------------------------
-inline bool XMPlaneIsUnit(_In_ FXMVECTOR Plane) noexcept {
-    XMVECTOR Difference =
-        XMVectorSubtract(XMVector3Length(Plane), XMVectorSplatOne());
-    return XMVector4Less(XMVectorAbs(Difference), g_UnitPlaneEpsilon);
-}
-
-#endif  // _PREFAST_ || !NDEBUG
-
-//-----------------------------------------------------------------------------
-inline XMVECTOR XMPlaneTransform(_In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation,
-                                 _In_ FXMVECTOR Translation) noexcept {
-    XMVECTOR vNormal = XMVector3Rotate(Plane, Rotation);
-    XMVECTOR vD = XMVectorSubtract(XMVectorSplatW(Plane),
-                                   XMVector3Dot(vNormal, Translation));
-
-    return XMVectorInsert<0, 0, 0, 0, 1>(vNormal, vD);
-}
-
-//-----------------------------------------------------------------------------
-// Return the point on the line segement (S1, S2) nearest the point P.
-//-----------------------------------------------------------------------------
-inline XMVECTOR PointOnLineSegmentNearestPoint(_In_ FXMVECTOR S1,
-                                               _In_ FXMVECTOR S2,
-                                               _In_ FXMVECTOR P) noexcept {
-    XMVECTOR Dir = XMVectorSubtract(S2, S1);
-    XMVECTOR Projection =
-        XMVectorSubtract(XMVector3Dot(P, Dir), XMVector3Dot(S1, Dir));
-    XMVECTOR LengthSq = XMVector3Dot(Dir, Dir);
-
-    XMVECTOR t = XMVectorMultiply(Projection, XMVectorReciprocal(LengthSq));
-    XMVECTOR Point = XMVectorMultiplyAdd(t, Dir, S1);
-
-    // t < 0
-    XMVECTOR SelectS1 = XMVectorLess(Projection, XMVectorZero());
-    Point = XMVectorSelect(Point, S1, SelectS1);
-
-    // t > 1
-    XMVECTOR SelectS2 = XMVectorGreater(Projection, LengthSq);
-    Point = XMVectorSelect(Point, S2, SelectS2);
-
-    return Point;
-}
-
-//-----------------------------------------------------------------------------
-// Test if the point (P) on the plane of the triangle is inside the triangle
-// (V0, V1, V2).
-//-----------------------------------------------------------------------------
-inline XMVECTOR XM_CALLCONV
-PointOnPlaneInsideTriangle(_In_ FXMVECTOR P, _In_ FXMVECTOR V0,
-                           _In_ FXMVECTOR V1, _In_ GXMVECTOR V2) noexcept {
-    // Compute the triangle normal.
-    XMVECTOR N =
-        XMVector3Cross(XMVectorSubtract(V2, V0), XMVectorSubtract(V1, V0));
-
-    // Compute the cross products of the vector from the base of each edge to
-    // the point with each edge vector.
-    XMVECTOR C0 =
-        XMVector3Cross(XMVectorSubtract(P, V0), XMVectorSubtract(V1, V0));
-    XMVECTOR C1 =
-        XMVector3Cross(XMVectorSubtract(P, V1), XMVectorSubtract(V2, V1));
-    XMVECTOR C2 =
-        XMVector3Cross(XMVectorSubtract(P, V2), XMVectorSubtract(V0, V2));
-
-    // If the cross product points in the same direction as the normal the the
-    // point is inside the edge (it is zero if is on the edge).
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Inside0 = XMVectorGreaterOrEqual(XMVector3Dot(C0, N), Zero);
-    XMVECTOR Inside1 = XMVectorGreaterOrEqual(XMVector3Dot(C1, N), Zero);
-    XMVECTOR Inside2 = XMVectorGreaterOrEqual(XMVector3Dot(C2, N), Zero);
-
-    // If the point inside all of the edges it is inside.
-    return XMVectorAndInt(XMVectorAndInt(Inside0, Inside1), Inside2);
-}
-
-//-----------------------------------------------------------------------------
-inline bool SolveCubic(_In_ float e, _In_ float f, _In_ float g, _Out_ float* t,
-                       _Out_ float* u, _Out_ float* v) noexcept {
-    float p, q, h, rc, d, theta, costh3, sinth3;
-
-    p = f - e * e / 3.0f;
-    q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f;
-    h = q * q / 4.0f + p * p * p / 27.0f;
-
-    if (h > 0) {
-        *t = *u = *v = 0.f;
-        return false;  // only one real root
-    }
-
-    if ((h == 0) && (q == 0))  // all the same root
-    {
-        *t = -e / 3;
-        *u = -e / 3;
-        *v = -e / 3;
-
-        return true;
-    }
-
-    d = sqrtf(q * q / 4.0f - h);
-    if (d < 0)
-        rc = -powf(-d, 1.0f / 3.0f);
-    else
-        rc = powf(d, 1.0f / 3.0f);
-
-    theta = XMScalarACos(-q / (2.0f * d));
-    costh3 = XMScalarCos(theta / 3.0f);
-    sinth3 = sqrtf(3.0f) * XMScalarSin(theta / 3.0f);
-    *t = 2.0f * rc * costh3 - e / 3.0f;
-    *u = -rc * (costh3 + sinth3) - e / 3.0f;
-    *v = -rc * (costh3 - sinth3) - e / 3.0f;
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-inline XMVECTOR CalculateEigenVector(_In_ float m11, _In_ float m12,
-                                     _In_ float m13, _In_ float m22,
-                                     _In_ float m23, _In_ float m33,
-                                     _In_ float e) noexcept {
-    float fTmp[3];
-    fTmp[0] = m12 * m23 - m13 * (m22 - e);
-    fTmp[1] = m13 * m12 - m23 * (m11 - e);
-    fTmp[2] = (m11 - e) * (m22 - e) - m12 * m12;
-
-    XMVECTOR vTmp = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(fTmp));
-
-    if (XMVector3Equal(vTmp, XMVectorZero()))  // planar or linear
-    {
-        float f1, f2, f3;
-
-        // we only have one equation - find a valid one
-        if ((m11 - e != 0) || (m12 != 0) || (m13 != 0)) {
-            f1 = m11 - e;
-            f2 = m12;
-            f3 = m13;
-        } else if ((m12 != 0) || (m22 - e != 0) || (m23 != 0)) {
-            f1 = m12;
-            f2 = m22 - e;
-            f3 = m23;
-        } else if ((m13 != 0) || (m23 != 0) || (m33 - e != 0)) {
-            f1 = m13;
-            f2 = m23;
-            f3 = m33 - e;
-        } else {
-            // error, we'll just make something up - we have NO context
-            f1 = 1.0f;
-            f2 = 0.0f;
-            f3 = 0.0f;
-        }
-
-        if (f1 == 0)
-            vTmp = XMVectorSetX(vTmp, 0.0f);
-        else
-            vTmp = XMVectorSetX(vTmp, 1.0f);
-
-        if (f2 == 0)
-            vTmp = XMVectorSetY(vTmp, 0.0f);
-        else
-            vTmp = XMVectorSetY(vTmp, 1.0f);
-
-        if (f3 == 0) {
-            vTmp = XMVectorSetZ(vTmp, 0.0f);
-            // recalculate y to make equation work
-            if (m12 != 0) vTmp = XMVectorSetY(vTmp, -f1 / f2);
-        } else {
-            vTmp = XMVectorSetZ(vTmp, (f2 - f1) / f3);
-        }
-    }
-
-    if (XMVectorGetX(XMVector3LengthSq(vTmp)) > 1e-5f) {
-        return XMVector3Normalize(vTmp);
-    } else {
-        // Multiply by a value large enough to make the vector non-zero.
-        vTmp = XMVectorScale(vTmp, 1e5f);
-        return XMVector3Normalize(vTmp);
-    }
-}
-
-//-----------------------------------------------------------------------------
-inline bool CalculateEigenVectors(_In_ float m11, _In_ float m12,
-                                  _In_ float m13, _In_ float m22,
-                                  _In_ float m23, _In_ float m33, _In_ float e1,
-                                  _In_ float e2, _In_ float e3,
-                                  _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2,
-                                  _Out_ XMVECTOR* pV3) noexcept {
-    *pV1 = DirectX::MathInternal::CalculateEigenVector(m11, m12, m13, m22, m23,
-                                                       m33, e1);
-    *pV2 = DirectX::MathInternal::CalculateEigenVector(m11, m12, m13, m22, m23,
-                                                       m33, e2);
-    *pV3 = DirectX::MathInternal::CalculateEigenVector(m11, m12, m13, m22, m23,
-                                                       m33, e3);
-
-    bool v1z = false;
-    bool v2z = false;
-    bool v3z = false;
-
-    XMVECTOR Zero = XMVectorZero();
-
-    if (XMVector3Equal(*pV1, Zero)) v1z = true;
-
-    if (XMVector3Equal(*pV2, Zero)) v2z = true;
-
-    if (XMVector3Equal(*pV3, Zero)) v3z = true;
-
-    bool e12 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV2))) >
-                0.1f);  // check for non-orthogonal vectors
-    bool e13 = (fabsf(XMVectorGetX(XMVector3Dot(*pV1, *pV3))) > 0.1f);
-    bool e23 = (fabsf(XMVectorGetX(XMVector3Dot(*pV2, *pV3))) > 0.1f);
-
-    if ((v1z && v2z && v3z) || (e12 && e13 && e23) || (e12 && v3z) ||
-        (e13 && v2z) || (e23 && v1z))  // all eigenvectors are 0- any basis set
-    {
-        *pV1 = g_XMIdentityR0.v;
-        *pV2 = g_XMIdentityR1.v;
-        *pV3 = g_XMIdentityR2.v;
-        return true;
-    }
-
-    if (v1z && v2z) {
-        XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV3);
-        if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) {
-            vTmp = XMVector3Cross(g_XMIdentityR0, *pV3);
-        }
-        *pV1 = XMVector3Normalize(vTmp);
-        *pV2 = XMVector3Cross(*pV3, *pV1);
-        return true;
-    }
-
-    if (v3z && v1z) {
-        XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV2);
-        if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) {
-            vTmp = XMVector3Cross(g_XMIdentityR0, *pV2);
-        }
-        *pV3 = XMVector3Normalize(vTmp);
-        *pV1 = XMVector3Cross(*pV2, *pV3);
-        return true;
-    }
-
-    if (v2z && v3z) {
-        XMVECTOR vTmp = XMVector3Cross(g_XMIdentityR1, *pV1);
-        if (XMVectorGetX(XMVector3LengthSq(vTmp)) < 1e-5f) {
-            vTmp = XMVector3Cross(g_XMIdentityR0, *pV1);
-        }
-        *pV2 = XMVector3Normalize(vTmp);
-        *pV3 = XMVector3Cross(*pV1, *pV2);
-        return true;
-    }
-
-    if ((v1z) || e12) {
-        *pV1 = XMVector3Cross(*pV2, *pV3);
-        return true;
-    }
-
-    if ((v2z) || e23) {
-        *pV2 = XMVector3Cross(*pV3, *pV1);
-        return true;
-    }
-
-    if ((v3z) || e13) {
-        *pV3 = XMVector3Cross(*pV1, *pV2);
-        return true;
-    }
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-inline bool CalculateEigenVectorsFromCovarianceMatrix(
-    _In_ float Cxx, _In_ float Cyy, _In_ float Czz, _In_ float Cxy,
-    _In_ float Cxz, _In_ float Cyz, _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2,
-    _Out_ XMVECTOR* pV3) noexcept {
-    // Calculate the eigenvalues by solving a cubic equation.
-    float e = -(Cxx + Cyy + Czz);
-    float f =
-        Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz;
-    float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx -
-              Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz;
-
-    float ev1, ev2, ev3;
-    if (!DirectX::MathInternal::SolveCubic(e, f, g, &ev1, &ev2, &ev3)) {
-        // set them to arbitrary orthonormal basis set
-        *pV1 = g_XMIdentityR0.v;
-        *pV2 = g_XMIdentityR1.v;
-        *pV3 = g_XMIdentityR2.v;
-        return false;
-    }
-
-    return DirectX::MathInternal::CalculateEigenVectors(
-        Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3);
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectTrianglePlane(FXMVECTOR V0, FXMVECTOR V1,
-                                                   FXMVECTOR V2,
-                                                   GXMVECTOR Plane,
-                                                   XMVECTOR& Outside,
-                                                   XMVECTOR& Inside) noexcept {
-    // Plane0
-    XMVECTOR Dist0 = XMVector4Dot(V0, Plane);
-    XMVECTOR Dist1 = XMVector4Dot(V1, Plane);
-    XMVECTOR Dist2 = XMVector4Dot(V2, Plane);
-
-    XMVECTOR MinDist = XMVectorMin(Dist0, Dist1);
-    MinDist = XMVectorMin(MinDist, Dist2);
-
-    XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1);
-    MaxDist = XMVectorMax(MaxDist, Dist2);
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Outside the plane?
-    Outside = XMVectorGreater(MinDist, Zero);
-
-    // Fully inside the plane?
-    Inside = XMVectorLess(MaxDist, Zero);
-}
-
-//-----------------------------------------------------------------------------
-inline void FastIntersectSpherePlane(_In_ FXMVECTOR Center,
-                                     _In_ FXMVECTOR Radius,
-                                     _In_ FXMVECTOR Plane,
-                                     _Out_ XMVECTOR& Outside,
-                                     _Out_ XMVECTOR& Inside) noexcept {
-    XMVECTOR Dist = XMVector4Dot(Center, Plane);
-
-    // Outside the plane?
-    Outside = XMVectorGreater(Dist, Radius);
-
-    // Fully inside the plane?
-    Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
-}
-
-//-----------------------------------------------------------------------------
-inline void FastIntersectAxisAlignedBoxPlane(_In_ FXMVECTOR Center,
-                                             _In_ FXMVECTOR Extents,
-                                             _In_ FXMVECTOR Plane,
-                                             _Out_ XMVECTOR& Outside,
-                                             _Out_ XMVECTOR& Inside) noexcept {
-    // Compute the distance to the center of the box.
-    XMVECTOR Dist = XMVector4Dot(Center, Plane);
-
-    // Project the axes of the box onto the normal of the plane.  Half the
-    // length of the projection (sometime called the "radius") is equal to
-    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
-    // where h(i) are extents of the box, n is the plane normal, and b(i) are
-    // the axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)].
-    XMVECTOR Radius = XMVector3Dot(Extents, XMVectorAbs(Plane));
-
-    // Outside the plane?
-    Outside = XMVectorGreater(Dist, Radius);
-
-    // Fully inside the plane?
-    Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectOrientedBoxPlane(
-    _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0,
-    _In_ GXMVECTOR Axis1, _In_ HXMVECTOR Axis2, _In_ HXMVECTOR Plane,
-    _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept {
-    // Compute the distance to the center of the box.
-    XMVECTOR Dist = XMVector4Dot(Center, Plane);
-
-    // Project the axes of the box onto the normal of the plane.  Half the
-    // length of the projection (sometime called the "radius") is equal to
-    // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w))
-    // where h(i) are extents of the box, n is the plane normal, and b(i) are
-    // the axes of the box.
-    XMVECTOR Radius = XMVector3Dot(Plane, Axis0);
-    Radius = XMVectorInsert<0, 0, 1, 0, 0>(Radius, XMVector3Dot(Plane, Axis1));
-    Radius = XMVectorInsert<0, 0, 0, 1, 0>(Radius, XMVector3Dot(Plane, Axis2));
-    Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
-
-    // Outside the plane?
-    Outside = XMVectorGreater(Dist, Radius);
-
-    // Fully inside the plane?
-    Inside = XMVectorLess(Dist, XMVectorNegate(Radius));
-}
-
-//-----------------------------------------------------------------------------
-inline void XM_CALLCONV FastIntersectFrustumPlane(
-    _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2,
-    _In_ GXMVECTOR Point3, _In_ HXMVECTOR Point4, _In_ HXMVECTOR Point5,
-    _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, _In_ CXMVECTOR Plane,
-    _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside) noexcept {
-    // Find the min/max projection of the frustum onto the plane normal.
-    XMVECTOR Min, Max, Dist;
-
-    Min = Max = XMVector3Dot(Plane, Point0);
-
-    Dist = XMVector3Dot(Plane, Point1);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point2);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point3);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point4);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point5);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point6);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    Dist = XMVector3Dot(Plane, Point7);
-    Min = XMVectorMin(Min, Dist);
-    Max = XMVectorMax(Max, Dist);
-
-    XMVECTOR PlaneDist = XMVectorNegate(XMVectorSplatW(Plane));
-
-    // Outside the plane?
-    Outside = XMVectorGreater(Min, PlaneDist);
-
-    // Fully inside the plane?
-    Inside = XMVectorLess(Max, PlaneDist);
-}
-
-}  // namespace MathInternal
-
-/****************************************************************************
- *
- * BoundingSphere
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform a sphere by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingSphere::Transform(BoundingSphere& Out, FXMMATRIX M) const noexcept {
-    // Load the center of the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-
-    // Transform the center of the sphere.
-    XMVECTOR C = XMVector3Transform(vCenter, M);
-
-    XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]);
-    XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]);
-    XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]);
-
-    XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ));
-
-    // Store the center sphere.
-    XMStoreFloat3(&Out.Center, C);
-
-    // Scale the radius of the pshere.
-    float Scale = sqrtf(XMVectorGetX(d));
-    Out.Radius = Radius * Scale;
-}
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingSphere::Transform(BoundingSphere& Out, float Scale, FXMVECTOR Rotation,
-                          FXMVECTOR Translation) const noexcept {
-    // Load the center of the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-
-    // Transform the center of the sphere.
-    vCenter = XMVectorAdd(
-        XMVector3Rotate(XMVectorScale(vCenter, Scale), Rotation), Translation);
-
-    // Store the center sphere.
-    XMStoreFloat3(&Out.Center, vCenter);
-
-    // Scale the radius of the pshere.
-    Out.Radius = Radius * Scale;
-}
-
-//-----------------------------------------------------------------------------
-// Point in sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingSphere::Contains(FXMVECTOR Point) const noexcept {
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-
-    XMVECTOR DistanceSquared =
-        XMVector3LengthSq(XMVectorSubtract(Point, vCenter));
-    XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius);
-
-    return XMVector3LessOrEqual(DistanceSquared, RadiusSquared) ? CONTAINS
-                                                                : DISJOINT;
-}
-
-//-----------------------------------------------------------------------------
-// Triangle in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingSphere::Contains(FXMVECTOR V0, FXMVECTOR V1,
-                         FXMVECTOR V2) const noexcept {
-    if (!Intersects(V0, V1, V2)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-    XMVECTOR RadiusSquared = XMVectorMultiply(vRadius, vRadius);
-
-    XMVECTOR DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V0, vCenter));
-    XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared);
-
-    DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V1, vCenter));
-    Inside = XMVectorAndInt(
-        Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared));
-
-    DistanceSquared = XMVector3LengthSq(XMVectorSubtract(V2, vCenter));
-    Inside = XMVectorAndInt(
-        Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared));
-
-    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS
-                                                          : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Sphere in sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingSphere::Contains(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR Center1 = XMLoadFloat3(&Center);
-    float r1 = Radius;
-
-    XMVECTOR Center2 = XMLoadFloat3(&sh.Center);
-    float r2 = sh.Radius;
-
-    XMVECTOR V = XMVectorSubtract(Center2, Center1);
-
-    XMVECTOR Dist = XMVector3Length(V);
-
-    float d = XMVectorGetX(Dist);
-
-    return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT;
-}
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingSphere::Contains(
-    const BoundingBox& box) const noexcept {
-    if (!box.Intersects(*this)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
-
-    XMVECTOR boxCenter = XMLoadFloat3(&box.Center);
-    XMVECTOR boxExtents = XMLoadFloat3(&box.Extents);
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    XMVECTOR offset = XMVectorSubtract(boxCenter, vCenter);
-
-    for (size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVectorMultiplyAdd(boxExtents, g_BoxOffset[i], offset);
-        XMVECTOR d = XMVector3LengthSq(C);
-        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
-    }
-
-    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS
-                                                             : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Oriented box in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingSphere::Contains(
-    const BoundingOrientedBox& box) const noexcept {
-    if (!box.Intersects(*this)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
-
-    XMVECTOR boxCenter = XMLoadFloat3(&box.Center);
-    XMVECTOR boxExtents = XMLoadFloat3(&box.Extents);
-    XMVECTOR boxOrientation = XMLoadFloat4(&box.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(boxOrientation));
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVectorAdd(
-            XMVector3Rotate(XMVectorMultiply(boxExtents, g_BoxOffset[i]),
-                            boxOrientation),
-            boxCenter);
-        XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C));
-        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
-    }
-
-    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS
-                                                             : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Frustum in sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingSphere::Contains(
-    const BoundingFrustum& fr) const noexcept {
-    if (!fr.Intersects(*this)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
-
-    XMVECTOR vOrigin = XMLoadFloat3(&fr.Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&fr.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f);
-    XMVECTOR vRightBottom =
-        XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftTop = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftBottom =
-        XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&fr.Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&fr.Far);
-
-    XMVECTOR Corners[BoundingFrustum::CORNER_COUNT];
-    Corners[0] = XMVectorMultiply(vRightTop, vNear);
-    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
-    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
-    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
-    Corners[4] = XMVectorMultiply(vRightTop, vFar);
-    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
-    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
-    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
-
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) {
-        XMVECTOR C =
-            XMVectorAdd(XMVector3Rotate(Corners[i], vOrientation), vOrigin);
-        XMVECTOR d = XMVector3LengthSq(XMVectorSubtract(vCenter, C));
-        InsideAll = XMVectorAndInt(InsideAll, XMVectorLessOrEqual(d, RadiusSq));
-    }
-
-    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS
-                                                             : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Sphere vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingSphere::Intersects(
-    const BoundingSphere& sh) const noexcept {
-    // Load A.
-    XMVECTOR vCenterA = XMLoadFloat3(&Center);
-    XMVECTOR vRadiusA = XMVectorReplicatePtr(&Radius);
-
-    // Load B.
-    XMVECTOR vCenterB = XMLoadFloat3(&sh.Center);
-    XMVECTOR vRadiusB = XMVectorReplicatePtr(&sh.Radius);
-
-    // Distance squared between centers.
-    XMVECTOR Delta = XMVectorSubtract(vCenterB, vCenterA);
-    XMVECTOR DistanceSquared = XMVector3LengthSq(Delta);
-
-    // Sum of the radii squared.
-    XMVECTOR RadiusSquared = XMVectorAdd(vRadiusA, vRadiusB);
-    RadiusSquared = XMVectorMultiply(RadiusSquared, RadiusSquared);
-
-    return XMVector3LessOrEqual(DistanceSquared, RadiusSquared);
-}
-
-//-----------------------------------------------------------------------------
-// Box vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingSphere::Intersects(
-    const BoundingBox& box) const noexcept {
-    return box.Intersects(*this);
-}
-
-_Use_decl_annotations_ inline bool BoundingSphere::Intersects(
-    const BoundingOrientedBox& box) const noexcept {
-    return box.Intersects(*this);
-}
-
-//-----------------------------------------------------------------------------
-// Frustum vs. sphere test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingSphere::Intersects(
-    const BoundingFrustum& fr) const noexcept {
-    return fr.Intersects(*this);
-}
-
-//-----------------------------------------------------------------------------
-// Triangle vs sphere test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingSphere::Intersects(
-    FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept {
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-
-    // Compute the plane of the triangle (has to be normalized).
-    XMVECTOR N = XMVector3Normalize(
-        XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0)));
-
-    // Assert that the triangle is not degenerate.
-    assert(!XMVector3Equal(N, XMVectorZero()));
-
-    // Find the nearest feature on the triangle to the sphere.
-    XMVECTOR Dist = XMVector3Dot(XMVectorSubtract(vCenter, V0), N);
-
-    // If the center of the sphere is farther from the plane of the triangle
-    // than the radius of the sphere, then there cannot be an intersection.
-    XMVECTOR NoIntersection = XMVectorLess(Dist, XMVectorNegate(vRadius));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Dist, vRadius));
-
-    // Project the center of the sphere onto the plane of the triangle.
-    XMVECTOR Point = XMVectorNegativeMultiplySubtract(N, Dist, vCenter);
-
-    // Is it inside all the edges? If so we intersect because the distance
-    // to the plane is less than the radius.
-    XMVECTOR Intersection =
-        DirectX::MathInternal::PointOnPlaneInsideTriangle(Point, V0, V1, V2);
-
-    // Find the nearest point on each edge.
-    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
-
-    // Edge 0,1
-    Point =
-        DirectX::MathInternal::PointOnLineSegmentNearestPoint(V0, V1, vCenter);
-
-    // If the distance to the center of the sphere to the point is less than
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt(
-        Intersection,
-        XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)),
-                            RadiusSq));
-
-    // Edge 1,2
-    Point =
-        DirectX::MathInternal::PointOnLineSegmentNearestPoint(V1, V2, vCenter);
-
-    // If the distance to the center of the sphere to the point is less than
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt(
-        Intersection,
-        XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)),
-                            RadiusSq));
-
-    // Edge 2,0
-    Point =
-        DirectX::MathInternal::PointOnLineSegmentNearestPoint(V2, V0, vCenter);
-
-    // If the distance to the center of the sphere to the point is less than
-    // the radius of the sphere then it must intersect.
-    Intersection = XMVectorOrInt(
-        Intersection,
-        XMVectorLessOrEqual(XMVector3LengthSq(XMVectorSubtract(vCenter, Point)),
-                            RadiusSq));
-
-    return XMVector4EqualInt(XMVectorAndCInt(Intersection, NoIntersection),
-                             XMVectorTrueInt());
-}
-
-//-----------------------------------------------------------------------------
-// Sphere-plane intersection
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline PlaneIntersectionType XM_CALLCONV
-BoundingSphere::Intersects(FXMVECTOR Plane) const noexcept {
-    assert(DirectX::MathInternal::XMPlaneIsUnit(Plane));
-
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    XMVECTOR Outside, Inside;
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane,
-                                                    Outside, Inside);
-
-    // If the sphere is outside any plane it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return FRONT;
-
-    // If the sphere is inside all planes it is inside.
-    if (XMVector4EqualInt(Inside, XMVectorTrueInt())) return BACK;
-
-    // The sphere is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with a sphere.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingSphere::Intersects(
-    FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept {
-    assert(DirectX::MathInternal::XMVector3IsUnit(Direction));
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-
-    // l is the vector from the ray origin to the center of the sphere.
-    XMVECTOR l = XMVectorSubtract(vCenter, Origin);
-
-    // s is the projection of the l onto the ray direction.
-    XMVECTOR s = XMVector3Dot(l, Direction);
-
-    XMVECTOR l2 = XMVector3Dot(l, l);
-
-    XMVECTOR r2 = XMVectorMultiply(vRadius, vRadius);
-
-    // m2 is squared distance from the center of the sphere to the projection.
-    XMVECTOR m2 = XMVectorNegativeMultiplySubtract(s, s, l2);
-
-    XMVECTOR NoIntersection;
-
-    // If the ray origin is outside the sphere and the center of the sphere is
-    // behind the ray origin there is no intersection.
-    NoIntersection = XMVectorAndInt(XMVectorLess(s, XMVectorZero()),
-                                    XMVectorGreater(l2, r2));
-
-    // If the squared distance from the center of the sphere to the projection
-    // is greater than the radius squared the ray will miss the sphere.
-    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(m2, r2));
-
-    // The ray hits the sphere, compute the nearest intersection point.
-    XMVECTOR q = XMVectorSqrt(XMVectorSubtract(r2, m2));
-    XMVECTOR t1 = XMVectorSubtract(s, q);
-    XMVECTOR t2 = XMVectorAdd(s, q);
-
-    XMVECTOR OriginInside = XMVectorLessOrEqual(l2, r2);
-    XMVECTOR t = XMVectorSelect(t1, t2, OriginInside);
-
-    if (XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt())) {
-        // Store the x-component to *pDist.
-        XMStoreFloat(&Dist, t);
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-//-----------------------------------------------------------------------------
-// Test a sphere vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingSphere::ContainedBy(FXMVECTOR Plane0, FXMVECTOR Plane1,
-                            FXMVECTOR Plane2, GXMVECTOR Plane3,
-                            HXMVECTOR Plane4, HXMVECTOR Plane5) const noexcept {
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&Radius);
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane0,
-                                                    Outside, Inside);
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane1,
-                                                    Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane2,
-                                                    Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane3,
-                                                    Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane4,
-                                                    Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectSpherePlane(vCenter, vRadius, Plane5,
-                                                    Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    // If the sphere is outside any plane it is outside.
-    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) return DISJOINT;
-
-    // If the sphere is inside all planes it is inside.
-    if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) return CONTAINS;
-
-    // The sphere is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Creates a bounding sphere that contains two other bounding spheres
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingSphere::CreateMerged(
-    BoundingSphere& Out, const BoundingSphere& S1,
-    const BoundingSphere& S2) noexcept {
-    XMVECTOR Center1 = XMLoadFloat3(&S1.Center);
-    float r1 = S1.Radius;
-
-    XMVECTOR Center2 = XMLoadFloat3(&S2.Center);
-    float r2 = S2.Radius;
-
-    XMVECTOR V = XMVectorSubtract(Center2, Center1);
-
-    XMVECTOR Dist = XMVector3Length(V);
-
-    float d = XMVectorGetX(Dist);
-
-    if (r1 + r2 >= d) {
-        if (r1 - r2 >= d) {
-            Out = S1;
-            return;
-        } else if (r2 - r1 >= d) {
-            Out = S2;
-            return;
-        }
-    }
-
-    XMVECTOR N = XMVectorDivide(V, Dist);
-
-    float t1 = XMMin(-r1, d - r2);
-    float t2 = XMMax(r1, d + r2);
-    float t_5 = (t2 - t1) * 0.5f;
-
-    XMVECTOR NCenter =
-        XMVectorAdd(Center1, XMVectorMultiply(N, XMVectorReplicate(t_5 + t1)));
-
-    XMStoreFloat3(&Out.Center, NCenter);
-    Out.Radius = t_5;
-}
-
-//-----------------------------------------------------------------------------
-// Create sphere enscribing bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingSphere::CreateFromBoundingBox(
-    BoundingSphere& Out, const BoundingBox& box) noexcept {
-    Out.Center = box.Center;
-    XMVECTOR vExtents = XMLoadFloat3(&box.Extents);
-    Out.Radius = XMVectorGetX(XMVector3Length(vExtents));
-}
-
-_Use_decl_annotations_ inline void BoundingSphere::CreateFromBoundingBox(
-    BoundingSphere& Out, const BoundingOrientedBox& box) noexcept {
-    // Bounding box orientation is irrelevant because a sphere is rotationally
-    // invariant
-    Out.Center = box.Center;
-    XMVECTOR vExtents = XMLoadFloat3(&box.Extents);
-    Out.Radius = XMVectorGetX(XMVector3Length(vExtents));
-}
-
-//-----------------------------------------------------------------------------
-// Find the approximate smallest enclosing bounding sphere for a set of
-// points. Exact computation of the smallest enclosing bounding sphere is
-// possible but is slower and requires a more complex algorithm.
-// The algorithm is based on  Jack Ritter, "An Efficient Bounding Sphere",
-// Graphics Gems.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingSphere::CreateFromPoints(
-    BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints,
-    size_t Stride) noexcept {
-    assert(Count > 0);
-    assert(pPoints);
-
-    // Find the points with minimum and maximum x, y, and z
-    XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ;
-
-    MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3(pPoints);
-
-    for (size_t i = 1; i < Count; ++i) {
-        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-            reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
-
-        float px = XMVectorGetX(Point);
-        float py = XMVectorGetY(Point);
-        float pz = XMVectorGetZ(Point);
-
-        if (px < XMVectorGetX(MinX)) MinX = Point;
-
-        if (px > XMVectorGetX(MaxX)) MaxX = Point;
-
-        if (py < XMVectorGetY(MinY)) MinY = Point;
-
-        if (py > XMVectorGetY(MaxY)) MaxY = Point;
-
-        if (pz < XMVectorGetZ(MinZ)) MinZ = Point;
-
-        if (pz > XMVectorGetZ(MaxZ)) MaxZ = Point;
-    }
-
-    // Use the min/max pair that are farthest apart to form the initial sphere.
-    XMVECTOR DeltaX = XMVectorSubtract(MaxX, MinX);
-    XMVECTOR DistX = XMVector3Length(DeltaX);
-
-    XMVECTOR DeltaY = XMVectorSubtract(MaxY, MinY);
-    XMVECTOR DistY = XMVector3Length(DeltaY);
-
-    XMVECTOR DeltaZ = XMVectorSubtract(MaxZ, MinZ);
-    XMVECTOR DistZ = XMVector3Length(DeltaZ);
-
-    XMVECTOR vCenter;
-    XMVECTOR vRadius;
-
-    if (XMVector3Greater(DistX, DistY)) {
-        if (XMVector3Greater(DistX, DistZ)) {
-            // Use min/max x.
-            vCenter = XMVectorLerp(MaxX, MinX, 0.5f);
-            vRadius = XMVectorScale(DistX, 0.5f);
-        } else {
-            // Use min/max z.
-            vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f);
-            vRadius = XMVectorScale(DistZ, 0.5f);
-        }
-    } else  // Y >= X
-    {
-        if (XMVector3Greater(DistY, DistZ)) {
-            // Use min/max y.
-            vCenter = XMVectorLerp(MaxY, MinY, 0.5f);
-            vRadius = XMVectorScale(DistY, 0.5f);
-        } else {
-            // Use min/max z.
-            vCenter = XMVectorLerp(MaxZ, MinZ, 0.5f);
-            vRadius = XMVectorScale(DistZ, 0.5f);
-        }
-    }
-
-    // Add any points not inside the sphere.
-    for (size_t i = 0; i < Count; ++i) {
-        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-            reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
-
-        XMVECTOR Delta = XMVectorSubtract(Point, vCenter);
-
-        XMVECTOR Dist = XMVector3Length(Delta);
-
-        if (XMVector3Greater(Dist, vRadius)) {
-            // Adjust sphere to include the new point.
-            vRadius = XMVectorScale(XMVectorAdd(vRadius, Dist), 0.5f);
-            vCenter = XMVectorAdd(
-                vCenter, XMVectorMultiply(
-                             XMVectorSubtract(XMVectorReplicate(1.0f),
-                                              XMVectorDivide(vRadius, Dist)),
-                             Delta));
-        }
-    }
-
-    XMStoreFloat3(&Out.Center, vCenter);
-    XMStoreFloat(&Out.Radius, vRadius);
-}
-
-//-----------------------------------------------------------------------------
-// Create sphere containing frustum
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingSphere::CreateFromFrustum(
-    BoundingSphere& Out, const BoundingFrustum& fr) noexcept {
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners(Corners);
-    CreateFromPoints(Out, BoundingFrustum::CORNER_COUNT, Corners,
-                     sizeof(XMFLOAT3));
-}
-
-/****************************************************************************
- *
- * BoundingBox
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform an axis aligned box by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingBox::Transform(BoundingBox& Out, FXMMATRIX M) const noexcept {
-    // Load center and extents.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    // Compute and transform the corners and find new min/max bounds.
-    XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter);
-    Corner = XMVector3Transform(Corner, M);
-
-    XMVECTOR Min, Max;
-    Min = Max = Corner;
-
-    for (size_t i = 1; i < CORNER_COUNT; ++i) {
-        Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
-        Corner = XMVector3Transform(Corner, M);
-
-        Min = XMVectorMin(Min, Corner);
-        Max = XMVectorMax(Max, Corner);
-    }
-
-    // Store center and extents.
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
-}
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingBox::Transform(BoundingBox& Out, float Scale, FXMVECTOR Rotation,
-                       FXMVECTOR Translation) const noexcept {
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(Rotation));
-
-    // Load center and extents.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR VectorScale = XMVectorReplicate(Scale);
-
-    // Compute and transform the corners and find new min/max bounds.
-    XMVECTOR Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[0], vCenter);
-    Corner = XMVectorAdd(
-        XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation),
-        Translation);
-
-    XMVECTOR Min, Max;
-    Min = Max = Corner;
-
-    for (size_t i = 1; i < CORNER_COUNT; ++i) {
-        Corner = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
-        Corner = XMVectorAdd(
-            XMVector3Rotate(XMVectorMultiply(Corner, VectorScale), Rotation),
-            Translation);
-
-        Min = XMVectorMin(Min, Corner);
-        Max = XMVectorMax(Max, Corner);
-    }
-
-    // Store center and extents.
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
-}
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingBox::GetCorners(
-    XMFLOAT3* Corners) const noexcept {
-    assert(Corners != nullptr);
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    for (size_t i = 0; i < CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVectorMultiplyAdd(vExtents, g_BoxOffset[i], vCenter);
-        XMStoreFloat3(&Corners[i], C);
-    }
-}
-
-//-----------------------------------------------------------------------------
-// Point in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingBox::Contains(FXMVECTOR Point) const noexcept {
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    return XMVector3InBounds(XMVectorSubtract(Point, vCenter), vExtents)
-               ? CONTAINS
-               : DISJOINT;
-}
-
-//-----------------------------------------------------------------------------
-// Triangle in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingBox::Contains(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept {
-    if (!Intersects(V0, V1, V2)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR d = XMVectorAbs(XMVectorSubtract(V0, vCenter));
-    XMVECTOR Inside = XMVectorLessOrEqual(d, vExtents);
-
-    d = XMVectorAbs(XMVectorSubtract(V1, vCenter));
-    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
-
-    d = XMVectorAbs(XMVectorSubtract(V2, vCenter));
-    Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
-
-    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS
-                                                          : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Sphere in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingBox::Contains(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
-    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents);
-    XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents);
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i))
-    // ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin);
-    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax);
-
-    XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin);
-    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax);
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect(d, MinDelta, LessThanMin);
-    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot(d, d);
-
-    if (XMVector3Greater(d2, XMVectorMultiply(SphereRadius, SphereRadius)))
-        return DISJOINT;
-
-    XMVECTOR InsideAll =
-        XMVectorLessOrEqual(XMVectorAdd(BoxMin, SphereRadius), SphereCenter);
-    InsideAll = XMVectorAndInt(
-        InsideAll, XMVectorLessOrEqual(SphereCenter,
-                                       XMVectorSubtract(BoxMax, SphereRadius)));
-    InsideAll = XMVectorAndInt(
-        InsideAll,
-        XMVectorGreater(XMVectorSubtract(BoxMax, BoxMin), SphereRadius));
-
-    return (XMVector3EqualInt(InsideAll, XMVectorTrueInt())) ? CONTAINS
-                                                             : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingBox::Contains(
-    const BoundingBox& box) const noexcept {
-    XMVECTOR CenterA = XMLoadFloat3(&Center);
-    XMVECTOR ExtentsA = XMLoadFloat3(&Extents);
-
-    XMVECTOR CenterB = XMLoadFloat3(&box.Center);
-    XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents);
-
-    XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA);
-    XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA);
-
-    XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB);
-    XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB);
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i)
-    // then return false
-    XMVECTOR Disjoint =
-        XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA));
-
-    if (DirectX::MathInternal::XMVector3AnyTrue(Disjoint)) return DISJOINT;
-
-    // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i)
-    // then A contains B
-    XMVECTOR Inside = XMVectorAndInt(XMVectorLessOrEqual(MinA, MinB),
-                                     XMVectorLessOrEqual(MaxB, MaxA));
-
-    return DirectX::MathInternal::XMVector3AllTrue(Inside) ? CONTAINS
-                                                           : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Oriented box in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingBox::Contains(
-    const BoundingOrientedBox& box) const noexcept {
-    if (!box.Intersects(*this)) return DISJOINT;
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    // Subtract off the AABB center to remove a subtract below
-    XMVECTOR oCenter = XMVectorSubtract(XMLoadFloat3(&box.Center), vCenter);
-
-    XMVECTOR oExtents = XMLoadFloat3(&box.Extents);
-    XMVECTOR oOrientation = XMLoadFloat4(&box.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(oOrientation));
-
-    XMVECTOR Inside = XMVectorTrueInt();
-
-    for (size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVectorAdd(
-            XMVector3Rotate(XMVectorMultiply(oExtents, g_BoxOffset[i]),
-                            oOrientation),
-            oCenter);
-        XMVECTOR d = XMVectorAbs(C);
-        Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
-    }
-
-    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS
-                                                          : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Frustum in axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingBox::Contains(
-    const BoundingFrustum& fr) const noexcept {
-    if (!fr.Intersects(*this)) return DISJOINT;
-
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners(Corners);
-
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR Inside = XMVectorTrueInt();
-
-    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) {
-        XMVECTOR Point = XMLoadFloat3(&Corners[i]);
-        XMVECTOR d = XMVectorAbs(XMVectorSubtract(Point, vCenter));
-        Inside = XMVectorAndInt(Inside, XMVectorLessOrEqual(d, vExtents));
-    }
-
-    return (XMVector3EqualInt(Inside, XMVectorTrueInt())) ? CONTAINS
-                                                          : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Sphere vs axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingBox::Intersects(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
-    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR BoxMin = XMVectorSubtract(BoxCenter, BoxExtents);
-    XMVECTOR BoxMax = XMVectorAdd(BoxCenter, BoxExtents);
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i))
-    // ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin = XMVectorLess(SphereCenter, BoxMin);
-    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxMax);
-
-    XMVECTOR MinDelta = XMVectorSubtract(SphereCenter, BoxMin);
-    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxMax);
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect(d, MinDelta, LessThanMin);
-    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot(d, d);
-
-    return XMVector3LessOrEqual(d2,
-                                XMVectorMultiply(SphereRadius, SphereRadius));
-}
-
-//-----------------------------------------------------------------------------
-// Axis-aligned box vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingBox::Intersects(
-    const BoundingBox& box) const noexcept {
-    XMVECTOR CenterA = XMLoadFloat3(&Center);
-    XMVECTOR ExtentsA = XMLoadFloat3(&Extents);
-
-    XMVECTOR CenterB = XMLoadFloat3(&box.Center);
-    XMVECTOR ExtentsB = XMLoadFloat3(&box.Extents);
-
-    XMVECTOR MinA = XMVectorSubtract(CenterA, ExtentsA);
-    XMVECTOR MaxA = XMVectorAdd(CenterA, ExtentsA);
-
-    XMVECTOR MinB = XMVectorSubtract(CenterB, ExtentsB);
-    XMVECTOR MaxB = XMVectorAdd(CenterB, ExtentsB);
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i)
-    // then return false
-    XMVECTOR Disjoint =
-        XMVectorOrInt(XMVectorGreater(MinA, MaxB), XMVectorGreater(MinB, MaxA));
-
-    return !DirectX::MathInternal::XMVector3AnyTrue(Disjoint);
-}
-
-//-----------------------------------------------------------------------------
-// Oriented box vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingBox::Intersects(
-    const BoundingOrientedBox& box) const noexcept {
-    return box.Intersects(*this);
-}
-
-//-----------------------------------------------------------------------------
-// Frustum vs. axis-aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingBox::Intersects(
-    const BoundingFrustum& fr) const noexcept {
-    return fr.Intersects(*this);
-}
-
-//-----------------------------------------------------------------------------
-// Triangle vs. axis aligned box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingBox::Intersects(
-    FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept {
-    XMVECTOR Zero = XMVectorZero();
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    XMVECTOR BoxMin = XMVectorSubtract(vCenter, vExtents);
-    XMVECTOR BoxMax = XMVectorAdd(vCenter, vExtents);
-
-    // Test the axes of the box (in effect test the AAB against the minimal AAB
-    // around the triangle).
-    XMVECTOR TriMin = XMVectorMin(XMVectorMin(V0, V1), V2);
-    XMVECTOR TriMax = XMVectorMax(XMVectorMax(V0, V1), V2);
-
-    // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i)
-    // then disjoint
-    XMVECTOR Disjoint = XMVectorOrInt(XMVectorGreater(TriMin, BoxMax),
-                                      XMVectorGreater(BoxMin, TriMax));
-    if (DirectX::MathInternal::XMVector3AnyTrue(Disjoint)) return false;
-
-    // Test the plane of the triangle.
-    XMVECTOR Normal =
-        XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0));
-    XMVECTOR Dist = XMVector3Dot(Normal, V0);
-
-    // Assert that the triangle is not degenerate.
-    assert(!XMVector3Equal(Normal, Zero));
-
-    // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i),
-    // v_max(i)=b_max(i) else v_min(i)=b_max(i), v_max(i)=b_min(i)
-    XMVECTOR NormalSelect = XMVectorGreater(Normal, Zero);
-    XMVECTOR V_Min = XMVectorSelect(BoxMax, BoxMin, NormalSelect);
-    XMVECTOR V_Max = XMVectorSelect(BoxMin, BoxMax, NormalSelect);
-
-    // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint
-    XMVECTOR MinDist = XMVector3Dot(V_Min, Normal);
-    XMVECTOR MaxDist = XMVector3Dot(V_Max, Normal);
-
-    XMVECTOR NoIntersection = XMVectorGreater(MinDist, Dist);
-    NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(MaxDist, Dist));
-
-    // Move the box center to zero to simplify the following tests.
-    XMVECTOR TV0 = XMVectorSubtract(V0, vCenter);
-    XMVECTOR TV1 = XMVectorSubtract(V1, vCenter);
-    XMVECTOR TV2 = XMVectorSubtract(V2, vCenter);
-
-    // Test the edge/edge axes (3*3).
-    XMVECTOR e0 = XMVectorSubtract(TV1, TV0);
-    XMVECTOR e1 = XMVectorSubtract(TV2, TV1);
-    XMVECTOR e2 = XMVectorSubtract(TV0, TV2);
-
-    // Make w zero.
-    e0 = XMVectorInsert<0, 0, 0, 0, 1>(e0, Zero);
-    e1 = XMVectorInsert<0, 0, 0, 0, 1>(e1, Zero);
-    e2 = XMVectorInsert<0, 0, 0, 0, 1>(e2, Zero);
-
-    XMVECTOR Axis;
-    XMVECTOR p0, p1, p2;
-    XMVECTOR Min, Max;
-    XMVECTOR Radius;
-
-    // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(e0, XMVectorNegate(e0));
-    p0 = XMVector3Dot(TV0, Axis);
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot(TV2, Axis);
-    Min = XMVectorMin(p0, p2);
-    Max = XMVectorMax(p0, p2);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(e1, XMVectorNegate(e1));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y)
-    Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(e2, XMVectorNegate(e2));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(e0, XMVectorNegate(e0));
-    p0 = XMVector3Dot(TV0, Axis);
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot(TV2, Axis);
-    Min = XMVectorMin(p0, p2);
-    Max = XMVectorMax(p0, p2);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(e1, XMVectorNegate(e1));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x)
-    Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(e2, XMVectorNegate(e2));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(e0, XMVectorNegate(e0));
-    p0 = XMVector3Dot(TV0, Axis);
-    // p1 = XMVector3Dot( V1, Axis ); // p1 = p0;
-    p2 = XMVector3Dot(TV2, Axis);
-    Min = XMVectorMin(p0, p2);
-    Max = XMVectorMax(p0, p2);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(e1, XMVectorNegate(e1));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p1;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0)
-    Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(e2, XMVectorNegate(e2));
-    p0 = XMVector3Dot(TV0, Axis);
-    p1 = XMVector3Dot(TV1, Axis);
-    // p2 = XMVector3Dot( V2, Axis ); // p2 = p0;
-    Min = XMVectorMin(p0, p1);
-    Max = XMVectorMax(p0, p1);
-    Radius = XMVector3Dot(vExtents, XMVectorAbs(Axis));
-    NoIntersection =
-        XMVectorOrInt(NoIntersection, XMVectorGreater(Min, Radius));
-    NoIntersection = XMVectorOrInt(NoIntersection,
-                                   XMVectorLess(Max, XMVectorNegate(Radius)));
-
-    return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt());
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline PlaneIntersectionType XM_CALLCONV
-BoundingBox::Intersects(FXMVECTOR Plane) const noexcept {
-    assert(DirectX::MathInternal::XMPlaneIsUnit(Plane));
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    XMVECTOR Outside, Inside;
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane, Outside, Inside);
-
-    // If the box is outside any plane it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return FRONT;
-
-    // If the box is inside all planes it is inside.
-    if (XMVector4EqualInt(Inside, XMVectorTrueInt())) return BACK;
-
-    // The box is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with an axis aligned
-// box using the slabs method.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingBox::Intersects(
-    FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept {
-    assert(DirectX::MathInternal::XMVector3IsUnit(Direction));
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    // Adjust ray origin to be relative to center of the box.
-    XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin);
-
-    // Compute the dot product againt each axis of the box.
-    // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary.
-    XMVECTOR AxisDotOrigin = TOrigin;
-    XMVECTOR AxisDotDirection = Direction;
-
-    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the
-    // slab.
-    XMVECTOR IsParallel =
-        XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon);
-
-    // Test against all three axii simultaneously.
-    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection);
-    XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents),
-                                   InverseAxisDotDirection);
-    XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents),
-                                   InverseAxisDotDirection);
-
-    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
-    // use the results from any directions parallel to the slab.
-    XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel);
-    XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel);
-
-    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
-    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
-    t_min = XMVectorMax(t_min, XMVectorSplatY(t_min));  // x = max(x,y)
-    t_min = XMVectorMax(t_min, XMVectorSplatZ(t_min));  // x = max(max(x,y),z)
-    t_max = XMVectorMin(t_max, XMVectorSplatY(t_max));  // x = min(x,y)
-    t_max = XMVectorMin(t_max, XMVectorSplatZ(t_max));  // x = min(min(x,y),z)
-
-    // if ( t_min > t_max ) return false;
-    XMVECTOR NoIntersection =
-        XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max));
-
-    // if ( t_max < 0.0f ) return false;
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero()));
-
-    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin))
-    // return false;
-    XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap));
-
-    if (!DirectX::MathInternal::XMVector3AnyTrue(NoIntersection)) {
-        // Store the x-component to *pDist
-        XMStoreFloat(&Dist, t_min);
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-//-----------------------------------------------------------------------------
-// Test an axis alinged box vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingBox::ContainedBy(FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2,
-                         GXMVECTOR Plane3, HXMVECTOR Plane4,
-                         HXMVECTOR Plane5) const noexcept {
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane0, Outside, Inside);
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane1, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane2, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane3, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane4, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectAxisAlignedBoxPlane(
-        vCenter, vExtents, Plane5, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    // If the box is outside any plane it is outside.
-    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) return DISJOINT;
-
-    // If the box is inside all planes it is inside.
-    if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) return CONTAINS;
-
-    // The box is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box that contains two other bounding boxes
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingBox::CreateMerged(
-    BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2) noexcept {
-    XMVECTOR b1Center = XMLoadFloat3(&b1.Center);
-    XMVECTOR b1Extents = XMLoadFloat3(&b1.Extents);
-
-    XMVECTOR b2Center = XMLoadFloat3(&b2.Center);
-    XMVECTOR b2Extents = XMLoadFloat3(&b2.Extents);
-
-    XMVECTOR Min = XMVectorSubtract(b1Center, b1Extents);
-    Min = XMVectorMin(Min, XMVectorSubtract(b2Center, b2Extents));
-
-    XMVECTOR Max = XMVectorAdd(b1Center, b1Extents);
-    Max = XMVectorMax(Max, XMVectorAdd(b2Center, b2Extents));
-
-    assert(XMVector3LessOrEqual(Min, Max));
-
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
-}
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box that contains a bounding sphere
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingBox::CreateFromSphere(
-    BoundingBox& Out, const BoundingSphere& sh) noexcept {
-    XMVECTOR spCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR shRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    XMVECTOR Min = XMVectorSubtract(spCenter, shRadius);
-    XMVECTOR Max = XMVectorAdd(spCenter, shRadius);
-
-    assert(XMVector3LessOrEqual(Min, Max));
-
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
-}
-
-//-----------------------------------------------------------------------------
-// Create axis-aligned box from min/max points
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV BoundingBox::CreateFromPoints(
-    BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2) noexcept {
-    XMVECTOR Min = XMVectorMin(pt1, pt2);
-    XMVECTOR Max = XMVectorMax(pt1, pt2);
-
-    // Store center and extents.
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(Min, Max), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(Max, Min), 0.5f));
-}
-
-//-----------------------------------------------------------------------------
-// Find the minimum axis aligned bounding box containing a set of points.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingBox::CreateFromPoints(
-    BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints,
-    size_t Stride) noexcept {
-    assert(Count > 0);
-    assert(pPoints);
-
-    // Find the minimum and maximum x, y, and z
-    XMVECTOR vMin, vMax;
-
-    vMin = vMax = XMLoadFloat3(pPoints);
-
-    for (size_t i = 1; i < Count; ++i) {
-        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-            reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
-
-        vMin = XMVectorMin(vMin, Point);
-        vMax = XMVectorMax(vMax, Point);
-    }
-
-    // Store center and extents.
-    XMStoreFloat3(&Out.Center, XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f));
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f));
-}
-
-/****************************************************************************
- *
- * BoundingOrientedBox
- *
- ****************************************************************************/
-
-//-----------------------------------------------------------------------------
-// Transform an oriented box by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV BoundingOrientedBox::Transform(
-    BoundingOrientedBox& Out, FXMMATRIX M) const noexcept {
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Composite the box rotation and the transform rotation.
-    XMMATRIX nM;
-    nM.r[0] = XMVector3Normalize(M.r[0]);
-    nM.r[1] = XMVector3Normalize(M.r[1]);
-    nM.r[2] = XMVector3Normalize(M.r[2]);
-    nM.r[3] = g_XMIdentityR3;
-    XMVECTOR Rotation = XMQuaternionRotationMatrix(nM);
-    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
-
-    // Transform the center.
-    vCenter = XMVector3Transform(vCenter, M);
-
-    // Scale the box extents.
-    XMVECTOR dX = XMVector3Length(M.r[0]);
-    XMVECTOR dY = XMVector3Length(M.r[1]);
-    XMVECTOR dZ = XMVector3Length(M.r[2]);
-
-    XMVECTOR VectorScale = XMVectorSelect(dY, dX, g_XMSelect1000);
-    VectorScale = XMVectorSelect(dZ, VectorScale, g_XMSelect1100);
-    vExtents = XMVectorMultiply(vExtents, VectorScale);
-
-    // Store the box.
-    XMStoreFloat3(&Out.Center, vCenter);
-    XMStoreFloat3(&Out.Extents, vExtents);
-    XMStoreFloat4(&Out.Orientation, vOrientation);
-}
-
-_Use_decl_annotations_ inline void XM_CALLCONV BoundingOrientedBox::Transform(
-    BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation,
-    FXMVECTOR Translation) const noexcept {
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(Rotation));
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Composite the box rotation and the transform rotation.
-    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
-
-    // Transform the center.
-    XMVECTOR VectorScale = XMVectorReplicate(Scale);
-    vCenter = XMVectorAdd(
-        XMVector3Rotate(XMVectorMultiply(vCenter, VectorScale), Rotation),
-        Translation);
-
-    // Scale the box extents.
-    vExtents = XMVectorMultiply(vExtents, VectorScale);
-
-    // Store the box.
-    XMStoreFloat3(&Out.Center, vCenter);
-    XMStoreFloat3(&Out.Extents, vExtents);
-    XMStoreFloat4(&Out.Orientation, vOrientation);
-}
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingOrientedBox::GetCorners(
-    XMFLOAT3* Corners) const noexcept {
-    assert(Corners != nullptr);
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    for (size_t i = 0; i < CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVectorAdd(
-            XMVector3Rotate(XMVectorMultiply(vExtents, g_BoxOffset[i]),
-                            vOrientation),
-            vCenter);
-        XMStoreFloat3(&Corners[i], C);
-    }
-}
-
-//-----------------------------------------------------------------------------
-// Point in oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingOrientedBox::Contains(FXMVECTOR Point) const noexcept {
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Transform the point to be local to the box.
-    XMVECTOR TPoint =
-        XMVector3InverseRotate(XMVectorSubtract(Point, vCenter), vOrientation);
-
-    return XMVector3InBounds(TPoint, vExtents) ? CONTAINS : DISJOINT;
-}
-
-//-----------------------------------------------------------------------------
-// Triangle in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingOrientedBox::Contains(FXMVECTOR V0, FXMVECTOR V1,
-                              FXMVECTOR V2) const noexcept {
-    // Load the box center & orientation.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Transform the triangle vertices into the space of the box.
-    XMVECTOR TV0 =
-        XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation);
-    XMVECTOR TV1 =
-        XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation);
-    XMVECTOR TV2 =
-        XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation);
-
-    BoundingBox box;
-    box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f);
-    box.Extents = Extents;
-
-    // Use the triangle vs axis aligned box intersection routine.
-    return box.Contains(TV0, TV1, TV2);
-}
-
-//-----------------------------------------------------------------------------
-// Sphere in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingOrientedBox::Contains(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
-    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
-    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(BoxOrientation));
-
-    // Transform the center of the sphere to be local to the box.
-    // BoxMin = -BoxExtents
-    // BoxMax = +BoxExtents
-    SphereCenter = XMVector3InverseRotate(
-        XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation);
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i))
-    // ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin =
-        XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents));
-    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents);
-
-    XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents);
-    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents);
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect(d, MinDelta, LessThanMin);
-    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot(d, d);
-    XMVECTOR SphereRadiusSq = XMVectorMultiply(SphereRadius, SphereRadius);
-
-    if (XMVector4Greater(d2, SphereRadiusSq)) return DISJOINT;
-
-    // See if we are completely inside the box
-    XMVECTOR SMin = XMVectorSubtract(SphereCenter, SphereRadius);
-    XMVECTOR SMax = XMVectorAdd(SphereCenter, SphereRadius);
-
-    return (XMVector3InBounds(SMin, BoxExtents) &&
-            XMVector3InBounds(SMax, BoxExtents))
-               ? CONTAINS
-               : INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Axis aligned box vs. oriented box. Constructs an oriented box and uses
-// the oriented box vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingOrientedBox::Contains(
-    const BoundingBox& box) const noexcept {
-    // Make the axis aligned box oriented and do an OBB vs OBB test.
-    BoundingOrientedBox obox(box.Center, box.Extents,
-                             XMFLOAT4(0.f, 0.f, 0.f, 1.f));
-    return Contains(obox);
-}
-
-//-----------------------------------------------------------------------------
-// Oriented bounding box in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingOrientedBox::Contains(
-    const BoundingOrientedBox& box) const noexcept {
-    if (!Intersects(box)) return DISJOINT;
-
-    // Load the boxes
-    XMVECTOR aCenter = XMLoadFloat3(&Center);
-    XMVECTOR aExtents = XMLoadFloat3(&Extents);
-    XMVECTOR aOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(aOrientation));
-
-    XMVECTOR bCenter = XMLoadFloat3(&box.Center);
-    XMVECTOR bExtents = XMLoadFloat3(&box.Extents);
-    XMVECTOR bOrientation = XMLoadFloat4(&box.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(bOrientation));
-
-    XMVECTOR offset = XMVectorSubtract(bCenter, aCenter);
-
-    for (size_t i = 0; i < CORNER_COUNT; ++i) {
-        // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter
-        // Ca = invrotate( Cb - aCenter, aOrientation )
-
-        XMVECTOR C = XMVectorAdd(
-            XMVector3Rotate(XMVectorMultiply(bExtents, g_BoxOffset[i]),
-                            bOrientation),
-            offset);
-        C = XMVector3InverseRotate(C, aOrientation);
-
-        if (!XMVector3InBounds(C, aExtents)) return INTERSECTS;
-    }
-
-    return CONTAINS;
-}
-
-//-----------------------------------------------------------------------------
-// Frustum in oriented bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingOrientedBox::Contains(
-    const BoundingFrustum& fr) const noexcept {
-    if (!fr.Intersects(*this)) return DISJOINT;
-
-    XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT];
-    fr.GetCorners(Corners);
-
-    // Load the box
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    for (size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i) {
-        XMVECTOR C = XMVector3InverseRotate(
-            XMVectorSubtract(XMLoadFloat3(&Corners[i]), vCenter), vOrientation);
-
-        if (!XMVector3InBounds(C, vExtents)) return INTERSECTS;
-    }
-
-    return CONTAINS;
-}
-
-//-----------------------------------------------------------------------------
-// Sphere vs. oriented box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingOrientedBox::Intersects(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR SphereCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR SphereRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    XMVECTOR BoxCenter = XMLoadFloat3(&Center);
-    XMVECTOR BoxExtents = XMLoadFloat3(&Extents);
-    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(BoxOrientation));
-
-    // Transform the center of the sphere to be local to the box.
-    // BoxMin = -BoxExtents
-    // BoxMax = +BoxExtents
-    SphereCenter = XMVector3InverseRotate(
-        XMVectorSubtract(SphereCenter, BoxCenter), BoxOrientation);
-
-    // Find the distance to the nearest point on the box.
-    // for each i in (x, y, z)
-    // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2
-    // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i))
-    // ^ 2
-
-    XMVECTOR d = XMVectorZero();
-
-    // Compute d for each dimension.
-    XMVECTOR LessThanMin =
-        XMVectorLess(SphereCenter, XMVectorNegate(BoxExtents));
-    XMVECTOR GreaterThanMax = XMVectorGreater(SphereCenter, BoxExtents);
-
-    XMVECTOR MinDelta = XMVectorAdd(SphereCenter, BoxExtents);
-    XMVECTOR MaxDelta = XMVectorSubtract(SphereCenter, BoxExtents);
-
-    // Choose value for each dimension based on the comparison.
-    d = XMVectorSelect(d, MinDelta, LessThanMin);
-    d = XMVectorSelect(d, MaxDelta, GreaterThanMax);
-
-    // Use a dot-product to square them and sum them together.
-    XMVECTOR d2 = XMVector3Dot(d, d);
-
-    return XMVector4LessOrEqual(d2,
-                                XMVectorMultiply(SphereRadius, SphereRadius))
-               ? true
-               : false;
-}
-
-//-----------------------------------------------------------------------------
-// Axis aligned box vs. oriented box. Constructs an oriented box and uses
-// the oriented box vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingOrientedBox::Intersects(
-    const BoundingBox& box) const noexcept {
-    // Make the axis aligned box oriented and do an OBB vs OBB test.
-    BoundingOrientedBox obox(box.Center, box.Extents,
-                             XMFLOAT4(0.f, 0.f, 0.f, 1.f));
-    return Intersects(obox);
-}
-
-//-----------------------------------------------------------------------------
-// Fast oriented box / oriented box intersection test using the separating axis
-// theorem.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingOrientedBox::Intersects(
-    const BoundingOrientedBox& box) const noexcept {
-    // Build the 3x3 rotation matrix that defines the orientation of B relative
-    // to A.
-    XMVECTOR A_quat = XMLoadFloat4(&Orientation);
-    XMVECTOR B_quat = XMLoadFloat4(&box.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(A_quat));
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(B_quat));
-
-    XMVECTOR Q = XMQuaternionMultiply(A_quat, XMQuaternionConjugate(B_quat));
-    XMMATRIX R = XMMatrixRotationQuaternion(Q);
-
-    // Compute the translation of B relative to A.
-    XMVECTOR A_cent = XMLoadFloat3(&Center);
-    XMVECTOR B_cent = XMLoadFloat3(&box.Center);
-    XMVECTOR t =
-        XMVector3InverseRotate(XMVectorSubtract(B_cent, A_cent), A_quat);
-
-    //
-    // h(A) = extents of A.
-    // h(B) = extents of B.
-    //
-    // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1)
-    // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21),
-    // (r02,r12,r22)
-    //
-    // For each possible separating axis l:
-    //   d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l )
-    //   d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l )
-    //   if abs( t dot l ) > d(A) + d(B) then disjoint
-    //
-
-    // Load extents of A and B.
-    XMVECTOR h_A = XMLoadFloat3(&Extents);
-    XMVECTOR h_B = XMLoadFloat3(&box.Extents);
-
-    // Rows. Note R[0,1,2]X.w = 0.
-    XMVECTOR R0X = R.r[0];
-    XMVECTOR R1X = R.r[1];
-    XMVECTOR R2X = R.r[2];
-
-    R = XMMatrixTranspose(R);
-
-    // Columns. Note RX[0,1,2].w = 0.
-    XMVECTOR RX0 = R.r[0];
-    XMVECTOR RX1 = R.r[1];
-    XMVECTOR RX2 = R.r[2];
-
-    // Absolute value of rows.
-    XMVECTOR AR0X = XMVectorAbs(R0X);
-    XMVECTOR AR1X = XMVectorAbs(R1X);
-    XMVECTOR AR2X = XMVectorAbs(R2X);
-
-    // Absolute value of columns.
-    XMVECTOR ARX0 = XMVectorAbs(RX0);
-    XMVECTOR ARX1 = XMVectorAbs(RX1);
-    XMVECTOR ARX2 = XMVectorAbs(RX2);
-
-    // Test each of the 15 possible seperating axii.
-    XMVECTOR d, d_A, d_B;
-
-    // l = a(u) = (1, 0, 0)
-    // t dot l = t.x
-    // d(A) = h(A).x
-    // d(B) = h(B) dot abs(r00, r01, r02)
-    d = XMVectorSplatX(t);
-    d_A = XMVectorSplatX(h_A);
-    d_B = XMVector3Dot(h_B, AR0X);
-    XMVECTOR NoIntersection =
-        XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B));
-
-    // l = a(v) = (0, 1, 0)
-    // t dot l = t.y
-    // d(A) = h(A).y
-    // d(B) = h(B) dot abs(r10, r11, r12)
-    d = XMVectorSplatY(t);
-    d_A = XMVectorSplatY(h_A);
-    d_B = XMVector3Dot(h_B, AR1X);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(w) = (0, 0, 1)
-    // t dot l = t.z
-    // d(A) = h(A).z
-    // d(B) = h(B) dot abs(r20, r21, r22)
-    d = XMVectorSplatZ(t);
-    d_A = XMVectorSplatZ(h_A);
-    d_B = XMVector3Dot(h_B, AR2X);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = b(u) = (r00, r10, r20)
-    // d(A) = h(A) dot abs(r00, r10, r20)
-    // d(B) = h(B).x
-    d = XMVector3Dot(t, RX0);
-    d_A = XMVector3Dot(h_A, ARX0);
-    d_B = XMVectorSplatX(h_B);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = b(v) = (r01, r11, r21)
-    // d(A) = h(A) dot abs(r01, r11, r21)
-    // d(B) = h(B).y
-    d = XMVector3Dot(t, RX1);
-    d_A = XMVector3Dot(h_A, ARX1);
-    d_B = XMVectorSplatY(h_B);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = b(w) = (r02, r12, r22)
-    // d(A) = h(A) dot abs(r02, r12, r22)
-    // d(B) = h(B).z
-    d = XMVector3Dot(t, RX2);
-    d_A = XMVector3Dot(h_A, ARX2);
-    d_B = XMVectorSplatZ(h_B);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(u) x b(u) = (0, -r20, r10)
-    // d(A) = h(A) dot abs(0, r20, r10)
-    // d(B) = h(B) dot abs(0, r02, r01)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(RX0, XMVectorNegate(RX0)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            ARX0));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            AR0X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(u) x b(v) = (0, -r21, r11)
-    // d(A) = h(A) dot abs(0, r21, r11)
-    // d(B) = h(B) dot abs(r02, 0, r00)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(RX1, XMVectorNegate(RX1)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            ARX1));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            AR0X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(u) x b(w) = (0, -r22, r12)
-    // d(A) = h(A) dot abs(0, r22, r12)
-    // d(B) = h(B) dot abs(r01, r00, 0)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y,
-                           XM_PERMUTE_0X>(RX2, XMVectorNegate(RX2)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            ARX2));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            AR0X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(v) x b(u) = (r20, 0, -r00)
-    // d(A) = h(A) dot abs(r20, 0, r00)
-    // d(B) = h(B) dot abs(0, r12, r11)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(RX0, XMVectorNegate(RX0)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            ARX0));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            AR1X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(v) x b(v) = (r21, 0, -r01)
-    // d(A) = h(A) dot abs(r21, 0, r01)
-    // d(B) = h(B) dot abs(r12, 0, r10)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(RX1, XMVectorNegate(RX1)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            ARX1));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            AR1X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(v) x b(w) = (r22, 0, -r02)
-    // d(A) = h(A) dot abs(r22, 0, r02)
-    // d(B) = h(B) dot abs(r11, r10, 0)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X,
-                           XM_PERMUTE_0Y>(RX2, XMVectorNegate(RX2)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            ARX2));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            AR1X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(w) x b(u) = (-r10, r00, 0)
-    // d(A) = h(A) dot abs(r10, r00, 0)
-    // d(B) = h(B) dot abs(0, r22, r21)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(RX0, XMVectorNegate(RX0)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            ARX0));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            AR2X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(w) x b(v) = (-r11, r01, 0)
-    // d(A) = h(A) dot abs(r11, r01, 0)
-    // d(B) = h(B) dot abs(r22, 0, r20)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(RX1, XMVectorNegate(RX1)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            ARX1));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            AR2X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // l = a(w) x b(w) = (-r12, r02, 0)
-    // d(A) = h(A) dot abs(r12, r02, 0)
-    // d(B) = h(B) dot abs(r21, r20, 0)
-    d = XMVector3Dot(
-        t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                           XM_PERMUTE_0Z>(RX2, XMVectorNegate(RX2)));
-    d_A = XMVector3Dot(
-        h_A,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            ARX2));
-    d_B = XMVector3Dot(
-        h_B,
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            AR2X));
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorGreater(XMVectorAbs(d), XMVectorAdd(d_A, d_B)));
-
-    // No seperating axis found, boxes must intersect.
-    return XMVector4NotEqualInt(NoIntersection, XMVectorTrueInt()) ? true
-                                                                   : false;
-}
-
-//-----------------------------------------------------------------------------
-// Frustum vs. oriented box test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingOrientedBox::Intersects(
-    const BoundingFrustum& fr) const noexcept {
-    return fr.Intersects(*this);
-}
-
-//-----------------------------------------------------------------------------
-// Triangle vs. oriented box test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingOrientedBox::Intersects(
-    FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept {
-    // Load the box center & orientation.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Transform the triangle vertices into the space of the box.
-    XMVECTOR TV0 =
-        XMVector3InverseRotate(XMVectorSubtract(V0, vCenter), vOrientation);
-    XMVECTOR TV1 =
-        XMVector3InverseRotate(XMVectorSubtract(V1, vCenter), vOrientation);
-    XMVECTOR TV2 =
-        XMVector3InverseRotate(XMVectorSubtract(V2, vCenter), vOrientation);
-
-    BoundingBox box;
-    box.Center = XMFLOAT3(0.0f, 0.0f, 0.0f);
-    box.Extents = Extents;
-
-    // Use the triangle vs axis aligned box intersection routine.
-    return box.Intersects(TV0, TV1, TV2);
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline PlaneIntersectionType XM_CALLCONV
-BoundingOrientedBox::Intersects(FXMVECTOR Plane) const noexcept {
-    assert(DirectX::MathInternal::XMPlaneIsUnit(Plane));
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(BoxOrientation));
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
-
-    XMVECTOR Outside, Inside;
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside);
-
-    // If the box is outside any plane it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return FRONT;
-
-    // If the box is inside all planes it is inside.
-    if (XMVector4EqualInt(Inside, XMVectorTrueInt())) return BACK;
-
-    // The box is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with an oriented box
-// using the slabs method.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingOrientedBox::Intersects(
-    FXMVECTOR Origin, FXMVECTOR Direction, float& Dist) const noexcept {
-    assert(DirectX::MathInternal::XMVector3IsUnit(Direction));
-
-    static const XMVECTORU32 SelectY = {
-        {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}}};
-    static const XMVECTORU32 SelectZ = {
-        {{XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
-
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Get the boxes normalized side directions.
-    XMMATRIX R = XMMatrixRotationQuaternion(vOrientation);
-
-    // Adjust ray origin to be relative to center of the box.
-    XMVECTOR TOrigin = XMVectorSubtract(vCenter, Origin);
-
-    // Compute the dot product againt each axis of the box.
-    XMVECTOR AxisDotOrigin = XMVector3Dot(R.r[0], TOrigin);
-    AxisDotOrigin =
-        XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[1], TOrigin), SelectY);
-    AxisDotOrigin =
-        XMVectorSelect(AxisDotOrigin, XMVector3Dot(R.r[2], TOrigin), SelectZ);
-
-    XMVECTOR AxisDotDirection = XMVector3Dot(R.r[0], Direction);
-    AxisDotDirection = XMVectorSelect(AxisDotDirection,
-                                      XMVector3Dot(R.r[1], Direction), SelectY);
-    AxisDotDirection = XMVectorSelect(AxisDotDirection,
-                                      XMVector3Dot(R.r[2], Direction), SelectZ);
-
-    // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the
-    // slab.
-    XMVECTOR IsParallel =
-        XMVectorLessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon);
-
-    // Test against all three axes simultaneously.
-    XMVECTOR InverseAxisDotDirection = XMVectorReciprocal(AxisDotDirection);
-    XMVECTOR t1 = XMVectorMultiply(XMVectorSubtract(AxisDotOrigin, vExtents),
-                                   InverseAxisDotDirection);
-    XMVECTOR t2 = XMVectorMultiply(XMVectorAdd(AxisDotOrigin, vExtents),
-                                   InverseAxisDotDirection);
-
-    // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't
-    // use the results from any directions parallel to the slab.
-    XMVECTOR t_min = XMVectorSelect(XMVectorMin(t1, t2), g_FltMin, IsParallel);
-    XMVECTOR t_max = XMVectorSelect(XMVectorMax(t1, t2), g_FltMax, IsParallel);
-
-    // t_min.x = maximum( t_min.x, t_min.y, t_min.z );
-    // t_max.x = minimum( t_max.x, t_max.y, t_max.z );
-    t_min = XMVectorMax(t_min, XMVectorSplatY(t_min));  // x = max(x,y)
-    t_min =
-        XMVectorMax(t_min, XMVectorSplatZ(t_min));  // x = max(std::max(x,y),z)
-    t_max = XMVectorMin(t_max, XMVectorSplatY(t_max));  // x = min(x,y)
-    t_max =
-        XMVectorMin(t_max, XMVectorSplatZ(t_max));  // x = min(std::min(x,y),z)
-
-    // if ( t_min > t_max ) return false;
-    XMVECTOR NoIntersection =
-        XMVectorGreater(XMVectorSplatX(t_min), XMVectorSplatX(t_max));
-
-    // if ( t_max < 0.0f ) return false;
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorLess(XMVectorSplatX(t_max), XMVectorZero()));
-
-    // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin))
-    // return false;
-    XMVECTOR ParallelOverlap = XMVectorInBounds(AxisDotOrigin, vExtents);
-    NoIntersection = XMVectorOrInt(
-        NoIntersection, XMVectorAndCInt(IsParallel, ParallelOverlap));
-
-    if (!DirectX::MathInternal::XMVector3AnyTrue(NoIntersection)) {
-        // Store the x-component to *pDist
-        XMStoreFloat(&Dist, t_min);
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-//-----------------------------------------------------------------------------
-// Test an oriented box vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingOrientedBox::ContainedBy(FXMVECTOR Plane0, FXMVECTOR Plane1,
-                                 FXMVECTOR Plane2, GXMVECTOR Plane3,
-                                 HXMVECTOR Plane4,
-                                 HXMVECTOR Plane5) const noexcept {
-    // Load the box.
-    XMVECTOR vCenter = XMLoadFloat3(&Center);
-    XMVECTOR vExtents = XMLoadFloat3(&Extents);
-    XMVECTOR BoxOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(BoxOrientation));
-
-    // Set w of the center to one so we can dot4 with a plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside);
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectOrientedBoxPlane(
-        vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    // If the box is outside any plane it is outside.
-    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) return DISJOINT;
-
-    // If the box is inside all planes it is inside.
-    if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) return CONTAINS;
-
-    // The box is not inside all planes or outside a plane, it may intersect.
-    return INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Create oriented bounding box from axis-aligned bounding box
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingOrientedBox::CreateFromBoundingBox(
-    BoundingOrientedBox& Out, const BoundingBox& box) noexcept {
-    Out.Center = box.Center;
-    Out.Extents = box.Extents;
-    Out.Orientation = XMFLOAT4(0.f, 0.f, 0.f, 1.f);
-}
-
-//-----------------------------------------------------------------------------
-// Find the approximate minimum oriented bounding box containing a set of
-// points.  Exact computation of minimum oriented bounding box is possible but
-// is slower and requires a more complex algorithm.
-// The algorithm works by computing the inertia tensor of the points and then
-// using the eigenvectors of the intertia tensor as the axes of the box.
-// Computing the intertia tensor of the convex hull of the points will usually
-// result in better bounding box but the computation is more complex.
-// Exact computation of the minimum oriented bounding box is possible but the
-// best know algorithm is O(N^3) and is significanly more complex to implement.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingOrientedBox::CreateFromPoints(
-    BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints,
-    size_t Stride) noexcept {
-    assert(Count > 0);
-    assert(pPoints != nullptr);
-
-    XMVECTOR CenterOfMass = XMVectorZero();
-
-    // Compute the center of mass and inertia tensor of the points.
-    for (size_t i = 0; i < Count; ++i) {
-        XMVECTOR Point = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-            reinterpret_cast<const uint8_t*>(pPoints) + i * Stride));
-
-        CenterOfMass = XMVectorAdd(CenterOfMass, Point);
-    }
-
-    CenterOfMass = XMVectorMultiply(
-        CenterOfMass, XMVectorReciprocal(XMVectorReplicate(float(Count))));
-
-    // Compute the inertia tensor of the points around the center of mass.
-    // Using the center of mass is not strictly necessary, but will hopefully
-    // improve the stability of finding the eigenvectors.
-    XMVECTOR XX_YY_ZZ = XMVectorZero();
-    XMVECTOR XY_XZ_YZ = XMVectorZero();
-
-    for (size_t i = 0; i < Count; ++i) {
-        XMVECTOR Point = XMVectorSubtract(
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-                reinterpret_cast<const uint8_t*>(pPoints) + i * Stride)),
-            CenterOfMass);
-
-        XX_YY_ZZ = XMVectorAdd(XX_YY_ZZ, XMVectorMultiply(Point, Point));
-
-        XMVECTOR XXY = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y,
-                                       XM_SWIZZLE_W>(Point);
-        XMVECTOR YZZ = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Z,
-                                       XM_SWIZZLE_W>(Point);
-
-        XY_XZ_YZ = XMVectorAdd(XY_XZ_YZ, XMVectorMultiply(XXY, YZZ));
-    }
-
-    XMVECTOR v1, v2, v3;
-
-    // Compute the eigenvectors of the inertia tensor.
-    DirectX::MathInternal::CalculateEigenVectorsFromCovarianceMatrix(
-        XMVectorGetX(XX_YY_ZZ), XMVectorGetY(XX_YY_ZZ), XMVectorGetZ(XX_YY_ZZ),
-        XMVectorGetX(XY_XZ_YZ), XMVectorGetY(XY_XZ_YZ), XMVectorGetZ(XY_XZ_YZ),
-        &v1, &v2, &v3);
-
-    // Put them in a matrix.
-    XMMATRIX R;
-
-    R.r[0] = XMVectorSetW(v1, 0.f);
-    R.r[1] = XMVectorSetW(v2, 0.f);
-    R.r[2] = XMVectorSetW(v3, 0.f);
-    R.r[3] = g_XMIdentityR3.v;
-
-    // Multiply by -1 to convert the matrix into a right handed coordinate
-    // system (Det ~= 1) in case the eigenvectors form a left handed
-    // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only
-    // works on right handed matrices.
-    XMVECTOR Det = XMMatrixDeterminant(R);
-
-    if (XMVector4Less(Det, XMVectorZero())) {
-        R.r[0] = XMVectorMultiply(R.r[0], g_XMNegativeOne.v);
-        R.r[1] = XMVectorMultiply(R.r[1], g_XMNegativeOne.v);
-        R.r[2] = XMVectorMultiply(R.r[2], g_XMNegativeOne.v);
-    }
-
-    // Get the rotation quaternion from the matrix.
-    XMVECTOR vOrientation = XMQuaternionRotationMatrix(R);
-
-    // Make sure it is normal (in case the vectors are slightly non-orthogonal).
-    vOrientation = XMQuaternionNormalize(vOrientation);
-
-    // Rebuild the rotation matrix from the quaternion.
-    R = XMMatrixRotationQuaternion(vOrientation);
-
-    // Build the rotation into the rotated space.
-    XMMATRIX InverseR = XMMatrixTranspose(R);
-
-    // Find the minimum OBB using the eigenvectors as the axes.
-    XMVECTOR vMin, vMax;
-
-    vMin = vMax = XMVector3TransformNormal(XMLoadFloat3(pPoints), InverseR);
-
-    for (size_t i = 1; i < Count; ++i) {
-        XMVECTOR Point = XMVector3TransformNormal(
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(
-                reinterpret_cast<const uint8_t*>(pPoints) + i * Stride)),
-            InverseR);
-
-        vMin = XMVectorMin(vMin, Point);
-        vMax = XMVectorMax(vMax, Point);
-    }
-
-    // Rotate the center into world space.
-    XMVECTOR vCenter = XMVectorScale(XMVectorAdd(vMin, vMax), 0.5f);
-    vCenter = XMVector3TransformNormal(vCenter, R);
-
-    // Store center, extents, and orientation.
-    XMStoreFloat3(&Out.Center, vCenter);
-    XMStoreFloat3(&Out.Extents,
-                  XMVectorScale(XMVectorSubtract(vMax, vMin), 0.5f));
-    XMStoreFloat4(&Out.Orientation, vOrientation);
-}
-
-/****************************************************************************
- *
- * BoundingFrustum
- *
- ****************************************************************************/
-
-_Use_decl_annotations_ inline BoundingFrustum::BoundingFrustum(
-    CXMMATRIX Projection, bool rhcoords) noexcept {
-    CreateFromMatrix(*this, Projection, rhcoords);
-}
-
-//-----------------------------------------------------------------------------
-// Transform a frustum by an angle preserving transform.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingFrustum::Transform(BoundingFrustum& Out, FXMMATRIX M) const noexcept {
-    // Load the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Composite the frustum rotation and the transform rotation
-    XMMATRIX nM;
-    nM.r[0] = XMVector3Normalize(M.r[0]);
-    nM.r[1] = XMVector3Normalize(M.r[1]);
-    nM.r[2] = XMVector3Normalize(M.r[2]);
-    nM.r[3] = g_XMIdentityR3;
-    XMVECTOR Rotation = XMQuaternionRotationMatrix(nM);
-    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
-
-    // Transform the center.
-    vOrigin = XMVector3Transform(vOrigin, M);
-
-    // Store the frustum.
-    XMStoreFloat3(&Out.Origin, vOrigin);
-    XMStoreFloat4(&Out.Orientation, vOrientation);
-
-    // Scale the near and far distances (the slopes remain the same).
-    XMVECTOR dX = XMVector3Dot(M.r[0], M.r[0]);
-    XMVECTOR dY = XMVector3Dot(M.r[1], M.r[1]);
-    XMVECTOR dZ = XMVector3Dot(M.r[2], M.r[2]);
-
-    XMVECTOR d = XMVectorMax(dX, XMVectorMax(dY, dZ));
-    float Scale = sqrtf(XMVectorGetX(d));
-
-    Out.Near = Near * Scale;
-    Out.Far = Far * Scale;
-
-    // Copy the slopes.
-    Out.RightSlope = RightSlope;
-    Out.LeftSlope = LeftSlope;
-    Out.TopSlope = TopSlope;
-    Out.BottomSlope = BottomSlope;
-}
-
-_Use_decl_annotations_ inline void XM_CALLCONV BoundingFrustum::Transform(
-    BoundingFrustum& Out, float Scale, FXMVECTOR Rotation,
-    FXMVECTOR Translation) const noexcept {
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(Rotation));
-
-    // Load the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Composite the frustum rotation and the transform rotation.
-    vOrientation = XMQuaternionMultiply(vOrientation, Rotation);
-
-    // Transform the origin.
-    vOrigin = XMVectorAdd(
-        XMVector3Rotate(XMVectorScale(vOrigin, Scale), Rotation), Translation);
-
-    // Store the frustum.
-    XMStoreFloat3(&Out.Origin, vOrigin);
-    XMStoreFloat4(&Out.Orientation, vOrientation);
-
-    // Scale the near and far distances (the slopes remain the same).
-    Out.Near = Near * Scale;
-    Out.Far = Far * Scale;
-
-    // Copy the slopes.
-    Out.RightSlope = RightSlope;
-    Out.LeftSlope = LeftSlope;
-    Out.TopSlope = TopSlope;
-    Out.BottomSlope = BottomSlope;
-}
-
-//-----------------------------------------------------------------------------
-// Get the corner points of the frustum
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingFrustum::GetCorners(
-    XMFLOAT3* Corners) const noexcept {
-    assert(Corners != nullptr);
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    // Returns 8 corners position of bounding frustum.
-    //     Near    Far
-    //    0----1  4----5
-    //    |    |  |    |
-    //    |    |  |    |
-    //    3----2  7----6
-
-    XMVECTOR vCorners[CORNER_COUNT];
-    vCorners[0] = XMVectorMultiply(vLeftTop, vNear);
-    vCorners[1] = XMVectorMultiply(vRightTop, vNear);
-    vCorners[2] = XMVectorMultiply(vRightBottom, vNear);
-    vCorners[3] = XMVectorMultiply(vLeftBottom, vNear);
-    vCorners[4] = XMVectorMultiply(vLeftTop, vFar);
-    vCorners[5] = XMVectorMultiply(vRightTop, vFar);
-    vCorners[6] = XMVectorMultiply(vRightBottom, vFar);
-    vCorners[7] = XMVectorMultiply(vLeftBottom, vFar);
-
-    for (size_t i = 0; i < CORNER_COUNT; ++i) {
-        XMVECTOR C =
-            XMVectorAdd(XMVector3Rotate(vCorners[i], vOrientation), vOrigin);
-        XMStoreFloat3(&Corners[i], C);
-    }
-}
-
-//-----------------------------------------------------------------------------
-// Point in frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingFrustum::Contains(FXMVECTOR Point) const noexcept {
-    // Build frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    // Load origin and orientation.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Transform point into local space of frustum.
-    XMVECTOR TPoint =
-        XMVector3InverseRotate(XMVectorSubtract(Point, vOrigin), vOrientation);
-
-    // Set w to one.
-    TPoint = XMVectorInsert<0, 0, 0, 0, 1>(TPoint, XMVectorSplatOne());
-
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Outside = Zero;
-
-    // Test point against each plane of the frustum.
-    for (size_t i = 0; i < 6; ++i) {
-        XMVECTOR Dot = XMVector4Dot(TPoint, Planes[i]);
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dot, Zero));
-    }
-
-    return XMVector4NotEqualInt(Outside, XMVectorTrueInt()) ? CONTAINS
-                                                            : DISJOINT;
-}
-
-//-----------------------------------------------------------------------------
-// Triangle vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingFrustum::Contains(FXMVECTOR V0, FXMVECTOR V1,
-                          FXMVECTOR V2) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    NearPlane = DirectX::MathInternal::XMPlaneTransform(NearPlane, vOrientation,
-                                                        vOrigin);
-    NearPlane = XMPlaneNormalize(NearPlane);
-
-    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    FarPlane = DirectX::MathInternal::XMPlaneTransform(FarPlane, vOrientation,
-                                                       vOrigin);
-    FarPlane = XMPlaneNormalize(FarPlane);
-
-    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    RightPlane = DirectX::MathInternal::XMPlaneTransform(RightPlane,
-                                                         vOrientation, vOrigin);
-    RightPlane = XMPlaneNormalize(RightPlane);
-
-    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    LeftPlane = DirectX::MathInternal::XMPlaneTransform(LeftPlane, vOrientation,
-                                                        vOrigin);
-    LeftPlane = XMPlaneNormalize(LeftPlane);
-
-    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    TopPlane = DirectX::MathInternal::XMPlaneTransform(TopPlane, vOrientation,
-                                                       vOrigin);
-    TopPlane = XMPlaneNormalize(TopPlane);
-
-    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-    BottomPlane = DirectX::MathInternal::XMPlaneTransform(
-        BottomPlane, vOrientation, vOrigin);
-    BottomPlane = XMPlaneNormalize(BottomPlane);
-
-    return TriangleTests::ContainedBy(V0, V1, V2, NearPlane, FarPlane,
-                                      RightPlane, LeftPlane, TopPlane,
-                                      BottomPlane);
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingFrustum::Contains(
-    const BoundingSphere& sh) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    NearPlane = DirectX::MathInternal::XMPlaneTransform(NearPlane, vOrientation,
-                                                        vOrigin);
-    NearPlane = XMPlaneNormalize(NearPlane);
-
-    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    FarPlane = DirectX::MathInternal::XMPlaneTransform(FarPlane, vOrientation,
-                                                       vOrigin);
-    FarPlane = XMPlaneNormalize(FarPlane);
-
-    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    RightPlane = DirectX::MathInternal::XMPlaneTransform(RightPlane,
-                                                         vOrientation, vOrigin);
-    RightPlane = XMPlaneNormalize(RightPlane);
-
-    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    LeftPlane = DirectX::MathInternal::XMPlaneTransform(LeftPlane, vOrientation,
-                                                        vOrigin);
-    LeftPlane = XMPlaneNormalize(LeftPlane);
-
-    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    TopPlane = DirectX::MathInternal::XMPlaneTransform(TopPlane, vOrientation,
-                                                       vOrigin);
-    TopPlane = XMPlaneNormalize(TopPlane);
-
-    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-    BottomPlane = DirectX::MathInternal::XMPlaneTransform(
-        BottomPlane, vOrientation, vOrigin);
-    BottomPlane = XMPlaneNormalize(BottomPlane);
-
-    return sh.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane,
-                          BottomPlane);
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingFrustum::Contains(
-    const BoundingBox& box) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    NearPlane = DirectX::MathInternal::XMPlaneTransform(NearPlane, vOrientation,
-                                                        vOrigin);
-    NearPlane = XMPlaneNormalize(NearPlane);
-
-    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    FarPlane = DirectX::MathInternal::XMPlaneTransform(FarPlane, vOrientation,
-                                                       vOrigin);
-    FarPlane = XMPlaneNormalize(FarPlane);
-
-    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    RightPlane = DirectX::MathInternal::XMPlaneTransform(RightPlane,
-                                                         vOrientation, vOrigin);
-    RightPlane = XMPlaneNormalize(RightPlane);
-
-    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    LeftPlane = DirectX::MathInternal::XMPlaneTransform(LeftPlane, vOrientation,
-                                                        vOrigin);
-    LeftPlane = XMPlaneNormalize(LeftPlane);
-
-    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    TopPlane = DirectX::MathInternal::XMPlaneTransform(TopPlane, vOrientation,
-                                                       vOrigin);
-    TopPlane = XMPlaneNormalize(TopPlane);
-
-    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-    BottomPlane = DirectX::MathInternal::XMPlaneTransform(
-        BottomPlane, vOrientation, vOrigin);
-    BottomPlane = XMPlaneNormalize(BottomPlane);
-
-    return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane,
-                           BottomPlane);
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingFrustum::Contains(
-    const BoundingOrientedBox& box) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    NearPlane = DirectX::MathInternal::XMPlaneTransform(NearPlane, vOrientation,
-                                                        vOrigin);
-    NearPlane = XMPlaneNormalize(NearPlane);
-
-    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    FarPlane = DirectX::MathInternal::XMPlaneTransform(FarPlane, vOrientation,
-                                                       vOrigin);
-    FarPlane = XMPlaneNormalize(FarPlane);
-
-    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    RightPlane = DirectX::MathInternal::XMPlaneTransform(RightPlane,
-                                                         vOrientation, vOrigin);
-    RightPlane = XMPlaneNormalize(RightPlane);
-
-    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    LeftPlane = DirectX::MathInternal::XMPlaneTransform(LeftPlane, vOrientation,
-                                                        vOrigin);
-    LeftPlane = XMPlaneNormalize(LeftPlane);
-
-    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    TopPlane = DirectX::MathInternal::XMPlaneTransform(TopPlane, vOrientation,
-                                                       vOrigin);
-    TopPlane = XMPlaneNormalize(TopPlane);
-
-    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-    BottomPlane = DirectX::MathInternal::XMPlaneTransform(
-        BottomPlane, vOrientation, vOrigin);
-    BottomPlane = XMPlaneNormalize(BottomPlane);
-
-    return box.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane,
-                           BottomPlane);
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType BoundingFrustum::Contains(
-    const BoundingFrustum& fr) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    // Create 6 planes (do it inline to encourage use of registers)
-    XMVECTOR NearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    NearPlane = DirectX::MathInternal::XMPlaneTransform(NearPlane, vOrientation,
-                                                        vOrigin);
-    NearPlane = XMPlaneNormalize(NearPlane);
-
-    XMVECTOR FarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    FarPlane = DirectX::MathInternal::XMPlaneTransform(FarPlane, vOrientation,
-                                                       vOrigin);
-    FarPlane = XMPlaneNormalize(FarPlane);
-
-    XMVECTOR RightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    RightPlane = DirectX::MathInternal::XMPlaneTransform(RightPlane,
-                                                         vOrientation, vOrigin);
-    RightPlane = XMPlaneNormalize(RightPlane);
-
-    XMVECTOR LeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    LeftPlane = DirectX::MathInternal::XMPlaneTransform(LeftPlane, vOrientation,
-                                                        vOrigin);
-    LeftPlane = XMPlaneNormalize(LeftPlane);
-
-    XMVECTOR TopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    TopPlane = DirectX::MathInternal::XMPlaneTransform(TopPlane, vOrientation,
-                                                       vOrigin);
-    TopPlane = XMPlaneNormalize(TopPlane);
-
-    XMVECTOR BottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-    BottomPlane = DirectX::MathInternal::XMPlaneTransform(
-        BottomPlane, vOrientation, vOrigin);
-    BottomPlane = XMPlaneNormalize(BottomPlane);
-
-    return fr.ContainedBy(NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane,
-                          BottomPlane);
-}
-
-//-----------------------------------------------------------------------------
-// Exact sphere vs frustum test.  The algorithm first checks the sphere against
-// the planes of the frustum, then if the plane checks were indeterminate finds
-// the nearest feature (plane, line, point) on the frustum to the center of the
-// sphere and compares the distance to the nearest feature to the radius of the
-// sphere
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingFrustum::Intersects(
-    const BoundingSphere& sh) const noexcept {
-    XMVECTOR Zero = XMVectorZero();
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    // Normalize the planes so we can compare to the sphere radius.
-    Planes[2] = XMVector3Normalize(Planes[2]);
-    Planes[3] = XMVector3Normalize(Planes[3]);
-    Planes[4] = XMVector3Normalize(Planes[4]);
-    Planes[5] = XMVector3Normalize(Planes[5]);
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Load the sphere.
-    XMVECTOR vCenter = XMLoadFloat3(&sh.Center);
-    XMVECTOR vRadius = XMVectorReplicatePtr(&sh.Radius);
-
-    // Transform the center of the sphere into the local space of frustum.
-    vCenter = XMVector3InverseRotate(XMVectorSubtract(vCenter, vOrigin),
-                                     vOrientation);
-
-    // Set w of the center to one so we can dot4 with the plane.
-    vCenter = XMVectorInsert<0, 0, 0, 0, 1>(vCenter, XMVectorSplatOne());
-
-    // Check against each plane of the frustum.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    XMVECTOR CenterInsideAll = XMVectorTrueInt();
-
-    XMVECTOR Dist[6];
-
-    for (size_t i = 0; i < 6; ++i) {
-        Dist[i] = XMVector4Dot(vCenter, Planes[i]);
-
-        // Outside the plane?
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist[i], vRadius));
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt(
-            InsideAll, XMVectorLessOrEqual(Dist[i], XMVectorNegate(vRadius)));
-
-        // Check if the center is inside the plane.
-        CenterInsideAll =
-            XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist[i], Zero));
-    }
-
-    // If the sphere is outside any of the planes it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If the sphere is inside all planes it is fully inside.
-    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) return true;
-
-    // If the center of the sphere is inside all planes and the sphere
-    // intersects one or more planes then it must intersect.
-    if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) return true;
-
-    // The sphere may be outside the frustum or intersecting the frustum.
-    // Find the nearest feature (face, edge, or corner) on the frustum
-    // to the sphere.
-
-    // The faces adjacent to each face are:
-    static const size_t adjacent_faces[6][4] = {{2, 3, 4, 5},   // 0
-                                                {2, 3, 4, 5},   // 1
-                                                {0, 1, 4, 5},   // 2
-                                                {0, 1, 4, 5},   // 3
-                                                {0, 1, 2, 3},   // 4
-                                                {0, 1, 2, 3}};  // 5
-
-    XMVECTOR Intersects = XMVectorFalseInt();
-
-    // Check to see if the nearest feature is one of the planes.
-    for (size_t i = 0; i < 6; ++i) {
-        // Find the nearest point on the plane to the center of the sphere.
-        XMVECTOR Point =
-            XMVectorNegativeMultiplySubtract(Planes[i], Dist[i], vCenter);
-
-        // Set w of the point to one.
-        Point = XMVectorInsert<0, 0, 0, 0, 1>(Point, XMVectorSplatOne());
-
-        // If the point is inside the face (inside the adjacent planes) then
-        // this plane is the nearest feature.
-        XMVECTOR InsideFace = XMVectorTrueInt();
-
-        for (size_t j = 0; j < 4; j++) {
-            size_t plane_index = adjacent_faces[i][j];
-
-            InsideFace = XMVectorAndInt(
-                InsideFace,
-                XMVectorLessOrEqual(XMVector4Dot(Point, Planes[plane_index]),
-                                    Zero));
-        }
-
-        // Since we have already checked distance from the plane we know that
-        // the sphere must intersect if this plane is the nearest feature.
-        Intersects = XMVectorOrInt(
-            Intersects,
-            XMVectorAndInt(XMVectorGreater(Dist[i], Zero), InsideFace));
-    }
-
-    if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = XMVectorMultiply(vRightTop, vNear);
-    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
-    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
-    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
-    Corners[4] = XMVectorMultiply(vRightTop, vFar);
-    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
-    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
-    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
-
-    // The Edges are:
-    static const size_t edges[12][2] = {
-        {0, 1}, {2, 3}, {0, 2}, {1, 3},  // Near plane
-        {4, 5}, {6, 7}, {4, 6}, {5, 7},  // Far plane
-        {0, 4}, {1, 5}, {2, 6}, {3, 7},
-    };  // Near to far
-
-    XMVECTOR RadiusSq = XMVectorMultiply(vRadius, vRadius);
-
-    // Check to see if the nearest feature is one of the edges (or corners).
-    for (size_t i = 0; i < 12; ++i) {
-        size_t ei0 = edges[i][0];
-        size_t ei1 = edges[i][1];
-
-        // Find the nearest point on the edge to the center of the sphere.
-        // The corners of the frustum are included as the endpoints of the
-        // edges.
-        XMVECTOR Point = DirectX::MathInternal::PointOnLineSegmentNearestPoint(
-            Corners[ei0], Corners[ei1], vCenter);
-
-        XMVECTOR Delta = XMVectorSubtract(vCenter, Point);
-
-        XMVECTOR DistSq = XMVector3Dot(Delta, Delta);
-
-        // If the distance to the center of the sphere to the point is less than
-        // the radius of the sphere then it must intersect.
-        Intersects =
-            XMVectorOrInt(Intersects, XMVectorLessOrEqual(DistSq, RadiusSq));
-    }
-
-    if (XMVector4EqualInt(Intersects, XMVectorTrueInt())) return true;
-
-    // The sphere must be outside the frustum.
-    return false;
-}
-
-//-----------------------------------------------------------------------------
-// Exact axis aligned box vs frustum test.  Constructs an oriented box and uses
-// the oriented box vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingFrustum::Intersects(
-    const BoundingBox& box) const noexcept {
-    // Make the axis aligned box oriented and do an OBB vs frustum test.
-    BoundingOrientedBox obox(box.Center, box.Extents,
-                             XMFLOAT4(0.f, 0.f, 0.f, 1.f));
-    return Intersects(obox);
-}
-
-//-----------------------------------------------------------------------------
-// Exact oriented box vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingFrustum::Intersects(
-    const BoundingOrientedBox& box) const noexcept {
-    static const XMVECTORU32 SelectY = {
-        {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}}};
-    static const XMVECTORU32 SelectZ = {
-        {{XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR FrustumOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(FrustumOrientation));
-
-    // Load the box.
-    XMVECTOR Center = XMLoadFloat3(&box.Center);
-    XMVECTOR Extents = XMLoadFloat3(&box.Extents);
-    XMVECTOR BoxOrientation = XMLoadFloat4(&box.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(BoxOrientation));
-
-    // Transform the oriented box into the space of the frustum in order to
-    // minimize the number of transforms we have to do.
-    Center = XMVector3InverseRotate(XMVectorSubtract(Center, vOrigin),
-                                    FrustumOrientation);
-    BoxOrientation = XMQuaternionMultiply(
-        BoxOrientation, XMQuaternionConjugate(FrustumOrientation));
-
-    // Set w of the center to one so we can dot4 with the plane.
-    Center = XMVectorInsert<0, 0, 0, 0, 1>(Center, XMVectorSplatOne());
-
-    // Build the 3x3 rotation matrix that defines the box axes.
-    XMMATRIX R = XMMatrixRotationQuaternion(BoxOrientation);
-
-    // Check against each plane of the frustum.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-    XMVECTOR CenterInsideAll = XMVectorTrueInt();
-
-    for (size_t i = 0; i < 6; ++i) {
-        // Compute the distance to the center of the box.
-        XMVECTOR Dist = XMVector4Dot(Center, Planes[i]);
-
-        // Project the axes of the box onto the normal of the plane.  Half the
-        // length of the projection (sometime called the "radius") is equal to
-        // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot
-        // b(w)) where h(i) are extents of the box, n is the plane normal, and
-        // b(i) are the axes of the box.
-        XMVECTOR Radius = XMVector3Dot(Planes[i], R.r[0]);
-        Radius =
-            XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[1]), SelectY);
-        Radius =
-            XMVectorSelect(Radius, XMVector3Dot(Planes[i], R.r[2]), SelectZ);
-        Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
-
-        // Outside the plane?
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(Dist, Radius));
-
-        // Fully inside the plane?
-        InsideAll = XMVectorAndInt(
-            InsideAll, XMVectorLessOrEqual(Dist, XMVectorNegate(Radius)));
-
-        // Check if the center is inside the plane.
-        CenterInsideAll =
-            XMVectorAndInt(CenterInsideAll, XMVectorLessOrEqual(Dist, Zero));
-    }
-
-    // If the box is outside any of the planes it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If the box is inside all planes it is fully inside.
-    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) return true;
-
-    // If the center of the box is inside all planes and the box intersects
-    // one or more planes then it must intersect.
-    if (XMVector4EqualInt(CenterInsideAll, XMVectorTrueInt())) return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = XMVectorMultiply(vRightTop, vNear);
-    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
-    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
-    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
-    Corners[4] = XMVectorMultiply(vRightTop, vFar);
-    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
-    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
-    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
-
-    // Test against box axes (3)
-    {
-        // Find the min/max values of the projection of the frustum onto each
-        // axis.
-        XMVECTOR FrustumMin, FrustumMax;
-
-        FrustumMin = XMVector3Dot(Corners[0], R.r[0]);
-        FrustumMin = XMVectorSelect(FrustumMin,
-                                    XMVector3Dot(Corners[0], R.r[1]), SelectY);
-        FrustumMin = XMVectorSelect(FrustumMin,
-                                    XMVector3Dot(Corners[0], R.r[2]), SelectZ);
-        FrustumMax = FrustumMin;
-
-        for (size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i) {
-            XMVECTOR Temp = XMVector3Dot(Corners[i], R.r[0]);
-            Temp =
-                XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[1]), SelectY);
-            Temp =
-                XMVectorSelect(Temp, XMVector3Dot(Corners[i], R.r[2]), SelectZ);
-
-            FrustumMin = XMVectorMin(FrustumMin, Temp);
-            FrustumMax = XMVectorMax(FrustumMax, Temp);
-        }
-
-        // Project the center of the box onto the axes.
-        XMVECTOR BoxDist = XMVector3Dot(Center, R.r[0]);
-        BoxDist =
-            XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[1]), SelectY);
-        BoxDist =
-            XMVectorSelect(BoxDist, XMVector3Dot(Center, R.r[2]), SelectZ);
-
-        // The projection of the box onto the axis is just its Center and
-        // Extents. if (min > box_max || max < box_min) reject;
-        XMVECTOR Result = XMVectorOrInt(
-            XMVectorGreater(FrustumMin, XMVectorAdd(BoxDist, Extents)),
-            XMVectorLess(FrustumMax, XMVectorSubtract(BoxDist, Extents)));
-
-        if (DirectX::MathInternal::XMVector3AnyTrue(Result)) return false;
-    }
-
-    // Test against edge/edge axes (3*6).
-    XMVECTOR FrustumEdgeAxis[6];
-
-    FrustumEdgeAxis[0] = vRightTop;
-    FrustumEdgeAxis[1] = vRightBottom;
-    FrustumEdgeAxis[2] = vLeftTop;
-    FrustumEdgeAxis[3] = vLeftBottom;
-    FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop);
-    FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop);
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 6; j++) {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis = XMVector3Cross(R.r[i], FrustumEdgeAxis[j]);
-
-            // Find the min/max values of the projection of the frustum onto the
-            // axis.
-            XMVECTOR FrustumMin, FrustumMax;
-
-            FrustumMin = FrustumMax = XMVector3Dot(Axis, Corners[0]);
-
-            for (size_t k = 1; k < CORNER_COUNT; k++) {
-                XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]);
-                FrustumMin = XMVectorMin(FrustumMin, Temp);
-                FrustumMax = XMVectorMax(FrustumMax, Temp);
-            }
-
-            // Project the center of the box onto the axis.
-            XMVECTOR Dist = XMVector3Dot(Center, Axis);
-
-            // Project the axes of the box onto the axis to find the "radius" of
-            // the box.
-            XMVECTOR Radius = XMVector3Dot(Axis, R.r[0]);
-            Radius =
-                XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[1]), SelectY);
-            Radius =
-                XMVectorSelect(Radius, XMVector3Dot(Axis, R.r[2]), SelectZ);
-            Radius = XMVector3Dot(Extents, XMVectorAbs(Radius));
-
-            // if (center > max + radius || center < min - radius) reject;
-            Outside = XMVectorOrInt(
-                Outside,
-                XMVectorGreater(Dist, XMVectorAdd(FrustumMax, Radius)));
-            Outside = XMVectorOrInt(
-                Outside,
-                XMVectorLess(Dist, XMVectorSubtract(FrustumMin, Radius)));
-        }
-    }
-
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If we did not find a separating plane then the box must intersect the
-    // frustum.
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-// Exact frustum vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool BoundingFrustum::Intersects(
-    const BoundingFrustum& fr) const noexcept {
-    // Load origin and orientation of frustum B.
-    XMVECTOR OriginB = XMLoadFloat3(&Origin);
-    XMVECTOR OrientationB = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(OrientationB));
-
-    // Build the planes of frustum B.
-    XMVECTOR AxisB[6];
-    AxisB[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f);
-    AxisB[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f);
-    AxisB[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    AxisB[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    AxisB[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    AxisB[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    XMVECTOR PlaneDistB[6];
-    PlaneDistB[0] = XMVectorNegate(XMVectorReplicatePtr(&Near));
-    PlaneDistB[1] = XMVectorReplicatePtr(&Far);
-    PlaneDistB[2] = XMVectorZero();
-    PlaneDistB[3] = XMVectorZero();
-    PlaneDistB[4] = XMVectorZero();
-    PlaneDistB[5] = XMVectorZero();
-
-    // Load origin and orientation of frustum A.
-    XMVECTOR OriginA = XMLoadFloat3(&fr.Origin);
-    XMVECTOR OrientationA = XMLoadFloat4(&fr.Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(OrientationA));
-
-    // Transform frustum A into the space of the frustum B in order to
-    // minimize the number of transforms we have to do.
-    OriginA = XMVector3InverseRotate(XMVectorSubtract(OriginA, OriginB),
-                                     OrientationB);
-    OrientationA =
-        XMQuaternionMultiply(OrientationA, XMQuaternionConjugate(OrientationB));
-
-    // Build the corners of frustum A (in the local space of B).
-    XMVECTOR RightTopA = XMVectorSet(fr.RightSlope, fr.TopSlope, 1.0f, 0.0f);
-    XMVECTOR RightBottomA =
-        XMVectorSet(fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f);
-    XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f);
-    XMVECTOR LeftBottomA =
-        XMVectorSet(fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f);
-    XMVECTOR NearA = XMVectorReplicatePtr(&fr.Near);
-    XMVECTOR FarA = XMVectorReplicatePtr(&fr.Far);
-
-    RightTopA = XMVector3Rotate(RightTopA, OrientationA);
-    RightBottomA = XMVector3Rotate(RightBottomA, OrientationA);
-    LeftTopA = XMVector3Rotate(LeftTopA, OrientationA);
-    LeftBottomA = XMVector3Rotate(LeftBottomA, OrientationA);
-
-    XMVECTOR CornersA[CORNER_COUNT];
-    CornersA[0] = XMVectorMultiplyAdd(RightTopA, NearA, OriginA);
-    CornersA[1] = XMVectorMultiplyAdd(RightBottomA, NearA, OriginA);
-    CornersA[2] = XMVectorMultiplyAdd(LeftTopA, NearA, OriginA);
-    CornersA[3] = XMVectorMultiplyAdd(LeftBottomA, NearA, OriginA);
-    CornersA[4] = XMVectorMultiplyAdd(RightTopA, FarA, OriginA);
-    CornersA[5] = XMVectorMultiplyAdd(RightBottomA, FarA, OriginA);
-    CornersA[6] = XMVectorMultiplyAdd(LeftTopA, FarA, OriginA);
-    CornersA[7] = XMVectorMultiplyAdd(LeftBottomA, FarA, OriginA);
-
-    // Check frustum A against each plane of frustum B.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for (size_t i = 0; i < 6; ++i) {
-        // Find the min/max projection of the frustum onto the plane normal.
-        XMVECTOR Min, Max;
-
-        Min = Max = XMVector3Dot(AxisB[i], CornersA[0]);
-
-        for (size_t j = 1; j < CORNER_COUNT; j++) {
-            XMVECTOR Temp = XMVector3Dot(AxisB[i], CornersA[j]);
-            Min = XMVectorMin(Min, Temp);
-            Max = XMVectorMax(Max, Temp);
-        }
-
-        // Outside the plane?
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistB[i]));
-
-        // Fully inside the plane?
-        InsideAll =
-            XMVectorAndInt(InsideAll, XMVectorLessOrEqual(Max, PlaneDistB[i]));
-    }
-
-    // If the frustum A is outside any of the planes of frustum B it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If frustum A is inside all planes of frustum B it is fully inside.
-    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) return true;
-
-    // Build the corners of frustum B.
-    XMVECTOR RightTopB = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR RightBottomB = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR LeftTopB = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR LeftBottomB = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR NearB = XMVectorReplicatePtr(&Near);
-    XMVECTOR FarB = XMVectorReplicatePtr(&Far);
-
-    XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT];
-    CornersB[0] = XMVectorMultiply(RightTopB, NearB);
-    CornersB[1] = XMVectorMultiply(RightBottomB, NearB);
-    CornersB[2] = XMVectorMultiply(LeftTopB, NearB);
-    CornersB[3] = XMVectorMultiply(LeftBottomB, NearB);
-    CornersB[4] = XMVectorMultiply(RightTopB, FarB);
-    CornersB[5] = XMVectorMultiply(RightBottomB, FarB);
-    CornersB[6] = XMVectorMultiply(LeftTopB, FarB);
-    CornersB[7] = XMVectorMultiply(LeftBottomB, FarB);
-
-    // Build the planes of frustum A (in the local space of B).
-    XMVECTOR AxisA[6];
-    XMVECTOR PlaneDistA[6];
-
-    AxisA[0] = XMVectorSet(0.0f, 0.0f, -1.0f, 0.0f);
-    AxisA[1] = XMVectorSet(0.0f, 0.0f, 1.0f, 0.0f);
-    AxisA[2] = XMVectorSet(1.0f, 0.0f, -fr.RightSlope, 0.0f);
-    AxisA[3] = XMVectorSet(-1.0f, 0.0f, fr.LeftSlope, 0.0f);
-    AxisA[4] = XMVectorSet(0.0f, 1.0f, -fr.TopSlope, 0.0f);
-    AxisA[5] = XMVectorSet(0.0f, -1.0f, fr.BottomSlope, 0.0f);
-
-    AxisA[0] = XMVector3Rotate(AxisA[0], OrientationA);
-    AxisA[1] = XMVectorNegate(AxisA[0]);
-    AxisA[2] = XMVector3Rotate(AxisA[2], OrientationA);
-    AxisA[3] = XMVector3Rotate(AxisA[3], OrientationA);
-    AxisA[4] = XMVector3Rotate(AxisA[4], OrientationA);
-    AxisA[5] = XMVector3Rotate(AxisA[5], OrientationA);
-
-    PlaneDistA[0] =
-        XMVector3Dot(AxisA[0], CornersA[0]);  // Re-use corner on near plane.
-    PlaneDistA[1] =
-        XMVector3Dot(AxisA[1], CornersA[4]);  // Re-use corner on far plane.
-    PlaneDistA[2] = XMVector3Dot(AxisA[2], OriginA);
-    PlaneDistA[3] = XMVector3Dot(AxisA[3], OriginA);
-    PlaneDistA[4] = XMVector3Dot(AxisA[4], OriginA);
-    PlaneDistA[5] = XMVector3Dot(AxisA[5], OriginA);
-
-    // Check each axis of frustum A for a seperating plane (5).
-    for (size_t i = 0; i < 6; ++i) {
-        // Find the minimum projection of the frustum onto the plane normal.
-        XMVECTOR Min;
-
-        Min = XMVector3Dot(AxisA[i], CornersB[0]);
-
-        for (size_t j = 1; j < CORNER_COUNT; j++) {
-            XMVECTOR Temp = XMVector3Dot(AxisA[i], CornersB[j]);
-            Min = XMVectorMin(Min, Temp);
-        }
-
-        // Outside the plane?
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(Min, PlaneDistA[i]));
-    }
-
-    // If the frustum B is outside any of the planes of frustum A it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // Check edge/edge axes (6 * 6).
-    XMVECTOR FrustumEdgeAxisA[6];
-    FrustumEdgeAxisA[0] = RightTopA;
-    FrustumEdgeAxisA[1] = RightBottomA;
-    FrustumEdgeAxisA[2] = LeftTopA;
-    FrustumEdgeAxisA[3] = LeftBottomA;
-    FrustumEdgeAxisA[4] = XMVectorSubtract(RightTopA, LeftTopA);
-    FrustumEdgeAxisA[5] = XMVectorSubtract(LeftBottomA, LeftTopA);
-
-    XMVECTOR FrustumEdgeAxisB[6];
-    FrustumEdgeAxisB[0] = RightTopB;
-    FrustumEdgeAxisB[1] = RightBottomB;
-    FrustumEdgeAxisB[2] = LeftTopB;
-    FrustumEdgeAxisB[3] = LeftBottomB;
-    FrustumEdgeAxisB[4] = XMVectorSubtract(RightTopB, LeftTopB);
-    FrustumEdgeAxisB[5] = XMVectorSubtract(LeftBottomB, LeftTopB);
-
-    for (size_t i = 0; i < 6; ++i) {
-        for (size_t j = 0; j < 6; j++) {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis =
-                XMVector3Cross(FrustumEdgeAxisA[i], FrustumEdgeAxisB[j]);
-
-            // Find the min/max values of the projection of both frustums onto
-            // the axis.
-            XMVECTOR MinA, MaxA;
-            XMVECTOR MinB, MaxB;
-
-            MinA = MaxA = XMVector3Dot(Axis, CornersA[0]);
-            MinB = MaxB = XMVector3Dot(Axis, CornersB[0]);
-
-            for (size_t k = 1; k < CORNER_COUNT; k++) {
-                XMVECTOR TempA = XMVector3Dot(Axis, CornersA[k]);
-                MinA = XMVectorMin(MinA, TempA);
-                MaxA = XMVectorMax(MaxA, TempA);
-
-                XMVECTOR TempB = XMVector3Dot(Axis, CornersB[k]);
-                MinB = XMVectorMin(MinB, TempB);
-                MaxB = XMVectorMax(MaxB, TempB);
-            }
-
-            // if (MinA > MaxB || MinB > MaxA) reject
-            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB));
-            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA));
-        }
-    }
-
-    // If there is a seperating plane, then the frustums do not intersect.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If we did not find a separating plane then the frustums intersect.
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-// Triangle vs frustum test.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingFrustum::Intersects(
-    FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2) const noexcept {
-    // Build the frustum planes (NOTE: D is negated from the usual).
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, -Near);
-    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, Far);
-    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Transform triangle into the local space of frustum.
-    XMVECTOR TV0 =
-        XMVector3InverseRotate(XMVectorSubtract(V0, vOrigin), vOrientation);
-    XMVECTOR TV1 =
-        XMVector3InverseRotate(XMVectorSubtract(V1, vOrigin), vOrientation);
-    XMVECTOR TV2 =
-        XMVector3InverseRotate(XMVectorSubtract(V2, vOrigin), vOrientation);
-
-    // Test each vertex of the triangle against the frustum planes.
-    XMVECTOR Outside = XMVectorFalseInt();
-    XMVECTOR InsideAll = XMVectorTrueInt();
-
-    for (size_t i = 0; i < 6; ++i) {
-        XMVECTOR Dist0 = XMVector3Dot(TV0, Planes[i]);
-        XMVECTOR Dist1 = XMVector3Dot(TV1, Planes[i]);
-        XMVECTOR Dist2 = XMVector3Dot(TV2, Planes[i]);
-
-        XMVECTOR MinDist = XMVectorMin(Dist0, Dist1);
-        MinDist = XMVectorMin(MinDist, Dist2);
-        XMVECTOR MaxDist = XMVectorMax(Dist0, Dist1);
-        MaxDist = XMVectorMax(MaxDist, Dist2);
-
-        XMVECTOR PlaneDist = XMVectorSplatW(Planes[i]);
-
-        // Outside the plane?
-        Outside = XMVectorOrInt(Outside, XMVectorGreater(MinDist, PlaneDist));
-
-        // Fully inside the plane?
-        InsideAll =
-            XMVectorAndInt(InsideAll, XMVectorLessOrEqual(MaxDist, PlaneDist));
-    }
-
-    // If the triangle is outside any of the planes it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If the triangle is inside all planes it is fully inside.
-    if (XMVector4EqualInt(InsideAll, XMVectorTrueInt())) return true;
-
-    // Build the corners of the frustum.
-    XMVECTOR vRightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vRightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR vLeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    XMVECTOR Corners[CORNER_COUNT];
-    Corners[0] = XMVectorMultiply(vRightTop, vNear);
-    Corners[1] = XMVectorMultiply(vRightBottom, vNear);
-    Corners[2] = XMVectorMultiply(vLeftTop, vNear);
-    Corners[3] = XMVectorMultiply(vLeftBottom, vNear);
-    Corners[4] = XMVectorMultiply(vRightTop, vFar);
-    Corners[5] = XMVectorMultiply(vRightBottom, vFar);
-    Corners[6] = XMVectorMultiply(vLeftTop, vFar);
-    Corners[7] = XMVectorMultiply(vLeftBottom, vFar);
-
-    // Test the plane of the triangle.
-    XMVECTOR Normal =
-        XMVector3Cross(XMVectorSubtract(V1, V0), XMVectorSubtract(V2, V0));
-    XMVECTOR Dist = XMVector3Dot(Normal, V0);
-
-    XMVECTOR MinDist, MaxDist;
-    MinDist = MaxDist = XMVector3Dot(Corners[0], Normal);
-    for (size_t i = 1; i < CORNER_COUNT; ++i) {
-        XMVECTOR Temp = XMVector3Dot(Corners[i], Normal);
-        MinDist = XMVectorMin(MinDist, Temp);
-        MaxDist = XMVectorMax(MaxDist, Temp);
-    }
-
-    Outside = XMVectorOrInt(XMVectorGreater(MinDist, Dist),
-                            XMVectorLess(MaxDist, Dist));
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // Check the edge/edge axes (3*6).
-    XMVECTOR TriangleEdgeAxis[3];
-    TriangleEdgeAxis[0] = XMVectorSubtract(V1, V0);
-    TriangleEdgeAxis[1] = XMVectorSubtract(V2, V1);
-    TriangleEdgeAxis[2] = XMVectorSubtract(V0, V2);
-
-    XMVECTOR FrustumEdgeAxis[6];
-    FrustumEdgeAxis[0] = vRightTop;
-    FrustumEdgeAxis[1] = vRightBottom;
-    FrustumEdgeAxis[2] = vLeftTop;
-    FrustumEdgeAxis[3] = vLeftBottom;
-    FrustumEdgeAxis[4] = XMVectorSubtract(vRightTop, vLeftTop);
-    FrustumEdgeAxis[5] = XMVectorSubtract(vLeftBottom, vLeftTop);
-
-    for (size_t i = 0; i < 3; ++i) {
-        for (size_t j = 0; j < 6; j++) {
-            // Compute the axis we are going to test.
-            XMVECTOR Axis =
-                XMVector3Cross(TriangleEdgeAxis[i], FrustumEdgeAxis[j]);
-
-            // Find the min/max of the projection of the triangle onto the axis.
-            XMVECTOR MinA, MaxA;
-
-            XMVECTOR Dist0 = XMVector3Dot(V0, Axis);
-            XMVECTOR Dist1 = XMVector3Dot(V1, Axis);
-            XMVECTOR Dist2 = XMVector3Dot(V2, Axis);
-
-            MinA = XMVectorMin(Dist0, Dist1);
-            MinA = XMVectorMin(MinA, Dist2);
-            MaxA = XMVectorMax(Dist0, Dist1);
-            MaxA = XMVectorMax(MaxA, Dist2);
-
-            // Find the min/max of the projection of the frustum onto the axis.
-            XMVECTOR MinB, MaxB;
-
-            MinB = MaxB = XMVector3Dot(Axis, Corners[0]);
-
-            for (size_t k = 1; k < CORNER_COUNT; k++) {
-                XMVECTOR Temp = XMVector3Dot(Axis, Corners[k]);
-                MinB = XMVectorMin(MinB, Temp);
-                MaxB = XMVectorMax(MaxB, Temp);
-            }
-
-            // if (MinA > MaxB || MinB > MaxA) reject;
-            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinA, MaxB));
-            Outside = XMVectorOrInt(Outside, XMVectorGreater(MinB, MaxA));
-        }
-    }
-
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return false;
-
-    // If we did not find a separating plane then the triangle must intersect
-    // the frustum.
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline PlaneIntersectionType XM_CALLCONV
-BoundingFrustum::Intersects(FXMVECTOR Plane) const noexcept {
-    assert(DirectX::MathInternal::XMPlaneIsUnit(Plane));
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Set w of the origin to one so we can dot4 with a plane.
-    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne());
-
-    // Build the corners of the frustum (in world space).
-    XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    RightTop = XMVector3Rotate(RightTop, vOrientation);
-    RightBottom = XMVector3Rotate(RightBottom, vOrientation);
-    LeftTop = XMVector3Rotate(LeftTop, vOrientation);
-    LeftBottom = XMVector3Rotate(LeftBottom, vOrientation);
-
-    XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin);
-    XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin);
-    XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin);
-    XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin);
-    XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin);
-    XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin);
-    XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin);
-    XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin);
-
-    XMVECTOR Outside, Inside;
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane, Outside, Inside);
-
-    // If the frustum is outside any plane it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return FRONT;
-
-    // If the frustum is inside all planes it is inside.
-    if (XMVector4EqualInt(Inside, XMVectorTrueInt())) return BACK;
-
-    // The frustum is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-//-----------------------------------------------------------------------------
-// Ray vs. frustum test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV BoundingFrustum::Intersects(
-    FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist) const noexcept {
-    // If ray starts inside the frustum, return a distance of 0 for the hit
-    if (Contains(rayOrigin) == CONTAINS) {
-        Dist = 0.0f;
-        return true;
-    }
-
-    // Build the frustum planes.
-    XMVECTOR Planes[6];
-    Planes[0] = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-    Planes[1] = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-    Planes[2] = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-    Planes[3] = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-    Planes[4] = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-    Planes[5] = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-
-    // Load origin and orientation of the frustum.
-    XMVECTOR frOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR frOrientation = XMLoadFloat4(&Orientation);
-
-    // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in
-    // James Arvo, ed., Graphics Gems II pp. 247-250
-    float tnear = -FLT_MAX;
-    float tfar = FLT_MAX;
-
-    for (size_t i = 0; i < 6; ++i) {
-        XMVECTOR Plane = DirectX::MathInternal::XMPlaneTransform(
-            Planes[i], frOrientation, frOrigin);
-        Plane = XMPlaneNormalize(Plane);
-
-        XMVECTOR AxisDotOrigin = XMPlaneDotCoord(Plane, rayOrigin);
-        XMVECTOR AxisDotDirection = XMVector3Dot(Plane, Direction);
-
-        if (XMVector3LessOrEqual(XMVectorAbs(AxisDotDirection), g_RayEpsilon)) {
-            // Ray is parallel to plane - check if ray origin is inside plane's
-            if (XMVector3Greater(AxisDotOrigin, g_XMZero)) {
-                // Ray origin is outside half-space.
-                Dist = 0.f;
-                return false;
-            }
-        } else {
-            // Ray not parallel - get distance to plane.
-            float vd = XMVectorGetX(AxisDotDirection);
-            float vn = XMVectorGetX(AxisDotOrigin);
-            float t = -vn / vd;
-            if (vd < 0.0f) {
-                // Front face - T is a near point.
-                if (t > tfar) {
-                    Dist = 0.f;
-                    return false;
-                }
-                if (t > tnear) {
-                    // Hit near face.
-                    tnear = t;
-                }
-            } else {
-                // back face - T is far point.
-                if (t < tnear) {
-                    Dist = 0.f;
-                    return false;
-                }
-                if (t < tfar) {
-                    // Hit far face.
-                    tfar = t;
-                }
-            }
-        }
-    }
-
-    // Survived all tests.
-    // Note: if ray originates on polyhedron, may want to change 0.0f to some
-    // epsilon to avoid intersecting the originating face.
-    float distance = (tnear >= 0.0f) ? tnear : tfar;
-    if (distance >= 0.0f) {
-        Dist = distance;
-        return true;
-    }
-
-    Dist = 0.f;
-    return false;
-}
-
-//-----------------------------------------------------------------------------
-// Test a frustum vs 6 planes (typically forming another frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-BoundingFrustum::ContainedBy(FXMVECTOR Plane0, FXMVECTOR Plane1,
-                             FXMVECTOR Plane2, GXMVECTOR Plane3,
-                             HXMVECTOR Plane4,
-                             HXMVECTOR Plane5) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    assert(DirectX::MathInternal::XMQuaternionIsUnit(vOrientation));
-
-    // Set w of the origin to one so we can dot4 with a plane.
-    vOrigin = XMVectorInsert<0, 0, 0, 0, 1>(vOrigin, XMVectorSplatOne());
-
-    // Build the corners of the frustum (in world space).
-    XMVECTOR RightTop = XMVectorSet(RightSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR RightBottom = XMVectorSet(RightSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR LeftTop = XMVectorSet(LeftSlope, TopSlope, 1.0f, 0.0f);
-    XMVECTOR LeftBottom = XMVectorSet(LeftSlope, BottomSlope, 1.0f, 0.0f);
-    XMVECTOR vNear = XMVectorReplicatePtr(&Near);
-    XMVECTOR vFar = XMVectorReplicatePtr(&Far);
-
-    RightTop = XMVector3Rotate(RightTop, vOrientation);
-    RightBottom = XMVector3Rotate(RightBottom, vOrientation);
-    LeftTop = XMVector3Rotate(LeftTop, vOrientation);
-    LeftBottom = XMVector3Rotate(LeftBottom, vOrientation);
-
-    XMVECTOR Corners0 = XMVectorMultiplyAdd(RightTop, vNear, vOrigin);
-    XMVECTOR Corners1 = XMVectorMultiplyAdd(RightBottom, vNear, vOrigin);
-    XMVECTOR Corners2 = XMVectorMultiplyAdd(LeftTop, vNear, vOrigin);
-    XMVECTOR Corners3 = XMVectorMultiplyAdd(LeftBottom, vNear, vOrigin);
-    XMVECTOR Corners4 = XMVectorMultiplyAdd(RightTop, vFar, vOrigin);
-    XMVECTOR Corners5 = XMVectorMultiplyAdd(RightBottom, vFar, vOrigin);
-    XMVECTOR Corners6 = XMVectorMultiplyAdd(LeftTop, vFar, vOrigin);
-    XMVECTOR Corners7 = XMVectorMultiplyAdd(LeftBottom, vFar, vOrigin);
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane0, Outside, Inside);
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane1, Outside, Inside);
-
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane2, Outside, Inside);
-
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane3, Outside, Inside);
-
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane4, Outside, Inside);
-
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectFrustumPlane(
-        Corners0, Corners1, Corners2, Corners3, Corners4, Corners5, Corners6,
-        Corners7, Plane5, Outside, Inside);
-
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    // If the frustum is outside any plane it is outside.
-    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) return DISJOINT;
-
-    // If the frustum is inside all planes it is inside.
-    if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) return CONTAINS;
-
-    // The frustum is not inside all planes or outside a plane, it may
-    // intersect.
-    return INTERSECTS;
-}
-
-//-----------------------------------------------------------------------------
-// Build the 6 frustum planes from a frustum.
-//
-// The intended use for these routines is for fast culling to a view frustum.
-// When the volume being tested against a view frustum is small relative to the
-// view frustum it is usually either inside all six planes of the frustum
-// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither
-// of these cases is true then it may or may not be intersecting the frustum
-// (INTERSECTS)
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void BoundingFrustum::GetPlanes(
-    XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane,
-    XMVECTOR* LeftPlane, XMVECTOR* TopPlane,
-    XMVECTOR* BottomPlane) const noexcept {
-    // Load origin and orientation of the frustum.
-    XMVECTOR vOrigin = XMLoadFloat3(&Origin);
-    XMVECTOR vOrientation = XMLoadFloat4(&Orientation);
-
-    if (NearPlane) {
-        XMVECTOR vNearPlane = XMVectorSet(0.0f, 0.0f, -1.0f, Near);
-        vNearPlane = DirectX::MathInternal::XMPlaneTransform(
-            vNearPlane, vOrientation, vOrigin);
-        *NearPlane = XMPlaneNormalize(vNearPlane);
-    }
-
-    if (FarPlane) {
-        XMVECTOR vFarPlane = XMVectorSet(0.0f, 0.0f, 1.0f, -Far);
-        vFarPlane = DirectX::MathInternal::XMPlaneTransform(
-            vFarPlane, vOrientation, vOrigin);
-        *FarPlane = XMPlaneNormalize(vFarPlane);
-    }
-
-    if (RightPlane) {
-        XMVECTOR vRightPlane = XMVectorSet(1.0f, 0.0f, -RightSlope, 0.0f);
-        vRightPlane = DirectX::MathInternal::XMPlaneTransform(
-            vRightPlane, vOrientation, vOrigin);
-        *RightPlane = XMPlaneNormalize(vRightPlane);
-    }
-
-    if (LeftPlane) {
-        XMVECTOR vLeftPlane = XMVectorSet(-1.0f, 0.0f, LeftSlope, 0.0f);
-        vLeftPlane = DirectX::MathInternal::XMPlaneTransform(
-            vLeftPlane, vOrientation, vOrigin);
-        *LeftPlane = XMPlaneNormalize(vLeftPlane);
-    }
-
-    if (TopPlane) {
-        XMVECTOR vTopPlane = XMVectorSet(0.0f, 1.0f, -TopSlope, 0.0f);
-        vTopPlane = DirectX::MathInternal::XMPlaneTransform(
-            vTopPlane, vOrientation, vOrigin);
-        *TopPlane = XMPlaneNormalize(vTopPlane);
-    }
-
-    if (BottomPlane) {
-        XMVECTOR vBottomPlane = XMVectorSet(0.0f, -1.0f, BottomSlope, 0.0f);
-        vBottomPlane = DirectX::MathInternal::XMPlaneTransform(
-            vBottomPlane, vOrientation, vOrigin);
-        *BottomPlane = XMPlaneNormalize(vBottomPlane);
-    }
-}
-
-//-----------------------------------------------------------------------------
-// Build a frustum from a persepective projection matrix.  The matrix may only
-// contain a projection; any rotation, translation or scale will cause the
-// constructed frustum to be incorrect.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-BoundingFrustum::CreateFromMatrix(BoundingFrustum& Out, FXMMATRIX Projection,
-                                  bool rhcoords) noexcept {
-    // Corners of the projection frustum in NDC space.
-    static XMVECTORF32 NDCPoints[6] = {
-        {{{1.0f, 0.0f, 1.0f, 1.0f}}},   // right (at far plane)
-        {{{-1.0f, 0.0f, 1.0f, 1.0f}}},  // left
-        {{{0.0f, 1.0f, 1.0f, 1.0f}}},   // top
-        {{{0.0f, -1.0f, 1.0f, 1.0f}}},  // bottom
-
-        {{{0.0f, 0.0f, 0.0f, 1.0f}}},  // near
-        {{{0.0f, 0.0f, 1.0f, 1.0f}}}   // far
-    };
-
-    XMVECTOR Determinant;
-    XMMATRIX matInverse = XMMatrixInverse(&Determinant, Projection);
-
-    // Compute the frustum corners in world space.
-    XMVECTOR Points[6];
-
-    for (size_t i = 0; i < 6; ++i) {
-        // Transform point.
-        Points[i] = XMVector4Transform(NDCPoints[i], matInverse);
-    }
-
-    Out.Origin = XMFLOAT3(0.0f, 0.0f, 0.0f);
-    Out.Orientation = XMFLOAT4(0.0f, 0.0f, 0.0f, 1.0f);
-
-    // Compute the slopes.
-    Points[0] = XMVectorMultiply(Points[0],
-                                 XMVectorReciprocal(XMVectorSplatZ(Points[0])));
-    Points[1] = XMVectorMultiply(Points[1],
-                                 XMVectorReciprocal(XMVectorSplatZ(Points[1])));
-    Points[2] = XMVectorMultiply(Points[2],
-                                 XMVectorReciprocal(XMVectorSplatZ(Points[2])));
-    Points[3] = XMVectorMultiply(Points[3],
-                                 XMVectorReciprocal(XMVectorSplatZ(Points[3])));
-
-    Out.RightSlope = XMVectorGetX(Points[0]);
-    Out.LeftSlope = XMVectorGetX(Points[1]);
-    Out.TopSlope = XMVectorGetY(Points[2]);
-    Out.BottomSlope = XMVectorGetY(Points[3]);
-
-    // Compute near and far.
-    Points[4] = XMVectorMultiply(Points[4],
-                                 XMVectorReciprocal(XMVectorSplatW(Points[4])));
-    Points[5] = XMVectorMultiply(Points[5],
-                                 XMVectorReciprocal(XMVectorSplatW(Points[5])));
-
-    if (rhcoords) {
-        Out.Near = XMVectorGetZ(Points[5]);
-        Out.Far = XMVectorGetZ(Points[4]);
-    } else {
-        Out.Near = XMVectorGetZ(Points[4]);
-        Out.Far = XMVectorGetZ(Points[5]);
-    }
-}
-
-/****************************************************************************
- *
- * TriangleTests
- *
- ****************************************************************************/
-
-namespace TriangleTests {
-
-//-----------------------------------------------------------------------------
-// Compute the intersection of a ray (Origin, Direction) with a triangle
-// (V0, V1, V2).  Return true if there is an intersection and also set *pDist
-// to the distance along the ray to the intersection.
-//
-// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage
-// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1,
-// pp 21-28, 1997.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV
-Intersects(FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1,
-           HXMVECTOR V2, float& Dist) noexcept {
-    assert(DirectX::MathInternal::XMVector3IsUnit(Direction));
-
-    XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR e1 = XMVectorSubtract(V1, V0);
-    XMVECTOR e2 = XMVectorSubtract(V2, V0);
-
-    // p = Direction ^ e2;
-    XMVECTOR p = XMVector3Cross(Direction, e2);
-
-    // det = e1 * p;
-    XMVECTOR det = XMVector3Dot(e1, p);
-
-    XMVECTOR u, v, t;
-
-    if (XMVector3GreaterOrEqual(det, g_RayEpsilon)) {
-        // Determinate is positive (front side of the triangle).
-        XMVECTOR s = XMVectorSubtract(Origin, V0);
-
-        // u = s * p;
-        u = XMVector3Dot(s, p);
-
-        XMVECTOR NoIntersection = XMVectorLess(u, Zero);
-        NoIntersection = XMVectorOrInt(NoIntersection, XMVectorGreater(u, det));
-
-        // q = s ^ e1;
-        XMVECTOR q = XMVector3Cross(s, e1);
-
-        // v = Direction * q;
-        v = XMVector3Dot(Direction, q);
-
-        NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(v, Zero));
-        NoIntersection = XMVectorOrInt(NoIntersection,
-                                       XMVectorGreater(XMVectorAdd(u, v), det));
-
-        // t = e2 * q;
-        t = XMVector3Dot(e2, q);
-
-        NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(t, Zero));
-
-        if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) {
-            Dist = 0.f;
-            return false;
-        }
-    } else if (XMVector3LessOrEqual(det, g_RayNegEpsilon)) {
-        // Determinate is negative (back side of the triangle).
-        XMVECTOR s = XMVectorSubtract(Origin, V0);
-
-        // u = s * p;
-        u = XMVector3Dot(s, p);
-
-        XMVECTOR NoIntersection = XMVectorGreater(u, Zero);
-        NoIntersection = XMVectorOrInt(NoIntersection, XMVectorLess(u, det));
-
-        // q = s ^ e1;
-        XMVECTOR q = XMVector3Cross(s, e1);
-
-        // v = Direction * q;
-        v = XMVector3Dot(Direction, q);
-
-        NoIntersection =
-            XMVectorOrInt(NoIntersection, XMVectorGreater(v, Zero));
-        NoIntersection =
-            XMVectorOrInt(NoIntersection, XMVectorLess(XMVectorAdd(u, v), det));
-
-        // t = e2 * q;
-        t = XMVector3Dot(e2, q);
-
-        NoIntersection =
-            XMVectorOrInt(NoIntersection, XMVectorGreater(t, Zero));
-
-        if (XMVector4EqualInt(NoIntersection, XMVectorTrueInt())) {
-            Dist = 0.f;
-            return false;
-        }
-    } else {
-        // Parallel ray.
-        Dist = 0.f;
-        return false;
-    }
-
-    t = XMVectorDivide(t, det);
-
-    // (u / det) and (v / dev) are the barycentric cooridinates of the
-    // intersection.
-
-    // Store the x-component to *pDist
-    XMStoreFloat(&Dist, t);
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-// Test if two triangles intersect.
-//
-// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast
-// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics
-// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and
-// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal
-// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003.
-//
-// The final test could be considered an edge-edge separating plane test with
-// the 9 possible cases narrowed down to the only two pairs of edges that can
-// actaully result in a seperation.
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline bool XM_CALLCONV
-Intersects(FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, HXMVECTOR B1,
-           HXMVECTOR B2) noexcept {
-    static const XMVECTORU32 SelectY = {
-        {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}}};
-    static const XMVECTORU32 SelectZ = {
-        {{XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
-    static const XMVECTORU32 Select0111 = {
-        {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1}}};
-    static const XMVECTORU32 Select1011 = {
-        {{XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1}}};
-    static const XMVECTORU32 Select1101 = {
-        {{XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}}};
-
-    XMVECTOR Zero = XMVectorZero();
-
-    // Compute the normal of triangle A.
-    XMVECTOR N1 =
-        XMVector3Cross(XMVectorSubtract(A1, A0), XMVectorSubtract(A2, A0));
-
-    // Assert that the triangle is not degenerate.
-    assert(!XMVector3Equal(N1, Zero));
-
-    // Test points of B against the plane of A.
-    XMVECTOR BDist = XMVector3Dot(N1, XMVectorSubtract(B0, A0));
-    BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B1, A0)),
-                           SelectY);
-    BDist = XMVectorSelect(BDist, XMVector3Dot(N1, XMVectorSubtract(B2, A0)),
-                           SelectZ);
-
-    // Ensure robustness with co-planar triangles by zeroing small distances.
-    uint32_t BDistIsZeroCR;
-    XMVECTOR BDistIsZero =
-        XMVectorGreaterR(&BDistIsZeroCR, g_RayEpsilon, XMVectorAbs(BDist));
-    BDist = XMVectorSelect(BDist, Zero, BDistIsZero);
-
-    uint32_t BDistIsLessCR;
-    XMVECTOR BDistIsLess = XMVectorGreaterR(&BDistIsLessCR, Zero, BDist);
-
-    uint32_t BDistIsGreaterCR;
-    XMVECTOR BDistIsGreater = XMVectorGreaterR(&BDistIsGreaterCR, BDist, Zero);
-
-    // If all the points are on the same side we don't intersect.
-    if (XMComparisonAllTrue(BDistIsLessCR) ||
-        XMComparisonAllTrue(BDistIsGreaterCR))
-        return false;
-
-    // Compute the normal of triangle B.
-    XMVECTOR N2 =
-        XMVector3Cross(XMVectorSubtract(B1, B0), XMVectorSubtract(B2, B0));
-
-    // Assert that the triangle is not degenerate.
-    assert(!XMVector3Equal(N2, Zero));
-
-    // Test points of A against the plane of B.
-    XMVECTOR ADist = XMVector3Dot(N2, XMVectorSubtract(A0, B0));
-    ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A1, B0)),
-                           SelectY);
-    ADist = XMVectorSelect(ADist, XMVector3Dot(N2, XMVectorSubtract(A2, B0)),
-                           SelectZ);
-
-    // Ensure robustness with co-planar triangles by zeroing small distances.
-    uint32_t ADistIsZeroCR;
-    XMVECTOR ADistIsZero =
-        XMVectorGreaterR(&ADistIsZeroCR, g_RayEpsilon, XMVectorAbs(ADist));
-    ADist = XMVectorSelect(ADist, Zero, ADistIsZero);
-
-    uint32_t ADistIsLessCR;
-    XMVECTOR ADistIsLess = XMVectorGreaterR(&ADistIsLessCR, Zero, ADist);
-
-    uint32_t ADistIsGreaterCR;
-    XMVECTOR ADistIsGreater = XMVectorGreaterR(&ADistIsGreaterCR, ADist, Zero);
-
-    // If all the points are on the same side we don't intersect.
-    if (XMComparisonAllTrue(ADistIsLessCR) ||
-        XMComparisonAllTrue(ADistIsGreaterCR))
-        return false;
-
-    // Special case for co-planar triangles.
-    if (XMComparisonAllTrue(ADistIsZeroCR) ||
-        XMComparisonAllTrue(BDistIsZeroCR)) {
-        XMVECTOR Axis, Dist, MinDist;
-
-        // Compute an axis perpindicular to the edge (points out).
-        Axis = XMVector3Cross(N1, XMVectorSubtract(A1, A0));
-        Dist = XMVector3Dot(Axis, A0);
-
-        // Test points of B against the axis.
-        MinDist = XMVector3Dot(B0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        // Edge (A1, A2)
-        Axis = XMVector3Cross(N1, XMVectorSubtract(A2, A1));
-        Dist = XMVector3Dot(Axis, A1);
-
-        MinDist = XMVector3Dot(B0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        // Edge (A2, A0)
-        Axis = XMVector3Cross(N1, XMVectorSubtract(A0, A2));
-        Dist = XMVector3Dot(Axis, A2);
-
-        MinDist = XMVector3Dot(B0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(B2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        // Edge (B0, B1)
-        Axis = XMVector3Cross(N2, XMVectorSubtract(B1, B0));
-        Dist = XMVector3Dot(Axis, B0);
-
-        MinDist = XMVector3Dot(A0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        // Edge (B1, B2)
-        Axis = XMVector3Cross(N2, XMVectorSubtract(B2, B1));
-        Dist = XMVector3Dot(Axis, B1);
-
-        MinDist = XMVector3Dot(A0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        // Edge (B2,B0)
-        Axis = XMVector3Cross(N2, XMVectorSubtract(B0, B2));
-        Dist = XMVector3Dot(Axis, B2);
-
-        MinDist = XMVector3Dot(A0, Axis);
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A1, Axis));
-        MinDist = XMVectorMin(MinDist, XMVector3Dot(A2, Axis));
-        if (XMVector4GreaterOrEqual(MinDist, Dist)) return false;
-
-        return true;
-    }
-
-    //
-    // Find the single vertex of A and B (ie the vertex on the opposite side
-    // of the plane from the other two) and reorder the edges so we can compute
-    // the signed edge/edge distances.
-    //
-    // if ( (V0 >= 0 && V1 <  0 && V2 <  0) ||
-    //      (V0 >  0 && V1 <= 0 && V2 <= 0) ||
-    //      (V0 <= 0 && V1 >  0 && V2 >  0) ||
-    //      (V0 <  0 && V1 >= 0 && V2 >= 0) ) then V0 is singular;
-    //
-    // If our singular vertex is not on the positive side of the plane we
-    // reverse the triangle winding so that the overlap comparisons will compare
-    // the correct edges with the correct signs.
-    //
-    XMVECTOR ADistIsLessEqual = XMVectorOrInt(ADistIsLess, ADistIsZero);
-    XMVECTOR ADistIsGreaterEqual = XMVectorOrInt(ADistIsGreater, ADistIsZero);
-
-    XMVECTOR AA0, AA1, AA2;
-    bool bPositiveA;
-
-    if (DirectX::MathInternal::XMVector3AllTrue(
-            XMVectorSelect(ADistIsGreaterEqual, ADistIsLess, Select0111)) ||
-        DirectX::MathInternal::XMVector3AllTrue(
-            XMVectorSelect(ADistIsGreater, ADistIsLessEqual, Select0111))) {
-        // A0 is singular, crossing from positive to negative.
-        AA0 = A0;
-        AA1 = A1;
-        AA2 = A2;
-        bPositiveA = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLessEqual, ADistIsGreater, Select0111)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLess, ADistIsGreaterEqual, Select0111))) {
-        // A0 is singular, crossing from negative to positive.
-        AA0 = A0;
-        AA1 = A2;
-        AA2 = A1;
-        bPositiveA = false;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsGreaterEqual, ADistIsLess, Select1011)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsGreater, ADistIsLessEqual, Select1011))) {
-        // A1 is singular, crossing from positive to negative.
-        AA0 = A1;
-        AA1 = A2;
-        AA2 = A0;
-        bPositiveA = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLessEqual, ADistIsGreater, Select1011)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLess, ADistIsGreaterEqual, Select1011))) {
-        // A1 is singular, crossing from negative to positive.
-        AA0 = A1;
-        AA1 = A0;
-        AA2 = A2;
-        bPositiveA = false;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsGreaterEqual, ADistIsLess, Select1101)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsGreater, ADistIsLessEqual, Select1101))) {
-        // A2 is singular, crossing from positive to negative.
-        AA0 = A2;
-        AA1 = A0;
-        AA2 = A1;
-        bPositiveA = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLessEqual, ADistIsGreater, Select1101)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   ADistIsLess, ADistIsGreaterEqual, Select1101))) {
-        // A2 is singular, crossing from negative to positive.
-        AA0 = A2;
-        AA1 = A1;
-        AA2 = A0;
-        bPositiveA = false;
-    } else {
-        assert(false);
-        return false;
-    }
-
-    XMVECTOR BDistIsLessEqual = XMVectorOrInt(BDistIsLess, BDistIsZero);
-    XMVECTOR BDistIsGreaterEqual = XMVectorOrInt(BDistIsGreater, BDistIsZero);
-
-    XMVECTOR BB0, BB1, BB2;
-    bool bPositiveB;
-
-    if (DirectX::MathInternal::XMVector3AllTrue(
-            XMVectorSelect(BDistIsGreaterEqual, BDistIsLess, Select0111)) ||
-        DirectX::MathInternal::XMVector3AllTrue(
-            XMVectorSelect(BDistIsGreater, BDistIsLessEqual, Select0111))) {
-        // B0 is singular, crossing from positive to negative.
-        BB0 = B0;
-        BB1 = B1;
-        BB2 = B2;
-        bPositiveB = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLessEqual, BDistIsGreater, Select0111)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLess, BDistIsGreaterEqual, Select0111))) {
-        // B0 is singular, crossing from negative to positive.
-        BB0 = B0;
-        BB1 = B2;
-        BB2 = B1;
-        bPositiveB = false;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsGreaterEqual, BDistIsLess, Select1011)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsGreater, BDistIsLessEqual, Select1011))) {
-        // B1 is singular, crossing from positive to negative.
-        BB0 = B1;
-        BB1 = B2;
-        BB2 = B0;
-        bPositiveB = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLessEqual, BDistIsGreater, Select1011)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLess, BDistIsGreaterEqual, Select1011))) {
-        // B1 is singular, crossing from negative to positive.
-        BB0 = B1;
-        BB1 = B0;
-        BB2 = B2;
-        bPositiveB = false;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsGreaterEqual, BDistIsLess, Select1101)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsGreater, BDistIsLessEqual, Select1101))) {
-        // B2 is singular, crossing from positive to negative.
-        BB0 = B2;
-        BB1 = B0;
-        BB2 = B1;
-        bPositiveB = true;
-    } else if (DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLessEqual, BDistIsGreater, Select1101)) ||
-               DirectX::MathInternal::XMVector3AllTrue(XMVectorSelect(
-                   BDistIsLess, BDistIsGreaterEqual, Select1101))) {
-        // B2 is singular, crossing from negative to positive.
-        BB0 = B2;
-        BB1 = B1;
-        BB2 = B0;
-        bPositiveB = false;
-    } else {
-        assert(false);
-        return false;
-    }
-
-    XMVECTOR Delta0, Delta1;
-
-    // Reverse the direction of the test depending on whether the singular
-    // vertices are the same sign or different signs.
-    if (bPositiveA ^ bPositiveB) {
-        Delta0 = XMVectorSubtract(BB0, AA0);
-        Delta1 = XMVectorSubtract(AA0, BB0);
-    } else {
-        Delta0 = XMVectorSubtract(AA0, BB0);
-        Delta1 = XMVectorSubtract(BB0, AA0);
-    }
-
-    // Check if the triangles overlap on the line of intersection between the
-    // planes of the two triangles by finding the signed line distances.
-    XMVECTOR Dist0 = XMVector3Dot(
-        Delta0,
-        XMVector3Cross(XMVectorSubtract(BB2, BB0), XMVectorSubtract(AA2, AA0)));
-    if (XMVector4Greater(Dist0, Zero)) return false;
-
-    XMVECTOR Dist1 = XMVector3Dot(
-        Delta1,
-        XMVector3Cross(XMVectorSubtract(BB1, BB0), XMVectorSubtract(AA1, AA0)));
-    if (XMVector4Greater(Dist1, Zero)) return false;
-
-    return true;
-}
-
-//-----------------------------------------------------------------------------
-// Ray-triangle test
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline PlaneIntersectionType XM_CALLCONV
-Intersects(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane) noexcept {
-    XMVECTOR One = XMVectorSplatOne();
-
-    assert(DirectX::MathInternal::XMPlaneIsUnit(Plane));
-
-    // Set w of the points to one so we can dot4 with a plane.
-    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
-    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
-    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
-
-    XMVECTOR Outside, Inside;
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane,
-                                                      Outside, Inside);
-
-    // If the triangle is outside any plane it is outside.
-    if (XMVector4EqualInt(Outside, XMVectorTrueInt())) return FRONT;
-
-    // If the triangle is inside all planes it is inside.
-    if (XMVector4EqualInt(Inside, XMVectorTrueInt())) return BACK;
-
-    // The triangle is not inside all planes or outside a plane it intersects.
-    return INTERSECTING;
-}
-
-//-----------------------------------------------------------------------------
-// Test a triangle vs 6 planes (typically forming a frustum).
-//-----------------------------------------------------------------------------
-_Use_decl_annotations_ inline ContainmentType XM_CALLCONV
-ContainedBy(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane0,
-            HXMVECTOR Plane1, HXMVECTOR Plane2, CXMVECTOR Plane3,
-            CXMVECTOR Plane4, CXMVECTOR Plane5) noexcept {
-    XMVECTOR One = XMVectorSplatOne();
-
-    // Set w of the points to one so we can dot4 with a plane.
-    XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One);
-    XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One);
-    XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One);
-
-    XMVECTOR Outside, Inside;
-
-    // Test against each plane.
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane0,
-                                                      Outside, Inside);
-
-    XMVECTOR AnyOutside = Outside;
-    XMVECTOR AllInside = Inside;
-
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane1,
-                                                      Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane2,
-                                                      Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane3,
-                                                      Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane4,
-                                                      Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    DirectX::MathInternal::FastIntersectTrianglePlane(TV0, TV1, TV2, Plane5,
-                                                      Outside, Inside);
-    AnyOutside = XMVectorOrInt(AnyOutside, Outside);
-    AllInside = XMVectorAndInt(AllInside, Inside);
-
-    // If the triangle is outside any plane it is outside.
-    if (XMVector4EqualInt(AnyOutside, XMVectorTrueInt())) return DISJOINT;
-
-    // If the triangle is inside all planes it is inside.
-    if (XMVector4EqualInt(AllInside, XMVectorTrueInt())) return CONTAINS;
-
-    // The triangle is not inside all planes or outside a plane, it may
-    // intersect.
-    return INTERSECTS;
-}
-
-}  // namespace TriangleTests
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXColors.h b/targets/app/linux/Stubs/DirectXMath/DirectXColors.h
deleted file mode 100644
index 218fe17c6..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXColors.h
+++ /dev/null
@@ -1,500 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXColors.h -- C++ Color Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX {
-
-namespace Colors {
-// Standard colors (Red/Green/Blue/Alpha) in sRGB colorspace
-XMGLOBALCONST XMVECTORF32 AliceBlue = {
-    {{0.941176534f, 0.972549081f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 AntiqueWhite = {
-    {{0.980392218f, 0.921568692f, 0.843137324f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Aqua = {{{0.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Aquamarine = {
-    {{0.498039246f, 1.f, 0.831372619f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Azure = {{{0.941176534f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Beige = {
-    {{0.960784376f, 0.960784376f, 0.862745166f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Bisque = {{{1.f, 0.894117713f, 0.768627524f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Black = {{{0.f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BlanchedAlmond = {
-    {{1.f, 0.921568692f, 0.803921640f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Blue = {{{0.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BlueViolet = {
-    {{0.541176498f, 0.168627456f, 0.886274576f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Brown = {
-    {{0.647058845f, 0.164705887f, 0.164705887f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BurlyWood = {
-    {{0.870588303f, 0.721568644f, 0.529411793f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 CadetBlue = {
-    {{0.372549027f, 0.619607866f, 0.627451003f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Chartreuse = {{{0.498039246f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Chocolate = {
-    {{0.823529482f, 0.411764741f, 0.117647067f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Coral = {{{1.f, 0.498039246f, 0.313725501f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 CornflowerBlue = {
-    {{0.392156899f, 0.584313750f, 0.929411829f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Cornsilk = {{{1.f, 0.972549081f, 0.862745166f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Crimson = {
-    {{0.862745166f, 0.078431375f, 0.235294133f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Cyan = {{{0.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkBlue = {{{0.f, 0.f, 0.545098066f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkCyan = {{{0.f, 0.545098066f, 0.545098066f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGoldenrod = {
-    {{0.721568644f, 0.525490224f, 0.043137256f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGray = {
-    {{0.662745118f, 0.662745118f, 0.662745118f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGreen = {{{0.f, 0.392156899f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkKhaki = {
-    {{0.741176486f, 0.717647076f, 0.419607878f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkMagenta = {
-    {{0.545098066f, 0.f, 0.545098066f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOliveGreen = {
-    {{0.333333343f, 0.419607878f, 0.184313729f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOrange = {{{1.f, 0.549019635f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOrchid = {
-    {{0.600000024f, 0.196078449f, 0.800000072f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkRed = {{{0.545098066f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSalmon = {
-    {{0.913725555f, 0.588235319f, 0.478431404f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSeaGreen = {
-    {{0.560784340f, 0.737254918f, 0.545098066f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSlateBlue = {
-    {{0.282352954f, 0.239215702f, 0.545098066f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSlateGray = {
-    {{0.184313729f, 0.309803933f, 0.309803933f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkTurquoise = {
-    {{0.f, 0.807843208f, 0.819607913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkViolet = {
-    {{0.580392182f, 0.f, 0.827451050f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DeepPink = {{{1.f, 0.078431375f, 0.576470613f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DeepSkyBlue = {{{0.f, 0.749019623f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DimGray = {
-    {{0.411764741f, 0.411764741f, 0.411764741f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DodgerBlue = {
-    {{0.117647067f, 0.564705908f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Firebrick = {
-    {{0.698039234f, 0.133333340f, 0.133333340f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 FloralWhite = {
-    {{1.f, 0.980392218f, 0.941176534f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 ForestGreen = {
-    {{0.133333340f, 0.545098066f, 0.133333340f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Fuchsia = {{{1.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gainsboro = {
-    {{0.862745166f, 0.862745166f, 0.862745166f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 GhostWhite = {
-    {{0.972549081f, 0.972549081f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gold = {{{1.f, 0.843137324f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Goldenrod = {
-    {{0.854902029f, 0.647058845f, 0.125490203f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gray = {
-    {{0.501960814f, 0.501960814f, 0.501960814f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Green = {{{0.f, 0.501960814f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 GreenYellow = {
-    {{0.678431392f, 1.f, 0.184313729f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Honeydew = {{{0.941176534f, 1.f, 0.941176534f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 HotPink = {{{1.f, 0.411764741f, 0.705882370f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 IndianRed = {
-    {{0.803921640f, 0.360784322f, 0.360784322f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Indigo = {{{0.294117659f, 0.f, 0.509803951f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Ivory = {{{1.f, 1.f, 0.941176534f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Khaki = {
-    {{0.941176534f, 0.901960850f, 0.549019635f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Lavender = {
-    {{0.901960850f, 0.901960850f, 0.980392218f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LavenderBlush = {
-    {{1.f, 0.941176534f, 0.960784376f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LawnGreen = {
-    {{0.486274540f, 0.988235354f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LemonChiffon = {
-    {{1.f, 0.980392218f, 0.803921640f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightBlue = {
-    {{0.678431392f, 0.847058892f, 0.901960850f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightCoral = {
-    {{0.941176534f, 0.501960814f, 0.501960814f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightCyan = {{{0.878431439f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {
-    {{0.980392218f, 0.980392218f, 0.823529482f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGray = {
-    {{0.827451050f, 0.827451050f, 0.827451050f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGreen = {
-    {{0.564705908f, 0.933333397f, 0.564705908f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightPink = {
-    {{1.f, 0.713725507f, 0.756862819f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSalmon = {
-    {{1.f, 0.627451003f, 0.478431404f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSeaGreen = {
-    {{0.125490203f, 0.698039234f, 0.666666687f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSkyBlue = {
-    {{0.529411793f, 0.807843208f, 0.980392218f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSlateGray = {
-    {{0.466666698f, 0.533333361f, 0.600000024f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSteelBlue = {
-    {{0.690196097f, 0.768627524f, 0.870588303f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightYellow = {{{1.f, 1.f, 0.878431439f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Lime = {{{0.f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LimeGreen = {
-    {{0.196078449f, 0.803921640f, 0.196078449f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Linen = {
-    {{0.980392218f, 0.941176534f, 0.901960850f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Magenta = {{{1.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Maroon = {{{0.501960814f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumAquamarine = {
-    {{0.400000036f, 0.803921640f, 0.666666687f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumBlue = {{{0.f, 0.f, 0.803921640f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumOrchid = {
-    {{0.729411781f, 0.333333343f, 0.827451050f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumPurple = {
-    {{0.576470613f, 0.439215720f, 0.858823597f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSeaGreen = {
-    {{0.235294133f, 0.701960802f, 0.443137288f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSlateBlue = {
-    {{0.482352972f, 0.407843173f, 0.933333397f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSpringGreen = {
-    {{0.f, 0.980392218f, 0.603921592f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumTurquoise = {
-    {{0.282352954f, 0.819607913f, 0.800000072f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumVioletRed = {
-    {{0.780392230f, 0.082352944f, 0.521568656f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MidnightBlue = {
-    {{0.098039225f, 0.098039225f, 0.439215720f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MintCream = {
-    {{0.960784376f, 1.f, 0.980392218f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MistyRose = {
-    {{1.f, 0.894117713f, 0.882353008f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Moccasin = {{{1.f, 0.894117713f, 0.709803939f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 NavajoWhite = {
-    {{1.f, 0.870588303f, 0.678431392f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Navy = {{{0.f, 0.f, 0.501960814f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OldLace = {
-    {{0.992156923f, 0.960784376f, 0.901960850f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Olive = {{{0.501960814f, 0.501960814f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OliveDrab = {
-    {{0.419607878f, 0.556862772f, 0.137254909f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Orange = {{{1.f, 0.647058845f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OrangeRed = {{{1.f, 0.270588249f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Orchid = {
-    {{0.854902029f, 0.439215720f, 0.839215755f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleGoldenrod = {
-    {{0.933333397f, 0.909803987f, 0.666666687f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleGreen = {
-    {{0.596078455f, 0.984313786f, 0.596078455f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleTurquoise = {
-    {{0.686274529f, 0.933333397f, 0.933333397f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleVioletRed = {
-    {{0.858823597f, 0.439215720f, 0.576470613f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PapayaWhip = {
-    {{1.f, 0.937254965f, 0.835294187f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PeachPuff = {
-    {{1.f, 0.854902029f, 0.725490212f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Peru = {
-    {{0.803921640f, 0.521568656f, 0.247058839f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Pink = {{{1.f, 0.752941251f, 0.796078503f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Plum = {
-    {{0.866666734f, 0.627451003f, 0.866666734f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PowderBlue = {
-    {{0.690196097f, 0.878431439f, 0.901960850f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Purple = {{{0.501960814f, 0.f, 0.501960814f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Red = {{{1.f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 RosyBrown = {
-    {{0.737254918f, 0.560784340f, 0.560784340f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 RoyalBlue = {
-    {{0.254901975f, 0.411764741f, 0.882353008f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SaddleBrown = {
-    {{0.545098066f, 0.270588249f, 0.074509807f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Salmon = {
-    {{0.980392218f, 0.501960814f, 0.447058856f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SandyBrown = {
-    {{0.956862807f, 0.643137276f, 0.376470625f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SeaGreen = {
-    {{0.180392161f, 0.545098066f, 0.341176480f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SeaShell = {{{1.f, 0.960784376f, 0.933333397f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Sienna = {
-    {{0.627451003f, 0.321568638f, 0.176470593f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Silver = {
-    {{0.752941251f, 0.752941251f, 0.752941251f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SkyBlue = {
-    {{0.529411793f, 0.807843208f, 0.921568692f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SlateBlue = {
-    {{0.415686309f, 0.352941185f, 0.803921640f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SlateGray = {
-    {{0.439215720f, 0.501960814f, 0.564705908f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Snow = {{{1.f, 0.980392218f, 0.980392218f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SpringGreen = {{{0.f, 1.f, 0.498039246f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SteelBlue = {
-    {{0.274509817f, 0.509803951f, 0.705882370f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Tan = {
-    {{0.823529482f, 0.705882370f, 0.549019635f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Teal = {{{0.f, 0.501960814f, 0.501960814f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Thistle = {
-    {{0.847058892f, 0.749019623f, 0.847058892f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Tomato = {{{1.f, 0.388235331f, 0.278431386f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Transparent = {{{0.f, 0.f, 0.f, 0.f}}};
-XMGLOBALCONST XMVECTORF32 Turquoise = {
-    {{0.250980407f, 0.878431439f, 0.815686345f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Violet = {
-    {{0.933333397f, 0.509803951f, 0.933333397f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Wheat = {
-    {{0.960784376f, 0.870588303f, 0.701960802f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 White = {{{1.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 WhiteSmoke = {
-    {{0.960784376f, 0.960784376f, 0.960784376f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Yellow = {{{1.f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 YellowGreen = {
-    {{0.603921592f, 0.803921640f, 0.196078449f, 1.f}}};
-
-}  // namespace Colors
-
-namespace ColorsLinear {
-// Standard colors (Red/Green/Blue/Alpha) in linear colorspace
-XMGLOBALCONST XMVECTORF32 AliceBlue = {
-    {{0.871367335f, 0.938685894f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 AntiqueWhite = {
-    {{0.955973506f, 0.830770075f, 0.679542601f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Aqua = {{{0.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Aquamarine = {
-    {{0.212230787f, 1.f, 0.658374965f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Azure = {{{0.871367335f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Beige = {
-    {{0.913098991f, 0.913098991f, 0.715693772f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Bisque = {{{1.f, 0.775822461f, 0.552011609f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Black = {{{0.f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BlanchedAlmond = {
-    {{1.f, 0.830770075f, 0.610495746f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Blue = {{{0.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BlueViolet = {
-    {{0.254152179f, 0.024157630f, 0.760524750f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Brown = {
-    {{0.376262218f, 0.023153365f, 0.023153365f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 BurlyWood = {
-    {{0.730461001f, 0.479320228f, 0.242281199f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 CadetBlue = {
-    {{0.114435382f, 0.341914445f, 0.351532698f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Chartreuse = {{{0.212230787f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Chocolate = {
-    {{0.644479871f, 0.141263321f, 0.012983031f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Coral = {{{1.f, 0.212230787f, 0.080219828f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 CornflowerBlue = {
-    {{0.127437726f, 0.300543845f, 0.846873462f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Cornsilk = {{{1.f, 0.938685894f, 0.715693772f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Crimson = {
-    {{0.715693772f, 0.006995410f, 0.045186214f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Cyan = {{{0.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkBlue = {{{0.f, 0.f, 0.258182913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkCyan = {{{0.f, 0.258182913f, 0.258182913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGoldenrod = {
-    {{0.479320228f, 0.238397658f, 0.003346536f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGray = {
-    {{0.396755308f, 0.396755308f, 0.396755308f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkGreen = {{{0.f, 0.127437726f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkKhaki = {
-    {{0.508881450f, 0.473531544f, 0.147027299f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkMagenta = {
-    {{0.258182913f, 0.f, 0.258182913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOliveGreen = {
-    {{0.090841733f, 0.147027299f, 0.028426038f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOrange = {{{1.f, 0.262250721f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkOrchid = {
-    {{0.318546832f, 0.031896040f, 0.603827536f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkRed = {{{0.258182913f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSalmon = {
-    {{0.814846814f, 0.304987371f, 0.194617867f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSeaGreen = {
-    {{0.274677366f, 0.502886593f, 0.258182913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSlateBlue = {
-    {{0.064803280f, 0.046665095f, 0.258182913f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkSlateGray = {
-    {{0.028426038f, 0.078187428f, 0.078187428f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkTurquoise = {
-    {{0.f, 0.617206752f, 0.637597024f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DarkViolet = {
-    {{0.296138316f, 0.f, 0.651405811f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DeepPink = {{{1.f, 0.006995410f, 0.291770697f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DeepSkyBlue = {{{0.f, 0.520995677f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DimGray = {
-    {{0.141263321f, 0.141263321f, 0.141263321f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 DodgerBlue = {
-    {{0.012983031f, 0.278894335f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Firebrick = {
-    {{0.445201248f, 0.015996292f, 0.015996292f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 FloralWhite = {
-    {{1.f, 0.955973506f, 0.871367335f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 ForestGreen = {
-    {{0.015996292f, 0.258182913f, 0.015996292f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Fuchsia = {{{1.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gainsboro = {
-    {{0.715693772f, 0.715693772f, 0.715693772f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 GhostWhite = {
-    {{0.938685894f, 0.938685894f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gold = {{{1.f, 0.679542601f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Goldenrod = {
-    {{0.701102138f, 0.376262218f, 0.014443844f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Gray = {
-    {{0.215860531f, 0.215860531f, 0.215860531f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Green = {{{0.f, 0.215860531f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 GreenYellow = {
-    {{0.417885154f, 1.f, 0.028426038f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Honeydew = {{{0.871367335f, 1.f, 0.871367335f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 HotPink = {{{1.f, 0.141263321f, 0.456411064f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 IndianRed = {
-    {{0.610495746f, 0.107023112f, 0.107023112f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Indigo = {{{0.070360109f, 0.f, 0.223227978f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Ivory = {{{1.f, 1.f, 0.871367335f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Khaki = {
-    {{0.871367335f, 0.791298151f, 0.262250721f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Lavender = {
-    {{0.791298151f, 0.791298151f, 0.955973506f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LavenderBlush = {
-    {{1.f, 0.871367335f, 0.913098991f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LawnGreen = {
-    {{0.201556295f, 0.973445475f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LemonChiffon = {
-    {{1.f, 0.955973506f, 0.610495746f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightBlue = {
-    {{0.417885154f, 0.686685443f, 0.791298151f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightCoral = {
-    {{0.871367335f, 0.215860531f, 0.215860531f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightCyan = {{{0.745404482f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {
-    {{0.955973506f, 0.955973506f, 0.644479871f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGray = {
-    {{0.651405811f, 0.651405811f, 0.651405811f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightGreen = {
-    {{0.278894335f, 0.854992807f, 0.278894335f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightPink = {
-    {{1.f, 0.467783839f, 0.533276618f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSalmon = {
-    {{1.f, 0.351532698f, 0.194617867f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSeaGreen = {
-    {{0.014443844f, 0.445201248f, 0.401977867f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSkyBlue = {
-    {{0.242281199f, 0.617206752f, 0.955973506f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSlateGray = {
-    {{0.184475034f, 0.246201396f, 0.318546832f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightSteelBlue = {
-    {{0.434153706f, 0.552011609f, 0.730461001f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LightYellow = {{{1.f, 1.f, 0.745404482f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Lime = {{{0.f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 LimeGreen = {
-    {{0.031896040f, 0.610495746f, 0.031896040f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Linen = {
-    {{0.955973506f, 0.871367335f, 0.791298151f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Magenta = {{{1.f, 0.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Maroon = {{{0.215860531f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumAquamarine = {
-    {{0.132868364f, 0.610495746f, 0.401977867f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumBlue = {{{0.f, 0.f, 0.610495746f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumOrchid = {
-    {{0.491020888f, 0.090841733f, 0.651405811f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumPurple = {
-    {{0.291770697f, 0.162029430f, 0.708376050f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSeaGreen = {
-    {{0.045186214f, 0.450785846f, 0.165132239f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSlateBlue = {
-    {{0.198069349f, 0.138431653f, 0.854992807f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumSpringGreen = {
-    {{0.f, 0.955973506f, 0.323143244f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumTurquoise = {
-    {{0.064803280f, 0.637597024f, 0.603827536f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MediumVioletRed = {
-    {{0.571125031f, 0.007499032f, 0.234550655f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MidnightBlue = {
-    {{0.009721218f, 0.009721218f, 0.162029430f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MintCream = {
-    {{0.913098991f, 1.f, 0.955973506f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 MistyRose = {
-    {{1.f, 0.775822461f, 0.752942443f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Moccasin = {{{1.f, 0.775822461f, 0.462077051f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 NavajoWhite = {
-    {{1.f, 0.730461001f, 0.417885154f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Navy = {{{0.f, 0.f, 0.215860531f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OldLace = {
-    {{0.982250869f, 0.913098991f, 0.791298151f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Olive = {{{0.215860531f, 0.215860531f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OliveDrab = {
-    {{0.147027299f, 0.270497859f, 0.016807375f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Orange = {{{1.f, 0.376262218f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 OrangeRed = {{{1.f, 0.059511241f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Orchid = {
-    {{0.701102138f, 0.162029430f, 0.672443330f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleGoldenrod = {
-    {{0.854992807f, 0.806952477f, 0.401977867f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleGreen = {
-    {{0.313988745f, 0.964686573f, 0.313988745f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleTurquoise = {
-    {{0.428690553f, 0.854992807f, 0.854992807f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PaleVioletRed = {
-    {{0.708376050f, 0.162029430f, 0.291770697f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PapayaWhip = {
-    {{1.f, 0.863157392f, 0.665387452f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PeachPuff = {
-    {{1.f, 0.701102138f, 0.485149980f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Peru = {
-    {{0.610495746f, 0.234550655f, 0.049706575f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Pink = {{{1.f, 0.527115345f, 0.597202003f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Plum = {
-    {{0.723055363f, 0.351532698f, 0.723055363f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 PowderBlue = {
-    {{0.434153706f, 0.745404482f, 0.791298151f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Purple = {{{0.215860531f, 0.f, 0.215860531f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Red = {{{1.f, 0.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 RosyBrown = {
-    {{0.502886593f, 0.274677366f, 0.274677366f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 RoyalBlue = {
-    {{0.052860655f, 0.141263321f, 0.752942443f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SaddleBrown = {
-    {{0.258182913f, 0.059511241f, 0.006512091f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Salmon = {
-    {{0.955973506f, 0.215860531f, 0.168269455f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SandyBrown = {
-    {{0.904661357f, 0.371237785f, 0.116970696f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SeaGreen = {
-    {{0.027320892f, 0.258182913f, 0.095307484f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SeaShell = {{{1.f, 0.913098991f, 0.854992807f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Sienna = {
-    {{0.351532698f, 0.084376216f, 0.026241222f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Silver = {
-    {{0.527115345f, 0.527115345f, 0.527115345f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SkyBlue = {
-    {{0.242281199f, 0.617206752f, 0.830770075f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SlateBlue = {
-    {{0.144128501f, 0.102241747f, 0.610495746f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SlateGray = {
-    {{0.162029430f, 0.215860531f, 0.278894335f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Snow = {{{1.f, 0.955973506f, 0.955973506f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SpringGreen = {{{0.f, 1.f, 0.212230787f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 SteelBlue = {
-    {{0.061246071f, 0.223227978f, 0.456411064f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Tan = {
-    {{0.644479871f, 0.456411064f, 0.262250721f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Teal = {{{0.f, 0.215860531f, 0.215860531f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Thistle = {
-    {{0.686685443f, 0.520995677f, 0.686685443f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Tomato = {{{1.f, 0.124771863f, 0.063010029f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Transparent = {{{0.f, 0.f, 0.f, 0.f}}};
-XMGLOBALCONST XMVECTORF32 Turquoise = {
-    {{0.051269468f, 0.745404482f, 0.630757332f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Violet = {
-    {{0.854992807f, 0.223227978f, 0.854992807f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Wheat = {
-    {{0.913098991f, 0.730461001f, 0.450785846f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 White = {{{1.f, 1.f, 1.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 WhiteSmoke = {
-    {{0.913098991f, 0.913098991f, 0.913098991f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 Yellow = {{{1.f, 1.f, 0.f, 1.f}}};
-XMGLOBALCONST XMVECTORF32 YellowGreen = {
-    {{0.323143244f, 0.610495746f, 0.031896040f, 1.f}}};
-
-}  // namespace ColorsLinear
-
-}  // namespace DirectX
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXMath.h b/targets/app/linux/Stubs/DirectXMath/DirectXMath.h
deleted file mode 100644
index e3e629732..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXMath.h
+++ /dev/null
@@ -1,3092 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXMath.h -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#ifndef __cplusplus
-#error DirectX Math requires C++
-#endif
-
-#define DIRECTX_MATH_VERSION 320
-
-#if defined(_MSC_VER) && (_MSC_VER < 1910)
-#error DirectX Math requires Visual C++ 2017 or later.
-#endif
-
-#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) &&          \
-    !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && \
-    (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) &&              \
-    !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
-#define _XM_VECTORCALL_ 1
-#endif
-
-#if _XM_VECTORCALL_
-#define XM_CALLCONV __vectorcall
-#elif defined(__GNUC__)
-#define XM_CALLCONV
-#else
-#define XM_CALLCONV __fastcall
-#endif
-
-#ifndef XM_DEPRECATED
-#if (__cplusplus >= 201402L)
-#define XM_DEPRECATED [[deprecated]]
-#elif defined(__GNUC__)
-#define XM_DEPRECATED __attribute__((deprecated))
-#else
-#define XM_DEPRECATED      \
-    __declspec(deprecated( \
-        "This is deprecated and will be removed in a future version."))
-#endif
-#endif
-
-#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && \
-    !defined(_XM_NO_INTRINSICS_)
-#define _XM_AVX2_INTRINSICS_
-#endif
-
-#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && \
-    !defined(_XM_NO_INTRINSICS_)
-#define _XM_FMA3_INTRINSICS_
-#endif
-
-#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && \
-    !defined(_XM_NO_INTRINSICS_)
-#define _XM_F16C_INTRINSICS_
-#endif
-
-#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && \
-    !defined(_XM_NO_INTRINSICS_)
-#define _XM_F16C_INTRINSICS_
-#endif
-
-#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
-#define _XM_AVX_INTRINSICS_
-#endif
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
-#define _XM_AVX_INTRINSICS_
-#endif
-
-#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && \
-    !defined(_XM_NO_INTRINSICS_)
-#define _XM_AVX_INTRINSICS_
-#endif
-
-#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
-#define _XM_SSE4_INTRINSICS_
-#endif
-
-#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_)
-#define _XM_SSE3_INTRINSICS_
-#endif
-
-#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
-#define _XM_SSE_INTRINSICS_
-#endif
-
-#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && \
-    !defined(_XM_NO_INTRINSICS_)
-#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && \
-    !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
-#define _XM_SSE_INTRINSICS_
-#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __arm__ || __aarch64__
-#define _XM_ARM_NEON_INTRINSICS_
-#elif !defined(_XM_NO_INTRINSICS_)
-#error DirectX Math does not support this target
-#endif
-#endif  // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ &&
-        // !_XM_NO_INTRINSICS_
-
-#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && \
-    !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) &&                   \
-    !defined(_XM_DISABLE_INTEL_SVML_)
-#define _XM_SVML_INTRINSICS_
-#endif
-
-#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && \
-    (defined(__clang__) || defined(__GNUC__)) && !defined(_XM_NO_INTRINSICS_)
-#define _XM_NO_XMVECTOR_OVERLOADS_
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4514 4820)
-// C4514/4820: Off by default noise
-#endif
-#include <float.h>
-#include <math.h>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#ifndef _XM_NO_INTRINSICS_
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4987)
-// C4987: Off by default noise
-#endif
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__) && \
-    !defined(__MINGW32__)
-#include <cpuid.h>
-#endif
-
-#ifdef _XM_SSE_INTRINSICS_
-#include <emmintrin.h>
-#include <xmmintrin.h>
-
-#ifdef _XM_SSE3_INTRINSICS_
-#include <pmmintrin.h>
-#endif
-
-#ifdef _XM_SSE4_INTRINSICS_
-#include <smmintrin.h>
-#endif
-
-#ifdef _XM_AVX_INTRINSICS_
-#include <immintrin.h>
-#endif
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-#endif
-#endif  // !_XM_NO_INTRINSICS_
-
-#include <assert.h>
-
-#include "sal.h"
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4005 4668)
-// C4005/4668: Old header issue
-#endif
-#include <stdint.h>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if (__cplusplus >= 201703L)
-#define XM_ALIGNED_DATA(x) alignas(x)
-#define XM_ALIGNED_STRUCT(x) struct alignas(x)
-#elif defined(__GNUC__)
-#define XM_ALIGNED_DATA(x) __attribute__((aligned(x)))
-#define XM_ALIGNED_STRUCT(x) struct __attribute__((aligned(x)))
-#else
-#define XM_ALIGNED_DATA(x) __declspec(align(x))
-#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct
-#endif
-
-#if (__cplusplus >= 202002L)
-#include <compare>
-#endif
-
-/****************************************************************************
- *
- * Conditional intrinsics
- *
- ****************************************************************************/
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-#if defined(_XM_NO_MOVNT_)
-#define XM_STREAM_PS(p, a) _mm_store_ps((p), (a))
-#define XM256_STREAM_PS(p, a) _mm256_store_ps((p), (a))
-#define XM_SFENCE()
-#else
-#define XM_STREAM_PS(p, a) _mm_stream_ps((p), (a))
-#define XM256_STREAM_PS(p, a) _mm256_stream_ps((p), (a))
-#define XM_SFENCE() _mm_sfence()
-#endif
-
-#if defined(_XM_FMA3_INTRINSICS_)
-#define XM_FMADD_PS(a, b, c) _mm_fmadd_ps((a), (b), (c))
-#define XM_FNMADD_PS(a, b, c) _mm_fnmadd_ps((a), (b), (c))
-#else
-#define XM_FMADD_PS(a, b, c) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
-#define XM_FNMADD_PS(a, b, c) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
-#endif
-
-#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
-#define XM_PERMUTE_PS(v, c) _mm_permute_ps((v), c)
-#else
-#define XM_PERMUTE_PS(v, c) _mm_shuffle_ps((v), (v), c)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 11)
-#define XM_LOADU_SI16(p) \
-    _mm_cvtsi32_si128(*reinterpret_cast<unsigned short const*>(p))
-#else
-#define XM_LOADU_SI16(p) _mm_loadu_si16(p)
-#endif
-
-#endif  // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-#if defined(__clang__) || defined(__GNUC__)
-#define XM_PREFETCH(a) __builtin_prefetch(a)
-#elif defined(_MSC_VER)
-#define XM_PREFETCH(a) __prefetch(a)
-#else
-#define XM_PREFETCH(a)
-#endif
-
-#endif  // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-namespace DirectX {
-
-/****************************************************************************
- *
- * Constant definitions
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XM_PI)
-#undef XM_PI
-#undef XM_2PI
-#undef XM_1DIVPI
-#undef XM_1DIV2PI
-#undef XM_PIDIV2
-#undef XM_PIDIV4
-#undef XM_SELECT_0
-#undef XM_SELECT_1
-#undef XM_PERMUTE_0X
-#undef XM_PERMUTE_0Y
-#undef XM_PERMUTE_0Z
-#undef XM_PERMUTE_0W
-#undef XM_PERMUTE_1X
-#undef XM_PERMUTE_1Y
-#undef XM_PERMUTE_1Z
-#undef XM_PERMUTE_1W
-#undef XM_CRMASK_CR6
-#undef XM_CRMASK_CR6TRUE
-#undef XM_CRMASK_CR6FALSE
-#undef XM_CRMASK_CR6BOUNDS
-#undef XM_CACHE_LINE_SIZE
-#endif
-
-constexpr float XM_PI = 3.141592654f;
-constexpr float XM_2PI = 6.283185307f;
-constexpr float XM_1DIVPI = 0.318309886f;
-constexpr float XM_1DIV2PI = 0.159154943f;
-constexpr float XM_PIDIV2 = 1.570796327f;
-constexpr float XM_PIDIV4 = 0.785398163f;
-
-constexpr uint32_t XM_SELECT_0 = 0x00000000;
-constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF;
-
-constexpr uint32_t XM_PERMUTE_0X = 0;
-constexpr uint32_t XM_PERMUTE_0Y = 1;
-constexpr uint32_t XM_PERMUTE_0Z = 2;
-constexpr uint32_t XM_PERMUTE_0W = 3;
-constexpr uint32_t XM_PERMUTE_1X = 4;
-constexpr uint32_t XM_PERMUTE_1Y = 5;
-constexpr uint32_t XM_PERMUTE_1Z = 6;
-constexpr uint32_t XM_PERMUTE_1W = 7;
-
-constexpr uint32_t XM_SWIZZLE_X = 0;
-constexpr uint32_t XM_SWIZZLE_Y = 1;
-constexpr uint32_t XM_SWIZZLE_Z = 2;
-constexpr uint32_t XM_SWIZZLE_W = 3;
-
-constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0;
-constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080;
-constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020;
-constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE;
-
-#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __arm__ || __aarch64__
-constexpr size_t XM_CACHE_LINE_SIZE = 128;
-#else
-constexpr size_t XM_CACHE_LINE_SIZE = 64;
-#endif
-
-/****************************************************************************
- *
- * Macros
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue)
-#undef XMComparisonAllTrue
-#undef XMComparisonAnyTrue
-#undef XMComparisonAllFalse
-#undef XMComparisonAnyFalse
-#undef XMComparisonMixed
-#undef XMComparisonAllInBounds
-#undef XMComparisonAnyOutOfBounds
-#endif
-
-// Unit conversion
-
-constexpr float XMConvertToRadians(float fDegrees) noexcept {
-    return fDegrees * (XM_PI / 180.0f);
-}
-constexpr float XMConvertToDegrees(float fRadians) noexcept {
-    return fRadians * (180.0f / XM_PI);
-}
-
-// Condition register evaluation proceeding a recording (R) comparison
-
-constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE;
-}
-constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE;
-}
-constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE;
-}
-constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE;
-}
-constexpr bool XMComparisonMixed(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6) == 0;
-}
-constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS;
-}
-constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept {
-    return (CR & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS;
-}
-
-/****************************************************************************
- *
- * Data types
- *
- ****************************************************************************/
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4068 4201 4365 4324 4820)
-// C4068: ignore unknown pragmas
-// C4201: nonstandard extension used : nameless struct/union
-// C4365: Off by default noise
-// C4324/4820: padding warnings
-#endif
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-#endif
-
-//------------------------------------------------------------------------------
-#if defined(_XM_NO_INTRINSICS_)
-struct __vector4 {
-    union {
-        float vector4_f32[4];
-        uint32_t vector4_u32[4];
-    };
-};
-#endif  // _XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte
-// boundary and mapped to hardware vector registers
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-using XMVECTOR = __m128;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-using XMVECTOR = float32x4_t;
-#else
-using XMVECTOR = __vector4;
-#endif
-
-// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86,
-// ARM, ARM64, and vector call; by reference otherwise
-#if (defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || \
-     _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__) &&   \
-    !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR FXMVECTOR;
-#else
-typedef const XMVECTOR& FXMVECTOR;
-#endif
-
-// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and
-// vector call; by reference otherwise
-#if (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-     defined(_M_ARM64EC) || _XM_VECTORCALL_ || __arm__ || __aarch64__) &&    \
-    !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR GXMVECTOR;
-#else
-typedef const XMVECTOR& GXMVECTOR;
-#endif
-
-// Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and
-// vector call; by reference otherwise
-#if (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) ||      \
-     defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__) && \
-    !defined(_XM_NO_INTRINSICS_)
-typedef const XMVECTOR HXMVECTOR;
-#else
-typedef const XMVECTOR& HXMVECTOR;
-#endif
-
-// Fix-up for (7th+) XMVECTOR parameters to pass by reference
-typedef const XMVECTOR& CXMVECTOR;
-
-//------------------------------------------------------------------------------
-// Conversion types for constants
-XM_ALIGNED_STRUCT(16) XMVECTORF32 {
-    union {
-        float f[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const noexcept { return v; }
-    inline operator const float*() const noexcept { return f; }
-#ifdef _XM_NO_INTRINSICS_
-#elif defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
-    inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && \
-    (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
-    inline operator int32x4_t() const noexcept {
-        return vreinterpretq_s32_f32(v);
-    }
-    inline operator uint32x4_t() const noexcept {
-        return vreinterpretq_u32_f32(v);
-    }
-#endif
-};
-
-XM_ALIGNED_STRUCT(16) XMVECTORI32 {
-    union {
-        int32_t i[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const noexcept { return v; }
-#ifdef _XM_NO_INTRINSICS_
-#elif defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
-    inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && \
-    (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
-    inline operator int32x4_t() const noexcept {
-        return vreinterpretq_s32_f32(v);
-    }
-    inline operator uint32x4_t() const noexcept {
-        return vreinterpretq_u32_f32(v);
-    }
-#endif
-};
-
-XM_ALIGNED_STRUCT(16) XMVECTORU8 {
-    union {
-        uint8_t u[16];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const noexcept { return v; }
-#ifdef _XM_NO_INTRINSICS_
-#elif defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
-    inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && \
-    (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
-    inline operator int32x4_t() const noexcept {
-        return vreinterpretq_s32_f32(v);
-    }
-    inline operator uint32x4_t() const noexcept {
-        return vreinterpretq_u32_f32(v);
-    }
-#endif
-};
-
-XM_ALIGNED_STRUCT(16) XMVECTORU32 {
-    union {
-        uint32_t u[4];
-        XMVECTOR v;
-    };
-
-    inline operator XMVECTOR() const noexcept { return v; }
-#ifdef _XM_NO_INTRINSICS_
-#elif defined(_XM_SSE_INTRINSICS_)
-    inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
-    inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && \
-    (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
-    inline operator int32x4_t() const noexcept {
-        return vreinterpretq_s32_f32(v);
-    }
-    inline operator uint32x4_t() const noexcept {
-        return vreinterpretq_u32_f32(v);
-    }
-#endif
-};
-
-//------------------------------------------------------------------------------
-// Vector operators
-
-#ifndef _XM_NO_XMVECTOR_OVERLOADS_
-XMVECTOR XM_CALLCONV operator+(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV operator-(FXMVECTOR V) noexcept;
-
-XMVECTOR& XM_CALLCONV operator+=(XMVECTOR& V1, FXMVECTOR V2) noexcept;
-XMVECTOR& XM_CALLCONV operator-=(XMVECTOR& V1, FXMVECTOR V2) noexcept;
-XMVECTOR& XM_CALLCONV operator*=(XMVECTOR& V1, FXMVECTOR V2) noexcept;
-XMVECTOR& XM_CALLCONV operator/=(XMVECTOR& V1, FXMVECTOR V2) noexcept;
-
-XMVECTOR& operator*=(XMVECTOR& V, float S) noexcept;
-XMVECTOR& operator/=(XMVECTOR& V, float S) noexcept;
-
-XMVECTOR XM_CALLCONV operator+(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV operator-(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV operator*(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV operator/(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV operator*(FXMVECTOR V, float S) noexcept;
-XMVECTOR XM_CALLCONV operator*(float S, FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV operator/(FXMVECTOR V, float S) noexcept;
-#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */
-
-//------------------------------------------------------------------------------
-// Matrix type: Sixteen 32 bit floating point components aligned on a
-// 16 byte boundary and mapped to four hardware vector registers
-
-struct XMMATRIX;
-
-// Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector
-// call; by reference otherwise
-#if (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) ||      \
-     defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__) && \
-    !defined(_XM_NO_INTRINSICS_)
-typedef const XMMATRIX FXMMATRIX;
-#else
-typedef const XMMATRIX& FXMMATRIX;
-#endif
-
-// Fix-up for (2nd+) XMMATRIX parameters to pass by reference
-typedef const XMMATRIX& CXMMATRIX;
-
-#ifdef _XM_NO_INTRINSICS_
-struct XMMATRIX
-#else
-XM_ALIGNED_STRUCT(16)
-XMMATRIX
-#endif
-{
-#ifdef _XM_NO_INTRINSICS_
-    union {
-        XMVECTOR r[4];
-        struct {
-            float _11, _12, _13, _14;
-            float _21, _22, _23, _24;
-            float _31, _32, _33, _34;
-            float _41, _42, _43, _44;
-        };
-        float m[4][4];
-    };
-#else
-    XMVECTOR r[4];
-#endif
-
-    XMMATRIX() = default;
-
-    XMMATRIX(const XMMATRIX&) = default;
-
-#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431)
-    XMMATRIX& operator=(const XMMATRIX& M) noexcept {
-        r[0] = M.r[0];
-        r[1] = M.r[1];
-        r[2] = M.r[2];
-        r[3] = M.r[3];
-        return *this;
-    }
-#else
-    XMMATRIX& operator=(const XMMATRIX&) = default;
-
-    XMMATRIX(XMMATRIX&&) = default;
-    XMMATRIX& operator=(XMMATRIX&&) = default;
-#endif
-
-    constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2,
-                       CXMVECTOR R3) noexcept
-        : r{R0, R1, R2, R3} {}
-    XMMATRIX(float m00, float m01, float m02, float m03, float m10, float m11,
-             float m12, float m13, float m20, float m21, float m22, float m23,
-             float m30, float m31, float m32, float m33) noexcept;
-    explicit XMMATRIX(_In_reads_(16) const float* pArray) noexcept;
-
-#ifdef _XM_NO_INTRINSICS_
-    float operator()(size_t Row, size_t Column) const noexcept {
-        return m[Row][Column];
-    }
-    float& operator()(size_t Row, size_t Column) noexcept {
-        return m[Row][Column];
-    }
-#endif
-
-    XMMATRIX operator+() const noexcept { return *this; }
-    XMMATRIX operator-() const noexcept;
-
-    XMMATRIX& XM_CALLCONV operator+=(FXMMATRIX M) noexcept;
-    XMMATRIX& XM_CALLCONV operator-=(FXMMATRIX M) noexcept;
-    XMMATRIX& XM_CALLCONV operator*=(FXMMATRIX M) noexcept;
-    XMMATRIX& operator*=(float S) noexcept;
-    XMMATRIX& operator/=(float S) noexcept;
-
-    XMMATRIX XM_CALLCONV operator+(FXMMATRIX M) const noexcept;
-    XMMATRIX XM_CALLCONV operator-(FXMMATRIX M) const noexcept;
-    XMMATRIX XM_CALLCONV operator*(FXMMATRIX M) const noexcept;
-    XMMATRIX operator*(float S) const noexcept;
-    XMMATRIX operator/(float S) const noexcept;
-
-    friend XMMATRIX XM_CALLCONV operator*(float S, FXMMATRIX M) noexcept;
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 32 bit floating point components
-struct XMFLOAT2 {
-    float x;
-    float y;
-
-    XMFLOAT2() = default;
-
-    XMFLOAT2(const XMFLOAT2&) = default;
-    XMFLOAT2& operator=(const XMFLOAT2&) = default;
-
-    XMFLOAT2(XMFLOAT2&&) = default;
-    XMFLOAT2& operator=(XMFLOAT2&&) = default;
-
-    constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {}
-    explicit XMFLOAT2(_In_reads_(2) const float* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT2&) const = default;
-    auto operator<=>(const XMFLOAT2&) const = default;
-#endif
-};
-
-// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT2A : public XMFLOAT2 { using XMFLOAT2::XMFLOAT2; };
-
-//------------------------------------------------------------------------------
-// 2D Vector; 32 bit signed integer components
-struct XMINT2 {
-    int32_t x;
-    int32_t y;
-
-    XMINT2() = default;
-
-    XMINT2(const XMINT2&) = default;
-    XMINT2& operator=(const XMINT2&) = default;
-
-    XMINT2(XMINT2&&) = default;
-    XMINT2& operator=(XMINT2&&) = default;
-
-    constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMINT2(_In_reads_(2) const int32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMINT2&) const = default;
-    auto operator<=>(const XMINT2&) const = default;
-#endif
-};
-
-// 2D Vector; 32 bit unsigned integer components
-struct XMUINT2 {
-    uint32_t x;
-    uint32_t y;
-
-    XMUINT2() = default;
-
-    XMUINT2(const XMUINT2&) = default;
-    XMUINT2& operator=(const XMUINT2&) = default;
-
-    XMUINT2(XMUINT2&&) = default;
-    XMUINT2& operator=(XMUINT2&&) = default;
-
-    constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMUINT2(_In_reads_(2) const uint32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMUINT2&) const = default;
-    auto operator<=>(const XMUINT2&) const = default;
-#endif
-};
-
-//------------------------------------------------------------------------------
-// 3D Vector; 32 bit floating point components
-struct XMFLOAT3 {
-    float x;
-    float y;
-    float z;
-
-    XMFLOAT3() = default;
-
-    XMFLOAT3(const XMFLOAT3&) = default;
-    XMFLOAT3& operator=(const XMFLOAT3&) = default;
-
-    XMFLOAT3(XMFLOAT3&&) = default;
-    XMFLOAT3& operator=(XMFLOAT3&&) = default;
-
-    constexpr XMFLOAT3(float _x, float _y, float _z) noexcept
-        : x(_x), y(_y), z(_z) {}
-    explicit XMFLOAT3(_In_reads_(3) const float* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-};
-
-// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT3A : public XMFLOAT3 { using XMFLOAT3::XMFLOAT3; };
-
-//------------------------------------------------------------------------------
-// 3D Vector; 32 bit signed integer components
-struct XMINT3 {
-    int32_t x;
-    int32_t y;
-    int32_t z;
-
-    XMINT3() = default;
-
-    XMINT3(const XMINT3&) = default;
-    XMINT3& operator=(const XMINT3&) = default;
-
-    XMINT3(XMINT3&&) = default;
-    XMINT3& operator=(XMINT3&&) = default;
-
-    constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept
-        : x(_x), y(_y), z(_z) {}
-    explicit XMINT3(_In_reads_(3) const int32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMINT3&) const = default;
-    auto operator<=>(const XMINT3&) const = default;
-#endif
-};
-
-// 3D Vector; 32 bit unsigned integer components
-struct XMUINT3 {
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-
-    XMUINT3() = default;
-
-    XMUINT3(const XMUINT3&) = default;
-    XMUINT3& operator=(const XMUINT3&) = default;
-
-    XMUINT3(XMUINT3&&) = default;
-    XMUINT3& operator=(XMUINT3&&) = default;
-
-    constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept
-        : x(_x), y(_y), z(_z) {}
-    explicit XMUINT3(_In_reads_(3) const uint32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMUINT3&) const = default;
-    auto operator<=>(const XMUINT3&) const = default;
-#endif
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 32 bit floating point components
-struct XMFLOAT4 {
-    float x;
-    float y;
-    float z;
-    float w;
-
-    XMFLOAT4() = default;
-
-    XMFLOAT4(const XMFLOAT4&) = default;
-    XMFLOAT4& operator=(const XMFLOAT4&) = default;
-
-    XMFLOAT4(XMFLOAT4&&) = default;
-    XMFLOAT4& operator=(XMFLOAT4&&) = default;
-
-    constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMFLOAT4(_In_reads_(4) const float* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT4&) const = default;
-    auto operator<=>(const XMFLOAT4&) const = default;
-#endif
-};
-
-// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT4A : public XMFLOAT4 { using XMFLOAT4::XMFLOAT4; };
-
-//------------------------------------------------------------------------------
-// 4D Vector; 32 bit signed integer components
-struct XMINT4 {
-    int32_t x;
-    int32_t y;
-    int32_t z;
-    int32_t w;
-
-    XMINT4() = default;
-
-    XMINT4(const XMINT4&) = default;
-    XMINT4& operator=(const XMINT4&) = default;
-
-    XMINT4(XMINT4&&) = default;
-    XMINT4& operator=(XMINT4&&) = default;
-
-    constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMINT4(_In_reads_(4) const int32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMINT4&) const = default;
-    auto operator<=>(const XMINT4&) const = default;
-#endif
-};
-
-// 4D Vector; 32 bit unsigned integer components
-struct XMUINT4 {
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t w;
-
-    XMUINT4() = default;
-
-    XMUINT4(const XMUINT4&) = default;
-    XMUINT4& operator=(const XMUINT4&) = default;
-
-    XMUINT4(XMUINT4&&) = default;
-    XMUINT4& operator=(XMUINT4&&) = default;
-
-    constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z,
-                      uint32_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUINT4(_In_reads_(4) const uint32_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMUINT4&) const = default;
-    auto operator<=>(const XMUINT4&) const = default;
-#endif
-};
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#pragma clang diagnostic ignored "-Wnested-anon-types"
-#pragma clang diagnostic ignored "-Wunknown-warning-option"
-#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
-#endif
-
-//------------------------------------------------------------------------------
-// 3x3 Matrix: 32 bit floating point components
-struct XMFLOAT3X3 {
-    union {
-        struct {
-            float _11, _12, _13;
-            float _21, _22, _23;
-            float _31, _32, _33;
-        };
-        float m[3][3];
-    };
-
-    XMFLOAT3X3() = default;
-
-    XMFLOAT3X3(const XMFLOAT3X3&) = default;
-    XMFLOAT3X3& operator=(const XMFLOAT3X3&) = default;
-
-    XMFLOAT3X3(XMFLOAT3X3&&) = default;
-    XMFLOAT3X3& operator=(XMFLOAT3X3&&) = default;
-
-    constexpr XMFLOAT3X3(float m00, float m01, float m02, float m10, float m11,
-                         float m12, float m20, float m21, float m22) noexcept
-        : _11(m00),
-          _12(m01),
-          _13(m02),
-          _21(m10),
-          _22(m11),
-          _23(m12),
-          _31(m20),
-          _32(m21),
-          _33(m22) {}
-    explicit XMFLOAT3X3(_In_reads_(9) const float* pArray) noexcept;
-
-    float operator()(size_t Row, size_t Column) const noexcept {
-        return m[Row][Column];
-    }
-    float& operator()(size_t Row, size_t Column) noexcept {
-        return m[Row][Column];
-    }
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT3X3&) const = delete;
-    auto operator<=>(const XMFLOAT3X3&) const = delete;
-#endif
-};
-
-//------------------------------------------------------------------------------
-// 4x3 Row-major Matrix: 32 bit floating point components
-struct XMFLOAT4X3 {
-    union {
-        struct {
-            float _11, _12, _13;
-            float _21, _22, _23;
-            float _31, _32, _33;
-            float _41, _42, _43;
-        };
-        float m[4][3];
-        float f[12];
-    };
-
-    XMFLOAT4X3() = default;
-
-    XMFLOAT4X3(const XMFLOAT4X3&) = default;
-    XMFLOAT4X3& operator=(const XMFLOAT4X3&) = default;
-
-    XMFLOAT4X3(XMFLOAT4X3&&) = default;
-    XMFLOAT4X3& operator=(XMFLOAT4X3&&) = default;
-
-    constexpr XMFLOAT4X3(float m00, float m01, float m02, float m10, float m11,
-                         float m12, float m20, float m21, float m22, float m30,
-                         float m31, float m32) noexcept
-        : _11(m00),
-          _12(m01),
-          _13(m02),
-          _21(m10),
-          _22(m11),
-          _23(m12),
-          _31(m20),
-          _32(m21),
-          _33(m22),
-          _41(m30),
-          _42(m31),
-          _43(m32) {}
-    explicit XMFLOAT4X3(_In_reads_(12) const float* pArray) noexcept;
-
-    float operator()(size_t Row, size_t Column) const noexcept {
-        return m[Row][Column];
-    }
-    float& operator()(size_t Row, size_t Column) noexcept {
-        return m[Row][Column];
-    }
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT4X3&) const = delete;
-    auto operator<=>(const XMFLOAT4X3&) const = delete;
-#endif
-};
-
-// 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte
-// boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT4X3A : public XMFLOAT4X3 {
-    using XMFLOAT4X3::XMFLOAT4X3;
-};
-
-//------------------------------------------------------------------------------
-// 3x4 Column-major Matrix: 32 bit floating point components
-struct XMFLOAT3X4 {
-    union {
-        struct {
-            float _11, _12, _13, _14;
-            float _21, _22, _23, _24;
-            float _31, _32, _33, _34;
-        };
-        float m[3][4];
-        float f[12];
-    };
-
-    XMFLOAT3X4() = default;
-
-    XMFLOAT3X4(const XMFLOAT3X4&) = default;
-    XMFLOAT3X4& operator=(const XMFLOAT3X4&) = default;
-
-    XMFLOAT3X4(XMFLOAT3X4&&) = default;
-    XMFLOAT3X4& operator=(XMFLOAT3X4&&) = default;
-
-    constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03, float m10,
-                         float m11, float m12, float m13, float m20, float m21,
-                         float m22, float m23) noexcept
-        : _11(m00),
-          _12(m01),
-          _13(m02),
-          _14(m03),
-          _21(m10),
-          _22(m11),
-          _23(m12),
-          _24(m13),
-          _31(m20),
-          _32(m21),
-          _33(m22),
-          _34(m23) {}
-    explicit XMFLOAT3X4(_In_reads_(12) const float* pArray) noexcept;
-
-    float operator()(size_t Row, size_t Column) const noexcept {
-        return m[Row][Column];
-    }
-    float& operator()(size_t Row, size_t Column) noexcept {
-        return m[Row][Column];
-    }
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT3X4&) const = delete;
-    auto operator<=>(const XMFLOAT3X4&) const = delete;
-#endif
-};
-
-// 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16
-// byte boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT3X4A : public XMFLOAT3X4 {
-    using XMFLOAT3X4::XMFLOAT3X4;
-};
-
-//------------------------------------------------------------------------------
-// 4x4 Matrix: 32 bit floating point components
-struct XMFLOAT4X4 {
-    union {
-        struct {
-            float _11, _12, _13, _14;
-            float _21, _22, _23, _24;
-            float _31, _32, _33, _34;
-            float _41, _42, _43, _44;
-        };
-        float m[4][4];
-    };
-
-    XMFLOAT4X4() = default;
-
-    XMFLOAT4X4(const XMFLOAT4X4&) = default;
-    XMFLOAT4X4& operator=(const XMFLOAT4X4&) = default;
-
-    XMFLOAT4X4(XMFLOAT4X4&&) = default;
-    XMFLOAT4X4& operator=(XMFLOAT4X4&&) = default;
-
-    constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03, float m10,
-                         float m11, float m12, float m13, float m20, float m21,
-                         float m22, float m23, float m30, float m31, float m32,
-                         float m33) noexcept
-        : _11(m00),
-          _12(m01),
-          _13(m02),
-          _14(m03),
-          _21(m10),
-          _22(m11),
-          _23(m12),
-          _24(m13),
-          _31(m20),
-          _32(m21),
-          _33(m22),
-          _34(m23),
-          _41(m30),
-          _42(m31),
-          _43(m32),
-          _44(m33) {}
-    explicit XMFLOAT4X4(_In_reads_(16) const float* pArray) noexcept;
-
-    float operator()(size_t Row, size_t Column) const noexcept {
-        return m[Row][Column];
-    }
-    float& operator()(size_t Row, size_t Column) noexcept {
-        return m[Row][Column];
-    }
-
-#if (__cplusplus >= 202002L)
-    bool operator==(const XMFLOAT4X4&) const = delete;
-    auto operator<=>(const XMFLOAT4X4&) const = delete;
-#endif
-};
-
-// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
-XM_ALIGNED_STRUCT(16) XMFLOAT4X4A : public XMFLOAT4X4 {
-    using XMFLOAT4X4::XMFLOAT4X4;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Data conversion operations
- *
- ****************************************************************************/
-
-XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt,
-                                               uint32_t DivExponent) noexcept;
-XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat,
-                                               uint32_t MulExponent) noexcept;
-XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt,
-                                                uint32_t DivExponent) noexcept;
-XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat,
-                                                uint32_t MulExponent) noexcept;
-
-#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant)
-#undef XMVectorSetBinaryConstant
-#undef XMVectorSplatConstant
-#undef XMVectorSplatConstantInt
-#endif
-
-XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1,
-                                               uint32_t C2,
-                                               uint32_t C3) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant,
-                                           uint32_t DivExponent) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept;
-
-/****************************************************************************
- *
- * Load operations
- *
- ****************************************************************************/
-
-XMVECTOR XM_CALLCONV XMLoadInt(_In_ const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat(_In_ const float* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadInt2(_In_reads_(2) const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadInt2A(_In_reads_(2)
-                                     const uint32_t* PSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat2(_In_ const XMFLOAT2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat2A(_In_ const XMFLOAT2A* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadSInt2(_In_ const XMINT2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUInt2(_In_ const XMUINT2* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadInt3(_In_reads_(3) const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadInt3A(_In_reads_(3)
-                                     const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat3(_In_ const XMFLOAT3* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat3A(_In_ const XMFLOAT3A* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadSInt3(_In_ const XMINT3* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUInt3(_In_ const XMUINT3* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadInt4(_In_reads_(4) const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadInt4A(_In_reads_(4)
-                                     const uint32_t* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat4(_In_ const XMFLOAT4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat4A(_In_ const XMFLOAT4A* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadSInt4(_In_ const XMINT4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUInt4(_In_ const XMUINT4* pSource) noexcept;
-
-XMMATRIX XM_CALLCONV XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat3x4(_In_ const XMFLOAT3X4* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat3x4A(_In_ const XMFLOAT3X4A* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource) noexcept;
-XMMATRIX XM_CALLCONV XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource) noexcept;
-
-/****************************************************************************
- *
- * Store operations
- *
- ****************************************************************************/
-
-void XM_CALLCONV XMStoreInt(_Out_ uint32_t* pDestination,
-                            _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat(_Out_ float* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreInt2(_Out_writes_(2) uint32_t* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat2(_Out_ XMFLOAT2* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreSInt2(_Out_ XMINT2* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUInt2(_Out_ XMUINT2* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreInt3(_Out_writes_(3) uint32_t* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat3(_Out_ XMFLOAT3* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreSInt3(_Out_ XMINT3* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUInt3(_Out_ XMUINT3* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreInt4(_Out_writes_(4) uint32_t* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat4(_Out_ XMFLOAT4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreSInt4(_Out_ XMINT4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUInt4(_Out_ XMUINT4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination,
-                                 _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination,
-                                 _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination,
-                                  _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat3x4(_Out_ XMFLOAT3X4* pDestination,
-                                 _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat3x4A(_Out_ XMFLOAT3X4A* pDestination,
-                                  _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination,
-                                 _In_ FXMMATRIX M) noexcept;
-void XM_CALLCONV XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination,
-                                  _In_ FXMMATRIX M) noexcept;
-
-/****************************************************************************
- *
- * General vector operations
- *
- ****************************************************************************/
-
-XMVECTOR XM_CALLCONV XMVectorZero() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z,
-                                    uint32_t w) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float* pValue) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept;
-XMVECTOR XM_CALLCONV
-XMVectorReplicateIntPtr(_In_ const uint32_t* pValue) noexcept;
-XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept;
-XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept;
-XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept;
-
-float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept;
-float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept;
-float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept;
-float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept;
-float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMVectorGetByIndexPtr(_Out_ float* f, _In_ FXMVECTOR V,
-                                       _In_ size_t i) noexcept;
-void XM_CALLCONV XMVectorGetXPtr(_Out_ float* x, _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetYPtr(_Out_ float* y, _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetZPtr(_Out_ float* z, _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetWPtr(_Out_ float* w, _In_ FXMVECTOR V) noexcept;
-
-uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept;
-uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept;
-uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept;
-uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept;
-uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMVectorGetIntByIndexPtr(_Out_ uint32_t* x, _In_ FXMVECTOR V,
-                                          _In_ size_t i) noexcept;
-void XM_CALLCONV XMVectorGetIntXPtr(_Out_ uint32_t* x,
-                                    _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t* y,
-                                    _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t* z,
-                                    _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t* w,
-                                    _In_ FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f,
-                                        size_t i) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(_In_ FXMVECTOR V,
-                                           _In_ const float* f,
-                                           _In_ size_t i) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetXPtr(_In_ FXMVECTOR V,
-                                     _In_ const float* x) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetYPtr(_In_ FXMVECTOR V,
-                                     _In_ const float* y) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetZPtr(_In_ FXMVECTOR V,
-                                     _In_ const float* z) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetWPtr(_In_ FXMVECTOR V,
-                                     _In_ const float* w) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x,
-                                           size_t i) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V,
-                                              _In_ const uint32_t* x,
-                                              _In_ size_t i) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(_In_ FXMVECTOR V,
-                                        _In_ const uint32_t* x) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(_In_ FXMVECTOR V,
-                                        _In_ const uint32_t* y) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(_In_ FXMVECTOR V,
-                                        _In_ const uint32_t* z) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(_In_ FXMVECTOR V,
-                                        _In_ const uint32_t* w) noexcept;
-
-#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle)
-#undef XMVectorSwizzle
-#endif
-
-XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1,
-                                     uint32_t E2, uint32_t E3) noexcept;
-XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2,
-                                     uint32_t PermuteX, uint32_t PermuteY,
-                                     uint32_t PermuteZ,
-                                     uint32_t PermuteW) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0,
-                                           uint32_t VectorIndex1,
-                                           uint32_t VectorIndex2,
-                                           uint32_t VectorIndex3) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2,
-                                    FXMVECTOR Control) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-
-#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft)
-#undef XMVectorShiftLeft
-#undef XMVectorRotateLeft
-#undef XMVectorRotateRight
-#undef XMVectorInsert
-#endif
-
-XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2,
-                                       uint32_t Elements) noexcept;
-XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V,
-                                        uint32_t Elements) noexcept;
-XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V,
-                                         uint32_t Elements) noexcept;
-XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS,
-                                    uint32_t VSLeftRotateElements,
-                                    uint32_t Select0, uint32_t Select1,
-                                    uint32_t Select2,
-                                    uint32_t Select3) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1,
-                                    _In_ FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V,
-                                       _In_ FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                       FXMVECTOR Epsilon) noexcept;
-XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1,
-                                      _In_ FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR,
-                                             _In_ FXMVECTOR V1,
-                                             _In_ FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
-XMVECTOR XM_CALLCONV XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V,
-                                       _In_ FXMVECTOR Bounds) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min,
-                                   FXMVECTOR Max) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-
-XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2,
-                                         FXMVECTOR V3) noexcept;
-XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1,
-                                                      FXMVECTOR V2,
-                                                      FXMVECTOR V3) noexcept;
-XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos,
-                                   _In_ FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept;
-XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept;
-XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1,
-                                   FXMVECTOR T) noexcept;
-XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0,
-                                     FXMVECTOR Position1, GXMVECTOR Tangent1,
-                                     float t) noexcept;
-XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0,
-                                      FXMVECTOR Position1, GXMVECTOR Tangent1,
-                                      HXMVECTOR T) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0,
-                                        FXMVECTOR Position1,
-                                        FXMVECTOR Position2,
-                                        GXMVECTOR Position3, float t) noexcept;
-XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0,
-                                         FXMVECTOR Position1,
-                                         FXMVECTOR Position2,
-                                         GXMVECTOR Position3,
-                                         HXMVECTOR T) noexcept;
-XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0,
-                                         FXMVECTOR Position1,
-                                         FXMVECTOR Position2, float f,
-                                         float g) noexcept;
-XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0,
-                                          FXMVECTOR Position1,
-                                          FXMVECTOR Position2, GXMVECTOR F,
-                                          HXMVECTOR G) noexcept;
-
-/****************************************************************************
- *
- * 2D vector operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                    FXMVECTOR Epsilon) noexcept;
-bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
-
-bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept;
-bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin,
-                                          float LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin,
-                                           FXMVECTOR LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident,
-                                      FXMVECTOR Normal) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal,
-                                      float RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal,
-                                       FXMVECTOR RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1,
-                                                     FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1,
-                                                  FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1,
-                                                  FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1,
-                                                FXMVECTOR LinePoint2,
-                                                FXMVECTOR Point) noexcept;
-XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1,
-                                            FXMVECTOR Line1Point2,
-                                            FXMVECTOR Line2Point1,
-                                            GXMVECTOR Line2Point2) noexcept;
-XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
-XMFLOAT4* XM_CALLCONV XMVector2TransformStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1))
-        XMFLOAT4* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1))
-        const XMFLOAT2* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
-XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1))
-        XMFLOAT2* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1))
-        const XMFLOAT2* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V,
-                                              FXMMATRIX M) noexcept;
-XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT2) + OutputStride * (VectorCount - 1))
-        XMFLOAT2* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT2) + InputStride * (VectorCount - 1))
-        const XMFLOAT2* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-
-/****************************************************************************
- *
- * 3D vector operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                    FXMVECTOR Epsilon) noexcept;
-bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
-
-bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept;
-bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin,
-                                          float LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin,
-                                           FXMVECTOR LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident,
-                                      FXMVECTOR Normal) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal,
-                                      float RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal,
-                                       FXMVECTOR RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1,
-                                                     FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1,
-                                                  FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1,
-                                                  FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1,
-                                                FXMVECTOR LinePoint2,
-                                                FXMVECTOR Point) noexcept;
-void XM_CALLCONV XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel,
-                                               _Out_ XMVECTOR* pPerpendicular,
-                                               _In_ FXMVECTOR V,
-                                               _In_ FXMVECTOR Normal) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V,
-                                     FXMVECTOR RotationQuaternion) noexcept;
-XMVECTOR XM_CALLCONV
-XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
-XMFLOAT4* XM_CALLCONV XMVector3TransformStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1))
-        XMFLOAT4* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1))
-        const XMFLOAT3* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
-XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1))
-        XMFLOAT3* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1))
-        const XMFLOAT3* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V,
-                                              FXMMATRIX M) noexcept;
-XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1))
-        XMFLOAT3* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1))
-        const XMFLOAT3* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX,
-                                      float ViewportY, float ViewportWidth,
-                                      float ViewportHeight, float ViewportMinZ,
-                                      float ViewportMaxZ, FXMMATRIX Projection,
-                                      CXMMATRIX View, CXMMATRIX World) noexcept;
-XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1))
-        XMFLOAT3* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1))
-        const XMFLOAT3* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount, _In_ float ViewportX,
-    _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight,
-    _In_ float ViewportMinZ, _In_ float ViewportMaxZ, _In_ FXMMATRIX Projection,
-    _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept;
-XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX,
-                                        float ViewportY, float ViewportWidth,
-                                        float ViewportHeight,
-                                        float ViewportMinZ, float ViewportMaxZ,
-                                        FXMMATRIX Projection, CXMMATRIX View,
-                                        CXMMATRIX World) noexcept;
-XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT3) + OutputStride * (VectorCount - 1))
-        XMFLOAT3* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT3) + InputStride * (VectorCount - 1))
-        const XMFLOAT3* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount, _In_ float ViewportX,
-    _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight,
-    _In_ float ViewportMinZ, _In_ float ViewportMaxZ, _In_ FXMMATRIX Projection,
-    _In_ CXMMATRIX View, _In_ CXMMATRIX World) noexcept;
-
-/****************************************************************************
- *
- * 4D vector operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                    FXMVECTOR Epsilon) noexcept;
-bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
-
-bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept;
-bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept;
-
-XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2,
-                                    FXMVECTOR V3) noexcept;
-XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin,
-                                          float LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin,
-                                           FXMVECTOR LengthMax) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident,
-                                      FXMVECTOR Normal) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal,
-                                      float RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal,
-                                       FXMVECTOR RefractionIndex) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1,
-                                                     FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1,
-                                                  FXMVECTOR N2) noexcept;
-XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1,
-                                                  FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
-XMFLOAT4* XM_CALLCONV XMVector4TransformStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (VectorCount - 1))
-        XMFLOAT4* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (VectorCount - 1))
-        const XMFLOAT4* pInputStream,
-    _In_ size_t InputStride, _In_ size_t VectorCount,
-    _In_ FXMMATRIX M) noexcept;
-
-/****************************************************************************
- *
- * Matrix operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept;
-bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept;
-bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept;
-
-XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1,
-                                               CXMMATRIX M2) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant,
-                                     _In_ FXMMATRIX M) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1,
-                                                 FXMVECTOR V2) noexcept;
-XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept;
-
-_Success_(return) bool XM_CALLCONV
-    XMMatrixDecompose(_Out_ XMVECTOR* outScale, _Out_ XMVECTOR* outRotQuat,
-                      _Out_ XMVECTOR* outTrans, _In_ FXMMATRIX M) noexcept;
-
-XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept;
-XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03,
-                                 float m10, float m11, float m12, float m13,
-                                 float m20, float m21, float m22, float m23,
-                                 float m30, float m31, float m32,
-                                 float m33) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY,
-                                         float OffsetZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY,
-                                     float ScaleZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept;
-
-// Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
-XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw,
-                                                  float Roll) noexcept;
-
-// Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis
-// (Angles.z)
-XMMATRIX XM_CALLCONV
-XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;
-
-XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis,
-                                            float Angle) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixTransformation2D(
-    FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling,
-    FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixTransformation(
-    FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion,
-    FXMVECTOR Scaling, GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion,
-    HXMVECTOR Translation) noexcept;
-XMMATRIX XM_CALLCONV
-XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin,
-                               float Rotation, FXMVECTOR Translation) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(
-    FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion,
-    GXMVECTOR Translation) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane,
-                                    FXMVECTOR LightPosition) noexcept;
-
-XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition,
-                                      FXMVECTOR FocusPosition,
-                                      FXMVECTOR UpDirection) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition,
-                                      FXMVECTOR FocusPosition,
-                                      FXMVECTOR UpDirection) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition,
-                                      FXMVECTOR EyeDirection,
-                                      FXMVECTOR UpDirection) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition,
-                                      FXMVECTOR EyeDirection,
-                                      FXMVECTOR UpDirection) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight,
-                                           float NearZ, float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight,
-                                           float NearZ, float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY,
-                                              float AspectRatio, float NearZ,
-                                              float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY,
-                                              float AspectRatio, float NearZ,
-                                              float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft,
-                                                    float ViewRight,
-                                                    float ViewBottom,
-                                                    float ViewTop, float NearZ,
-                                                    float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft,
-                                                    float ViewRight,
-                                                    float ViewBottom,
-                                                    float ViewTop, float NearZ,
-                                                    float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight,
-                                            float NearZ, float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight,
-                                            float NearZ, float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft,
-                                                     float ViewRight,
-                                                     float ViewBottom,
-                                                     float ViewTop, float NearZ,
-                                                     float FarZ) noexcept;
-XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft,
-                                                     float ViewRight,
-                                                     float ViewBottom,
-                                                     float ViewTop, float NearZ,
-                                                     float FarZ) noexcept;
-
-/****************************************************************************
- *
- * Quaternion operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
-bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
-
-bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept;
-bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept;
-bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept;
-
-XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1,
-                                       float t) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                        FXMVECTOR T) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2,
-                                       GXMVECTOR Q3, float t) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                        FXMVECTOR Q2, GXMVECTOR Q3,
-                                        HXMVECTOR T) noexcept;
-void XM_CALLCONV XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB,
-                                        _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0,
-                                        _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2,
-                                        _In_ GXMVECTOR Q3) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1,
-                                             FXMVECTOR Q2, float f,
-                                             float g) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                              FXMVECTOR Q2, GXMVECTOR F,
-                                              HXMVECTOR G) noexcept;
-
-XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept;
-
-// Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
-XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw,
-                                                      float Roll) noexcept;
-
-// Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis
-// (Angles.z)
-XMVECTOR XM_CALLCONV
-XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;
-
-XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis,
-                                                float Angle) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis,
-                                              float Angle) noexcept;
-XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept;
-
-void XM_CALLCONV XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis,
-                                         _Out_ float* pAngle,
-                                         _In_ FXMVECTOR Q) noexcept;
-
-/****************************************************************************
- *
- * Plane operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;
-bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2,
-                                  FXMVECTOR Epsilon) noexcept;
-bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;
-
-bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept;
-bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept;
-
-XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1,
-                                          FXMVECTOR LinePoint2) noexcept;
-void XM_CALLCONV XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1,
-                                       _Out_ XMVECTOR* pLinePoint2,
-                                       _In_ FXMVECTOR P1,
-                                       _In_ FXMVECTOR P2) noexcept;
-
-// Transforms a plane given an inverse transpose matrix
-XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX ITM) noexcept;
-
-// Transforms an array of planes given an inverse transpose matrix
-XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(
-    _Out_writes_bytes_(sizeof(XMFLOAT4) + OutputStride * (PlaneCount - 1))
-        XMFLOAT4* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(XMFLOAT4) + InputStride * (PlaneCount - 1))
-        const XMFLOAT4* pInputStream,
-    _In_ size_t InputStride, _In_ size_t PlaneCount,
-    _In_ FXMMATRIX ITM) noexcept;
-
-XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point,
-                                            FXMVECTOR Normal) noexcept;
-XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2,
-                                       FXMVECTOR Point3) noexcept;
-
-/****************************************************************************
- *
- * Color operations
- *
- ****************************************************************************/
-
-bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-
-bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept;
-bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C) noexcept;
-XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept;
-XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C,
-                                             float Saturation) noexcept;
-XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C,
-                                           float Contrast) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept;
-XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept;
-
-XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept;
-XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept;
-
-/****************************************************************************
- *
- * Miscellaneous operations
- *
- ****************************************************************************/
-
-bool XMVerifyCPUSupport() noexcept;
-
-XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle,
-                                   FXMVECTOR RefractionIndex) noexcept;
-
-bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept;
-float XMScalarModAngle(float Value) noexcept;
-
-float XMScalarSin(float Value) noexcept;
-float XMScalarSinEst(float Value) noexcept;
-
-float XMScalarCos(float Value) noexcept;
-float XMScalarCosEst(float Value) noexcept;
-
-void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value) noexcept;
-void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos,
-                       float Value) noexcept;
-
-float XMScalarASin(float Value) noexcept;
-float XMScalarASinEst(float Value) noexcept;
-
-float XMScalarACos(float Value) noexcept;
-float XMScalarACosEst(float Value) noexcept;
-
-/****************************************************************************
- *
- * Templates
- *
- ****************************************************************************/
-
-#if defined(__XNAMATH_H__) && defined(XMMin)
-#undef XMMin
-#undef XMMax
-#endif
-
-template <class T>
-inline T XMMin(T a, T b) noexcept {
-    return (a < b) ? a : b;
-}
-template <class T>
-inline T XMMax(T a, T b) noexcept {
-    return (a > b) ? a : b;
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-// PermuteHelper internal template (SSE only)
-namespace MathInternal {
-// Slow path fallback for permutes that do not map to a single SSE shuffle
-// opcode.
-template <uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW>
-struct PermuteHelper {
-    static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept {
-        static const XMVECTORU32 selectMask = {{{
-            WhichX ? 0xFFFFFFFF : 0,
-            WhichY ? 0xFFFFFFFF : 0,
-            WhichZ ? 0xFFFFFFFF : 0,
-            WhichW ? 0xFFFFFFFF : 0,
-        }}};
-
-        XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
-        XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);
-
-        XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
-        XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
-
-        return _mm_or_ps(masked1, masked2);
-    }
-};
-
-// Fast path for permutes that only read from the first vector.
-template <uint32_t Shuffle>
-struct PermuteHelper<Shuffle, false, false, false, false> {
-    static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) noexcept {
-        return XM_PERMUTE_PS(v1, Shuffle);
-    }
-};
-
-// Fast path for permutes that only read from the second vector.
-template <uint32_t Shuffle>
-struct PermuteHelper<Shuffle, true, true, true, true> {
-    static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2) noexcept {
-        return XM_PERMUTE_PS(v2, Shuffle);
-    }
-};
-
-// Fast path for permutes that read XY from the first vector, ZW from the
-// second.
-template <uint32_t Shuffle>
-struct PermuteHelper<Shuffle, false, false, true, true> {
-    static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept {
-        return _mm_shuffle_ps(v1, v2, Shuffle);
-    }
-};
-
-// Fast path for permutes that read XY from the second vector, ZW from the
-// first.
-template <uint32_t Shuffle>
-struct PermuteHelper<Shuffle, true, true, false, false> {
-    static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept {
-        return _mm_shuffle_ps(v2, v1, Shuffle);
-    }
-};
-}  // namespace MathInternal
-
-#endif  // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-// General permute template
-template <uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ,
-          uint32_t PermuteW>
-inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-    static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
-    static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
-    static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
-    static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    constexpr uint32_t Shuffle =
-        _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
-
-    constexpr bool WhichX = PermuteX > 3;
-    constexpr bool WhichY = PermuteY > 3;
-    constexpr bool WhichZ = PermuteZ > 3;
-    constexpr bool WhichW = PermuteW > 3;
-
-    return MathInternal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ,
-                                       WhichW>::Permute(V1, V2);
-#else
-
-    return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW);
-
-#endif
-}
-
-// Special-case permute templates
-template <>
-constexpr XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1,
-                                                           FXMVECTOR) noexcept {
-    return V1;
-}
-template <>
-constexpr XMVECTOR XM_CALLCONV
-XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept {
-    return V2;
-}
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_movelh_ps(V1, V2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_movehl_ps(V1, V2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_unpacklo_ps(V1, V2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_unpackhi_ps(V1, V2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2)));
-}
-#endif
-
-#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x1);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x3);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x4);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x5);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x6);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x7);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x8);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0x9);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0xA);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0xB);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0xC);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0xD);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return _mm_blend_ps(V1, V2, 0xE);
-}
-#endif
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle
-// instead The mirror cases are not spelled out here as the programmer can
-// always swap the arguments (i.e. prefer permutes where the X element comes
-// from the V1 vector instead of the V2 vector)
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_low_f32(V1)),
-                        vrev64_f32(vget_low_f32(V2)));
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V1)),
-                        vrev64_f32(vget_high_f32(V2)));
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_low_f32(V1)),
-                        vrev64_f32(vget_high_f32(V2)));
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V1)),
-                        vrev64_f32(vget_low_f32(V2)));
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vtrnq_f32(V1, V2).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vtrnq_f32(V1, V2).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vzipq_f32(V1, V2).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vzipq_f32(V1, V2).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vuzpq_f32(V1, V2).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vuzpq_f32(V1, V2).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vextq_f32(V1, V2, 1);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vextq_f32(V1, V2, 2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    return vextq_f32(V1, V2, 3);
-}
-
-#endif  // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-
-// General swizzle template
-template <uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ,
-          uint32_t SwizzleW>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) noexcept {
-    static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
-    static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
-    static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
-    static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    return XM_PERMUTE_PS(V,
-                         _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
-#else
-
-    return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
-
-#endif
-}
-
-// Specialized swizzles
-template <>
-constexpr XMVECTOR XM_CALLCONV
-XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept {
-    return V;
-}
-
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept {
-    return _mm_movelh_ps(V, V);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept {
-    return _mm_movehl_ps(V, V);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept {
-    return _mm_unpacklo_ps(V, V);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept {
-    return _mm_unpackhi_ps(V, V);
-}
-#endif
-
-#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept {
-    return _mm_moveldup_ps(V);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept {
-    return _mm_movehdup_ps(V);
-}
-#endif
-
-#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && \
-    defined(_XM_FAVOR_INTEL_)
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept {
-    return _mm_broadcastss_ps(V);
-}
-#endif
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept {
-    return vdupq_lane_f32(vget_low_f32(V), 0);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept {
-    return vdupq_lane_f32(vget_low_f32(V), 1);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept {
-    return vdupq_lane_f32(vget_high_f32(V), 0);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept {
-    return vdupq_lane_f32(vget_high_f32(V), 1);
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept {
-    return vrev64q_f32(V);
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept {
-    float32x2_t vt = vget_low_f32(V);
-    return vcombine_f32(vt, vt);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept {
-    float32x2_t vt = vget_high_f32(V);
-    return vcombine_f32(vt, vt);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept {
-    float32x2_t vt = vrev64_f32(vget_low_f32(V));
-    return vcombine_f32(vt, vt);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept {
-    float32x2_t vt = vrev64_f32(vget_high_f32(V));
-    return vcombine_f32(vt, vt);
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept {
-    return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept {
-    return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept {
-    return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V)));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V));
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept {
-    return vcombine_f32(vrev64_f32(vget_high_f32(V)),
-                        vrev64_f32(vget_low_f32(V)));
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept {
-    return vtrnq_f32(V, V).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept {
-    return vtrnq_f32(V, V).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept {
-    return vzipq_f32(V, V).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept {
-    return vzipq_f32(V, V).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept {
-    return vuzpq_f32(V, V).val[0];
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept {
-    return vuzpq_f32(V, V).val[1];
-}
-
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept {
-    return vextq_f32(V, V, 1);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept {
-    return vextq_f32(V, V, 2);
-}
-template <>
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept {
-    return vextq_f32(V, V, 3);
-}
-
-#endif  // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_
-
-//------------------------------------------------------------------------------
-
-template <uint32_t Elements>
-inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept {
-    static_assert(Elements < 4, "Elements template parameter out of range");
-    return XMVectorPermute<Elements, (Elements + 1), (Elements + 2),
-                           (Elements + 3)>(V1, V2);
-}
-
-template <uint32_t Elements>
-inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) noexcept {
-    static_assert(Elements < 4, "Elements template parameter out of range");
-    return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3,
-                           (Elements + 3) & 3>(V);
-}
-
-template <uint32_t Elements>
-inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) noexcept {
-    static_assert(Elements < 4, "Elements template parameter out of range");
-    return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3,
-                           (6 - Elements) & 3, (7 - Elements) & 3>(V);
-}
-
-template <uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1,
-          uint32_t Select2, uint32_t Select3>
-inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD,
-                                           FXMVECTOR VS) noexcept {
-    XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1,
-                                             Select2 & 1, Select3 & 1);
-    return XMVectorSelect(VD, XMVectorRotateLeft<VSLeftRotateElements>(VS),
-                          Control);
-}
-
-/****************************************************************************
- *
- * Globals
- *
- ****************************************************************************/
-
-// The purpose of the following global constants is to prevent redundant
-// reloading of the constants when they are referenced by more than one
-// separate inline math routine called within the same function.  Declaring
-// a constant locally within a routine is sufficient to prevent redundant
-// reloads of that constant when that single routine is called multiple
-// times in a function, but if the constant is used (and declared) in a
-// separate math routine it would be reloaded.
-
-#ifndef XMGLOBALCONST
-#if defined(__GNUC__) && !defined(__MINGW32__)
-#define XMGLOBALCONST extern const __attribute__((weak))
-#else
-#define XMGLOBALCONST extern const __declspec(selectany)
-#endif
-#endif
-
-XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {
-    {{-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f}}};
-XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {
-    {{-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/,
-      -0.00018524670f /*Est3*/}}};
-XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {
-    {{-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f}}};
-XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {
-    {{-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/,
-      -0.0012712436f /*Est3*/}}};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {
-    {{1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}}};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {
-    {{2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}}};
-XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {
-    {{5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}}};
-XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = {
-    {{+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f}}};
-XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = {
-    {{+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f}}};
-XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {
-    {{-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f}}};
-XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {
-    {{-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f}}};
-XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {
-    {{+0.999866f, +0.999866f, +0.999866f, +0.999866f}}};
-XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {
-    {{-0.3302995f, +0.180141f, -0.085133f, +0.0208351f}}};
-XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {
-    {{2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}}};
-XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = {
-    {{+1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f}}};
-XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {
-    {{XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {{{1.0f, 0.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {{{0.0f, 1.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {{{0.0f, 0.0f, 1.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {{{0.0f, 0.0f, 0.0f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {{{-1.0f, 0.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {{{0.0f, -1.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {{{0.0f, 0.0f, -1.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {{{0.0f, 0.0f, 0.0f, -1.0f}}};
-XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = {
-    {{0x80000000, 0x80000000, 0x80000000, 0x80000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMNegate3 = {
-    {{0x80000000, 0x80000000, 0x80000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskXY = {
-    {{0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMask3 = {
-    {{0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX = {
-    {{0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskY = {
-    {{0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskZ = {
-    {{0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskW = {
-    {{0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}}};
-XMGLOBALCONST XMVECTORF32 g_XMOne = {{{1.0f, 1.0f, 1.0f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMOne3 = {{{1.0f, 1.0f, 1.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMZero = {{{0.0f, 0.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMTwo = {{{2.f, 2.f, 2.f, 2.f}}};
-XMGLOBALCONST XMVECTORF32 g_XMFour = {{{4.f, 4.f, 4.f, 4.f}}};
-XMGLOBALCONST XMVECTORF32 g_XMSix = {{{6.f, 6.f, 6.f, 6.f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {{{-1.0f, -1.0f, -1.0f, -1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMOneHalf = {{{0.5f, 0.5f, 0.5f, 0.5f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {
-    {{-0.5f, -0.5f, -0.5f, -0.5f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {
-    {{-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {{{-XM_PI, -XM_PI, -XM_PI, -XM_PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {
-    {{XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}}};
-XMGLOBALCONST XMVECTORF32 g_XMPi = {{{XM_PI, XM_PI, XM_PI, XM_PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {
-    {{XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}}};
-XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {{{XM_2PI, XM_2PI, XM_2PI, XM_2PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {
-    {{XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}}};
-XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {
-    {{1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}}};
-XMGLOBALCONST XMVECTORI32 g_XMInfinity = {
-    {{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}}};
-XMGLOBALCONST XMVECTORI32 g_XMQNaN = {
-    {{0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}}};
-XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {
-    {{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}}};
-XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {
-    {{0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}}};
-XMGLOBALCONST XMVECTORI32 g_XMFltMin = {
-    {{0x00800000, 0x00800000, 0x00800000, 0x00800000}}};
-XMGLOBALCONST XMVECTORI32 g_XMFltMax = {
-    {{0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}}};
-XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = {
-    {{0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = {
-    {{0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = {
-    {{0x00000000, 0x00000000, 0x00000000, 0x80000000}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {
-    {{0.0f, 0.0f, 0.0f, float(0x80000000U)}}};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {
-    {{1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)),
-      1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000))}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = {
-    {{0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = {
-    {{0x00000200, 0x00080000, 0x20000000, 0x80000000}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {
-    {{-512.0f, -512.0f * float(0x400), -512.0f * float(0x100000),
-      float(0x80000000U)}}};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {
-    {{1.0f / 511.0f, 1.0f / (511.0f * float(0x400)),
-      1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000))}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = {
-    {{0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {
-    {{0x00008000, 0x00000000, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {{{-32768.0f, 0.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {
-    {{1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = {
-    {{0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}}};
-XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {
-    {{0x00008000, 0x00008000, 0x00000000, 0x00000000}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {
-    {{-32768.0f, -32768.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {
-    {{1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f),
-      1.0f / (32767.0f * 65536.0f)}}};
-XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {
-    {{8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f}}};
-XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {
-    {{0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegateX = {{{-1.0f, 1.0f, 1.0f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegateY = {{{1.0f, -1.0f, 1.0f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegateZ = {{{1.0f, 1.0f, -1.0f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMNegateW = {{{1.0f, 1.0f, 1.0f, -1.0f}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = {
-    {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = {
-    {{XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
-XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = {
-    {{0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = {
-    {{XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = {
-    {{XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = {
-    {{XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}}};
-XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = {
-    {{XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {
-    {{1.0f, 1.0f / 65536.0f, 0.0f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {
-    {{1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipY = {{{0, 0x80000000, 0, 0}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipZ = {{{0, 0, 0x80000000, 0}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipW = {{{0, 0, 0, 0x80000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = {{{0, 0x80000000, 0x80000000, 0}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipZW = {{{0, 0, 0x80000000, 0x80000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMFlipYW = {{{0, 0x80000000, 0, 0x80000000}}};
-XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {
-    {{0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast<int>(0xC0000000)}}};
-XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {
-    {{0x200, 0x200 << 10, 0x200 << 20, 0}}};
-XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {{{0, 0, 0, 32768.0f * 65536.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {
-    {{-512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0}}};
-XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {
-    {{1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f),
-      1.0f / (1024.0f * 1024.0f * 1024.0f)}}};
-XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = {
-    {{0xFF, 0xFF00, 0xFF0000, 0xFF000000}}};
-XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {
-    {{0x80, 0x8000, 0x800000, 0x00000000}}};
-XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {
-    {{-128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0}}};
-XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {
-    {{32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f,
-      32768.0f * 65536.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {
-    {{65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f,
-      65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {
-    {{65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f,
-      65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {
-    {{32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f,
-      32768.0f * 65536.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = {{{12.92f, 12.92f, 12.92f, 1.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMsrgbA = {{{0.055f, 0.055f, 0.055f, 0.0f}}};
-XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = {{{1.055f, 1.055f, 1.055f, 1.0f}}};
-XMGLOBALCONST XMVECTORI32 g_XMExponentBias = {{{127, 127, 127, 127}}};
-XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {{{-126, -126, -126, -126}}};
-XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = {{{23, 23, 23, 23}}};
-XMGLOBALCONST XMVECTORI32 g_XMMinNormal = {
-    {{0x00800000, 0x00800000, 0x00800000, 0x00800000}}};
-XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = {
-    {{0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000}}};
-XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = {
-    {{0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000}}};
-XMGLOBALCONST XMVECTORI32 g_XMBin128 = {
-    {{0x43000000, 0x43000000, 0x43000000, 0x43000000}}};
-XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = {
-    {{0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000}}};
-XMGLOBALCONST XMVECTORI32 g_XM253 = {{{253, 253, 253, 253}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = {
-    {{-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = {
-    {{+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = {
-    {{-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = {
-    {{+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = {
-    {{-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = {
-    {{+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f}}};
-XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = {
-    {{-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = {
-    {{+1.442693f, +1.442693f, +1.442693f, +1.442693f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = {
-    {{-0.721242f, -0.721242f, -0.721242f, -0.721242f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = {
-    {{+0.479384f, +0.479384f, +0.479384f, +0.479384f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = {
-    {{-0.350295f, -0.350295f, -0.350295f, -0.350295f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = {
-    {{+0.248590f, +0.248590f, +0.248590f, +0.248590f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = {
-    {{-0.145700f, -0.145700f, -0.145700f, -0.145700f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = {
-    {{+0.057148f, +0.057148f, +0.057148f, +0.057148f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = {
-    {{-0.010578f, -0.010578f, -0.010578f, -0.010578f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLgE = {
-    {{+1.442695f, +1.442695f, +1.442695f, +1.442695f}}};
-XMGLOBALCONST XMVECTORF32 g_XMInvLgE = {
-    {{+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f}}};
-XMGLOBALCONST XMVECTORF32 g_XMLg10 = {
-    {{+3.321928f, +3.321928f, +3.321928f, +3.321928f}}};
-XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = {
-    {{+3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f}}};
-XMGLOBALCONST XMVECTORF32 g_UByteMax = {{{255.0f, 255.0f, 255.0f, 255.0f}}};
-XMGLOBALCONST XMVECTORF32 g_ByteMin = {{{-127.0f, -127.0f, -127.0f, -127.0f}}};
-XMGLOBALCONST XMVECTORF32 g_ByteMax = {{{127.0f, 127.0f, 127.0f, 127.0f}}};
-XMGLOBALCONST XMVECTORF32 g_ShortMin = {
-    {{-32767.0f, -32767.0f, -32767.0f, -32767.0f}}};
-XMGLOBALCONST XMVECTORF32 g_ShortMax = {
-    {{32767.0f, 32767.0f, 32767.0f, 32767.0f}}};
-XMGLOBALCONST XMVECTORF32 g_UShortMax = {
-    {{65535.0f, 65535.0f, 65535.0f, 65535.0f}}};
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4068 4214 4204 4365 4616 4640 6001 6101)
-// C4068/4616: ignore unknown pragmas
-// C4214/4204: nonstandard extension used
-// C4365/4640: Off by default noise
-// C6001/6101: False positives
-#endif
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wfloat-equal"
-#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
-#pragma clang diagnostic ignored "-Wunknown-warning-option"
-#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1,
-                                                      uint32_t C2,
-                                                      uint32_t C3) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
-    vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
-    vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
-    vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
-    vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
-    vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
-    vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
-    return vResult.v;
-#else  // XM_SSE_INTRINSICS_
-    static const XMVECTORU32 g_vMask1 = {{{1, 1, 1, 1}}};
-    // Move the parms to a vector
-    __m128i vTemp = _mm_set_epi32(static_cast<int>(C3), static_cast<int>(C2),
-                                  static_cast<int>(C1), static_cast<int>(C0));
-    // Mask off the low bits
-    vTemp = _mm_and_si128(vTemp, g_vMask1);
-    // 0xFFFFFFFF on true bits
-    vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1);
-    // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
-    vTemp = _mm_and_si128(vTemp, g_XMOne);
-    return _mm_castsi128_ps(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept {
-    assert(IntConstant >= -16 && IntConstant <= 15);
-    assert(DivExponent < 32);
-#if defined(_XM_NO_INTRINSICS_)
-
-    using DirectX::XMConvertVectorIntToFloat;
-
-    XMVECTORI32 V = {{{IntConstant, IntConstant, IntConstant, IntConstant}}};
-    return XMConvertVectorIntToFloat(V.v, DivExponent);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Splat the int
-    int32x4_t vScale = vdupq_n_s32(IntConstant);
-    // Convert to a float
-    XMVECTOR vResult = vcvtq_f32_s32(vScale);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value (It's really a float)
-    vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
-    // Multiply by the reciprocal (Perform a right shift by DivExponent)
-    vResult =
-        vmulq_f32(vResult, reinterpret_cast<const float32x4_t*>(&vScale)[0]);
-    return vResult;
-#else  // XM_SSE_INTRINSICS_
-       // Splat the int
-    __m128i vScale = _mm_set1_epi32(IntConstant);
-    // Convert to a float
-    XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value (It's really a float)
-    vScale = _mm_set1_epi32(static_cast<int>(uScale));
-    // Multiply by the reciprocal (Perform a right shift by DivExponent)
-    vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVectorSplatConstantInt(int32_t IntConstant) noexcept {
-    assert(IntConstant >= -16 && IntConstant <= 15);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORI32 V = {{{IntConstant, IntConstant, IntConstant, IntConstant}}};
-    return V.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t V = vdupq_n_s32(IntConstant);
-    return reinterpret_cast<float32x4_t*>(&V)[0];
-#else  // XM_SSE_INTRINSICS_
-    __m128i V = _mm_set1_epi32(IntConstant);
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-#include "DirectXMathConvert.inl"
-#include "DirectXMathMatrix.inl"
-#include "DirectXMathMisc.inl"
-#include "DirectXMathVector.inl"
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-}  // namespace DirectX
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXMathConvert.inl b/targets/app/linux/Stubs/DirectXMath/DirectXMathConvert.inl
deleted file mode 100644
index b68857896..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXMathConvert.inl
+++ /dev/null
@@ -1,2057 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathConvert.inl -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Data conversion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4701)
-// C4701: false positives
-#endif
-
-inline XMVECTOR XM_CALLCONV
-XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept {
-    assert(DivExponent < 32);
-#if defined(_XM_NO_INTRINSICS_)
-    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        auto iTemp = static_cast<int32_t>(VInt.vector4_u32[ElementIndex]);
-        Result.vector4_f32[ElementIndex] = static_cast<float>(iTemp) * fScale;
-    } while (++ElementIndex < 4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
-    return vmulq_n_f32(vResult, fScale);
-#else  // _XM_SSE_INTRINSICS_
-    // Convert to floats
-    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat the scalar value
-    __m128i vScale = _mm_set1_epi32(static_cast<int>(uScale));
-    vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept {
-    assert(MulExponent < 32);
-#if defined(_XM_NO_INTRINSICS_)
-    // Get the scalar factor.
-    auto fScale = static_cast<float>(1U << MulExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        int32_t iResult;
-        float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
-        if (fTemp <= -(65536.0f * 32768.0f)) {
-            iResult = (-0x7FFFFFFF) - 1;
-        } else if (fTemp > (65536.0f * 32768.0f) - 128.0f) {
-            iResult = 0x7FFFFFFF;
-        } else {
-            iResult = static_cast<int32_t>(fTemp);
-        }
-        Result.vector4_u32[ElementIndex] = static_cast<uint32_t>(iResult);
-    } while (++ElementIndex < 4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult =
-        vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
-    // In case of positive overflow, detect it
-    uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt);
-    // Float to int conversion
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
-    vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
-    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
-    return vreinterpretq_f32_u32(vOverflow);
-#else  // _XM_SSE_INTRINSICS_
-    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
-    vResult = _mm_mul_ps(vResult, VFloat);
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow, vResult);
-    return vOverflow;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept {
-    assert(DivExponent < 32);
-#if defined(_XM_NO_INTRINSICS_)
-    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        Result.vector4_f32[ElementIndex] =
-            static_cast<float>(VUInt.vector4_u32[ElementIndex]) * fScale;
-    } while (++ElementIndex < 4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fScale = 1.0f / static_cast<float>(1U << DivExponent);
-    float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
-    return vmulq_n_f32(vResult, fScale);
-#else  // _XM_SSE_INTRINSICS_
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(VUInt, vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult, vMask);
-    // Convert DivExponent into 1.0f/(1<<DivExponent)
-    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
-    // Splat
-    iMask = _mm_set1_epi32(static_cast<int>(uScale));
-    vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept {
-    assert(MulExponent < 32);
-#if defined(_XM_NO_INTRINSICS_)
-    // Get the scalar factor.
-    auto fScale = static_cast<float>(1U << MulExponent);
-    uint32_t ElementIndex = 0;
-    XMVECTOR Result;
-    do {
-        uint32_t uResult;
-        float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
-        if (fTemp <= 0.0f) {
-            uResult = 0;
-        } else if (fTemp >= (65536.0f * 65536.0f)) {
-            uResult = 0xFFFFFFFFU;
-        } else {
-            uResult = static_cast<uint32_t>(fTemp);
-        }
-        Result.vector4_u32[ElementIndex] = uResult;
-    } while (++ElementIndex < 4);
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult =
-        vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
-    // In case of overflow, detect it
-    uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt);
-    // Float to int conversion
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    // If there was overflow, set to 0xFFFFFFFFU
-    vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
-    vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
-    return vreinterpretq_f32_u32(vOverflow);
-#else  // _XM_SSE_INTRINSICS_
-    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
-    vResult = _mm_mul_ps(vResult, VFloat);
-    // Clamp to >=0
-    vResult = _mm_max_ps(vResult, g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue, vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult, vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult, vOverflow);
-    return vResult;
-#endif
-}
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Vector and matrix load operations
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt(const uint32_t* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = *pSource;
-    V.vector4_u32[1] = 0;
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t zero = vdupq_n_u32(0);
-    return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ss(reinterpret_cast<const float*>(pSource));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat(const float* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = *pSource;
-    V.vector4_f32[1] = 0.f;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t zero = vdupq_n_f32(0);
-    return vld1q_lane_f32(pSource, zero, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ss(pSource);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt2(const uint32_t* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32(pSource);
-    uint32x2_t zero = vdup_n_u32(0);
-    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt2A(const uint32_t* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = 0;
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    uint32x2_t x = vld1_u32_ex(pSource, 64);
-#else
-    uint32x2_t x = vld1_u32(pSource);
-#endif
-    uint32x2_t zero = vdup_n_u32(0);
-    return vreinterpretq_f32_u32(vcombine_u32(x, zero));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat2(const XMFLOAT2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32(x, zero);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat2A(const XMFLOAT2A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    float32x2_t x = vld1_f32_ex(reinterpret_cast<const float*>(pSource), 64);
-#else
-    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
-#endif
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32(x, zero);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadSInt2(const XMINT2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t x = vld1_s32(reinterpret_cast<const int32_t*>(pSource));
-    float32x2_t v = vcvt_f32_s32(x);
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32(v, zero);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 V =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    return _mm_cvtepi32_ps(_mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUInt2(const XMUINT2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = 0.f;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t*>(pSource));
-    float32x2_t v = vcvt_f32_u32(x);
-    float32x2_t zero = vdup_n_f32(0);
-    return vcombine_f32(v, zero);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 V =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(V, vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult, vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt3(const uint32_t* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32(pSource);
-    uint32x2_t zero = vdup_n_u32(0);
-    uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
-    return vreinterpretq_f32_u32(vcombine_u32(x, y));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
-    return _mm_insert_ps(xy, z, 0x20);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
-    return _mm_movelh_ps(xy, z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt3A(const uint32_t* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = 0;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Reads an extra integer which is zero'd
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    uint32x4_t V = vld1q_u32_ex(pSource, 128);
-#else
-    uint32x4_t V = vld1q_u32(pSource);
-#endif
-    return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
-    return _mm_insert_ps(xy, z, 0x20);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(pSource + 2));
-    return _mm_movelh_ps(xy, z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat3(const XMFLOAT3* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t x = vld1_f32(reinterpret_cast<const float*>(pSource));
-    float32x2_t zero = vdup_n_f32(0);
-    float32x2_t y =
-        vld1_lane_f32(reinterpret_cast<const float*>(pSource) + 2, zero, 0);
-    return vcombine_f32(x, y);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(&pSource->z);
-    return _mm_insert_ps(xy, z, 0x20);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(&pSource->z);
-    return _mm_movelh_ps(xy, z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Reads an extra float which is zero'd
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    float32x4_t V = vld1q_f32_ex(reinterpret_cast<const float*>(pSource), 128);
-#else
-    float32x4_t V = vld1q_f32(reinterpret_cast<const float*>(pSource));
-#endif
-    return vsetq_lane_f32(0, V, 3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Reads an extra float which is zero'd
-    __m128 V = _mm_load_ps(&pSource->x);
-    return _mm_and_ps(V, g_XMMask3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadSInt3(const XMINT3* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = static_cast<float>(pSource->z);
-    V.vector4_f32[3] = 0.f;
-    return V;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x2_t x = vld1_s32(reinterpret_cast<const int32_t*>(pSource));
-    int32x2_t zero = vdup_n_s32(0);
-    int32x2_t y =
-        vld1_lane_s32(reinterpret_cast<const int32_t*>(pSource) + 2, zero, 0);
-    int32x4_t v = vcombine_s32(x, y);
-    return vcvtq_f32_s32(v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(&pSource->z));
-    __m128 V = _mm_movelh_ps(xy, z);
-    return _mm_cvtepi32_ps(_mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUInt3(const XMUINT3* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = static_cast<float>(pSource->z);
-    V.vector4_f32[3] = 0.f;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t*>(pSource));
-    uint32x2_t zero = vdup_n_u32(0);
-    uint32x2_t y =
-        vld1_lane_u32(reinterpret_cast<const uint32_t*>(pSource) + 2, zero, 0);
-    uint32x4_t v = vcombine_u32(x, y);
-    return vcvtq_f32_u32(v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 xy =
-        _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pSource)));
-    __m128 z = _mm_load_ss(reinterpret_cast<const float*>(&pSource->z));
-    __m128 V = _mm_movelh_ps(xy, z);
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(V, vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult, vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt4(const uint32_t* pSource) noexcept {
-    assert(pSource);
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = pSource[3];
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vld1q_u32(pSource));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadInt4A(const uint32_t* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_u32[0] = pSource[0];
-    V.vector4_u32[1] = pSource[1];
-    V.vector4_u32[2] = pSource[2];
-    V.vector4_u32[3] = pSource[3];
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    return vld1q_u32_ex(pSource, 128);
-#else
-    return vreinterpretq_f32_u32(vld1q_u32(pSource));
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_load_si128(reinterpret_cast<const __m128i*>(pSource));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat4(const XMFLOAT4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = pSource->w;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_f32(reinterpret_cast<const float*>(pSource));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_loadu_ps(&pSource->x);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat4A(const XMFLOAT4A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = pSource->x;
-    V.vector4_f32[1] = pSource->y;
-    V.vector4_f32[2] = pSource->z;
-    V.vector4_f32[3] = pSource->w;
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    return vld1q_f32_ex(reinterpret_cast<const float*>(pSource), 128);
-#else
-    return vld1q_f32(reinterpret_cast<const float*>(pSource));
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps(&pSource->x);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadSInt4(const XMINT4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = static_cast<float>(pSource->z);
-    V.vector4_f32[3] = static_cast<float>(pSource->w);
-    return V;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vld1q_s32(reinterpret_cast<const int32_t*>(pSource));
-    return vcvtq_f32_s32(v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
-    return _mm_cvtepi32_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUInt4(const XMUINT4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR V;
-    V.vector4_f32[0] = static_cast<float>(pSource->x);
-    V.vector4_f32[1] = static_cast<float>(pSource->y);
-    V.vector4_f32[2] = static_cast<float>(pSource->z);
-    V.vector4_f32[3] = static_cast<float>(pSource->w);
-    return V;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t*>(pSource));
-    return vcvtq_f32_u32(v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSource));
-    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
-    // Determine which ones need the fix.
-    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero);
-    // Force all values positive
-    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask);
-    // Convert to floats
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Convert 0x80000000 -> 0xFFFFFFFF
-    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
-    // For only the ones that are too big, add the fixup
-    vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
-    vResult = _mm_add_ps(vResult, vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat3x3(const XMFLOAT3X3* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-    M.r[3].vector4_f32[0] = 0.0f;
-    M.r[3].vector4_f32[1] = 0.0f;
-    M.r[3].vector4_f32[2] = 0.0f;
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
-    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
-    float32x2_t v2 = vcreate_f32(static_cast<uint64_t>(
-        *reinterpret_cast<const uint32_t*>(&pSource->m[2][2])));
-    float32x4_t T = vextq_f32(v0, v1, 3);
-
-    XMMATRIX M;
-    M.r[0] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
-    M.r[1] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
-    M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 Z = _mm_setzero_ps();
-
-    __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]);
-    __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]);
-    __m128 V3 = _mm_load_ss(&pSource->m[2][2]);
-
-    __m128 T1 = _mm_unpackhi_ps(V1, Z);
-    __m128 T2 = _mm_unpacklo_ps(V2, Z);
-    __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0));
-    __m128 T4 = _mm_movehl_ps(T2, T3);
-    __m128 T5 = _mm_movehl_ps(Z, T1);
-
-    XMMATRIX M;
-    M.r[0] = _mm_movelh_ps(V1, T1);
-    M.r[1] = _mm_add_ps(T4, T5);
-    M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2));
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
-    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
-    float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);
-
-    float32x4_t T1 = vextq_f32(v0, v1, 3);
-    float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
-    float32x4_t T3 = vextq_f32(v2, v2, 1);
-
-    XMMATRIX M;
-    M.r[0] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
-    M.r[1] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
-    M.r[2] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
-    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Use unaligned load instructions to
-    // load the 12 floats
-    // vTemp1 = x1,y1,z1,x2
-    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
-    // vTemp2 = y2,z2,x3,y3
-    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
-    // vTemp4 = z3,x4,y4,z4
-    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
-    // vTemp3 = x3,y3,z3,z3
-    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
-    // vTemp2 = y2,z2,x2,x2
-    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
-    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
-    // vTemp1 = x1,y1,z1,0
-    vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
-    // vTemp2 = x2,y2,z2,0
-    vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
-    // vTemp3 = x3,y3,z3,0
-    vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
-    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
-    // vTemp4i = x4,y4,z4,1.0f
-    vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
-    XMMATRIX M(vTemp1, vTemp2, vTemp3, _mm_castsi128_ps(vTemp4i));
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128);
-    float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128);
-    float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128);
-#else
-    float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
-    float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
-    float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);
-#endif
-
-    float32x4_t T1 = vextq_f32(v0, v1, 3);
-    float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
-    float32x4_t T3 = vextq_f32(v2, v2, 1);
-
-    XMMATRIX M;
-    M.r[0] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
-    M.r[1] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
-    M.r[2] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
-    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Use aligned load instructions to
-    // load the 12 floats
-    // vTemp1 = x1,y1,z1,x2
-    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
-    // vTemp2 = y2,z2,x3,y3
-    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
-    // vTemp4 = z3,x4,y4,z4
-    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
-    // vTemp3 = x3,y3,z3,z3
-    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
-    // vTemp2 = y2,z2,x2,x2
-    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
-    // vTemp2 = x2,y2,z2,z2
-    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
-    // vTemp1 = x1,y1,z1,0
-    vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
-    // vTemp2 = x2,y2,z2,0
-    vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
-    // vTemp3 = x3,y3,z3,0
-    vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
-    // vTemp4i = x4,y4,z4,0
-    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
-    // vTemp4i = x4,y4,z4,1.0f
-    vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
-    XMMATRIX M(vTemp1, vTemp2, vTemp3, _mm_castsi128_ps(vTemp4i));
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat3x4(const XMFLOAT3X4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[1][0];
-    M.r[0].vector4_f32[2] = pSource->m[2][0];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[0][1];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[2][1];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[0][2];
-    M.r[2].vector4_f32[1] = pSource->m[1][2];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[0][3];
-    M.r[3].vector4_f32[1] = pSource->m[1][3];
-    M.r[3].vector4_f32[2] = pSource->m[2][3];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
-    float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
-
-    float32x2_t l = vget_low_f32(vTemp1);
-    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
-    float32x2_t rl = vrev64_f32(l);
-    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
-
-    float32x2_t h = vget_high_f32(vTemp1);
-    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
-    float32x2_t rh = vrev64_f32(h);
-    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
-
-    XMMATRIX M = {};
-    M.r[0] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
-    M.r[1] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
-    M.r[2] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
-    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_loadu_ps(&pSource->_11);
-    M.r[1] = _mm_loadu_ps(&pSource->_21);
-    M.r[2] = _mm_loadu_ps(&pSource->_31);
-    M.r[3] = g_XMIdentityR3;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-    XMMATRIX mResult;
-
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat3x4A(const XMFLOAT3X4A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[1][0];
-    M.r[0].vector4_f32[2] = pSource->m[2][0];
-    M.r[0].vector4_f32[3] = 0.0f;
-
-    M.r[1].vector4_f32[0] = pSource->m[0][1];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[2][1];
-    M.r[1].vector4_f32[3] = 0.0f;
-
-    M.r[2].vector4_f32[0] = pSource->m[0][2];
-    M.r[2].vector4_f32[1] = pSource->m[1][2];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = 0.0f;
-
-    M.r[3].vector4_f32[0] = pSource->m[0][3];
-    M.r[3].vector4_f32[1] = pSource->m[1][3];
-    M.r[3].vector4_f32[2] = pSource->m[2][3];
-    M.r[3].vector4_f32[3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128);
-    float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128);
-#else
-    float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
-    float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
-#endif
-
-    float32x2_t l = vget_low_f32(vTemp1);
-    float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
-    float32x2_t rl = vrev64_f32(l);
-    float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);
-
-    float32x2_t h = vget_high_f32(vTemp1);
-    float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
-    float32x2_t rh = vrev64_f32(h);
-    float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);
-
-    XMMATRIX M = {};
-    M.r[0] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
-    M.r[1] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
-    M.r[2] =
-        vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
-    M.r[3] = vsetq_lane_f32(1.f, T3, 3);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_load_ps(&pSource->_11);
-    M.r[1] = _mm_load_ps(&pSource->_21);
-    M.r[2] = _mm_load_ps(&pSource->_31);
-    M.r[3] = g_XMIdentityR3;
-
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-    XMMATRIX mResult;
-
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat4x4(const XMFLOAT4X4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = pSource->m[0][3];
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = pSource->m[1][3];
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = pSource->m[2][3];
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = pSource->m[3][3];
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_11));
-    M.r[1] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_21));
-    M.r[2] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_31));
-    M.r[3] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_41));
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_loadu_ps(&pSource->_11);
-    M.r[1] = _mm_loadu_ps(&pSource->_21);
-    M.r[2] = _mm_loadu_ps(&pSource->_31);
-    M.r[3] = _mm_loadu_ps(&pSource->_41);
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMLoadFloat4x4A(const XMFLOAT4X4A* pSource) noexcept {
-    assert(pSource);
-    assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.r[0].vector4_f32[0] = pSource->m[0][0];
-    M.r[0].vector4_f32[1] = pSource->m[0][1];
-    M.r[0].vector4_f32[2] = pSource->m[0][2];
-    M.r[0].vector4_f32[3] = pSource->m[0][3];
-
-    M.r[1].vector4_f32[0] = pSource->m[1][0];
-    M.r[1].vector4_f32[1] = pSource->m[1][1];
-    M.r[1].vector4_f32[2] = pSource->m[1][2];
-    M.r[1].vector4_f32[3] = pSource->m[1][3];
-
-    M.r[2].vector4_f32[0] = pSource->m[2][0];
-    M.r[2].vector4_f32[1] = pSource->m[2][1];
-    M.r[2].vector4_f32[2] = pSource->m[2][2];
-    M.r[2].vector4_f32[3] = pSource->m[2][3];
-
-    M.r[3].vector4_f32[0] = pSource->m[3][0];
-    M.r[3].vector4_f32[1] = pSource->m[3][1];
-    M.r[3].vector4_f32[2] = pSource->m[3][2];
-    M.r[3].vector4_f32[3] = pSource->m[3][3];
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    M.r[0] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_11), 128);
-    M.r[1] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_21), 128);
-    M.r[2] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_31), 128);
-    M.r[3] = vld1q_f32_ex(reinterpret_cast<const float*>(&pSource->_41), 128);
-#else
-    M.r[0] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_11));
-    M.r[1] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_21));
-    M.r[2] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_31));
-    M.r[3] = vld1q_f32(reinterpret_cast<const float*>(&pSource->_41));
-#endif
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_load_ps(&pSource->_11);
-    M.r[1] = _mm_load_ps(&pSource->_21);
-    M.r[2] = _mm_load_ps(&pSource->_31);
-    M.r[3] = _mm_load_ps(&pSource->_41);
-    return M;
-#endif
-}
-
-/****************************************************************************
- *
- * Vector and matrix store operations
- *
- ****************************************************************************/
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    *pDestination = XMVectorGetIntX(V);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(pDestination, *reinterpret_cast<const uint32x4_t*>(&V), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(reinterpret_cast<float*>(pDestination), V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat(float* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    *pDestination = XMVectorGetX(V);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(pDestination, V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(pDestination, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt2(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
-    vst1_u32(pDestination, VL);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt2A(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1_u32_ex(pDestination, VL, 64);
-#else
-    vst1_u32(pDestination, VL);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat2(XMFLOAT2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat2A(XMFLOAT2A* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1_f32_ex(reinterpret_cast<float*>(pDestination), VL, 64);
-#else
-    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreSInt2(XMINT2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v = vget_low_f32(V);
-    int32x2_t iv = vcvt_s32_f32(v);
-    vst1_s32(reinterpret_cast<int32_t*>(pDestination), iv);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow, vResult);
-    // Write two ints
-    _mm_store_sd(reinterpret_cast<double*>(pDestination),
-                 _mm_castps_pd(vOverflow));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUInt2(XMUINT2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v = vget_low_f32(V);
-    uint32x2_t iv = vcvt_u32_f32(v);
-    vst1_u32(reinterpret_cast<uint32_t*>(pDestination), iv);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue, vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult, vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult, vOverflow);
-    // Write two uints
-    _mm_store_sd(reinterpret_cast<double*>(pDestination),
-                 _mm_castps_pd(vResult));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt3(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
-    vst1_u32(pDestination, VL);
-    vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V),
-                   2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination[2]), z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt3A(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1_u32_ex(pDestination, VL, 64);
-#else
-    vst1_u32(pDestination, VL);
-#endif
-    vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t*>(&V),
-                   2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    __m128 z = _mm_movehl_ps(V, V);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination[2]), z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3(XMFLOAT3* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
-    vst1q_lane_f32(reinterpret_cast<float*>(pDestination) + 2, V, 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *reinterpret_cast<int*>(&pDestination->x) = _mm_extract_ps(V, 0);
-    *reinterpret_cast<int*>(&pDestination->y) = _mm_extract_ps(V, 1);
-    *reinterpret_cast<int*>(&pDestination->z) = _mm_extract_ps(V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(&pDestination->z, z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3A(XMFLOAT3A* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1_f32_ex(reinterpret_cast<float*>(pDestination), VL, 64);
-#else
-    vst1_f32(reinterpret_cast<float*>(pDestination), VL);
-#endif
-    vst1q_lane_f32(reinterpret_cast<float*>(pDestination) + 2, V, 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    *reinterpret_cast<int*>(&pDestination->z) = _mm_extract_ps(V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_sd(reinterpret_cast<double*>(pDestination), _mm_castps_pd(V));
-    __m128 z = _mm_movehl_ps(V, V);
-    _mm_store_ss(&pDestination->z, z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreSInt3(XMINT3* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
-    pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vcvtq_s32_f32(V);
-    int32x2_t vL = vget_low_s32(v);
-    vst1_s32(reinterpret_cast<int32_t*>(pDestination), vL);
-    vst1q_lane_s32(reinterpret_cast<int32_t*>(pDestination) + 2, v, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow, vResult);
-    // Write 3 uints
-    _mm_store_sd(reinterpret_cast<double*>(pDestination),
-                 _mm_castps_pd(vOverflow));
-    __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->z), z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUInt3(XMUINT3* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
-    pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vcvtq_u32_f32(V);
-    uint32x2_t vL = vget_low_u32(v);
-    vst1_u32(reinterpret_cast<uint32_t*>(pDestination), vL);
-    vst1q_lane_u32(reinterpret_cast<uint32_t*>(pDestination) + 2, v, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue, vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult, vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult, vOverflow);
-    // Write 3 uints
-    _mm_store_sd(reinterpret_cast<double*>(pDestination),
-                 _mm_castps_pd(vResult));
-    __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->z), z);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt4(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-    pDestination[3] = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination),
-                     _mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreInt4A(uint32_t* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination[0] = V.vector4_u32[0];
-    pDestination[1] = V.vector4_u32[1];
-    pDestination[2] = V.vector4_u32[2];
-    pDestination[3] = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1q_u32_ex(pDestination, V, 128);
-#else
-    vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_si128(reinterpret_cast<__m128i*>(pDestination),
-                    _mm_castps_si128(V));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4(XMFLOAT4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-    pDestination->w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32(reinterpret_cast<float*>(pDestination), V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_ps(&pDestination->x, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4A(XMFLOAT4A* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = V.vector4_f32[0];
-    pDestination->y = V.vector4_f32[1];
-    pDestination->z = V.vector4_f32[2];
-    pDestination->w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1q_f32_ex(reinterpret_cast<float*>(pDestination), V, 128);
-#else
-    vst1q_f32(reinterpret_cast<float*>(pDestination), V);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ps(&pDestination->x, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreSInt4(XMINT4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
-    pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
-    pDestination->w = static_cast<int32_t>(V.vector4_f32[3]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t v = vcvtq_s32_f32(V);
-    vst1q_s32(reinterpret_cast<int32_t*>(pDestination), v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // In case of positive overflow, detect it
-    XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
-    // Float to int conversion
-    __m128i vResulti = _mm_cvttps_epi32(V);
-    // If there was positive overflow, set to 0x7FFFFFFF
-    XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
-    vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
-    vOverflow = _mm_or_ps(vOverflow, vResult);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination),
-                     _mm_castps_si128(vOverflow));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUInt4(XMUINT4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-    pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
-    pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
-    pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
-    pDestination->w = static_cast<uint32_t>(V.vector4_f32[3]);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t v = vcvtq_u32_f32(V);
-    vst1q_u32(reinterpret_cast<uint32_t*>(pDestination), v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to >=0
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    // Any numbers that are too big, set to 0xFFFFFFFFU
-    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
-    XMVECTOR vValue = g_XMUnsignedFix;
-    // Too large for a signed integer?
-    XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
-    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
-    vValue = _mm_and_ps(vValue, vMask);
-    // Perform fixup only on numbers too large (Keeps low bit precision)
-    vResult = _mm_sub_ps(vResult, vValue);
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Convert from signed to unsigned pnly if greater than 0x80000000
-    vMask = _mm_and_ps(vMask, g_XMNegativeZero);
-    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
-    // On those that are too large, set to 0xFFFFFFFF
-    vResult = _mm_or_ps(vResult, vOverflow);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(pDestination),
-                     _mm_castps_si128(vResult));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3x3(XMFLOAT3X3* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
-    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
-    vst1q_f32(&pDestination->m[0][0], T2);
-
-    T1 = vextq_f32(M.r[1], M.r[1], 1);
-    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
-    vst1q_f32(&pDestination->m[1][1], T2);
-
-    vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = M.r[0];
-    XMVECTOR vTemp2 = M.r[1];
-    XMVECTOR vTemp3 = M.r[2];
-    XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2));
-    vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0));
-    _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
-    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
-    _mm_storeu_ps(&pDestination->m[1][1], vTemp2);
-    vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(&pDestination->m[2][2], vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4x3(XMFLOAT4X3* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
-    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
-    vst1q_f32(&pDestination->m[0][0], T2);
-
-    T1 = vextq_f32(M.r[1], M.r[1], 1);
-    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
-    vst1q_f32(&pDestination->m[1][1], T2);
-
-    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
-    T2 = vextq_f32(T1, M.r[3], 3);
-    vst1q_f32(&pDestination->m[2][2], T2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = M.r[0];
-    XMVECTOR vTemp2 = M.r[1];
-    XMVECTOR vTemp3 = M.r[2];
-    XMVECTOR vTemp4 = M.r[3];
-    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
-    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0));
-    vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0));
-    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
-    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
-    _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
-    _mm_storeu_ps(&pDestination->m[1][1], vTemp2x);
-    _mm_storeu_ps(&pDestination->m[2][2], vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4x3A(XMFLOAT4X3A* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
-    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
-    vst1q_f32_ex(&pDestination->m[0][0], T2, 128);
-
-    T1 = vextq_f32(M.r[1], M.r[1], 1);
-    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
-    vst1q_f32_ex(&pDestination->m[1][1], T2, 128);
-
-    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
-    T2 = vextq_f32(T1, M.r[3], 3);
-    vst1q_f32_ex(&pDestination->m[2][2], T2, 128);
-#else
-    float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
-    float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
-    vst1q_f32(&pDestination->m[0][0], T2);
-
-    T1 = vextq_f32(M.r[1], M.r[1], 1);
-    T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
-    vst1q_f32(&pDestination->m[1][1], T2);
-
-    T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
-    T2 = vextq_f32(T1, M.r[3], 3);
-    vst1q_f32(&pDestination->m[2][2], T2);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x1,y1,z1,w1
-    XMVECTOR vTemp1 = M.r[0];
-    // x2,y2,z2,w2
-    XMVECTOR vTemp2 = M.r[1];
-    // x3,y3,z3,w3
-    XMVECTOR vTemp3 = M.r[2];
-    // x4,y4,z4,w4
-    XMVECTOR vTemp4 = M.r[3];
-    // z1,z1,x2,y2
-    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2));
-    // y2,z2,x3,y3 (Final)
-    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
-    // x1,y1,z1,x2 (Final)
-    vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0));
-    // z3,z3,x4,x4
-    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
-    // z3,x4,y4,z4 (Final)
-    vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
-    // Store in 3 operations
-    _mm_store_ps(&pDestination->m[0][0], vTemp1);
-    _mm_store_ps(&pDestination->m[1][1], vTemp2);
-    _mm_store_ps(&pDestination->m[2][2], vTemp3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3x4(XMFLOAT3X4* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[1].vector4_f32[0];
-    pDestination->m[0][2] = M.r[2].vector4_f32[0];
-    pDestination->m[0][3] = M.r[3].vector4_f32[0];
-
-    pDestination->m[1][0] = M.r[0].vector4_f32[1];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[2].vector4_f32[1];
-    pDestination->m[1][3] = M.r[3].vector4_f32[1];
-
-    pDestination->m[2][0] = M.r[0].vector4_f32[2];
-    pDestination->m[2][1] = M.r[1].vector4_f32[2];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
-    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
-
-    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
-    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
-
-    vst1q_f32(&pDestination->m[0][0], T0.val[0]);
-    vst1q_f32(&pDestination->m[1][0], T0.val[1]);
-    vst1q_f32(&pDestination->m[2][0], T1.val[0]);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-
-    // x.x,y.x,z.x,w.x
-    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-
-    _mm_storeu_ps(&pDestination->m[0][0], r0);
-    _mm_storeu_ps(&pDestination->m[1][0], r1);
-    _mm_storeu_ps(&pDestination->m[2][0], r2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3x4A(XMFLOAT3X4A* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[1].vector4_f32[0];
-    pDestination->m[0][2] = M.r[2].vector4_f32[0];
-    pDestination->m[0][3] = M.r[3].vector4_f32[0];
-
-    pDestination->m[1][0] = M.r[0].vector4_f32[1];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[2].vector4_f32[1];
-    pDestination->m[1][3] = M.r[3].vector4_f32[1];
-
-    pDestination->m[2][0] = M.r[0].vector4_f32[2];
-    pDestination->m[2][1] = M.r[1].vector4_f32[2];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[3].vector4_f32[2];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
-    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
-
-    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
-    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
-
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128);
-    vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128);
-    vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128);
-#else
-    vst1q_f32(&pDestination->m[0][0], T0.val[0]);
-    vst1q_f32(&pDestination->m[1][0], T0.val[1]);
-    vst1q_f32(&pDestination->m[2][0], T1.val[0]);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-
-    // x.x,y.x,z.x,w.x
-    XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-
-    _mm_store_ps(&pDestination->m[0][0], r0);
-    _mm_store_ps(&pDestination->m[1][0], r1);
-    _mm_store_ps(&pDestination->m[2][0], r2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4x4(XMFLOAT4X4* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-    pDestination->m[0][3] = M.r[0].vector4_f32[3];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-    pDestination->m[1][3] = M.r[1].vector4_f32[3];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[2].vector4_f32[3];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-    pDestination->m[3][3] = M.r[3].vector4_f32[3];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_11), M.r[0]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_21), M.r[1]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_31), M.r[2]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_41), M.r[3]);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_storeu_ps(&pDestination->_11, M.r[0]);
-    _mm_storeu_ps(&pDestination->_21, M.r[1]);
-    _mm_storeu_ps(&pDestination->_31, M.r[2]);
-    _mm_storeu_ps(&pDestination->_41, M.r[3]);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat4x4A(XMFLOAT4X4A* pDestination, FXMMATRIX M) noexcept {
-    assert(pDestination);
-    assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
-#if defined(_XM_NO_INTRINSICS_)
-
-    pDestination->m[0][0] = M.r[0].vector4_f32[0];
-    pDestination->m[0][1] = M.r[0].vector4_f32[1];
-    pDestination->m[0][2] = M.r[0].vector4_f32[2];
-    pDestination->m[0][3] = M.r[0].vector4_f32[3];
-
-    pDestination->m[1][0] = M.r[1].vector4_f32[0];
-    pDestination->m[1][1] = M.r[1].vector4_f32[1];
-    pDestination->m[1][2] = M.r[1].vector4_f32[2];
-    pDestination->m[1][3] = M.r[1].vector4_f32[3];
-
-    pDestination->m[2][0] = M.r[2].vector4_f32[0];
-    pDestination->m[2][1] = M.r[2].vector4_f32[1];
-    pDestination->m[2][2] = M.r[2].vector4_f32[2];
-    pDestination->m[2][3] = M.r[2].vector4_f32[3];
-
-    pDestination->m[3][0] = M.r[3].vector4_f32[0];
-    pDestination->m[3][1] = M.r[3].vector4_f32[1];
-    pDestination->m[3][2] = M.r[3].vector4_f32[2];
-    pDestination->m[3][3] = M.r[3].vector4_f32[3];
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128);
-    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128);
-    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128);
-    vst1q_f32_ex(reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128);
-#else
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_11), M.r[0]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_21), M.r[1]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_31), M.r[2]);
-    vst1q_f32(reinterpret_cast<float*>(&pDestination->_41), M.r[3]);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ps(&pDestination->_11, M.r[0]);
-    _mm_store_ps(&pDestination->_21, M.r[1]);
-    _mm_store_ps(&pDestination->_31, M.r[2]);
-    _mm_store_ps(&pDestination->_41, M.r[3]);
-#endif
-}
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXMathMatrix.inl b/targets/app/linux/Stubs/DirectXMath/DirectXMathMatrix.inl
deleted file mode 100644
index d4ff70e09..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXMathMatrix.inl
+++ /dev/null
@@ -1,3484 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathMatrix.inl -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Matrix
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-// Return true if any entry in the matrix is NaN
-inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    size_t i = 16;
-    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
-    do {
-        // Fetch value into integer unit
-        uint32_t uTest = pWork[0];
-        // Remove sign
-        uTest &= 0x7FFFFFFFU;
-        // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
-        uTest -= 0x7F800001U;
-        if (uTest < 0x007FFFFFU) {
-            break;  // NaN found
-        }
-        ++pWork;  // Next entry
-    } while (--i);
-    return (i != 0);  // i == 0 if nothing matched
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Load in registers
-    float32x4_t vX = M.r[0];
-    float32x4_t vY = M.r[1];
-    float32x4_t vZ = M.r[2];
-    float32x4_t vW = M.r[3];
-    // Test themselves to check for NaN
-    uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
-    uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
-    uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
-    uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
-    // Or all the results
-    xmask = vorrq_u32(xmask, zmask);
-    ymask = vorrq_u32(ymask, wmask);
-    xmask = vorrq_u32(xmask, ymask);
-    // If any tested true, return true
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)),
-                                vget_high_u8(vreinterpretq_u8_u32(xmask)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    return (r != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Load in registers
-    XMVECTOR vX = M.r[0];
-    XMVECTOR vY = M.r[1];
-    XMVECTOR vZ = M.r[2];
-    XMVECTOR vW = M.r[3];
-    // Test themselves to check for NaN
-    vX = _mm_cmpneq_ps(vX, vX);
-    vY = _mm_cmpneq_ps(vY, vY);
-    vZ = _mm_cmpneq_ps(vZ, vZ);
-    vW = _mm_cmpneq_ps(vW, vW);
-    // Or all the results
-    vX = _mm_or_ps(vX, vZ);
-    vY = _mm_or_ps(vY, vW);
-    vX = _mm_or_ps(vX, vY);
-    // If any tested true, return true
-    return (_mm_movemask_ps(vX) != 0);
-#else
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-// Return true if any entry in the matrix is +/-INF
-inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    size_t i = 16;
-    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
-    do {
-        // Fetch value into integer unit
-        uint32_t uTest = pWork[0];
-        // Remove sign
-        uTest &= 0x7FFFFFFFU;
-        // INF is 0x7F800000
-        if (uTest == 0x7F800000U) {
-            break;  // INF found
-        }
-        ++pWork;  // Next entry
-    } while (--i);
-    return (i != 0);  // i == 0 if nothing matched
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Load in registers
-    float32x4_t vX = M.r[0];
-    float32x4_t vY = M.r[1];
-    float32x4_t vZ = M.r[2];
-    float32x4_t vW = M.r[3];
-    // Mask off the sign bits
-    vX = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
-    vY = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
-    vZ = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
-    vW = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
-    // Compare to infinity
-    uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
-    uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
-    uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
-    uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
-    // Or the answers together
-    xmask = vorrq_u32(xmask, zmask);
-    ymask = vorrq_u32(ymask, wmask);
-    xmask = vorrq_u32(xmask, ymask);
-    // If any tested true, return true
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)),
-                                vget_high_u8(vreinterpretq_u8_u32(xmask)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    return (r != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bits
-    XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask);
-    XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask);
-    XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask);
-    XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask);
-    // Compare to infinity
-    vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity);
-    vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity);
-    vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity);
-    vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity);
-    // Or the answers together
-    vTemp1 = _mm_or_ps(vTemp1, vTemp2);
-    vTemp3 = _mm_or_ps(vTemp3, vTemp4);
-    vTemp1 = _mm_or_ps(vTemp1, vTemp3);
-    // If any are infinity, the signs are true.
-    return (_mm_movemask_ps(vTemp1) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return true if the XMMatrix is equal to identity
-inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    // Use the integer pipeline to reduce branching to a minimum
-    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
-    // Convert 1.0f to zero and or them together
-    uint32_t uOne = pWork[0] ^ 0x3F800000U;
-    // Or all the 0.0f entries together
-    uint32_t uZero = pWork[1];
-    uZero |= pWork[2];
-    uZero |= pWork[3];
-    // 2nd row
-    uZero |= pWork[4];
-    uOne |= pWork[5] ^ 0x3F800000U;
-    uZero |= pWork[6];
-    uZero |= pWork[7];
-    // 3rd row
-    uZero |= pWork[8];
-    uZero |= pWork[9];
-    uOne |= pWork[10] ^ 0x3F800000U;
-    uZero |= pWork[11];
-    // 4th row
-    uZero |= pWork[12];
-    uZero |= pWork[13];
-    uZero |= pWork[14];
-    uOne |= pWork[15] ^ 0x3F800000U;
-    // If all zero entries are zero, the uZero==0
-    uZero &= 0x7FFFFFFF;  // Allow -0.0f
-    // If all 1.0f entries are 1.0f, then uOne==0
-    uOne |= uZero;
-    return (uOne == 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
-    uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
-    uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
-    uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
-    xmask = vandq_u32(xmask, zmask);
-    ymask = vandq_u32(ymask, wmask);
-    xmask = vandq_u32(xmask, ymask);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)),
-                                vget_high_u8(vreinterpretq_u8_u32(xmask)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    return (r == 0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
-    XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1);
-    XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2);
-    XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3);
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-    vTemp3 = _mm_and_ps(vTemp3, vTemp4);
-    vTemp1 = _mm_and_ps(vTemp1, vTemp3);
-    return (_mm_movemask_ps(vTemp1) == 0x0f);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Perform a 4x4 matrix multiply by a 4x4 matrix
-inline XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1,
-                                             CXMMATRIX M2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMMATRIX mResult;
-    // Cache the invariants in registers
-    float x = M1.m[0][0];
-    float y = M1.m[0][1];
-    float z = M1.m[0][2];
-    float w = M1.m[0][3];
-    // Perform the operation on the first row
-    mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) +
-                      (M2.m[3][0] * w);
-    mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) +
-                      (M2.m[3][1] * w);
-    mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) +
-                      (M2.m[3][2] * w);
-    mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) +
-                      (M2.m[3][3] * w);
-    // Repeat for all the other rows
-    x = M1.m[1][0];
-    y = M1.m[1][1];
-    z = M1.m[1][2];
-    w = M1.m[1][3];
-    mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) +
-                      (M2.m[3][0] * w);
-    mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) +
-                      (M2.m[3][1] * w);
-    mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) +
-                      (M2.m[3][2] * w);
-    mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) +
-                      (M2.m[3][3] * w);
-    x = M1.m[2][0];
-    y = M1.m[2][1];
-    z = M1.m[2][2];
-    w = M1.m[2][3];
-    mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) +
-                      (M2.m[3][0] * w);
-    mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) +
-                      (M2.m[3][1] * w);
-    mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) +
-                      (M2.m[3][2] * w);
-    mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) +
-                      (M2.m[3][3] * w);
-    x = M1.m[3][0];
-    y = M1.m[3][1];
-    z = M1.m[3][2];
-    w = M1.m[3][3];
-    mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) +
-                      (M2.m[3][0] * w);
-    mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) +
-                      (M2.m[3][1] * w);
-    mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) +
-                      (M2.m[3][2] * w);
-    mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) +
-                      (M2.m[3][3] * w);
-    return mResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX mResult;
-    float32x2_t VL = vget_low_f32(M1.r[0]);
-    float32x2_t VH = vget_high_f32(M1.r[0]);
-    // Perform the operation on the first row
-    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[0] = vaddq_f32(vZ, vW);
-    // Repeat for the other 3 rows
-    VL = vget_low_f32(M1.r[1]);
-    VH = vget_high_f32(M1.r[1]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[1] = vaddq_f32(vZ, vW);
-    VL = vget_low_f32(M1.r[2]);
-    VH = vget_high_f32(M1.r[2]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[2] = vaddq_f32(vZ, vW);
-    VL = vget_low_f32(M1.r[3]);
-    VH = vget_high_f32(M1.r[3]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    mResult.r[3] = vaddq_f32(vZ, vW);
-    return mResult;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
-    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
-    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
-    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);
-
-    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
-    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
-    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
-    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);
-
-    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
-    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
-    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
-    __m256 c0 = _mm256_mul_ps(a0, b0);
-    __m256 c1 = _mm256_mul_ps(a1, b0);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
-    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
-    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
-    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
-    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
-    __m256 c4 = _mm256_mul_ps(a0, b1);
-    __m256 c5 = _mm256_mul_ps(a1, b1);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
-    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
-    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
-    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
-
-    t0 = _mm256_add_ps(c2, c6);
-    t1 = _mm256_add_ps(c3, c7);
-
-    XMMATRIX mResult;
-    mResult.r[0] = _mm256_castps256_ps128(t0);
-    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
-    mResult.r[2] = _mm256_castps256_ps128(t1);
-    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX mResult;
-    // Splat the component X,Y,Z then W
-#if defined(_XM_AVX_INTRINSICS_)
-    XMVECTOR vX =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
-    XMVECTOR vY =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
-    XMVECTOR vZ =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
-    XMVECTOR vW =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
-#else
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    // Perform a binary add to reduce cumulative errors
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    mResult.r[0] = vX;
-    // Repeat for the other 3 rows
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
-#else
-    vW = M1.r[1];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    mResult.r[1] = vX;
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
-#else
-    vW = M1.r[2];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    mResult.r[2] = vX;
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
-#else
-    vW = M1.r[3];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    mResult.r[3] = vX;
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1,
-                                                      CXMMATRIX M2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMMATRIX mResult;
-    // Cache the invariants in registers
-    float x = M2.m[0][0];
-    float y = M2.m[1][0];
-    float z = M2.m[2][0];
-    float w = M2.m[3][0];
-    // Perform the operation on the first row
-    mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) +
-                      (M1.m[0][3] * w);
-    mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) +
-                      (M1.m[1][3] * w);
-    mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) +
-                      (M1.m[2][3] * w);
-    mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) +
-                      (M1.m[3][3] * w);
-    // Repeat for all the other rows
-    x = M2.m[0][1];
-    y = M2.m[1][1];
-    z = M2.m[2][1];
-    w = M2.m[3][1];
-    mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) +
-                      (M1.m[0][3] * w);
-    mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) +
-                      (M1.m[1][3] * w);
-    mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) +
-                      (M1.m[2][3] * w);
-    mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) +
-                      (M1.m[3][3] * w);
-    x = M2.m[0][2];
-    y = M2.m[1][2];
-    z = M2.m[2][2];
-    w = M2.m[3][2];
-    mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) +
-                      (M1.m[0][3] * w);
-    mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) +
-                      (M1.m[1][3] * w);
-    mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) +
-                      (M1.m[2][3] * w);
-    mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) +
-                      (M1.m[3][3] * w);
-    x = M2.m[0][3];
-    y = M2.m[1][3];
-    z = M2.m[2][3];
-    w = M2.m[3][3];
-    mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) +
-                      (M1.m[0][3] * w);
-    mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) +
-                      (M1.m[1][3] * w);
-    mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) +
-                      (M1.m[2][3] * w);
-    mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) +
-                      (M1.m[3][3] * w);
-    return mResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(M1.r[0]);
-    float32x2_t VH = vget_high_f32(M1.r[0]);
-    // Perform the operation on the first row
-    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r0 = vaddq_f32(vZ, vW);
-    // Repeat for the other 3 rows
-    VL = vget_low_f32(M1.r[1]);
-    VH = vget_high_f32(M1.r[1]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r1 = vaddq_f32(vZ, vW);
-    VL = vget_low_f32(M1.r[2]);
-    VH = vget_high_f32(M1.r[2]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r2 = vaddq_f32(vZ, vW);
-    VL = vget_low_f32(M1.r[3]);
-    VH = vget_high_f32(M1.r[3]);
-    vX = vmulq_lane_f32(M2.r[0], VL, 0);
-    vY = vmulq_lane_f32(M2.r[1], VL, 1);
-    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
-    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
-    float32x4_t r3 = vaddq_f32(vZ, vW);
-
-    // Transpose result
-    float32x4x2_t P0 = vzipq_f32(r0, r2);
-    float32x4x2_t P1 = vzipq_f32(r1, r3);
-
-    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
-    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
-
-    XMMATRIX mResult;
-    mResult.r[0] = T0.val[0];
-    mResult.r[1] = T0.val[1];
-    mResult.r[2] = T1.val[0];
-    mResult.r[3] = T1.val[1];
-    return mResult;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
-    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
-    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
-    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);
-
-    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
-    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
-    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
-    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);
-
-    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
-    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
-    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
-    __m256 c0 = _mm256_mul_ps(a0, b0);
-    __m256 c1 = _mm256_mul_ps(a1, b0);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
-    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
-    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
-    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
-    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
-    __m256 c4 = _mm256_mul_ps(a0, b1);
-    __m256 c5 = _mm256_mul_ps(a1, b1);
-
-    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
-    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
-    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
-    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
-    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
-
-    t0 = _mm256_add_ps(c2, c6);
-    t1 = _mm256_add_ps(c3, c7);
-
-    // Transpose result
-    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
-    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
-    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
-    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
-    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
-    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
-    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
-    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
-
-    XMMATRIX mResult;
-    mResult.r[0] = _mm256_castps256_ps128(t0);
-    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
-    mResult.r[2] = _mm256_castps256_ps128(t1);
-    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the component X,Y,Z then W
-#if defined(_XM_AVX_INTRINSICS_)
-    XMVECTOR vX =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
-    XMVECTOR vY =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
-    XMVECTOR vZ =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
-    XMVECTOR vW =
-        _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
-#else
-    // Use vW to hold the original row
-    XMVECTOR vW = M1.r[0];
-    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    // Perform the operation on the first row
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    // Perform a binary add to reduce cumulative errors
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    XMVECTOR r0 = vX;
-    // Repeat for the other 3 rows
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
-#else
-    vW = M1.r[1];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    XMVECTOR r1 = vX;
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
-#else
-    vW = M1.r[2];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    XMVECTOR r2 = vX;
-#if defined(_XM_AVX_INTRINSICS_)
-    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
-    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
-    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
-    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
-#else
-    vW = M1.r[3];
-    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
-    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
-    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
-    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-    vX = _mm_mul_ps(vX, M2.r[0]);
-    vY = _mm_mul_ps(vY, M2.r[1]);
-    vZ = _mm_mul_ps(vZ, M2.r[2]);
-    vW = _mm_mul_ps(vW, M2.r[3]);
-    vX = _mm_add_ps(vX, vZ);
-    vY = _mm_add_ps(vY, vW);
-    vX = _mm_add_ps(vX, vY);
-    XMVECTOR r3 = vX;
-
-    // Transpose result
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    // Original matrix:
-    //
-    //     m00m01m02m03
-    //     m10m11m12m13
-    //     m20m21m22m23
-    //     m30m31m32m33
-
-    XMMATRIX P;
-    P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]);  // m00m20m01m21
-    P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]);  // m10m30m11m31
-    P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]);  // m02m22m03m23
-    P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]);  // m12m32m13m33
-
-    XMMATRIX MT;
-    MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]);  // m00m10m20m30
-    MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]);  // m01m11m21m31
-    MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]);  // m02m12m22m32
-    MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]);  // m03m13m23m33
-    return MT;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
-    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);
-
-    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
-    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);
-
-    XMMATRIX mResult;
-    mResult.r[0] = T0.val[0];
-    mResult.r[1] = T0.val[1];
-    mResult.r[2] = T1.val[0];
-    mResult.r[3] = T1.val[1];
-    return mResult;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    __m256 t0 = _mm256_castps128_ps256(M.r[0]);
-    t0 = _mm256_insertf128_ps(t0, M.r[1], 1);
-    __m256 t1 = _mm256_castps128_ps256(M.r[2]);
-    t1 = _mm256_insertf128_ps(t1, M.r[3], 1);
-
-    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
-    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
-    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
-    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
-    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
-    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
-    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
-    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
-
-    XMMATRIX mResult;
-    mResult.r[0] = _mm256_castps256_ps128(t0);
-    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
-    mResult.r[2] = _mm256_castps256_ps128(t1);
-    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
-    return mResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // x.x,x.y,y.x,y.y
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    // x.z,x.w,y.z,y.w
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    // z.x,z.y,w.x,w.y
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    // z.z,z.w,w.z,w.w
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-
-    XMMATRIX mResult;
-    // x.x,y.x,z.x,w.x
-    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.y,y.y,z.y,w.y
-    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    // x.z,y.z,z.z,w.z
-    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-    // x.w,y.w,z.w,w.w
-    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return the inverse and the determinant of a 4x4 matrix
-_Use_decl_annotations_ inline XMMATRIX XM_CALLCONV
-XMMatrixInverse(XMVECTOR* pDeterminant, FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMMATRIX MT = XMMatrixTranspose(M);
-
-    XMVECTOR V0[4], V1[4];
-    V0[0] =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            MT.r[2]);
-    V1[0] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(
-            MT.r[3]);
-    V0[1] =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            MT.r[0]);
-    V1[1] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(
-            MT.r[1]);
-    V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X,
-                            XM_PERMUTE_1Z>(MT.r[2], MT.r[0]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y,
-                            XM_PERMUTE_1W>(MT.r[3], MT.r[1]);
-
-    XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]);
-    XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]);
-    XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]);
-
-    V0[0] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(
-            MT.r[2]);
-    V1[0] =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            MT.r[3]);
-    V0[1] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(
-            MT.r[0]);
-    V1[1] =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            MT.r[1]);
-    V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y,
-                            XM_PERMUTE_1W>(MT.r[2], MT.r[0]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X,
-                            XM_PERMUTE_1Z>(MT.r[3], MT.r[1]);
-
-    D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
-    D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
-    D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);
-
-    V0[0] =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W,
-                            XM_PERMUTE_0X>(D0, D2);
-    V0[1] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y,
-                            XM_PERMUTE_0Z>(D0, D2);
-    V0[2] =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(
-            MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W,
-                            XM_PERMUTE_0X>(D1, D2);
-    V0[3] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(
-            MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y,
-                            XM_PERMUTE_0Z>(D1, D2);
-
-    XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]);
-    XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]);
-    XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]);
-    XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]);
-
-    V0[0] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(
-            MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y,
-                            XM_PERMUTE_1X>(D0, D2);
-    V0[1] =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(
-            MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X,
-                            XM_PERMUTE_0X>(D0, D2);
-    V0[2] =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(
-            MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y,
-                            XM_PERMUTE_1Z>(D1, D2);
-    V0[3] =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(
-            MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z,
-                            XM_PERMUTE_0X>(D1, D2);
-
-    C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-    C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-    V0[0] =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(
-            MT.r[1]);
-    V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X,
-                            XM_PERMUTE_0Z>(D0, D2);
-    V0[1] =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(
-            MT.r[0]);
-    V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                            XM_PERMUTE_1X>(D0, D2);
-    V0[2] =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(
-            MT.r[3]);
-    V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z,
-                            XM_PERMUTE_0Z>(D1, D2);
-    V0[3] =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(
-            MT.r[2]);
-    V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W,
-                            XM_PERMUTE_1Z>(D1, D2);
-
-    XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-    C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
-    XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
-    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-    XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-    C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
-    XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
-    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-    XMMATRIX R;
-    R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
-    R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
-    R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
-    R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
-
-    XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);
-
-    if (pDeterminant != nullptr) *pDeterminant = Determinant;
-
-    XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);
-
-    XMMATRIX Result;
-    Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
-    Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
-    Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
-    Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Transpose matrix
-    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
-    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
-    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-
-    XMMATRIX MT;
-    MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
-    MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
-    MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
-    MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
-
-    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0));
-    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2));
-    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0));
-    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2));
-    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0));
-    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1));
-
-    XMVECTOR D0 = _mm_mul_ps(V00, V10);
-    XMVECTOR D1 = _mm_mul_ps(V01, V11);
-    XMVECTOR D2 = _mm_mul_ps(V02, V12);
-
-    V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2));
-    V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2));
-    V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0));
-    V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1));
-    V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0));
-
-    D0 = XM_FNMADD_PS(V00, V10, D0);
-    D1 = XM_FNMADD_PS(V01, V11, D1);
-    D2 = XM_FNMADD_PS(V02, V12, D2);
-    // V11 = D0Y,D0W,D2Y,D2Y
-    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1));
-    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1));
-    V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2));
-    V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1));
-    // V13 = D1Y,D1W,D2W,D2W
-    XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1));
-    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1));
-    V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2));
-    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2));
-    V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1));
-
-    XMVECTOR C0 = _mm_mul_ps(V00, V10);
-    XMVECTOR C2 = _mm_mul_ps(V01, V11);
-    XMVECTOR C4 = _mm_mul_ps(V02, V12);
-    XMVECTOR C6 = _mm_mul_ps(V03, V13);
-
-    // V11 = D0X,D0Y,D2X,D2X
-    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0));
-    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2));
-    V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3));
-    V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2));
-    // V13 = D1X,D1Y,D2Z,D2Z
-    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0));
-    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2));
-    V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3));
-    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3));
-    V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2));
-
-    C0 = XM_FNMADD_PS(V00, V10, C0);
-    C2 = XM_FNMADD_PS(V01, V11, C2);
-    C4 = XM_FNMADD_PS(V02, V12, C4);
-    C6 = XM_FNMADD_PS(V03, V13, C6);
-
-    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3));
-    // V10 = D0Z,D0Z,D2X,D2Y
-    V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2));
-    V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0));
-    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1));
-    // V11 = D0X,D0W,D2X,D2Y
-    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0));
-    V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3));
-    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3));
-    // V12 = D1Z,D1Z,D2Z,D2W
-    V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2));
-    V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0));
-    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1));
-    // V13 = D1X,D1W,D2Z,D2W
-    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0));
-    V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3));
-
-    V00 = _mm_mul_ps(V00, V10);
-    V01 = _mm_mul_ps(V01, V11);
-    V02 = _mm_mul_ps(V02, V12);
-    V03 = _mm_mul_ps(V03, V13);
-    XMVECTOR C1 = _mm_sub_ps(C0, V00);
-    C0 = _mm_add_ps(C0, V00);
-    XMVECTOR C3 = _mm_add_ps(C2, V01);
-    C2 = _mm_sub_ps(C2, V01);
-    XMVECTOR C5 = _mm_sub_ps(C4, V02);
-    C4 = _mm_add_ps(C4, V02);
-    XMVECTOR C7 = _mm_add_ps(C6, V03);
-    C6 = _mm_sub_ps(C6, V03);
-
-    C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0));
-    C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0));
-    C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0));
-    C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0));
-    C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0));
-    C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0));
-    C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0));
-    C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0));
-    // Get the determinant
-    XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]);
-    if (pDeterminant != nullptr) *pDeterminant = vTemp;
-    vTemp = _mm_div_ps(g_XMOne, vTemp);
-    XMMATRIX mResult;
-    mResult.r[0] = _mm_mul_ps(C0, vTemp);
-    mResult.r[1] = _mm_mul_ps(C2, vTemp);
-    mResult.r[2] = _mm_mul_ps(C4, vTemp);
-    mResult.r[3] = _mm_mul_ps(C6, vTemp);
-    return mResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1,
-                                                        FXMVECTOR V2) noexcept {
-    XMMATRIX mResult;
-    mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2);
-    mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2);
-    mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2);
-    mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2);
-    return mResult;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept {
-    static const XMVECTORF32 Sign = {{{1.0f, -1.0f, 1.0f, -1.0f}}};
-
-    XMVECTOR V0 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(
-            M.r[2]);
-    XMVECTOR V1 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            M.r[3]);
-    XMVECTOR V2 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(
-            M.r[2]);
-    XMVECTOR V3 =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            M.r[3]);
-    XMVECTOR V4 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            M.r[2]);
-    XMVECTOR V5 =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            M.r[3]);
-
-    XMVECTOR P0 = XMVectorMultiply(V0, V1);
-    XMVECTOR P1 = XMVectorMultiply(V2, V3);
-    XMVECTOR P2 = XMVectorMultiply(V4, V5);
-
-    V0 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            M.r[2]);
-    V1 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(
-            M.r[3]);
-    V2 =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            M.r[2]);
-    V3 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(
-            M.r[3]);
-    V4 =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            M.r[2]);
-    V5 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            M.r[3]);
-
-    P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
-    P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
-    P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);
-
-    V0 =
-        XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(
-            M.r[1]);
-    V1 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            M.r[1]);
-    V2 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(
-            M.r[1]);
-
-    XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v);
-    XMVECTOR R = XMVectorMultiply(V0, P0);
-    R = XMVectorNegativeMultiplySubtract(V1, P1, R);
-    R = XMVectorMultiplyAdd(V2, P2, R);
-
-    return XMVector4Dot(S, R);
-}
-
-#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \
-    if ((x) < (y)) {                       \
-        if ((y) < (z)) {                   \
-            (a) = 2;                       \
-            (b) = 1;                       \
-            (c) = 0;                       \
-        } else {                           \
-            (a) = 1;                       \
-                                           \
-            if ((x) < (z)) {               \
-                (b) = 2;                   \
-                (c) = 0;                   \
-            } else {                       \
-                (b) = 0;                   \
-                (c) = 2;                   \
-            }                              \
-        }                                  \
-    } else {                               \
-        if ((x) < (z)) {                   \
-            (a) = 2;                       \
-            (b) = 0;                       \
-            (c) = 1;                       \
-        } else {                           \
-            (a) = 0;                       \
-                                           \
-            if ((y) < (z)) {               \
-                (b) = 2;                   \
-                (c) = 1;                   \
-            } else {                       \
-                (b) = 1;                   \
-                (c) = 2;                   \
-            }                              \
-        }                                  \
-    }
-
-#define XM3_DECOMP_EPSILON 0.0001f
-
-_Use_decl_annotations_ inline bool XM_CALLCONV
-XMMatrixDecompose(XMVECTOR* outScale, XMVECTOR* outRotQuat, XMVECTOR* outTrans,
-                  FXMMATRIX M) noexcept {
-    static const XMVECTOR* pvCanonicalBasis[3] = {
-        &g_XMIdentityR0.v, &g_XMIdentityR1.v, &g_XMIdentityR2.v};
-
-    assert(outScale != nullptr);
-    assert(outRotQuat != nullptr);
-    assert(outTrans != nullptr);
-
-    // Get the translation
-    outTrans[0] = M.r[3];
-
-    XMVECTOR* ppvBasis[3];
-    XMMATRIX matTemp;
-    ppvBasis[0] = &matTemp.r[0];
-    ppvBasis[1] = &matTemp.r[1];
-    ppvBasis[2] = &matTemp.r[2];
-
-    matTemp.r[0] = M.r[0];
-    matTemp.r[1] = M.r[1];
-    matTemp.r[2] = M.r[2];
-    matTemp.r[3] = g_XMIdentityR3.v;
-
-    auto pfScales = reinterpret_cast<float*>(outScale);
-
-    size_t a, b, c;
-    XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0]));
-    XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0]));
-    XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0]));
-    pfScales[3] = 0.f;
-
-    XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])
-
-    if (pfScales[a] < XM3_DECOMP_EPSILON) {
-        ppvBasis[a][0] = pvCanonicalBasis[a][0];
-    }
-    ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);
-
-    if (pfScales[b] < XM3_DECOMP_EPSILON) {
-        size_t aa, bb, cc;
-        float fAbsX, fAbsY, fAbsZ;
-
-        fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
-        fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
-        fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));
-
-        XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)
-
-        ppvBasis[b][0] =
-            XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]);
-    }
-
-    ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);
-
-    if (pfScales[c] < XM3_DECOMP_EPSILON) {
-        ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]);
-    }
-
-    ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);
-
-    float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));
-
-    // use Kramer's rule to check for handedness of coordinate system
-    if (fDet < 0.0f) {
-        // switch coordinate system by negating the scale and inverting the
-        // basis vector on the x-axis
-        pfScales[a] = -pfScales[a];
-        ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);
-
-        fDet = -fDet;
-    }
-
-    fDet -= 1.0f;
-    fDet *= fDet;
-
-    if (XM3_DECOMP_EPSILON < fDet) {
-        // Non-SRT matrix encountered
-        return false;
-    }
-
-    // generate the quaternion from the matrix
-    outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
-    return true;
-}
-
-#undef XM3_DECOMP_EPSILON
-#undef XM3RANKDECOMPOSE
-
-//------------------------------------------------------------------------------
-// Transformation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept {
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02,
-                                        float m03, float m10, float m11,
-                                        float m12, float m13, float m20,
-                                        float m21, float m22, float m23,
-                                        float m30, float m31, float m32,
-                                        float m33) noexcept {
-    XMMATRIX M;
-#if defined(_XM_NO_INTRINSICS_)
-    M.m[0][0] = m00;
-    M.m[0][1] = m01;
-    M.m[0][2] = m02;
-    M.m[0][3] = m03;
-    M.m[1][0] = m10;
-    M.m[1][1] = m11;
-    M.m[1][2] = m12;
-    M.m[1][3] = m13;
-    M.m[2][0] = m20;
-    M.m[2][1] = m21;
-    M.m[2][2] = m22;
-    M.m[2][3] = m23;
-    M.m[3][0] = m30;
-    M.m[3][1] = m31;
-    M.m[3][2] = m32;
-    M.m[3][3] = m33;
-#else
-    M.r[0] = XMVectorSet(m00, m01, m02, m03);
-    M.r[1] = XMVectorSet(m10, m11, m12, m13);
-    M.r[2] = XMVectorSet(m20, m21, m22, m23);
-    M.r[3] = XMVectorSet(m30, m31, m32, m33);
-#endif
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY,
-                                                float OffsetZ) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = OffsetX;
-    M.m[3][1] = OffsetY;
-    M.m[3][2] = OffsetZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f);
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV
-XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = Offset.vector4_f32[0];
-    M.m[3][1] = Offset.vector4_f32[1];
-    M.m[3][2] = Offset.vector4_f32[2];
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v);
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY,
-                                            float ScaleZ) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = ScaleX;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ScaleY;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = ScaleZ;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const XMVECTOR Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0);
-    M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1);
-    M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_set_ps(0, 0, 0, ScaleX);
-    M.r[1] = _mm_set_ps(0, 0, ScaleY, 0);
-    M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV
-XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMMATRIX M;
-    M.m[0][0] = Scale.vector4_f32[0];
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Scale.vector4_f32[1];
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = Scale.vector4_f32[2];
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
-    M.r[1] = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
-    M.r[2] = vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    M.r[0] = _mm_and_ps(Scale, g_XMMaskX);
-    M.r[1] = _mm_and_ps(Scale, g_XMMaskY);
-    M.r[2] = _mm_and_ps(Scale, g_XMMaskZ);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = 1.0f;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = fCosAngle;
-    M.m[1][2] = fSinAngle;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = -fSinAngle;
-    M.m[2][2] = fCosAngle;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
-    T1 = vsetq_lane_f32(fSinAngle, T1, 2);
-
-    float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
-    T2 = vsetq_lane_f32(fCosAngle, T2, 2);
-
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0.v;
-    M.r[1] = T1;
-    M.r[2] = T2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float SinAngle;
-    float CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = 0,y = cos,z = sin, w = 0
-    vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
-    XMMATRIX M;
-    M.r[0] = g_XMIdentityR0;
-    M.r[1] = vCos;
-    // x = 0,y = sin,z = cos, w = 0
-    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
-    // x = 0,y = -sin,z = cos, w = 0
-    vCos = _mm_mul_ps(vCos, g_XMNegateY);
-    M.r[2] = vCos;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = fCosAngle;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = -fSinAngle;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 1.0f;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = fSinAngle;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fCosAngle;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
-    T0 = vsetq_lane_f32(-fSinAngle, T0, 2);
-
-    float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
-    T2 = vsetq_lane_f32(fCosAngle, T2, 2);
-
-    XMMATRIX M;
-    M.r[0] = T0;
-    M.r[1] = g_XMIdentityR1.v;
-    M.r[2] = T2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float SinAngle;
-    float CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = sin,y = 0,z = cos, w = 0
-    vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
-    XMMATRIX M;
-    M.r[2] = vSin;
-    M.r[1] = g_XMIdentityR1;
-    // x = cos,y = 0,z = sin, w = 0
-    vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
-    // x = cos,y = 0,z = -sin, w = 0
-    vSin = _mm_mul_ps(vSin, g_XMNegateZ);
-    M.r[0] = vSin;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMMATRIX M;
-    M.m[0][0] = fCosAngle;
-    M.m[0][1] = fSinAngle;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = -fSinAngle;
-    M.m[1][1] = fCosAngle;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = 1.0f;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
-    T0 = vsetq_lane_f32(fSinAngle, T0, 1);
-
-    float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
-    T1 = vsetq_lane_f32(fCosAngle, T1, 1);
-
-    XMMATRIX M;
-    M.r[0] = T0;
-    M.r[1] = T1;
-    M.r[2] = g_XMIdentityR2.v;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float SinAngle;
-    float CosAngle;
-    XMScalarSinCos(&SinAngle, &CosAngle, Angle);
-
-    XMVECTOR vSin = _mm_set_ss(SinAngle);
-    XMVECTOR vCos = _mm_set_ss(CosAngle);
-    // x = cos,y = sin,z = 0, w = 0
-    vCos = _mm_unpacklo_ps(vCos, vSin);
-    XMMATRIX M;
-    M.r[0] = vCos;
-    // x = sin,y = cos,z = 0, w = 0
-    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
-    // x = cos,y = -sin,z = 0, w = 0
-    vCos = _mm_mul_ps(vCos, g_XMNegateX);
-    M.r[1] = vCos;
-    M.r[2] = g_XMIdentityR2;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw,
-                                                         float Roll) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float cp = cosf(Pitch);
-    float sp = sinf(Pitch);
-
-    float cy = cosf(Yaw);
-    float sy = sinf(Yaw);
-
-    float cr = cosf(Roll);
-    float sr = sinf(Roll);
-
-    XMMATRIX M;
-    M.m[0][0] = cr * cy + sr * sp * sy;
-    M.m[0][1] = sr * cp;
-    M.m[0][2] = sr * sp * cy - cr * sy;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = cr * sp * sy - sr * cy;
-    M.m[1][1] = cr * cp;
-    M.m[1][2] = sr * sy + cr * sp * cy;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = cp * sy;
-    M.m[2][1] = -sp;
-    M.m[2][2] = cp * cy;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-#else
-    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
-    return XMMatrixRotationRollPitchYawFromVector(Angles);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(
-    FXMVECTOR Angles  // <Pitch, Yaw, Roll, undefined>
-    ) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float cp = cosf(Angles.vector4_f32[0]);
-    float sp = sinf(Angles.vector4_f32[0]);
-
-    float cy = cosf(Angles.vector4_f32[1]);
-    float sy = sinf(Angles.vector4_f32[1]);
-
-    float cr = cosf(Angles.vector4_f32[2]);
-    float sr = sinf(Angles.vector4_f32[2]);
-
-    XMMATRIX M;
-    M.m[0][0] = cr * cy + sr * sp * sy;
-    M.m[0][1] = sr * cp;
-    M.m[0][2] = sr * sp * cy - cr * sy;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = cr * sp * sy - sr * cy;
-    M.m[1][1] = cr * cp;
-    M.m[1][2] = sr * sy + cr * sp * cy;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = cp * sy;
-    M.m[2][1] = -sp;
-    M.m[2][2] = cp * cy;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = 0.0f;
-    M.m[3][3] = 1.0f;
-    return M;
-#else
-    static const XMVECTORF32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}};
-
-    XMVECTOR SinAngles, CosAngles;
-    XMVectorSinCos(&SinAngles, &CosAngles, Angles);
-
-    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Z, XM_PERMUTE_1Z,
-                                  XM_PERMUTE_1X>(SinAngles, CosAngles);
-    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1X,
-                                  XM_PERMUTE_1Y>(SinAngles, CosAngles);
-    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z,
-                                  XM_PERMUTE_0Z>(SinAngles, CosAngles);
-    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_1Y, XM_PERMUTE_0Y,
-                                  XM_PERMUTE_0Y>(SinAngles, CosAngles);
-    XMVECTOR P2 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z,
-                                  XM_PERMUTE_1Z>(SinAngles, CosAngles);
-    XMVECTOR P3 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y,
-                                  XM_PERMUTE_1Y>(SinAngles, CosAngles);
-    XMVECTOR Y2 = XMVectorSplatX(SinAngles);
-    XMVECTOR NS = XMVectorNegate(SinAngles);
-
-    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
-    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
-    Q1 = XMVectorMultiply(Q1, Y1);
-    XMVECTOR Q2 = XMVectorMultiply(P2, Y2);
-    Q2 = XMVectorMultiplyAdd(Q2, P3, Q1);
-
-    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Z,
-                                  XM_PERMUTE_0W>(Q0, Q2);
-    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1W,
-                                  XM_PERMUTE_0W>(Q0, Q2);
-    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_0W,
-                                  XM_PERMUTE_0W>(Q0, NS);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorSelect(g_XMZero, V0, g_XMSelect1110.v);
-    M.r[1] = XMVectorSelect(g_XMZero, V1, g_XMSelect1110.v);
-    M.r[2] = XMVectorSelect(g_XMZero, V2, g_XMSelect1110.v);
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis,
-                                                   float Angle) noexcept {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);
-
-    XMVECTOR C2 = XMVectorSplatZ(A);
-    XMVECTOR C1 = XMVectorSplatY(A);
-    XMVECTOR C0 = XMVectorSplatX(A);
-
-    XMVECTOR N0 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(
-            NormalAxis);
-    XMVECTOR N1 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(
-            NormalAxis);
-
-    XMVECTOR V0 = XMVectorMultiply(C2, N0);
-    V0 = XMVectorMultiply(V0, N1);
-
-    XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis);
-    R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);
-
-    XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
-    XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);
-
-    V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
-    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z,
-                                  XM_PERMUTE_0X>(R1, R2);
-    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y,
-                                  XM_PERMUTE_1X>(R1, R2);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y,
-                             XM_PERMUTE_0W>(V0, V1);
-    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W,
-                             XM_PERMUTE_0W>(V0, V1);
-    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z,
-                             XM_PERMUTE_0W>(V0, V2);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    float fSinAngle;
-    float fCosAngle;
-    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);
-
-    XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
-    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
-    XMVECTOR C0 = _mm_set_ps1(fSinAngle);
-
-    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1));
-    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2));
-
-    XMVECTOR V0 = _mm_mul_ps(C2, N0);
-    V0 = _mm_mul_ps(V0, N1);
-
-    XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis);
-    R0 = _mm_mul_ps(R0, NormalAxis);
-    R0 = _mm_add_ps(R0, C1);
-
-    XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis);
-    R1 = _mm_add_ps(R1, V0);
-    XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis);
-    R2 = _mm_sub_ps(V0, R2);
-
-    V0 = _mm_and_ps(R0, g_XMMask3);
-    XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0));
-    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1));
-    XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1));
-    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0));
-
-    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0));
-    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0));
-
-    XMMATRIX M;
-    M.r[0] = R2;
-
-    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1));
-    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2));
-    M.r[1] = R2;
-
-    V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0));
-    M.r[2] = V2;
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis,
-                                                 float Angle) noexcept {
-    assert(!XMVector3Equal(Axis, XMVectorZero()));
-    assert(!XMVector3IsInfinite(Axis));
-
-    XMVECTOR Normal = XMVector3Normalize(Axis);
-    return XMMatrixRotationNormal(Normal, Angle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV
-XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float qx = Quaternion.vector4_f32[0];
-    float qxx = qx * qx;
-
-    float qy = Quaternion.vector4_f32[1];
-    float qyy = qy * qy;
-
-    float qz = Quaternion.vector4_f32[2];
-    float qzz = qz * qz;
-
-    float qw = Quaternion.vector4_f32[3];
-
-    XMMATRIX M;
-    M.m[0][0] = 1.f - 2.f * qyy - 2.f * qzz;
-    M.m[0][1] = 2.f * qx * qy + 2.f * qz * qw;
-    M.m[0][2] = 2.f * qx * qz - 2.f * qy * qw;
-    M.m[0][3] = 0.f;
-
-    M.m[1][0] = 2.f * qx * qy - 2.f * qz * qw;
-    M.m[1][1] = 1.f - 2.f * qxx - 2.f * qzz;
-    M.m[1][2] = 2.f * qy * qz + 2.f * qx * qw;
-    M.m[1][3] = 0.f;
-
-    M.m[2][0] = 2.f * qx * qz + 2.f * qy * qw;
-    M.m[2][1] = 2.f * qy * qz - 2.f * qx * qw;
-    M.m[2][2] = 1.f - 2.f * qxx - 2.f * qyy;
-    M.m[2][3] = 0.f;
-
-    M.m[3][0] = 0.f;
-    M.m[3][1] = 0.f;
-    M.m[3][2] = 0.f;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Constant1110 = {{{1.0f, 1.0f, 1.0f, 0.0f}}};
-
-    XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion);
-    XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0);
-
-    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X,
-                                  XM_PERMUTE_1W>(Q1, Constant1110.v);
-    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y,
-                                  XM_PERMUTE_1W>(Q1, Constant1110.v);
-    XMVECTOR R0 = XMVectorSubtract(Constant1110, V0);
-    R0 = XMVectorSubtract(R0, V1);
-
-    V0 =
-        XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(
-            Quaternion);
-    V1 =
-        XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(
-            Q0);
-    V0 = XMVectorMultiply(V0, V1);
-
-    V1 = XMVectorSplatW(Quaternion);
-    XMVECTOR V2 =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(
-            Q0);
-    V1 = XMVectorMultiply(V1, V2);
-
-    XMVECTOR R1 = XMVectorAdd(V0, V1);
-    XMVECTOR R2 = XMVectorSubtract(V0, V1);
-
-    V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y,
-                         XM_PERMUTE_0Z>(R1, R2);
-    V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X,
-                         XM_PERMUTE_1Z>(R1, R2);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y,
-                             XM_PERMUTE_0W>(R0, V0);
-    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W,
-                             XM_PERMUTE_0W>(R0, V0);
-    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z,
-                             XM_PERMUTE_0W>(R0, V1);
-    M.r[3] = g_XMIdentityR3.v;
-    return M;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Constant1110 = {{{1.0f, 1.0f, 1.0f, 0.0f}}};
-
-    XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion);
-    XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0);
-
-    XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1));
-    V0 = _mm_and_ps(V0, g_XMMask3);
-    XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2));
-    V1 = _mm_and_ps(V1, g_XMMask3);
-    XMVECTOR R0 = _mm_sub_ps(Constant1110, V0);
-    R0 = _mm_sub_ps(R0, V1);
-
-    V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0));
-    V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2));
-    V0 = _mm_mul_ps(V0, V1);
-
-    V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3));
-    XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1));
-    V1 = _mm_mul_ps(V1, V2);
-
-    XMVECTOR R1 = _mm_add_ps(V0, V1);
-    XMVECTOR R2 = _mm_sub_ps(V0, V1);
-
-    V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1));
-    V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0));
-    V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0));
-    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0));
-
-    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0));
-    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0));
-
-    XMMATRIX M;
-    M.r[0] = Q1;
-
-    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1));
-    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2));
-    M.r[1] = Q1;
-
-    Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0));
-    M.r[2] = Q1;
-    M.r[3] = g_XMIdentityR3;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D(
-    FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling,
-    FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept {
-    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling *
-    // MScalingOrientation *
-    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation *
-    //         MRotationOrigin * MTranslation;
-
-    XMVECTOR VScalingOrigin =
-        XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
-    XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin);
-
-    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
-    XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation);
-    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
-    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
-    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
-    XMVECTOR VRotationOrigin =
-        XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
-    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
-    XMVECTOR VTranslation =
-        XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);
-
-    XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
-    M = XMMatrixMultiply(M, MScaling);
-    M = XMMatrixMultiply(M, MScalingOrientation);
-    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixTransformation(
-    FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion,
-    FXMVECTOR Scaling, GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion,
-    HXMVECTOR Translation) noexcept {
-    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling *
-    // MScalingOrientation *
-    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation *
-    //         MRotationOrigin * MTranslation;
-
-    XMVECTOR VScalingOrigin =
-        XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
-    XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin);
-
-    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
-    XMMATRIX MScalingOrientation =
-        XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
-    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
-    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
-    XMVECTOR VRotationOrigin =
-        XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
-    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
-    XMVECTOR VTranslation =
-        XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
-
-    XMMATRIX M;
-    M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
-    M = XMMatrixMultiply(M, MScaling);
-    M = XMMatrixMultiply(M, MScalingOrientation);
-    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV
-XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin,
-                               float Rotation, FXMVECTOR Translation) noexcept {
-    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin *
-    // MTranslation;
-
-    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
-    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
-    XMVECTOR VRotationOrigin =
-        XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
-    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
-    XMVECTOR VTranslation =
-        XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);
-
-    XMMATRIX M;
-    M = MScaling;
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(
-    FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion,
-    GXMVECTOR Translation) noexcept {
-    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin *
-    // MTranslation;
-
-    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
-    XMVECTOR VRotationOrigin =
-        XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
-    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
-    XMVECTOR VTranslation =
-        XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);
-
-    XMMATRIX M;
-    M = MScaling;
-    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
-    M = XMMatrixMultiply(M, MRotation);
-    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
-    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV
-XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept {
-    assert(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
-    assert(!XMPlaneIsInfinite(ReflectionPlane));
-
-    static const XMVECTORF32 NegativeTwo = {{{-2.0f, -2.0f, -2.0f, 0.0f}}};
-
-    XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
-    XMVECTOR S = XMVectorMultiply(P, NegativeTwo);
-
-    XMVECTOR A = XMVectorSplatX(P);
-    XMVECTOR B = XMVectorSplatY(P);
-    XMVECTOR C = XMVectorSplatZ(P);
-    XMVECTOR D = XMVectorSplatW(P);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
-    M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
-    M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
-    M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane,
-                                           FXMVECTOR LightPosition) noexcept {
-    static const XMVECTORU32 Select0001 = {
-        {{XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1}}};
-
-    assert(!XMVector3Equal(ShadowPlane, XMVectorZero()));
-    assert(!XMPlaneIsInfinite(ShadowPlane));
-
-    XMVECTOR P = XMPlaneNormalize(ShadowPlane);
-    XMVECTOR Dot = XMPlaneDot(P, LightPosition);
-    P = XMVectorNegate(P);
-    XMVECTOR D = XMVectorSplatW(P);
-    XMVECTOR C = XMVectorSplatZ(P);
-    XMVECTOR B = XMVectorSplatY(P);
-    XMVECTOR A = XMVectorSplatX(P);
-    Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);
-
-    XMMATRIX M;
-    M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
-    Dot = XMVectorRotateLeft(Dot, 1);
-    M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
-    return M;
-}
-
-//------------------------------------------------------------------------------
-// View and projection initialization operations
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition,
-                                             FXMVECTOR FocusPosition,
-                                             FXMVECTOR UpDirection) noexcept {
-    XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
-    return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition,
-                                             FXMVECTOR FocusPosition,
-                                             FXMVECTOR UpDirection) noexcept {
-    XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
-    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition,
-                                             FXMVECTOR EyeDirection,
-                                             FXMVECTOR UpDirection) noexcept {
-    assert(!XMVector3Equal(EyeDirection, XMVectorZero()));
-    assert(!XMVector3IsInfinite(EyeDirection));
-    assert(!XMVector3Equal(UpDirection, XMVectorZero()));
-    assert(!XMVector3IsInfinite(UpDirection));
-
-    XMVECTOR R2 = XMVector3Normalize(EyeDirection);
-
-    XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
-    R0 = XMVector3Normalize(R0);
-
-    XMVECTOR R1 = XMVector3Cross(R2, R0);
-
-    XMVECTOR NegEyePosition = XMVectorNegate(EyePosition);
-
-    XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition);
-    XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition);
-    XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition);
-
-    XMMATRIX M;
-    M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
-    M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
-    M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
-    M.r[3] = g_XMIdentityR3.v;
-
-    M = XMMatrixTranspose(M);
-
-    return M;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition,
-                                             FXMVECTOR EyeDirection,
-                                             FXMVECTOR UpDirection) noexcept {
-    XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection);
-    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 28931, "PREfast noise: Esp:1266")
-#endif
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth,
-                                                  float ViewHeight, float NearZ,
-                                                  float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
-    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {TwoNearZ / ViewWidth, TwoNearZ / ViewHeight, fRange,
-                     -fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // TwoNearZ / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,1.0f
-    vTemp = _mm_setzero_ps();
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth,
-                                                  float ViewHeight, float NearZ,
-                                                  float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ - FarZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
-    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float fRange = FarZ / (NearZ - FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {TwoNearZ / ViewWidth, TwoNearZ / ViewHeight, fRange,
-                     fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // TwoNearZ / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,-1.0f
-    vValues =
-        _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,-1.0f
-    vTemp = _mm_setzero_ps();
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY,
-                                                     float AspectRatio,
-                                                     float NearZ,
-                                                     float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    float fRange = FarZ / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = Width;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Height;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float fRange = FarZ / (FarZ - NearZ);
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
-    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
-    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float fRange = FarZ / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    float Height = CosFov / SinFov;
-    XMVECTOR rMem = {Height / AspectRatio, Height, fRange, -fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // Height / AspectRatio,0,0,0
-    XMMATRIX M;
-    M.r[0] = vTemp;
-    // 0,Height,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,1.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,0.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY,
-                                                     float AspectRatio,
-                                                     float NearZ,
-                                                     float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
-    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    float fRange = FarZ / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = Width;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = Height;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-    float fRange = FarZ / (NearZ - FarZ);
-    float Height = CosFov / SinFov;
-    float Width = Height / AspectRatio;
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
-    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
-    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float SinFov;
-    float CosFov;
-    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
-    float fRange = FarZ / (NearZ - FarZ);
-    // Note: This is recorded on the stack
-    float Height = CosFov / SinFov;
-    XMVECTOR rMem = {Height / AspectRatio, Height, fRange, fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // Height / AspectRatio,0,0,0
-    XMMATRIX M;
-    M.r[0] = vTemp;
-    // 0,Height,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,-1.0f
-    vTemp = _mm_setzero_ps();
-    vValues =
-        _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,-1.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,fRange * NearZ,0.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(
-    float ViewLeft, float ViewRight, float ViewBottom, float ViewTop,
-    float NearZ, float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ * ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ * ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ - NearZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
-    M.r[2] =
-        XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
-                    -(ViewTop + ViewBottom) * ReciprocalHeight, fRange, 1.0f);
-    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {TwoNearZ * ReciprocalWidth, TwoNearZ * ReciprocalHeight,
-                     -fRange * NearZ, 0};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // TwoNearZ*ReciprocalWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ*ReciprocalHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // 0,0,fRange,1.0f
-    M.r[2] =
-        XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
-                    -(ViewTop + ViewBottom) * ReciprocalHeight, fRange, 1.0f);
-    // 0,0,-fRange * NearZ,0.0f
-    vValues = _mm_and_ps(vValues, g_XMMaskZ);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(
-    float ViewLeft, float ViewRight, float ViewBottom, float ViewTop,
-    float NearZ, float FarZ) noexcept {
-    assert(NearZ > 0.f && FarZ > 0.f);
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = TwoNearZ * ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = TwoNearZ * ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[2][2] = fRange;
-    M.m[2][3] = -1.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 0.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ - FarZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
-    M.r[2] =
-        XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
-                    (ViewTop + ViewBottom) * ReciprocalHeight, fRange, -1.0f);
-    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float TwoNearZ = NearZ + NearZ;
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = FarZ / (NearZ - FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {TwoNearZ * ReciprocalWidth, TwoNearZ * ReciprocalHeight,
-                     fRange * NearZ, 0};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // TwoNearZ*ReciprocalWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,TwoNearZ*ReciprocalHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // 0,0,fRange,1.0f
-    M.r[2] =
-        XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
-                    (ViewTop + ViewBottom) * ReciprocalHeight, fRange, -1.0f);
-    // 0,0,-fRange * NearZ,0.0f
-    vValues = _mm_and_ps(vValues, g_XMMaskZ);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth,
-                                                   float ViewHeight,
-                                                   float NearZ,
-                                                   float FarZ) noexcept {
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fRange = 1.0f / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = 2.0f / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 2.0f / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fRange = 1.0f / (FarZ - NearZ);
-
-    const float32x4_t Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
-    M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fRange = 1.0f / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {2.0f / ViewWidth, 2.0f / ViewHeight, fRange,
-                     -fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // 2.0f / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,2.0f / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=-fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,0.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,-fRange * NearZ,1.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth,
-                                                   float ViewHeight,
-                                                   float NearZ,
-                                                   float FarZ) noexcept {
-    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fRange = 1.0f / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = 2.0f / ViewWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = 2.0f / ViewHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = 0.0f;
-    M.m[3][1] = 0.0f;
-    M.m[3][2] = fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float fRange = 1.0f / (NearZ - FarZ);
-
-    const float32x4_t Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
-    M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fRange = 1.0f / (NearZ - FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {2.0f / ViewWidth, 2.0f / ViewHeight, fRange,
-                     fRange * NearZ};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // 2.0f / ViewWidth,0,0,0
-    M.r[0] = vTemp;
-    // 0,2.0f / ViewHeight,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    M.r[1] = vTemp;
-    // x=fRange,y=fRange * NearZ,0,1.0f
-    vTemp = _mm_setzero_ps();
-    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
-    // 0,0,fRange,0.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
-    M.r[2] = vTemp;
-    // 0,0,fRange * NearZ,1.0f
-    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
-    M.r[3] = vTemp;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(
-    float ViewLeft, float ViewRight, float ViewBottom, float ViewTop,
-    float NearZ, float FarZ) noexcept {
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ - NearZ);
-
-    XMMATRIX M;
-    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
-    M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
-    M.m[3][2] = -fRange * NearZ;
-    M.m[3][3] = 1.0f;
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ - NearZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         -fRange * NearZ, 1.0f);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (FarZ - NearZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {fReciprocalWidth, fReciprocalHeight, fRange, 1.0f};
-    XMVECTOR rMem2 = {-(ViewLeft + ViewRight), -(ViewTop + ViewBottom), -NearZ,
-                      1.0f};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // fReciprocalWidth*2,0,0,0
-    vTemp = _mm_add_ss(vTemp, vTemp);
-    M.r[0] = vTemp;
-    // 0,fReciprocalHeight*2,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    vTemp = _mm_add_ps(vTemp, vTemp);
-    M.r[1] = vTemp;
-    // 0,0,fRange,0.0f
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
-    M.r[2] = vTemp;
-    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop +
-    // ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
-    vValues = _mm_mul_ps(vValues, rMem2);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(
-    float ViewLeft, float ViewRight, float ViewBottom, float ViewTop,
-    float NearZ, float FarZ) noexcept {
-    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
-    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
-    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ - FarZ);
-
-    XMMATRIX M;
-    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
-    M.m[0][1] = 0.0f;
-    M.m[0][2] = 0.0f;
-    M.m[0][3] = 0.0f;
-
-    M.m[1][0] = 0.0f;
-    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
-    M.m[1][2] = 0.0f;
-    M.m[1][3] = 0.0f;
-
-    M.m[2][0] = 0.0f;
-    M.m[2][1] = 0.0f;
-    M.m[2][2] = fRange;
-    M.m[2][3] = 0.0f;
-
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange * NearZ, 1.0f);
-    return M;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ - FarZ);
-    const float32x4_t Zero = vdupq_n_f32(0);
-    XMMATRIX M;
-    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
-    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
-    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
-    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
-                         -(ViewTop + ViewBottom) * ReciprocalHeight,
-                         fRange * NearZ, 1.0f);
-    return M;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMMATRIX M;
-    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
-    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
-    float fRange = 1.0f / (NearZ - FarZ);
-    // Note: This is recorded on the stack
-    XMVECTOR rMem = {fReciprocalWidth, fReciprocalHeight, fRange, 1.0f};
-    XMVECTOR rMem2 = {-(ViewLeft + ViewRight), -(ViewTop + ViewBottom), NearZ,
-                      1.0f};
-    // Copy from memory to SSE register
-    XMVECTOR vValues = rMem;
-    XMVECTOR vTemp = _mm_setzero_ps();
-    // Copy x only
-    vTemp = _mm_move_ss(vTemp, vValues);
-    // fReciprocalWidth*2,0,0,0
-    vTemp = _mm_add_ss(vTemp, vTemp);
-    M.r[0] = vTemp;
-    // 0,fReciprocalHeight*2,0,0
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
-    vTemp = _mm_add_ps(vTemp, vTemp);
-    M.r[1] = vTemp;
-    // 0,0,fRange,0.0f
-    vTemp = vValues;
-    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
-    M.r[2] = vTemp;
-    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop +
-    // ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
-    vValues = _mm_mul_ps(vValues, rMem2);
-    M.r[3] = vValues;
-    return M;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-/****************************************************************************
- *
- * XMMATRIX operators and methods
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX::XMMATRIX(float m00, float m01, float m02, float m03, float m10,
-                          float m11, float m12, float m13, float m20, float m21,
-                          float m22, float m23, float m30, float m31, float m32,
-                          float m33) noexcept {
-    r[0] = XMVectorSet(m00, m01, m02, m03);
-    r[1] = XMVectorSet(m10, m11, m12, m13);
-    r[2] = XMVectorSet(m20, m21, m22, m23);
-    r[3] = XMVectorSet(m30, m31, m32, m33);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMMATRIX::XMMATRIX(const float* pArray) noexcept {
-    assert(pArray != nullptr);
-    r[0] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray));
-    r[1] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 4));
-    r[2] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 8));
-    r[3] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 12));
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator-() const noexcept {
-    XMMATRIX R;
-    R.r[0] = XMVectorNegate(r[0]);
-    R.r[1] = XMVectorNegate(r[1]);
-    R.r[2] = XMVectorNegate(r[2]);
-    R.r[3] = XMVectorNegate(r[3]);
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+=(FXMMATRIX M) noexcept {
-    r[0] = XMVectorAdd(r[0], M.r[0]);
-    r[1] = XMVectorAdd(r[1], M.r[1]);
-    r[2] = XMVectorAdd(r[2], M.r[2]);
-    r[3] = XMVectorAdd(r[3], M.r[3]);
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-=(FXMMATRIX M) noexcept {
-    r[0] = XMVectorSubtract(r[0], M.r[0]);
-    r[1] = XMVectorSubtract(r[1], M.r[1]);
-    r[2] = XMVectorSubtract(r[2], M.r[2]);
-    r[3] = XMVectorSubtract(r[3], M.r[3]);
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept {
-    *this = XMMatrixMultiply(*this, M);
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XMMATRIX::operator*=(float S) noexcept {
-    r[0] = XMVectorScale(r[0], S);
-    r[1] = XMVectorScale(r[1], S);
-    r[2] = XMVectorScale(r[2], S);
-    r[3] = XMVectorScale(r[3], S);
-    return *this;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX& XMMATRIX::operator/=(float S) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vS = XMVectorReplicate(S);
-    r[0] = XMVectorDivide(r[0], vS);
-    r[1] = XMVectorDivide(r[1], vS);
-    r[2] = XMVectorDivide(r[2], vS);
-    r[3] = XMVectorDivide(r[3], vS);
-    return *this;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    float32x4_t vS = vdupq_n_f32(S);
-    r[0] = vdivq_f32(r[0], vS);
-    r[1] = vdivq_f32(r[1], vS);
-    r[2] = vdivq_f32(r[2], vS);
-    r[3] = vdivq_f32(r[3], vS);
-#else
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x2_t vS = vdup_n_f32(S);
-    float32x2_t R0 = vrecpe_f32(vS);
-    float32x2_t S0 = vrecps_f32(R0, vS);
-    R0 = vmul_f32(S0, R0);
-    S0 = vrecps_f32(R0, vS);
-    R0 = vmul_f32(S0, R0);
-    float32x4_t Reciprocal = vcombine_f32(R0, R0);
-    r[0] = vmulq_f32(r[0], Reciprocal);
-    r[1] = vmulq_f32(r[1], Reciprocal);
-    r[2] = vmulq_f32(r[2], Reciprocal);
-    r[3] = vmulq_f32(r[3], Reciprocal);
-#endif
-    return *this;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 vS = _mm_set_ps1(S);
-    r[0] = _mm_div_ps(r[0], vS);
-    r[1] = _mm_div_ps(r[1], vS);
-    r[2] = _mm_div_ps(r[2], vS);
-    r[3] = _mm_div_ps(r[3], vS);
-    return *this;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator+(FXMMATRIX M) const noexcept {
-    XMMATRIX R;
-    R.r[0] = XMVectorAdd(r[0], M.r[0]);
-    R.r[1] = XMVectorAdd(r[1], M.r[1]);
-    R.r[2] = XMVectorAdd(r[2], M.r[2]);
-    R.r[3] = XMVectorAdd(r[3], M.r[3]);
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator-(FXMMATRIX M) const noexcept {
-    XMMATRIX R;
-    R.r[0] = XMVectorSubtract(r[0], M.r[0]);
-    R.r[1] = XMVectorSubtract(r[1], M.r[1]);
-    R.r[2] = XMVectorSubtract(r[2], M.r[2]);
-    R.r[3] = XMVectorSubtract(r[3], M.r[3]);
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept {
-    return XMMatrixMultiply(*this, M);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator*(float S) const noexcept {
-    XMMATRIX R;
-    R.r[0] = XMVectorScale(r[0], S);
-    R.r[1] = XMVectorScale(r[1], S);
-    R.r[2] = XMVectorScale(r[2], S);
-    R.r[3] = XMVectorScale(r[3], S);
-    return R;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XMMATRIX::operator/(float S) const noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR vS = XMVectorReplicate(S);
-    XMMATRIX R;
-    R.r[0] = XMVectorDivide(r[0], vS);
-    R.r[1] = XMVectorDivide(r[1], vS);
-    R.r[2] = XMVectorDivide(r[2], vS);
-    R.r[3] = XMVectorDivide(r[3], vS);
-    return R;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    float32x4_t vS = vdupq_n_f32(S);
-    XMMATRIX R;
-    R.r[0] = vdivq_f32(r[0], vS);
-    R.r[1] = vdivq_f32(r[1], vS);
-    R.r[2] = vdivq_f32(r[2], vS);
-    R.r[3] = vdivq_f32(r[3], vS);
-#else
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x2_t vS = vdup_n_f32(S);
-    float32x2_t R0 = vrecpe_f32(vS);
-    float32x2_t S0 = vrecps_f32(R0, vS);
-    R0 = vmul_f32(S0, R0);
-    S0 = vrecps_f32(R0, vS);
-    R0 = vmul_f32(S0, R0);
-    float32x4_t Reciprocal = vcombine_f32(R0, R0);
-    XMMATRIX R;
-    R.r[0] = vmulq_f32(r[0], Reciprocal);
-    R.r[1] = vmulq_f32(r[1], Reciprocal);
-    R.r[2] = vmulq_f32(r[2], Reciprocal);
-    R.r[3] = vmulq_f32(r[3], Reciprocal);
-#endif
-    return R;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 vS = _mm_set_ps1(S);
-    XMMATRIX R;
-    R.r[0] = _mm_div_ps(r[0], vS);
-    R.r[1] = _mm_div_ps(r[1], vS);
-    R.r[2] = _mm_div_ps(r[2], vS);
-    R.r[3] = _mm_div_ps(r[3], vS);
-    return R;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMMATRIX XM_CALLCONV operator*(float S, FXMMATRIX M) noexcept {
-    XMMATRIX R;
-    R.r[0] = XMVectorScale(M.r[0], S);
-    R.r[1] = XMVectorScale(M.r[1], S);
-    R.r[2] = XMVectorScale(M.r[2], S);
-    R.r[3] = XMVectorScale(M.r[3], S);
-    return R;
-}
-
-/****************************************************************************
- *
- * XMFLOAT3X3 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT3X3::XMFLOAT3X3(
-    const float* pArray) noexcept {
-    assert(pArray != nullptr);
-    for (size_t Row = 0; Row < 3; Row++) {
-        for (size_t Column = 0; Column < 3; Column++) {
-            m[Row][Column] = pArray[Row * 3 + Column];
-        }
-    }
-}
-
-/****************************************************************************
- *
- * XMFLOAT4X3 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT4X3::XMFLOAT4X3(
-    const float* pArray) noexcept {
-    assert(pArray != nullptr);
-
-    m[0][0] = pArray[0];
-    m[0][1] = pArray[1];
-    m[0][2] = pArray[2];
-
-    m[1][0] = pArray[3];
-    m[1][1] = pArray[4];
-    m[1][2] = pArray[5];
-
-    m[2][0] = pArray[6];
-    m[2][1] = pArray[7];
-    m[2][2] = pArray[8];
-
-    m[3][0] = pArray[9];
-    m[3][1] = pArray[10];
-    m[3][2] = pArray[11];
-}
-
-/****************************************************************************
- *
- * XMFLOAT3X4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT3X4::XMFLOAT3X4(
-    const float* pArray) noexcept {
-    assert(pArray != nullptr);
-
-    m[0][0] = pArray[0];
-    m[0][1] = pArray[1];
-    m[0][2] = pArray[2];
-    m[0][3] = pArray[3];
-
-    m[1][0] = pArray[4];
-    m[1][1] = pArray[5];
-    m[1][2] = pArray[6];
-    m[1][3] = pArray[7];
-
-    m[2][0] = pArray[8];
-    m[2][1] = pArray[9];
-    m[2][2] = pArray[10];
-    m[2][3] = pArray[11];
-}
-
-/****************************************************************************
- *
- * XMFLOAT4X4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT4X4::XMFLOAT4X4(
-    const float* pArray) noexcept {
-    assert(pArray != nullptr);
-
-    m[0][0] = pArray[0];
-    m[0][1] = pArray[1];
-    m[0][2] = pArray[2];
-    m[0][3] = pArray[3];
-
-    m[1][0] = pArray[4];
-    m[1][1] = pArray[5];
-    m[1][2] = pArray[6];
-    m[1][3] = pArray[7];
-
-    m[2][0] = pArray[8];
-    m[2][1] = pArray[9];
-    m[2][2] = pArray[10];
-    m[2][3] = pArray[11];
-
-    m[3][0] = pArray[12];
-    m[3][1] = pArray[13];
-    m[3][2] = pArray[14];
-    m[3][3] = pArray[15];
-}
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXMathMisc.inl b/targets/app/linux/Stubs/DirectXMath/DirectXMathMisc.inl
deleted file mode 100644
index 1e2869428..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXMathMisc.inl
+++ /dev/null
@@ -1,2261 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathMisc.inl -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Quaternion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept {
-    return XMVector4Equal(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1,
-                                             FXMVECTOR Q2) noexcept {
-    return XMVector4NotEqual(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept {
-    return XMVector4IsNaN(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept {
-    return XMVector4IsInfinite(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept {
-    return XMVector4Equal(Q, g_XMIdentityR3.v);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1,
-                                            FXMVECTOR Q2) noexcept {
-    return XMVector4Dot(Q1, Q2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1,
-                                                 FXMVECTOR Q2) noexcept {
-    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1
-    // followed by the rotation Q2)
-
-    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
-    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
-    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
-    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{(Q2.vector4_f32[3] * Q1.vector4_f32[0]) +
-                                (Q2.vector4_f32[0] * Q1.vector4_f32[3]) +
-                                (Q2.vector4_f32[1] * Q1.vector4_f32[2]) -
-                                (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
-                            (Q2.vector4_f32[3] * Q1.vector4_f32[1]) -
-                                (Q2.vector4_f32[0] * Q1.vector4_f32[2]) +
-                                (Q2.vector4_f32[1] * Q1.vector4_f32[3]) +
-                                (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
-                            (Q2.vector4_f32[3] * Q1.vector4_f32[2]) +
-                                (Q2.vector4_f32[0] * Q1.vector4_f32[1]) -
-                                (Q2.vector4_f32[1] * Q1.vector4_f32[0]) +
-                                (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
-                            (Q2.vector4_f32[3] * Q1.vector4_f32[3]) -
-                                (Q2.vector4_f32[0] * Q1.vector4_f32[0]) -
-                                (Q2.vector4_f32[1] * Q1.vector4_f32[1]) -
-                                (Q2.vector4_f32[2] * Q1.vector4_f32[2])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ControlWZYX = {{{1.0f, -1.0f, 1.0f, -1.0f}}};
-    static const XMVECTORF32 ControlZWXY = {{{1.0f, 1.0f, -1.0f, -1.0f}}};
-    static const XMVECTORF32 ControlYXWZ = {{{-1.0f, 1.0f, 1.0f, -1.0f}}};
-
-    float32x2_t Q2L = vget_low_f32(Q2);
-    float32x2_t Q2H = vget_high_f32(Q2);
-
-    float32x4_t Q2X = vdupq_lane_f32(Q2L, 0);
-    float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1);
-    float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0);
-    XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1);
-
-    // Mul by Q1WZYX
-    float32x4_t vTemp = vrev64q_f32(Q1);
-    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
-    Q2X = vmulq_f32(Q2X, vTemp);
-    vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);
-
-    // Mul by Q1ZWXY
-    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
-    Q2Y = vmulq_f32(Q2Y, vTemp);
-    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
-
-    // Mul by Q1YXWZ
-    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
-    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
-    Q2Z = vmulq_f32(Q2Z, vTemp);
-    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ControlWZYX = {{{1.0f, -1.0f, 1.0f, -1.0f}}};
-    static const XMVECTORF32 ControlZWXY = {{{1.0f, 1.0f, -1.0f, -1.0f}}};
-    static const XMVECTORF32 ControlYXWZ = {{{-1.0f, 1.0f, 1.0f, -1.0f}}};
-    // Copy to SSE registers and use as few as possible for x86
-    XMVECTOR Q2X = Q2;
-    XMVECTOR Q2Y = Q2;
-    XMVECTOR Q2Z = Q2;
-    XMVECTOR vResult = Q2;
-    // Splat with one instruction
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3));
-    Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0));
-    Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1));
-    Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2));
-    // Retire Q1 and perform Q1*Q2W
-    vResult = _mm_mul_ps(vResult, Q1);
-    XMVECTOR Q1Shuffle = Q1;
-    // Shuffle the copies of Q1
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
-    // Mul by Q1WZYX
-    Q2X = _mm_mul_ps(Q2X, Q1Shuffle);
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1));
-    // Flip the signs on y and z
-    vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult);
-    // Mul by Q1ZWXY
-    Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle);
-    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
-    // Flip the signs on z and w
-    Q2Y = _mm_mul_ps(Q2Y, ControlZWXY);
-    // Mul by Q1YXWZ
-    Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle);
-    // Flip the signs on x and w
-    Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y);
-    vResult = _mm_add_ps(vResult, Q2Y);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept {
-    return XMVector4LengthSq(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept {
-    return XMVector4ReciprocalLength(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept {
-    return XMVector4Length(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept {
-    return XMVector4NormalizeEst(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept {
-    return XMVector4Normalize(Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{-Q.vector4_f32[0], -Q.vector4_f32[1],
-                            -Q.vector4_f32[2], Q.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 NegativeOne3 = {{{-1.0f, -1.0f, -1.0f, 1.0f}}};
-    return vmulq_f32(Q, NegativeOne3.v);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 NegativeOne3 = {{{-1.0f, -1.0f, -1.0f, 1.0f}}};
-    return _mm_mul_ps(Q, NegativeOne3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept {
-    XMVECTOR L = XMVector4LengthSq(Q);
-    XMVECTOR Conjugate = XMQuaternionConjugate(Q);
-
-    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
-
-    XMVECTOR Result = XMVectorDivide(Conjugate, L);
-
-    Result = XMVectorSelect(Result, g_XMZero, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept {
-    static const XMVECTORF32 OneMinusEpsilon = {
-        {{1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}}};
-
-    XMVECTOR QW = XMVectorSplatW(Q);
-    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
-
-    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);
-
-    XMVECTOR Theta = XMVectorACos(QW);
-    XMVECTOR SinTheta = XMVectorSin(Theta);
-
-    XMVECTOR S = XMVectorDivide(Theta, SinTheta);
-
-    XMVECTOR Result = XMVectorMultiply(Q0, S);
-    Result = XMVectorSelect(Q0, Result, ControlW);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept {
-    XMVECTOR Theta = XMVector3Length(Q);
-
-    XMVECTOR SinTheta, CosTheta;
-    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
-
-    XMVECTOR S = XMVectorDivide(SinTheta, Theta);
-
-    XMVECTOR Result = XMVectorMultiply(Q, S);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
-    Result = XMVectorSelect(Result, Q, Control);
-
-    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1,
-                                              float t) noexcept {
-    XMVECTOR T = XMVectorReplicate(t);
-    return XMQuaternionSlerpV(Q0, Q1, T);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                               FXMVECTOR T) noexcept {
-    assert((XMVectorGetY(T) == XMVectorGetX(T)) &&
-           (XMVectorGetZ(T) == XMVectorGetX(T)) &&
-           (XMVectorGetW(T) == XMVectorGetX(T)));
-
-    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) /
-    // sin(Omega)
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    const XMVECTORF32 OneMinusEpsilon = {
-        {{1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}}};
-
-    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
-    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
-
-    CosOmega = XMVectorMultiply(CosOmega, Sign);
-
-    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
-
-    XMVECTOR SinOmega =
-        XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
-    SinOmega = XMVectorSqrt(SinOmega);
-
-    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
-
-    XMVECTOR SignMask = XMVectorSplatSignMask();
-    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
-    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
-    V01 = XMVectorXorInt(V01, SignMask);
-    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);
-
-    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);
-
-    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
-    S0 = XMVectorSin(S0);
-    S0 = XMVectorMultiply(S0, InvSinOmega);
-
-    S0 = XMVectorSelect(V01, S0, Control);
-
-    XMVECTOR S1 = XMVectorSplatY(S0);
-    S0 = XMVectorSplatX(S0);
-
-    S1 = XMVectorMultiply(S1, Sign);
-
-    XMVECTOR Result = XMVectorMultiply(Q0, S0);
-    Result = XMVectorMultiplyAdd(Q1, S1, Result);
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 OneMinusEpsilon = {
-        {{1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}}};
-    static const XMVECTORU32 SignMask2 = {
-        {{0x80000000, 0x00000000, 0x00000000, 0x00000000}}};
-
-    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
-    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
-
-    CosOmega = _mm_mul_ps(CosOmega, Sign);
-
-    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
-
-    XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega);
-    SinOmega = _mm_sub_ps(g_XMOne, SinOmega);
-    SinOmega = _mm_sqrt_ps(SinOmega);
-
-    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
-
-    XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1));
-    V01 = _mm_and_ps(V01, g_XMMaskXY);
-    V01 = _mm_xor_ps(V01, SignMask2);
-    V01 = _mm_add_ps(g_XMIdentityR0, V01);
-
-    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
-    S0 = XMVectorSin(S0);
-    S0 = _mm_div_ps(S0, SinOmega);
-
-    S0 = XMVectorSelect(V01, S0, Control);
-
-    XMVECTOR S1 = XMVectorSplatY(S0);
-    S0 = XMVectorSplatX(S0);
-
-    S1 = _mm_mul_ps(S1, Sign);
-    XMVECTOR Result = _mm_mul_ps(Q0, S0);
-    S1 = _mm_mul_ps(S1, Q1);
-    Result = _mm_add_ps(Result, S1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1,
-                                              FXMVECTOR Q2, GXMVECTOR Q3,
-                                              float t) noexcept {
-    XMVECTOR T = XMVectorReplicate(t);
-    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                               FXMVECTOR Q2, GXMVECTOR Q3,
-                                               HXMVECTOR T) noexcept {
-    assert((XMVectorGetY(T) == XMVectorGetX(T)) &&
-           (XMVectorGetZ(T) == XMVectorGetX(T)) &&
-           (XMVectorGetW(T) == XMVectorGetX(T)));
-
-    XMVECTOR TP = T;
-    const XMVECTOR Two = XMVectorSplatConstant(2, 0);
-
-    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
-    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);
-
-    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
-    TP = XMVectorMultiply(TP, Two);
-
-    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMQuaternionSquadSetup(XMVECTOR* pA, XMVECTOR* pB, XMVECTOR* pC, FXMVECTOR Q0,
-                       FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3) noexcept {
-    assert(pA);
-    assert(pB);
-    assert(pC);
-
-    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
-    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
-    XMVECTOR SQ2 = XMVectorNegate(Q2);
-
-    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
-    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
-
-    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
-    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
-    XMVECTOR SQ0 = XMVectorNegate(Q0);
-
-    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
-    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
-    XMVECTOR SQ3 = XMVectorNegate(Q3);
-
-    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
-    XMVECTOR Control2 = XMVectorLess(LS23, LD23);
-
-    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
-    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
-
-    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
-    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);
-
-    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
-    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
-    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
-    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
-
-    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
-
-    XMVECTOR ExpQ02 =
-        XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
-    XMVECTOR ExpQ13 =
-        XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
-    ExpQ02 = XMQuaternionExp(ExpQ02);
-    ExpQ13 = XMQuaternionExp(ExpQ13);
-
-    *pA = XMQuaternionMultiply(Q1, ExpQ02);
-    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
-    *pC = SQ2;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1,
-                                                    FXMVECTOR Q2, float f,
-                                                    float g) noexcept {
-    float s = f + g;
-
-    XMVECTOR Result;
-    if ((s < 0.00001f) && (s > -0.00001f)) {
-        Result = Q0;
-    } else {
-        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
-        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);
-
-        Result = XMQuaternionSlerp(Q01, Q02, g / s);
-    }
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1,
-                                                     FXMVECTOR Q2, GXMVECTOR F,
-                                                     HXMVECTOR G) noexcept {
-    assert((XMVectorGetY(F) == XMVectorGetX(F)) &&
-           (XMVectorGetZ(F) == XMVectorGetX(F)) &&
-           (XMVectorGetW(F) == XMVectorGetX(F)));
-    assert((XMVectorGetY(G) == XMVectorGetX(G)) &&
-           (XMVectorGetZ(G) == XMVectorGetX(G)) &&
-           (XMVectorGetW(G) == XMVectorGetX(G)));
-
-    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);
-
-    XMVECTOR S = XMVectorAdd(F, G);
-
-    XMVECTOR Result;
-    if (XMVector4InBounds(S, Epsilon)) {
-        Result = Q0;
-    } else {
-        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
-        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
-        XMVECTOR GS = XMVectorReciprocal(S);
-        GS = XMVectorMultiply(G, GS);
-
-        Result = XMQuaternionSlerpV(Q01, Q02, GS);
-    }
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-// Transformation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept {
-    return g_XMIdentityR3.v;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    const float halfpitch = Pitch * 0.5f;
-    float cp = cosf(halfpitch);
-    float sp = sinf(halfpitch);
-
-    const float halfyaw = Yaw * 0.5f;
-    float cy = cosf(halfyaw);
-    float sy = sinf(halfyaw);
-
-    const float halfroll = Roll * 0.5f;
-    float cr = cosf(halfroll);
-    float sr = sinf(halfroll);
-
-    XMVECTORF32 vResult = {
-        {{cr * sp * cy + sr * cp * sy, cr * cp * sy - sr * sp * cy,
-          sr * cp * cy - cr * sp * sy, cr * cp * cy + sr * sp * sy}}};
-    return vResult;
-#else
-    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
-    return XMQuaternionRotationRollPitchYawFromVector(Angles);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(
-    FXMVECTOR Angles  // <Pitch, Yaw, Roll, 0>
-    ) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    const float halfpitch = Angles.vector4_f32[0] * 0.5f;
-    float cp = cosf(halfpitch);
-    float sp = sinf(halfpitch);
-
-    const float halfyaw = Angles.vector4_f32[1] * 0.5f;
-    float cy = cosf(halfyaw);
-    float sy = sinf(halfyaw);
-
-    const float halfroll = Angles.vector4_f32[2] * 0.5f;
-    float cr = cosf(halfroll);
-    float sr = sinf(halfroll);
-
-    XMVECTORF32 vResult = {
-        {{cr * sp * cy + sr * cp * sy, cr * cp * sy - sr * sp * cy,
-          sr * cp * cy - cr * sp * sy, cr * cp * cy + sr * sp * sy}}};
-    return vResult;
-#else
-    static const XMVECTORF32 Sign = {{{1.0f, -1.0f, -1.0f, 1.0f}}};
-
-    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
-
-    XMVECTOR SinAngles, CosAngles;
-    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
-
-    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X,
-                                  XM_PERMUTE_1X>(SinAngles, CosAngles);
-    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y,
-                                  XM_PERMUTE_1Y>(SinAngles, CosAngles);
-    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z,
-                                  XM_PERMUTE_1Z>(SinAngles, CosAngles);
-    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X,
-                                  XM_PERMUTE_1X>(CosAngles, SinAngles);
-    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y,
-                                  XM_PERMUTE_1Y>(CosAngles, SinAngles);
-    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z,
-                                  XM_PERMUTE_1Z>(CosAngles, SinAngles);
-
-    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
-    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
-    Q1 = XMVectorMultiply(Q1, Y1);
-    Q0 = XMVectorMultiply(Q0, R0);
-    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);
-
-    return Q;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis,
-                                                       float Angle) noexcept {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
-
-    float SinV, CosV;
-    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);
-
-    XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV);
-    return XMVectorMultiply(N, Scale);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3);
-    N = _mm_or_ps(N, g_XMIdentityR3);
-    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
-    XMVECTOR vSine;
-    XMVECTOR vCosine;
-    XMVectorSinCos(&vSine, &vCosine, Scale);
-    Scale = _mm_and_ps(vSine, g_XMMask3);
-    vCosine = _mm_and_ps(vCosine, g_XMMaskW);
-    Scale = _mm_or_ps(Scale, vCosine);
-    N = _mm_mul_ps(N, Scale);
-    return N;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis,
-                                                     float Angle) noexcept {
-    assert(!XMVector3Equal(Axis, XMVectorZero()));
-    assert(!XMVector3IsInfinite(Axis));
-
-    XMVECTOR Normal = XMVector3Normalize(Axis);
-    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
-    return Q;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 q;
-    float r22 = M.m[2][2];
-    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
-    {
-        float dif10 = M.m[1][1] - M.m[0][0];
-        float omr22 = 1.f - r22;
-        if (dif10 <= 0.f)  // x^2 >= y^2
-        {
-            float fourXSqr = omr22 - dif10;
-            float inv4x = 0.5f / sqrtf(fourXSqr);
-            q.f[0] = fourXSqr * inv4x;
-            q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x;
-            q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x;
-            q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x;
-        } else  // y^2 >= x^2
-        {
-            float fourYSqr = omr22 + dif10;
-            float inv4y = 0.5f / sqrtf(fourYSqr);
-            q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y;
-            q.f[1] = fourYSqr * inv4y;
-            q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y;
-            q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y;
-        }
-    } else  // z^2 + w^2 >= x^2 + y^2
-    {
-        float sum10 = M.m[1][1] + M.m[0][0];
-        float opr22 = 1.f + r22;
-        if (sum10 <= 0.f)  // z^2 >= w^2
-        {
-            float fourZSqr = opr22 - sum10;
-            float inv4z = 0.5f / sqrtf(fourZSqr);
-            q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z;
-            q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z;
-            q.f[2] = fourZSqr * inv4z;
-            q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z;
-        } else  // w^2 >= z^2
-        {
-            float fourWSqr = opr22 + sum10;
-            float inv4w = 0.5f / sqrtf(fourWSqr);
-            q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w;
-            q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w;
-            q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w;
-            q.f[3] = fourWSqr * inv4w;
-        }
-    }
-    return q.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 XMPMMP = {{{+1.0f, -1.0f, -1.0f, +1.0f}}};
-    static const XMVECTORF32 XMMPMP = {{{-1.0f, +1.0f, -1.0f, +1.0f}}};
-    static const XMVECTORF32 XMMMPP = {{{-1.0f, -1.0f, +1.0f, +1.0f}}};
-    static const XMVECTORU32 Select0110 = {
-        {{XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}}};
-    static const XMVECTORU32 Select0010 = {
-        {{XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
-
-    float32x4_t r0 = M.r[0];
-    float32x4_t r1 = M.r[1];
-    float32x4_t r2 = M.r[2];
-
-    float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
-    float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
-    float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
-
-    // x^2 >= y^2 equivalent to r11 - r00 <= 0
-    float32x4_t r11mr00 = vsubq_f32(r11, r00);
-    uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);
-
-    // z^2 >= w^2 equivalent to r11 + r00 <= 0
-    float32x4_t r11pr00 = vaddq_f32(r11, r00);
-    uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);
-
-    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
-    uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
-
-    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
-    float32x4_t t0 = vmulq_f32(XMPMMP, r00);
-    float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
-    x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
-    x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);
-
-    // (r01, r02, r12, r11)
-    t0 = vextq_f32(r0, r0, 1);
-    float32x4_t t1 = vextq_f32(r1, r1, 1);
-    t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));
-
-    // (r10, r20, r21, r10)
-    t1 = vextq_f32(r2, r2, 3);
-    float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
-    t1 = vbslq_f32(Select0110, t1, r10);
-
-    // (4*x*y, 4*x*z, 4*y*z, unused)
-    float32x4_t xyxzyz = vaddq_f32(t0, t1);
-
-    // (r21, r20, r10, r10)
-    t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));
-
-    // (r12, r02, r01, r12)
-    float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)),
-                                  vrev64_f32(vget_low_f32(r0)));
-    float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
-    t1 = vbslq_f32(Select0110, t2, t3);
-
-    // (4*x*w, 4*y*w, 4*z*w, unused)
-    float32x4_t xwywzw = vsubq_f32(t0, t1);
-    xwywzw = vmulq_f32(XMMPMP, xwywzw);
-
-    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
-    t0 = vextq_f32(xyxzyz, xyxzyz, 3);
-    t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
-    t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
-    float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
-
-    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
-    t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
-    t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
-    float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
-
-    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
-    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
-    t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
-    float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
-
-    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
-    float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
-
-    // Select the row of the tensor-product matrix that has the largest
-    // magnitude.
-    t0 = vbslq_f32(x2gey2, tensor0, tensor1);
-    t1 = vbslq_f32(z2gew2, tensor2, tensor3);
-    t2 = vbslq_f32(x2py2gez2pw2, t0, t1);
-
-    // Normalize the row.  No division by zero is possible because the
-    // quaternion is unit-length (and the row is a nonzero multiple of
-    // the quaternion).
-    t0 = XMVector4Length(t2);
-    return XMVectorDivide(t2, t0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 XMPMMP = {{{+1.0f, -1.0f, -1.0f, +1.0f}}};
-    static const XMVECTORF32 XMMPMP = {{{-1.0f, +1.0f, -1.0f, +1.0f}}};
-    static const XMVECTORF32 XMMMPP = {{{-1.0f, -1.0f, +1.0f, +1.0f}}};
-
-    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
-    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
-    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)
-
-    // (r00, r00, r00, r00)
-    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0));
-    // (r11, r11, r11, r11)
-    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1));
-    // (r22, r22, r22, r22)
-    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2));
-
-    // x^2 >= y^2 equivalent to r11 - r00 <= 0
-    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
-    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
-    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
-
-    // z^2 >= w^2 equivalent to r11 + r00 <= 0
-    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
-    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
-    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
-
-    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
-    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
-
-    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
-    XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne);
-    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
-    XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0);
-    XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2);
-
-    // (r01, r02, r12, r11)
-    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1));
-    // (r10, r10, r20, r21)
-    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0));
-    // (r10, r20, r21, r10)
-    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
-    // (4*x*y, 4*x*z, 4*y*z, unused)
-    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
-
-    // (r21, r20, r10, r10)
-    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1));
-    // (r12, r12, r02, r01)
-    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2));
-    // (r12, r02, r01, r12)
-    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
-    // (4*x*w, 4*y*w, 4*z*w, unused)
-    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
-    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
-
-    // (4*x^2, 4*y^2, 4*x*y, unused)
-    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0));
-    // (4*z^2, 4*w^2, 4*z*w, unused)
-    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2));
-    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
-    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1));
-
-    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
-    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
-    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
-    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2));
-    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
-    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0));
-    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
-    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2));
-
-    // Select the row of the tensor-product matrix that has the largest
-    // magnitude.
-    t0 = _mm_and_ps(x2gey2, tensor0);
-    t1 = _mm_andnot_ps(x2gey2, tensor1);
-    t0 = _mm_or_ps(t0, t1);
-    t1 = _mm_and_ps(z2gew2, tensor2);
-    t2 = _mm_andnot_ps(z2gew2, tensor3);
-    t1 = _mm_or_ps(t1, t2);
-    t0 = _mm_and_ps(x2py2gez2pw2, t0);
-    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
-    t2 = _mm_or_ps(t0, t1);
-
-    // Normalize the row.  No division by zero is possible because the
-    // quaternion is unit-length (and the row is a nonzero multiple of
-    // the quaternion).
-    t0 = XMVector4Length(t2);
-    return _mm_div_ps(t2, t0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Conversion operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMQuaternionToAxisAngle(XMVECTOR* pAxis, float* pAngle, FXMVECTOR Q) noexcept {
-    assert(pAxis);
-    assert(pAngle);
-
-    *pAxis = Q;
-
-    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
-}
-
-/****************************************************************************
- *
- * Plane
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept {
-    return XMVector4Equal(P1, P2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2,
-                                         FXMVECTOR Epsilon) noexcept {
-    XMVECTOR NP1 = XMPlaneNormalize(P1);
-    XMVECTOR NP2 = XMPlaneNormalize(P2);
-    return XMVector4NearEqual(NP1, NP2, Epsilon);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept {
-    return XMVector4NotEqual(P1, P2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept {
-    return XMVector4IsNaN(P);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept {
-    return XMVector4IsInfinite(P);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept {
-    return XMVector4Dot(P, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept {
-    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
-
-    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
-    XMVECTOR Result = XMVector4Dot(P, V3);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P,
-                                             FXMVECTOR V) noexcept {
-    return XMVector3Dot(P, V);
-}
-
-//------------------------------------------------------------------------------
-// XMPlaneNormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
-    return XMVectorMultiply(P, Result);
-
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f);
-    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
-    return _mm_mul_ps(vResult, P);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(P, P);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot, vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot, vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the reciprocal
-    vDot = _mm_rsqrt_ps(vDot);
-    // Get the reciprocal
-    vDot = _mm_mul_ps(vDot, P);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) +
-                            (P.vector4_f32[1] * P.vector4_f32[1]) +
-                            (P.vector4_f32[2] * P.vector4_f32[2]));
-    // Prevent divide by zero
-    if (fLengthSq > 0) {
-        fLengthSq = 1.0f / fLengthSq;
-    }
-    XMVECTORF32 vResult = {
-        {{P.vector4_f32[0] * fLengthSq, P.vector4_f32[1] * fLengthSq,
-          P.vector4_f32[2] * fLengthSq, P.vector4_f32[3] * fLengthSq}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vLength = XMVector3ReciprocalLength(P);
-    return XMVectorMultiply(P, vLength);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(P, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vLengthSq);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(P, P);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(P, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vLengthSq);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine(
-    FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept {
-    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
-    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
-    XMVECTOR D = XMVectorSubtract(V1, V2);
-
-    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
-    VT = XMVectorDivide(VT, D);
-
-    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
-    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
-
-    return XMVectorSelect(Point, g_XMQNaN.v, Control);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMPlaneIntersectPlane(XMVECTOR* pLinePoint1, XMVECTOR* pLinePoint2,
-                      FXMVECTOR P1, FXMVECTOR P2) noexcept {
-    assert(pLinePoint1);
-    assert(pLinePoint2);
-
-    XMVECTOR V1 = XMVector3Cross(P2, P1);
-
-    XMVECTOR LengthSq = XMVector3LengthSq(V1);
-
-    XMVECTOR V2 = XMVector3Cross(P2, V1);
-
-    XMVECTOR P1W = XMVectorSplatW(P1);
-    XMVECTOR Point = XMVectorMultiply(V2, P1W);
-
-    XMVECTOR V3 = XMVector3Cross(V1, P1);
-
-    XMVECTOR P2W = XMVectorSplatW(P2);
-    Point = XMVectorMultiplyAdd(V3, P2W, Point);
-
-    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);
-
-    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);
-
-    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
-    *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control);
-    *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P,
-                                             FXMMATRIX ITM) noexcept {
-    XMVECTOR W = XMVectorSplatW(P);
-    XMVECTOR Z = XMVectorSplatZ(P);
-    XMVECTOR Y = XMVectorSplatY(P);
-    XMVECTOR X = XMVectorSplatX(P);
-
-    XMVECTOR Result = XMVectorMultiply(W, ITM.r[3]);
-    Result = XMVectorMultiplyAdd(Z, ITM.r[2], Result);
-    Result = XMVectorMultiplyAdd(Y, ITM.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, ITM.r[0], Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream(
-    XMFLOAT4* pOutputStream, size_t OutputStride, const XMFLOAT4* pInputStream,
-    size_t InputStride, size_t PlaneCount, FXMMATRIX ITM) noexcept {
-    return XMVector4TransformStream(pOutputStream, OutputStride, pInputStream,
-                                    InputStride, PlaneCount, ITM);
-}
-
-//------------------------------------------------------------------------------
-// Conversion operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point,
-                                                   FXMVECTOR Normal) noexcept {
-    XMVECTOR W = XMVector3Dot(Point, Normal);
-    W = XMVectorNegate(W);
-    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1,
-                                              FXMVECTOR Point2,
-                                              FXMVECTOR Point3) noexcept {
-    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
-    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);
-
-    XMVECTOR N = XMVector3Cross(V21, V31);
-    N = XMVector3Normalize(N);
-
-    XMVECTOR D = XMPlaneDotNormal(N, Point1);
-    D = XMVectorNegate(D);
-
-    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);
-
-    return Result;
-}
-
-/****************************************************************************
- *
- * Color
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept {
-    return XMVector4Equal(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept {
-    return XMVector4NotEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept {
-    return XMVector4Greater(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1,
-                                              FXMVECTOR C2) noexcept {
-    return XMVector4GreaterOrEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept {
-    return XMVector4Less(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1,
-                                           FXMVECTOR C2) noexcept {
-    return XMVector4LessOrEqual(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept {
-    return XMVector4IsNaN(C);
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept {
-    return XMVector4IsInfinite(C);
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{1.0f - vColor.vector4_f32[0], 1.0f - vColor.vector4_f32[1],
-          1.0f - vColor.vector4_f32[2], vColor.vector4_f32[3]}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
-    return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Negate only x,y and z.
-    XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);
-    // Add 1,1,1,0 to -x,-y,-z,w
-    return _mm_add_ps(vTemp, g_XMOne3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1,
-                                            FXMVECTOR C2) noexcept {
-    return XMVectorMultiply(C1, C2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMColorAdjustSaturation(FXMVECTOR vColor, float fSaturation) noexcept {
-    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
-    // Result = (C - Luminance) * Saturation + Luminance;
-
-    const XMVECTORF32 gvLuminance = {{{0.2125f, 0.7154f, 0.0721f, 0.0f}}};
-#if defined(_XM_NO_INTRINSICS_)
-    float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) +
-                       (vColor.vector4_f32[1] * gvLuminance.f[1]) +
-                       (vColor.vector4_f32[2] * gvLuminance.f[2]);
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] =
-        ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance;
-    vResult.vector4_f32[1] =
-        ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance;
-    vResult.vector4_f32[2] =
-        ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance;
-    vResult.vector4_f32[3] = vColor.vector4_f32[3];
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
-    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
-    vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation);
-    return vbslq_f32(g_XMSelect1110, vResult, vColor);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
-    // Splat fSaturation
-    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
-    // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
-    XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance);
-    vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance);
-    // Retain w from the source color
-    vLuminance = _mm_shuffle_ps(
-        vResult, vColor,
-        _MM_SHUFFLE(3, 2, 2,
-                    2));  // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
-    vResult = _mm_shuffle_ps(
-        vResult, vLuminance,
-        _MM_SHUFFLE(
-            3, 0, 1,
-            0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR vColor,
-                                                  float fContrast) noexcept {
-    // Result = (vColor - 0.5f) * fContrast + 0.5f;
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{
-        ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f,
-        ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f,
-        ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f,
-        vColor.vector4_f32[3]  // Leave W untouched
-    }}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
-    vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast);
-    return vbslq_f32(g_XMSelect1110, vResult, vColor);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vScale = _mm_set_ps1(fContrast);  // Splat the scale
-    XMVECTOR vResult = _mm_sub_ps(
-        vColor, g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
-    vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf);
-    // Retain w from the source color
-    vScale = _mm_shuffle_ps(
-        vResult, vColor,
-        _MM_SHUFFLE(3, 2, 2,
-                    2));  // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
-    vResult = _mm_shuffle_ps(
-        vResult, vScale,
-        _MM_SHUFFLE(
-            3, 0, 1,
-            0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept {
-    XMVECTOR r = XMVectorSplatX(rgb);
-    XMVECTOR g = XMVectorSplatY(rgb);
-    XMVECTOR b = XMVectorSplatZ(rgb);
-
-    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
-    XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b));
-
-    XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf);
-
-    XMVECTOR d = XMVectorSubtract(max, min);
-
-    XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110);
-
-    if (XMVector3Less(d, g_XMEpsilon)) {
-        // Achromatic, assume H and S of 0
-        return XMVectorSelect(la, g_XMZero, g_XMSelect1100);
-    } else {
-        XMVECTOR s, h;
-
-        XMVECTOR d2 = XMVectorAdd(min, max);
-
-        if (XMVector3Greater(l, g_XMOneHalf)) {
-            // d / (2-max-min)
-            s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2));
-        } else {
-            // d / (max+min)
-            s = XMVectorDivide(d, d2);
-        }
-
-        if (XMVector3Equal(r, max)) {
-            // Red is max
-            h = XMVectorDivide(XMVectorSubtract(g, b), d);
-        } else if (XMVector3Equal(g, max)) {
-            // Green is max
-            h = XMVectorDivide(XMVectorSubtract(b, r), d);
-            h = XMVectorAdd(h, g_XMTwo);
-        } else {
-            // Blue is max
-            h = XMVectorDivide(XMVectorSubtract(r, g), d);
-            h = XMVectorAdd(h, g_XMFour);
-        }
-
-        h = XMVectorDivide(h, g_XMSix);
-
-        if (XMVector3Less(h, g_XMZero)) h = XMVectorAdd(h, g_XMOne);
-
-        XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100);
-        return XMVectorSelect(s, lha, g_XMSelect1011);
-    }
-}
-
-//------------------------------------------------------------------------------
-
-namespace MathInternal {
-
-inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q,
-                                           FXMVECTOR h) noexcept {
-    static const XMVECTORF32 oneSixth = {
-        {{1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f}}};
-    static const XMVECTORF32 twoThirds = {
-        {{2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f}}};
-
-    XMVECTOR t = h;
-
-    if (XMVector3Less(t, g_XMZero)) t = XMVectorAdd(t, g_XMOne);
-
-    if (XMVector3Greater(t, g_XMOne)) t = XMVectorSubtract(t, g_XMOne);
-
-    if (XMVector3Less(t, oneSixth)) {
-        // p + (q - p) * 6 * t
-        XMVECTOR t1 = XMVectorSubtract(q, p);
-        XMVECTOR t2 = XMVectorMultiply(g_XMSix, t);
-        return XMVectorMultiplyAdd(t1, t2, p);
-    }
-
-    if (XMVector3Less(t, g_XMOneHalf)) return q;
-
-    if (XMVector3Less(t, twoThirds)) {
-        // p + (q - p) * 6 * (2/3 - t)
-        XMVECTOR t1 = XMVectorSubtract(q, p);
-        XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t));
-        return XMVectorMultiplyAdd(t1, t2, p);
-    }
-
-    return p;
-}
-
-}  // namespace MathInternal
-
-inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept {
-    static const XMVECTORF32 oneThird = {
-        {{1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f}}};
-
-    XMVECTOR s = XMVectorSplatY(hsl);
-    XMVECTOR l = XMVectorSplatZ(hsl);
-
-    if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon)) {
-        // Achromatic
-        return XMVectorSelect(hsl, l, g_XMSelect1110);
-    } else {
-        XMVECTOR h = XMVectorSplatX(hsl);
-
-        XMVECTOR q;
-        if (XMVector3Less(l, g_XMOneHalf)) {
-            q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s));
-        } else {
-            q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s));
-        }
-
-        XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q);
-
-        XMVECTOR r = DirectX::MathInternal::XMColorHue2Clr(
-            p, q, XMVectorAdd(h, oneThird));
-        XMVECTOR g = DirectX::MathInternal::XMColorHue2Clr(p, q, h);
-        XMVECTOR b = DirectX::MathInternal::XMColorHue2Clr(
-            p, q, XMVectorSubtract(h, oneThird));
-
-        XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000);
-        XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110);
-
-        return XMVectorSelect(ba, rg, g_XMSelect1100);
-    }
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept {
-    XMVECTOR r = XMVectorSplatX(rgb);
-    XMVECTOR g = XMVectorSplatY(rgb);
-    XMVECTOR b = XMVectorSplatZ(rgb);
-
-    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
-    XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b));
-
-    XMVECTOR d = XMVectorSubtract(v, min);
-
-    XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon))
-                     ? g_XMZero
-                     : XMVectorDivide(d, v);
-
-    if (XMVector3Less(d, g_XMEpsilon)) {
-        // Achromatic, assume H of 0
-        XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000);
-        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
-        return XMVectorSelect(s, hva, g_XMSelect1011);
-    } else {
-        XMVECTOR h;
-
-        if (XMVector3Equal(r, v)) {
-            // Red is max
-            h = XMVectorDivide(XMVectorSubtract(g, b), d);
-
-            if (XMVector3Less(g, b)) h = XMVectorAdd(h, g_XMSix);
-        } else if (XMVector3Equal(g, v)) {
-            // Green is max
-            h = XMVectorDivide(XMVectorSubtract(b, r), d);
-            h = XMVectorAdd(h, g_XMTwo);
-        } else {
-            // Blue is max
-            h = XMVectorDivide(XMVectorSubtract(r, g), d);
-            h = XMVectorAdd(h, g_XMFour);
-        }
-
-        h = XMVectorDivide(h, g_XMSix);
-
-        XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000);
-        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
-        return XMVectorSelect(s, hva, g_XMSelect1011);
-    }
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept {
-    XMVECTOR h = XMVectorSplatX(hsv);
-    XMVECTOR s = XMVectorSplatY(hsv);
-    XMVECTOR v = XMVectorSplatZ(hsv);
-
-    XMVECTOR h6 = XMVectorMultiply(h, g_XMSix);
-
-    XMVECTOR i = XMVectorFloor(h6);
-    XMVECTOR f = XMVectorSubtract(h6, i);
-
-    // p = v* (1-s)
-    XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s));
-
-    // q = v*(1-f*s)
-    XMVECTOR q =
-        XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s)));
-
-    // t = v*(1 - (1-f)*s)
-    XMVECTOR t = XMVectorMultiply(
-        v, XMVectorSubtract(g_XMOne,
-                            XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s)));
-
-    auto ii = static_cast<int>(XMVectorGetX(XMVectorMod(i, g_XMSix)));
-
-    XMVECTOR _rgb;
-
-    switch (ii) {
-        case 0:  // rgb = vtp
-        {
-            XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000);
-            _rgb = XMVectorSelect(p, vt, g_XMSelect1100);
-        } break;
-        case 1:  // rgb = qvp
-        {
-            XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000);
-            _rgb = XMVectorSelect(p, qv, g_XMSelect1100);
-        } break;
-        case 2:  // rgb = pvt
-        {
-            XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000);
-            _rgb = XMVectorSelect(t, pv, g_XMSelect1100);
-        } break;
-        case 3:  // rgb = pqv
-        {
-            XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000);
-            _rgb = XMVectorSelect(v, pq, g_XMSelect1100);
-        } break;
-        case 4:  // rgb = tpv
-        {
-            XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000);
-            _rgb = XMVectorSelect(v, tp, g_XMSelect1100);
-        } break;
-        default:  // rgb = vpq
-        {
-            XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000);
-            _rgb = XMVectorSelect(q, vp, g_XMSelect1100);
-        } break;
-    }
-
-    return XMVectorSelect(hsv, _rgb, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept {
-    static const XMVECTORF32 Scale0 = {{{0.299f, -0.147f, 0.615f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {{{0.587f, -0.289f, -0.515f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{0.114f, 0.436f, -0.100f, 0.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(rgb, M);
-
-    return XMVectorSelect(rgb, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept {
-    static const XMVECTORF32 Scale1 = {{{0.0f, -0.395f, 2.032f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{1.140f, -0.581f, 0.0f, 0.0f}}};
-
-    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(yuv, M);
-
-    return XMVectorSelect(yuv, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept {
-    static const XMVECTORF32 Scale0 = {{{0.2126f, -0.0997f, 0.6150f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {{{0.7152f, -0.3354f, -0.5586f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{0.0722f, 0.4351f, -0.0564f, 0.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(rgb, M);
-
-    return XMVectorSelect(rgb, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept {
-    static const XMVECTORF32 Scale1 = {{{0.0f, -0.2153f, 2.1324f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{1.2803f, -0.3806f, 0.0f, 0.0f}}};
-
-    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(yuv, M);
-
-    return XMVectorSelect(yuv, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept {
-    static const XMVECTORF32 Scale0 = {{{0.2627f, -0.1215f, 0.6150f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {{{0.6780f, -0.3136f, -0.5655f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{0.0593f, 0.4351f, -0.0495f, 0.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(rgb, M);
-
-    return XMVectorSelect(rgb, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept {
-    static const XMVECTORF32 Scale1 = {{{0.0f, -0.1891f, 2.1620f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{1.1989f, -0.4645f, 0.0f, 0.0f}}};
-
-    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(yuv, M);
-
-    return XMVectorSelect(yuv, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept {
-    static const XMVECTORF32 Scale0 = {
-        {{0.4887180f, 0.1762044f, 0.0000000f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {
-        {{0.3106803f, 0.8129847f, 0.0102048f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {
-        {{0.2006017f, 0.0108109f, 0.9897952f, 0.0f}}};
-    static const XMVECTORF32 Scale = {
-        {{1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale);
-
-    return XMVectorSelect(rgb, clr, g_XMSelect1110);
-}
-
-inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept {
-    static const XMVECTORF32 Scale0 = {
-        {{2.3706743f, -0.5138850f, 0.0052982f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {
-        {{-0.9000405f, 1.4253036f, -0.0146949f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {
-        {{-0.4706338f, 0.0885814f, 1.0093968f, 0.0f}}};
-    static const XMVECTORF32 Scale = {{{0.17697f, 0.17697f, 0.17697f, 0.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M);
-
-    return XMVectorSelect(xyz, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept {
-    static const XMVECTORF32 Scale0 = {{{3.2406f, -0.9689f, 0.0557f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {{{-1.5372f, 1.8758f, -0.2040f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{-0.4986f, 0.0415f, 1.0570f, 0.0f}}};
-    static const XMVECTORF32 Cutoff = {
-        {{0.0031308f, 0.0031308f, 0.0031308f, 0.0f}}};
-    static const XMVECTORF32 Exp = {
-        {{1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f}}};
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR lclr = XMVector3Transform(xyz, M);
-
-    XMVECTOR sel = XMVectorGreater(lclr, Cutoff);
-
-    // clr = 12.92 * lclr for lclr <= 0.0031308f
-    XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale);
-
-    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
-    XMVECTOR largeC = XMVectorSubtract(
-        XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA);
-
-    XMVECTOR clr = XMVectorSelect(smallC, largeC, sel);
-
-    return XMVectorSelect(xyz, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept {
-    static const XMVECTORF32 Scale0 = {{{0.4124f, 0.2126f, 0.0193f, 0.0f}}};
-    static const XMVECTORF32 Scale1 = {{{0.3576f, 0.7152f, 0.1192f, 0.0f}}};
-    static const XMVECTORF32 Scale2 = {{{0.1805f, 0.0722f, 0.9505f, 0.0f}}};
-    static const XMVECTORF32 Cutoff = {{{0.04045f, 0.04045f, 0.04045f, 0.0f}}};
-    static const XMVECTORF32 Exp = {{{2.4f, 2.4f, 2.4f, 1.0f}}};
-
-    XMVECTOR sel = XMVectorGreater(srgb, Cutoff);
-
-    // lclr = clr / 12.92
-    XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale);
-
-    // lclr = pow( (clr + a) / (1+a), 2.4 )
-    XMVECTOR largeC = XMVectorPow(
-        XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp);
-
-    XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel);
-
-    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
-    XMVECTOR clr = XMVector3Transform(lclr, M);
-
-    return XMVectorSelect(srgb, clr, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept {
-    static const XMVECTORF32 Cutoff = {
-        {{0.0031308f, 0.0031308f, 0.0031308f, 1.f}}};
-    static const XMVECTORF32 Linear = {{{12.92f, 12.92f, 12.92f, 1.f}}};
-    static const XMVECTORF32 Scale = {{{1.055f, 1.055f, 1.055f, 1.f}}};
-    static const XMVECTORF32 Bias = {{{0.055f, 0.055f, 0.055f, 0.f}}};
-    static const XMVECTORF32 InvGamma = {
-        {{1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f}}};
-
-    XMVECTOR V = XMVectorSaturate(rgb);
-    XMVECTOR V0 = XMVectorMultiply(V, Linear);
-    XMVECTOR V1 = XMVectorSubtract(
-        XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias);
-    XMVECTOR select = XMVectorLess(V, Cutoff);
-    V = XMVectorSelect(V1, V0, select);
-    return XMVectorSelect(rgb, V, g_XMSelect1110);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept {
-    static const XMVECTORF32 Cutoff = {{{0.04045f, 0.04045f, 0.04045f, 1.f}}};
-    static const XMVECTORF32 ILinear = {
-        {{1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f}}};
-    static const XMVECTORF32 Scale = {
-        {{1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f}}};
-    static const XMVECTORF32 Bias = {{{0.055f, 0.055f, 0.055f, 0.f}}};
-    static const XMVECTORF32 Gamma = {{{2.4f, 2.4f, 2.4f, 1.f}}};
-
-    XMVECTOR V = XMVectorSaturate(srgb);
-    XMVECTOR V0 = XMVectorMultiply(V, ILinear);
-    XMVECTOR V1 =
-        XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma);
-    XMVECTOR select = XMVectorGreater(V, Cutoff);
-    V = XMVectorSelect(V0, V1, select);
-    return XMVectorSelect(srgb, V, g_XMSelect1110);
-}
-
-/****************************************************************************
- *
- * Miscellaneous
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline bool XMVerifyCPUSupport() noexcept {
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    int CPUInfo[4] = {-1};
-#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
-    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
-#else
-    __cpuid(CPUInfo, 0);
-#endif
-
-#ifdef __AVX2__
-    if (CPUInfo[0] < 7) return false;
-#else
-    if (CPUInfo[0] < 1) return false;
-#endif
-
-#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
-    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
-#else
-    __cpuid(CPUInfo, 1);
-#endif
-
-#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
-    // The compiler can emit FMA3 instructions even without explicit intrinsics
-    // use
-    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
-        return false;  // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
-#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
-    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
-        return false;  // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
-#elif defined(_XM_FMA3_INTRINSICS_)
-    if ((CPUInfo[2] & 0x18081001) != 0x18081001)
-        return false;  // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
-#elif defined(_XM_F16C_INTRINSICS_)
-    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
-        return false;  // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
-#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_)
-    if ((CPUInfo[2] & 0x18080001) != 0x18080001)
-        return false;  // No AVX/OSXSAVE/SSE4.1/SSE3 support
-#elif defined(_XM_SSE4_INTRINSICS_)
-    if ((CPUInfo[2] & 0x80001) != 0x80001)
-        return false;  // No SSE3/SSE4.1 support
-#elif defined(_XM_SSE3_INTRINSICS_)
-    if (!(CPUInfo[2] & 0x1)) return false;  // No SSE3 support
-#endif
-
-    // The x64 processor model requires SSE2 support, but no harm in checking
-    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
-        return false;  // No SSE2/SSE support
-
-#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
-#if defined(__clang__) || defined(__GNUC__)
-    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
-#else
-    __cpuidex(CPUInfo, 7, 0);
-#endif
-    if (!(CPUInfo[1] & 0x20)) return false;  // No AVX2 support
-#endif
-
-    return true;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    // ARM-NEON support is required for the Windows on ARM platform
-    return true;
-#else
-    // No intrinsics path always supported
-    return true;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle,
-                                          FXMVECTOR RefractionIndex) noexcept {
-    assert(!XMVector4IsInfinite(CosIncidentAngle));
-
-    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g -
-    // c) + 1)^2 + 1) where c = CosIncidentAngle g = sqrt(c^2 +
-    // RefractionIndex^2 - 1)
-
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex,
-                                     g_XMNegativeOne.v);
-    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
-    G = XMVectorAbs(G);
-    G = XMVectorSqrt(G);
-
-    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
-    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);
-
-    XMVECTOR V0 = XMVectorMultiply(D, D);
-    XMVECTOR V1 = XMVectorMultiply(S, S);
-    V1 = XMVectorReciprocal(V1);
-    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
-    V0 = XMVectorMultiply(V0, V1);
-
-    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
-    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
-    V2 = XMVectorMultiply(V2, V2);
-    V3 = XMVectorMultiply(V3, V3);
-    V3 = XMVectorReciprocal(V3);
-    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
-
-    XMVECTOR Result = XMVectorMultiply(V0, V2);
-
-    Result = XMVectorSaturate(Result);
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
-    XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex);
-    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle);
-    G = _mm_sub_ps(G, g_XMOne);
-    vTemp = _mm_add_ps(vTemp, G);
-    // max((0-vTemp),vTemp) == abs(vTemp)
-    // The abs is needed to deal with refraction and cosine being zero
-    G = _mm_setzero_ps();
-    G = _mm_sub_ps(G, vTemp);
-    G = _mm_max_ps(G, vTemp);
-    // Last operation, the sqrt()
-    G = _mm_sqrt_ps(G);
-
-    // Calc G-C and G+C
-    XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle);
-    XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle);
-    // Perform the term (0.5f *(g - c)^2) / (g + c)^2
-    XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC);
-    vTemp = _mm_mul_ps(GAddC, GAddC);
-    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
-    vResult = _mm_div_ps(vResult, vTemp);
-    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
-    GAddC = _mm_mul_ps(GAddC, CosIncidentAngle);
-    GSubC = _mm_mul_ps(GSubC, CosIncidentAngle);
-    GAddC = _mm_sub_ps(GAddC, g_XMOne);
-    GSubC = _mm_add_ps(GSubC, g_XMOne);
-    GAddC = _mm_mul_ps(GAddC, GAddC);
-    GSubC = _mm_mul_ps(GSubC, GSubC);
-    GAddC = _mm_div_ps(GAddC, GSubC);
-    GAddC = _mm_add_ps(GAddC, g_XMOne);
-    // Multiply the two term parts
-    vResult = _mm_mul_ps(vResult, GAddC);
-    // Clamp to 0.0 - 1.0f
-    vResult = _mm_max_ps(vResult, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept {
-    float Delta = S1 - S2;
-    return (fabsf(Delta) <= Epsilon);
-}
-
-//------------------------------------------------------------------------------
-// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
-inline float XMScalarModAngle(float Angle) noexcept {
-    // Note: The modulo is performed with unsigned math only to work
-    // around a precision error on numbers that are close to PI
-
-    // Normalize the range from 0.0f to XM_2PI
-    Angle = Angle + XM_PI;
-    // Perform the modulo, unsigned
-    float fTemp = fabsf(Angle);
-    fTemp = fTemp -
-            (XM_2PI * static_cast<float>(static_cast<int32_t>(fTemp / XM_2PI)));
-    // Restore the number to the range of -XM_PI to XM_PI-epsilon
-    fTemp = fTemp - XM_PI;
-    // If the modulo'd value was negative, restore negation
-    if (Angle < 0.0f) {
-        fTemp = -fTemp;
-    }
-    return fTemp;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarSin(float Value) noexcept {
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-    }
-
-    // 11-degree minimax approximation
-    float y2 = y * y;
-    return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) *
-                  y2 +
-              0.0083333310f) *
-                 y2 -
-             0.16666667f) *
-                y2 +
-            1.0f) *
-           y;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarSinEst(float Value) noexcept {
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-    }
-
-    // 7-degree minimax approximation
-    float y2 = y * y;
-    return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 +
-            1.0f) *
-           y;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarCos(float Value) noexcept {
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    float sign;
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-        sign = -1.0f;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    } else {
-        sign = +1.0f;
-    }
-
-    // 10-degree minimax approximation
-    float y2 = y * y;
-    float p =
-        ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 +
-          0.041666638f) *
-             y2 -
-         0.5f) *
-            y2 +
-        1.0f;
-    return sign * p;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarCosEst(float Value) noexcept {
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    float sign;
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-        sign = -1.0f;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    } else {
-        sign = +1.0f;
-    }
-
-    // 6-degree minimax approximation
-    float y2 = y * y;
-    float p =
-        ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
-    return sign * p;
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline void XMScalarSinCos(float* pSin, float* pCos,
-                                                  float Value) noexcept {
-    assert(pSin);
-    assert(pCos);
-
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    float sign;
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-        sign = -1.0f;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    } else {
-        sign = +1.0f;
-    }
-
-    float y2 = y * y;
-
-    // 11-degree minimax approximation
-    *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) *
-                   y2 +
-               0.0083333310f) *
-                  y2 -
-              0.16666667f) *
-                 y2 +
-             1.0f) *
-            y;
-
-    // 10-degree minimax approximation
-    float p =
-        ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 +
-          0.041666638f) *
-             y2 -
-         0.5f) *
-            y2 +
-        1.0f;
-    *pCos = sign * p;
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline void XMScalarSinCosEst(float* pSin, float* pCos,
-                                                     float Value) noexcept {
-    assert(pSin);
-    assert(pCos);
-
-    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
-    float quotient = XM_1DIV2PI * Value;
-    if (Value >= 0.0f) {
-        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
-    } else {
-        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
-    }
-    float y = Value - XM_2PI * quotient;
-
-    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
-    float sign;
-    if (y > XM_PIDIV2) {
-        y = XM_PI - y;
-        sign = -1.0f;
-    } else if (y < -XM_PIDIV2) {
-        y = -XM_PI - y;
-        sign = -1.0f;
-    } else {
-        sign = +1.0f;
-    }
-
-    float y2 = y * y;
-
-    // 7-degree minimax approximation
-    *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 +
-             1.0f) *
-            y;
-
-    // 6-degree minimax approximation
-    float p =
-        ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
-    *pCos = sign * p;
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarASin(float Value) noexcept {
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f) {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 7-degree minimax approximation
-    float result =
-        ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x +
-            0.0308918810f) *
-               x -
-           0.0501743046f) *
-              x +
-          0.0889789874f) *
-             x -
-         0.2145988016f) *
-            x +
-        1.5707963050f;
-    result *= root;  // acos(|x|)
-
-    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
-    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarASinEst(float Value) noexcept {
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f) {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 3-degree minimax approximation
-    float result =
-        ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
-    result *= root;  // acos(|x|)
-
-    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
-    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarACos(float Value) noexcept {
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f) {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 7-degree minimax approximation
-    float result =
-        ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x +
-            0.0308918810f) *
-               x -
-           0.0501743046f) *
-              x +
-          0.0889789874f) *
-             x -
-         0.2145988016f) *
-            x +
-        1.5707963050f;
-    result *= root;
-
-    // acos(x) = pi - acos(-x) when x < 0
-    return (nonnegative ? result : XM_PI - result);
-}
-
-//------------------------------------------------------------------------------
-
-inline float XMScalarACosEst(float Value) noexcept {
-    // Clamp input to [-1,1].
-    bool nonnegative = (Value >= 0.0f);
-    float x = fabsf(Value);
-    float omx = 1.0f - x;
-    if (omx < 0.0f) {
-        omx = 0.0f;
-    }
-    float root = sqrtf(omx);
-
-    // 3-degree minimax approximation
-    float result =
-        ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
-    result *= root;
-
-    // acos(x) = pi - acos(-x) when x < 0
-    return (nonnegative ? result : XM_PI - result);
-}
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXMathVector.inl b/targets/app/linux/Stubs/DirectXMath/DirectXMathVector.inl
deleted file mode 100644
index be289e5a6..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXMathVector.inl
+++ /dev/null
@@ -1,14000 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXMathVector.inl -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#if defined(_XM_NO_INTRINSICS_)
-#define XMISNAN(x) isnan(x)
-#define XMISINF(x) isinf(x)
-#endif
-
-#if defined(_XM_SSE_INTRINSICS_)
-
-#define XM3UNPACK3INTO4(l1, l2, l3)                                \
-    XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2)); \
-    XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0)); \
-    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));               \
-    XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8))
-
-#define XM3PACK4INTO3(v2x)                                 \
-    v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1)); \
-    V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));  \
-    V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));  \
-    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));  \
-    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0))
-
-#endif
-
-/****************************************************************************
- *
- * General Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Assignment operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-// Return a vector with all elements equaling zero
-inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{0.0f, 0.0f, 0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32(0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_setzero_ps();
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with four floating point values
-inline XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z,
-                                        float w) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{x, y, z, w}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t V0 = vcreate_f32(
-        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&x)) |
-        (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&y)) << 32));
-    float32x2_t V1 = vcreate_f32(
-        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&z)) |
-        (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&w)) << 32));
-    return vcombine_f32(V0, V1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_set_ps(w, z, y, x);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with four integer values
-inline XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z,
-                                           uint32_t w) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult = {{{x, y, z, w}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t V0 = vcreate_u32(static_cast<uint64_t>(x) |
-                                (static_cast<uint64_t>(y) << 32));
-    uint32x2_t V1 = vcreate_u32(static_cast<uint64_t>(z) |
-                                (static_cast<uint64_t>(w) << 32));
-    return vreinterpretq_f32_u32(vcombine_u32(V0, V1));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set_epi32(static_cast<int>(w), static_cast<int>(z),
-                              static_cast<int>(y), static_cast<int>(x));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated floating point value
-inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32(Value);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_set_ps1(Value);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorReplicatePtr(const float* pValue) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float Value = pValue[0];
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_dup_f32(pValue);
-#elif defined(_XM_AVX_INTRINSICS_)
-    return _mm_broadcast_ss(pValue);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps1(pValue);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated integer value
-inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(Value));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_set1_epi32(static_cast<int>(Value));
-    return _mm_castsi128_ps(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with a replicated integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t Value = pValue[0];
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = Value;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vld1q_dup_u32(pValue));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_load_ps1(reinterpret_cast<const float*>(pValue));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with all bits set (true mask)
-inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult = {
-        {{0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_s32(vdupq_n_s32(-1));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set1_epi32(-1);
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Initialize a vector with all bits clear (false mask)
-inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{0.0f, 0.0f, 0.0f, 0.0f}}};
-    return vResult;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(0));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_setzero_ps();
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the x component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] =
-        V.vector4_f32[0];
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32(vget_low_f32(V), 0);
-#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
-    return _mm_broadcastss_ps(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the y component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] =
-        V.vector4_f32[1];
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32(vget_low_f32(V), 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the z component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] =
-        V.vector4_f32[2];
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32(vget_high_f32(V), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Replicate the w component of the vector
-inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] =
-        V.vector4_f32[3];
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_lane_f32(vget_high_f32(V), 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of 1.0f,1.0f,1.0f,1.0f
-inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] = 1.0f;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vdupq_n_f32(1.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMOne;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of INF,INF,INF,INF
-inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = 0x7F800000;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMInfinity;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
-inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = 0x7FC00000;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMQNaN;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector
-// of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
-inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = 0x34000000;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return g_XMEpsilon;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
-inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 vResult;
-    vResult.u[0] = vResult.u[1] = vResult.u[2] = vResult.u[3] = 0x80000000U;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_set1_epi32(static_cast<int>(0x80000000));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return a floating point value via an index. This is not a recommended
-// function to use due to performance loss.
-inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept {
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[i];
-#else
-    XMVECTORF32 U;
-    U.v = V;
-    return U.f[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return the X component in an FPU register.
-inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cvtss_f32(V);
-#endif
-}
-
-// Return the Y component in an FPU register.
-inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-// Return the Z component in an FPU register.
-inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-// Return the W component in an FPU register.
-inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_f32(V, 3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-    return _mm_cvtss_f32(vTemp);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store a component indexed by i into a 32 bit float location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept {
-    assert(f != nullptr);
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-#if defined(_XM_NO_INTRINSICS_)
-    *f = V.vector4_f32[i];
-#else
-    XMVECTORF32 U;
-    U.v = V;
-    *f = U.f[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store the X component into a 32 bit float location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept {
-    assert(x != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_f32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(x, V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(x, V);
-#endif
-}
-
-// Store the Y component into a 32 bit float location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept {
-    assert(y != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *y = V.vector4_f32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(y, V, 1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *(reinterpret_cast<int*>(y)) = _mm_extract_ps(V, 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-    _mm_store_ss(y, vResult);
-#endif
-}
-
-// Store the Z component into a 32 bit float location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept {
-    assert(z != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *z = V.vector4_f32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(z, V, 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *(reinterpret_cast<int*>(z)) = _mm_extract_ps(V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(z, vResult);
-#endif
-}
-
-// Store the W component into a 32 bit float location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept {
-    assert(w != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *w = V.vector4_f32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_f32(w, V, 3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    *(reinterpret_cast<int*>(w)) = _mm_extract_ps(V, 3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-    _mm_store_ss(w, vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return an integer value via an index. This is not a recommended
-// function to use due to performance loss.
-inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V,
-                                                  size_t i) noexcept {
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[i];
-#else
-    XMVECTORU32 U;
-    U.v = V;
-    return U.u[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Return the X component in an integer register.
-inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
-#endif
-}
-
-// Return the Y component in an integer register.
-inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    return static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti =
-        _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-// Return the Z component in an integer register.
-inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    return static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti =
-        _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-// Return the W component in an integer register.
-inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    return static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vResulti =
-        _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3));
-    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store a component indexed by i into a 32 bit integer location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept {
-    assert(x != nullptr);
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_u32[i];
-#else
-    XMVECTORU32 U;
-    U.v = V;
-    *x = U.u[i];
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Store the X component into a 32 bit integer location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept {
-    assert(x != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *x = V.vector4_u32[0];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    _mm_store_ss(reinterpret_cast<float*>(x), V);
-#endif
-}
-
-// Store the Y component into a 32 bit integer location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept {
-    assert(y != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *y = V.vector4_u32[1];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    *y = static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-    _mm_store_ss(reinterpret_cast<float*>(y), vResult);
-#endif
-}
-
-// Store the Z component into a 32 bit integer locaCantion in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept {
-    assert(z != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *z = V.vector4_u32[2];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    *z = static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-    _mm_store_ss(reinterpret_cast<float*>(z), vResult);
-#endif
-}
-
-// Store the W component into a 32 bit integer location in memory.
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept {
-    assert(w != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    *w = V.vector4_u32[3];
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    vst1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i V1 = _mm_castps_si128(V);
-    *w = static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-    _mm_store_ss(reinterpret_cast<float*>(w), vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Set a single indexed floating point component
-inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f,
-                                               size_t i) noexcept {
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-    XMVECTORF32 U;
-    U.v = V;
-    U.f[i] = f;
-    return U.v;
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{x, V.vector4_f32[1], V.vector4_f32[2], V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(x, V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(x);
-    vResult = _mm_move_ss(V, vResult);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], y, V.vector4_f32[2], V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(y, V, 1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(y);
-    vResult = _mm_insert_ps(V, vResult, 0x10);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(y);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
-    return vResult;
-#endif
-}
-// Sets the Z component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], V.vector4_f32[1], z, V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(z, V, 2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(z);
-    vResult = _mm_insert_ps(V, vResult, 0x20);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(z);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to a passed floating point value
-inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], V.vector4_f32[1], V.vector4_f32[2], w}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsetq_lane_f32(w, V, 3);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ss(w);
-    vResult = _mm_insert_ps(V, vResult, 0x30);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_set_ss(w);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept {
-    assert(f != nullptr);
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-    XMVECTORF32 U;
-    U.v = V;
-    U.f[i] = *f;
-    return U.v;
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept {
-    assert(x != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{*x, V.vector4_f32[1], V.vector4_f32[2], V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(x, V, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_load_ss(x);
-    vResult = _mm_move_ss(V, vResult);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept {
-    assert(y != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], *y, V.vector4_f32[2], V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(y, V, 1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(y);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept {
-    assert(z != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], V.vector4_f32[1], *z, V.vector4_f32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(z, V, 2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(z);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to a floating point value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept {
-    assert(w != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 U = {
-        {{V.vector4_f32[0], V.vector4_f32[1], V.vector4_f32[2], *w}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vld1q_lane_f32(w, V, 3);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(w);
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x,
-                                                  size_t i) noexcept {
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = x;
-    return tmp;
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{x, V.vector4_u32[1], V.vector4_u32[2], V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(x));
-    XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp));
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], y, V.vector4_u32[2], V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128(V);
-    vResult = _mm_insert_epi32(vResult, static_cast<int>(y), 1);
-    return _mm_castsi128_ps(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(y));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], V.vector4_u32[1], z, V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128(V);
-    vResult = _mm_insert_epi32(vResult, static_cast<int>(z), 2);
-    return _mm_castsi128_ps(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(z));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to an integer passed by value
-inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], V.vector4_u32[1], V.vector4_u32[2], w}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    __m128i vResult = _mm_castps_si128(V);
-    vResult = _mm_insert_epi32(vResult, static_cast<int>(w), 3);
-    return _mm_castsi128_ps(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
-    // Convert input to vector
-    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(w));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-// Sets a component of a vector to an integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept {
-    assert(x != nullptr);
-    assert(i < 4);
-    _Analysis_assume_(i < 4);
-    XMVECTORU32 tmp;
-    tmp.v = V;
-    tmp.u[i] = *x;
-    return tmp;
-}
-
-//------------------------------------------------------------------------------
-
-// Sets the X component of a vector to an integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept {
-    assert(x != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{*x, V.vector4_u32[1], V.vector4_u32[2], V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vld1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0));
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(x));
-    XMVECTOR vResult = _mm_move_ss(V, vTemp);
-    return vResult;
-#endif
-}
-
-// Sets the Y component of a vector to an integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept {
-    assert(y != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], *y, V.vector4_u32[2], V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vld1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap y and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(y));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap y and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
-    return vResult;
-#endif
-}
-
-// Sets the Z component of a vector to an integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept {
-    assert(z != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], V.vector4_u32[1], *z, V.vector4_u32[3]}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vld1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap z and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(z));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap z and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
-    return vResult;
-#endif
-}
-
-// Sets the W component of a vector to an integer value passed by pointer
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept {
-    assert(w != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORU32 U = {
-        {{V.vector4_u32[0], V.vector4_u32[1], V.vector4_u32[2], *w}}};
-    return U.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vld1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap w and x
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
-    // Convert input to vector
-    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(w));
-    // Replace the x component
-    vResult = _mm_move_ss(vResult, vTemp);
-    // Swap w and x again
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0,
-                                            uint32_t E1, uint32_t E2,
-                                            uint32_t E3) noexcept {
-    assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
-    _Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{V.vector4_f32[E0], V.vector4_f32[E1],
-                            V.vector4_f32[E2], V.vector4_f32[E3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const uint32_t ControlElement[4] = {
-        0x03020100,  // XM_SWIZZLE_X
-        0x07060504,  // XM_SWIZZLE_Y
-        0x0B0A0908,  // XM_SWIZZLE_Z
-        0x0F0E0D0C,  // XM_SWIZZLE_W
-    };
-
-    uint8x8x2_t tbl;
-    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
-    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
-
-    uint32x2_t idx =
-        vcreate_u32(static_cast<uint64_t>(ControlElement[E0]) |
-                    (static_cast<uint64_t>(ControlElement[E1]) << 32));
-    const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));
-
-    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[E2]) |
-                      (static_cast<uint64_t>(ControlElement[E3]) << 32));
-    const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));
-
-    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
-#elif defined(_XM_AVX_INTRINSICS_)
-    unsigned int elem[4] = {E0, E1, E2, E3};
-    __m128i vControl =
-        _mm_loadu_si128(reinterpret_cast<const __m128i*>(&elem[0]));
-    return _mm_permutevar_ps(V, vControl);
-#else
-    auto aPtr = reinterpret_cast<const uint32_t*>(&V);
-
-    XMVECTOR Result;
-    auto pWork = reinterpret_cast<uint32_t*>(&Result);
-
-    pWork[0] = aPtr[E0];
-    pWork[1] = aPtr[E1];
-    pWork[2] = aPtr[E2];
-    pWork[3] = aPtr[E3];
-
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2,
-                                            uint32_t PermuteX,
-                                            uint32_t PermuteY,
-                                            uint32_t PermuteZ,
-                                            uint32_t PermuteW) noexcept {
-    assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
-    _Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 &&
-                      PermuteW <= 7);
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    static const uint32_t ControlElement[8] = {
-        0x03020100,  // XM_PERMUTE_0X
-        0x07060504,  // XM_PERMUTE_0Y
-        0x0B0A0908,  // XM_PERMUTE_0Z
-        0x0F0E0D0C,  // XM_PERMUTE_0W
-        0x13121110,  // XM_PERMUTE_1X
-        0x17161514,  // XM_PERMUTE_1Y
-        0x1B1A1918,  // XM_PERMUTE_1Z
-        0x1F1E1D1C,  // XM_PERMUTE_1W
-    };
-
-    uint8x8x4_t tbl;
-    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1));
-    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1));
-    tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2));
-    tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2));
-
-    uint32x2_t idx =
-        vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteX]) |
-                    (static_cast<uint64_t>(ControlElement[PermuteY]) << 32));
-    const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));
-
-    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteZ]) |
-                      (static_cast<uint64_t>(ControlElement[PermuteW]) << 32));
-    const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));
-
-    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
-#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    static const XMVECTORU32 three = {{{3, 3, 3, 3}}};
-
-    XM_ALIGNED_DATA(16)
-    unsigned int elem[4] = {PermuteX, PermuteY, PermuteZ, PermuteW};
-    __m128i vControl =
-        _mm_load_si128(reinterpret_cast<const __m128i*>(&elem[0]));
-
-    __m128i vSelect = _mm_cmpgt_epi32(vControl, three);
-    vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));
-
-    __m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
-    __m128 shuffled2 = _mm_permutevar_ps(V2, vControl);
-
-    __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
-    __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);
-
-    return _mm_or_ps(masked1, masked2);
-#else
-
-    const uint32_t* aPtr[2];
-    aPtr[0] = reinterpret_cast<const uint32_t*>(&V1);
-    aPtr[1] = reinterpret_cast<const uint32_t*>(&V2);
-
-    XMVECTOR Result;
-    auto pWork = reinterpret_cast<uint32_t*>(&Result);
-
-    const uint32_t i0 = PermuteX & 3;
-    const uint32_t vi0 = PermuteX >> 2;
-    pWork[0] = aPtr[vi0][i0];
-
-    const uint32_t i1 = PermuteY & 3;
-    const uint32_t vi1 = PermuteY >> 2;
-    pWork[1] = aPtr[vi1][i1];
-
-    const uint32_t i2 = PermuteZ & 3;
-    const uint32_t vi2 = PermuteZ >> 2;
-    pWork[2] = aPtr[vi2][i2];
-
-    const uint32_t i3 = PermuteW & 3;
-    const uint32_t vi3 = PermuteW >> 2;
-    pWork[3] = aPtr[vi3][i3];
-
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Define a control vector to be used in XMVectorSelect
-// operations.  The four integers specified in XMVectorSelectControl
-// serve as indices to select between components in two vectors.
-// The first index controls selection for the first component of
-// the vectors involved in a select operation, the second index
-// controls selection for the second component etc.  A value of
-// zero for an index causes the corresponding component from the first
-// vector to be selected whereas a one causes the component from the
-// second vector to be selected instead.
-
-inline XMVECTOR XM_CALLCONV
-XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1,
-                      uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept {
-#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    // x=Index0,y=Index1,z=Index2,w=Index3
-    __m128i vTemp = _mm_set_epi32(
-        static_cast<int>(VectorIndex3), static_cast<int>(VectorIndex2),
-        static_cast<int>(VectorIndex1), static_cast<int>(VectorIndex0));
-    // Any non-zero entries become 0xFFFFFFFF else 0
-    vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero);
-    return _mm_castsi128_ps(vTemp);
-#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    int32x2_t V0 = vcreate_s32(static_cast<uint64_t>(VectorIndex0) |
-                               (static_cast<uint64_t>(VectorIndex1) << 32));
-    int32x2_t V1 = vcreate_s32(static_cast<uint64_t>(VectorIndex2) |
-                               (static_cast<uint64_t>(VectorIndex3) << 32));
-    int32x4_t vTemp = vcombine_s32(V0, V1);
-    // Any non-zero entries become 0xFFFFFFFF else 0
-    return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero));
-#else
-    XMVECTOR ControlVector;
-    const uint32_t ControlElement[] = {XM_SELECT_0, XM_SELECT_1};
-
-    assert(VectorIndex0 < 2);
-    assert(VectorIndex1 < 2);
-    assert(VectorIndex2 < 2);
-    assert(VectorIndex3 < 2);
-    _Analysis_assume_(VectorIndex0 < 2);
-    _Analysis_assume_(VectorIndex1 < 2);
-    _Analysis_assume_(VectorIndex2 < 2);
-    _Analysis_assume_(VectorIndex3 < 2);
-
-    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
-    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
-    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
-    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
-
-    return ControlVector;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2,
-                                           FXMVECTOR Control) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{
-        (V1.vector4_u32[0] & ~Control.vector4_u32[0]) |
-            (V2.vector4_u32[0] & Control.vector4_u32[0]),
-        (V1.vector4_u32[1] & ~Control.vector4_u32[1]) |
-            (V2.vector4_u32[1] & Control.vector4_u32[1]),
-        (V1.vector4_u32[2] & ~Control.vector4_u32[2]) |
-            (V2.vector4_u32[2] & Control.vector4_u32[2]),
-        (V1.vector4_u32[3] & ~Control.vector4_u32[3]) |
-            (V2.vector4_u32[3] & Control.vector4_u32[3]),
-    }}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1);
-    XMVECTOR vTemp2 = _mm_and_ps(V2, Control);
-    return _mm_or_ps(vTemp1, vTemp2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{
-        V1.vector4_u32[0],
-        V2.vector4_u32[0],
-        V1.vector4_u32[1],
-        V2.vector4_u32[1],
-    }}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vzipq_f32(V1, V2).val[0];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_unpacklo_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{V1.vector4_u32[2], V2.vector4_u32[2],
-                            V1.vector4_u32[3], V2.vector4_u32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vzipq_f32(V1, V2).val[1];
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_unpackhi_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2,
-                                              uint32_t Elements) noexcept {
-    assert(Elements < 4);
-    _Analysis_assume_(Elements < 4);
-    return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2),
-                           ((Elements) + 3));
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V,
-                                               uint32_t Elements) noexcept {
-    assert(Elements < 4);
-    _Analysis_assume_(Elements < 4);
-    return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3,
-                           (Elements + 2) & 3, (Elements + 3) & 3);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V,
-                                                uint32_t Elements) noexcept {
-    assert(Elements < 4);
-    _Analysis_assume_(Elements < 4);
-    return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3,
-                           (6 - (Elements)) & 3, (7 - (Elements)) & 3);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS,
-                                           uint32_t VSLeftRotateElements,
-                                           uint32_t Select0, uint32_t Select1,
-                                           uint32_t Select2,
-                                           uint32_t Select3) noexcept {
-    XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1,
-                                             Select2 & 1, Select3 & 1);
-    return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements),
-                          Control);
-}
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{
-        (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
-    }}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vceqq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpeq_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorEqualR(uint32_t* pCR, FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    assert(pCR != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux & uy & uz & uw) {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!(ux | uy | uz | uw)) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTORU32 Control = {{{ux, uy, uz, uw}}};
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)),
-                                vreinterpret_u8_u32(vget_high_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        // All elements are equal
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        // All elements are not equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vreinterpretq_f32_u32(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest == 0xf) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Treat the components of the vectors as unsigned integers and
-// compare individual bits between the two.  This is useful for
-// comparing control vectors and result vectors returned from
-// other comparison operations.
-
-inline XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{
-        (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0,
-    }}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorEqualIntR(uint32_t* pCR, FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    assert(pCR != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Control = XMVectorEqualInt(V1, V2);
-
-    *pCR = 0;
-    if (XMVector4EqualInt(Control, XMVectorTrueInt())) {
-        // All elements are equal
-        *pCR |= XM_CRMASK_CR6TRUE;
-    } else if (XMVector4EqualInt(Control, XMVectorFalseInt())) {
-        // All elements are not equal
-        *pCR |= XM_CRMASK_CR6FALSE;
-    }
-    return Control;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        // All elements are equal
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        // All elements are not equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vreinterpretq_f32_u32(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
-    uint32_t CR = 0;
-    if (iTemp == 0x0F) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTemp) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                              FXMVECTOR Epsilon) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0];
-    float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1];
-    float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2];
-    float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3];
-
-    fDeltax = fabsf(fDeltax);
-    fDeltay = fabsf(fDeltay);
-    fDeltaz = fabsf(fDeltaz);
-    fDeltaw = fabsf(fDeltaw);
-
-    XMVECTORU32 Control = {{{
-        (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
-        (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
-        (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
-        (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0,
-    }}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vDelta = vsubq_f32(V1, V2);
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    return vacleq_f32(vDelta, Epsilon);
-#else
-    return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon));
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp, vDelta);
-    vTemp = _mm_max_ps(vTemp, vDelta);
-    vTemp = _mm_cmple_ps(vTemp, Epsilon);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{
-        (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-        (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
-    }}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpneq_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1,
-                                                FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {
-        {{(V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0,
-          (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0,
-          (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0,
-          (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vmvnq_u32(
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {
-        {{(V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vcgtq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpgt_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorGreaterR(uint32_t* pCR, FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    assert(pCR != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux & uy & uz & uw) {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!(ux | uy | uz | uw)) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTORU32 Control = {{{ux, uy, uz, uw}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vreinterpretq_f32_u32(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest == 0xf) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1,
-                                                   FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {
-        {{(V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vcgeq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmpge_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorGreaterOrEqualR(uint32_t* pCR, FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    assert(pCR != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
-    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
-    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
-    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
-    uint32_t CR = 0;
-    if (ux & uy & uz & uw) {
-        // All elements are greater
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!(ux | uy | uz | uw)) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-
-    XMVECTORU32 Control = {{{ux, uy, uz, uw}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        // All elements are greater or equal
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        // All elements are not greater or equal
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vreinterpretq_f32_u32(vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest == 0xf) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        // All elements are not greater
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    *pCR = CR;
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {
-        {{(V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vcltq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmplt_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1,
-                                                FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {
-        {{(V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
-          (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(vcleq_f32(V1, V2));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_cmple_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V,
-                                             FXMVECTOR Bounds) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{(V.vector4_f32[0] <= Bounds.vector4_f32[0] &&
-                              V.vector4_f32[0] >= -Bounds.vector4_f32[0])
-                                 ? 0xFFFFFFFF
-                                 : 0,
-                             (V.vector4_f32[1] <= Bounds.vector4_f32[1] &&
-                              V.vector4_f32[1] >= -Bounds.vector4_f32[1])
-                                 ? 0xFFFFFFFF
-                                 : 0,
-                             (V.vector4_f32[2] <= Bounds.vector4_f32[2] &&
-                              V.vector4_f32[2] >= -Bounds.vector4_f32[2])
-                                 ? 0xFFFFFFFF
-                                 : 0,
-                             (V.vector4_f32[3] <= Bounds.vector4_f32[3] &&
-                              V.vector4_f32[3] >= -Bounds.vector4_f32[3])
-                                 ? 0xFFFFFFFF
-                                 : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
-    // Negate the bounds
-    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
-    // Test if greater or equal (Reversed)
-    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
-    // Blend answers
-    vTemp1 = vandq_u32(vTemp1, vTemp2);
-    return vreinterpretq_f32_u32(vTemp1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2, V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-    return vTemp1;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMVectorInBoundsR(uint32_t* pCR, FXMVECTOR V, FXMVECTOR Bounds) noexcept {
-    assert(pCR != nullptr);
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] &&
-                   V.vector4_f32[0] >= -Bounds.vector4_f32[0])
-                      ? 0xFFFFFFFFU
-                      : 0;
-    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] &&
-                   V.vector4_f32[1] >= -Bounds.vector4_f32[1])
-                      ? 0xFFFFFFFFU
-                      : 0;
-    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] &&
-                   V.vector4_f32[2] >= -Bounds.vector4_f32[2])
-                      ? 0xFFFFFFFFU
-                      : 0;
-    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] &&
-                   V.vector4_f32[3] >= -Bounds.vector4_f32[3])
-                      ? 0xFFFFFFFFU
-                      : 0;
-
-    uint32_t CR = 0;
-    if (ux & uy & uz & uw) {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-
-    XMVECTORU32 Control = {{{ux, uy, uz, uw}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
-    // Negate the bounds
-    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
-    // Test if greater or equal (Reversed)
-    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
-    // Blend answers
-    vTemp1 = vandq_u32(vTemp1, vTemp2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)),
-                                vget_high_u8(vreinterpretq_u8_u32(vTemp1)));
-    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-    return vreinterpretq_f32_u32(vTemp1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2, V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-
-    uint32_t CR = 0;
-    if (_mm_movemask_ps(vTemp1) == 0xf) {
-        // All elements are in bounds
-        CR = XM_CRMASK_CR6BOUNDS;
-    }
-    *pCR = CR;
-    return vTemp1;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
-                             XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
-                             XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
-                             XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    XMVECTORU32 vResult = {{{isnan(vgetq_lane_f32(V, 0)) ? 0xFFFFFFFFU : 0,
-                             isnan(vgetq_lane_f32(V, 1)) ? 0xFFFFFFFFU : 0,
-                             isnan(vgetq_lane_f32(V, 2)) ? 0xFFFFFFFFU : 0,
-                             isnan(vgetq_lane_f32(V, 3)) ? 0xFFFFFFFFU : 0}}};
-    return vResult.v;
-#else
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32(V, V);
-    // Flip results
-    return vreinterpretq_f32_u32(vmvnq_u32(vTempNan));
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    XM_ALIGNED_DATA(16) float tmp[4];
-    _mm_store_ps(tmp, V);
-    XMVECTORU32 vResult = {
-        {{isnan(tmp[0]) ? 0xFFFFFFFFU : 0, isnan(tmp[1]) ? 0xFFFFFFFFU : 0,
-          isnan(tmp[2]) ? 0xFFFFFFFFU : 0, isnan(tmp[3]) ? 0xFFFFFFFFU : 0}}};
-    return vResult.v;
-#else
-    // Test against itself. NaN is always not equal
-    return _mm_cmpneq_ps(V, V);
-#endif
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Control = {{{XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
-                             XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
-                             XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
-                             XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0}}};
-    return Control.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    // Compare to infinity
-    vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return vreinterpretq_f32_u32(vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Rounding and clamping operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {
-        {{(V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0]
-                                                  : V2.vector4_f32[0],
-          (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1]
-                                                  : V2.vector4_f32[1],
-          (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2]
-                                                  : V2.vector4_f32[2],
-          (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3]
-                                                  : V2.vector4_f32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vminq_f32(V1, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_min_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {
-        {{(V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0]
-                                                  : V2.vector4_f32[0],
-          (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1]
-                                                  : V2.vector4_f32[1],
-          (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2]
-                                                  : V2.vector4_f32[2],
-          (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3]
-                                                  : V2.vector4_f32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmaxq_f32(V1, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_max_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-namespace MathInternal {
-// Round to nearest (even) a.k.a. banker's rounding
-inline float round_to_nearest(float x) noexcept {
-    float i = floorf(x);
-    x -= i;
-    if (x < 0.5f) return i;
-    if (x > 0.5f) return i + 1.f;
-
-    float int_part;
-    (void)modff(i / 2.f, &int_part);
-    if ((2.f * int_part) == i) {
-        return i;
-    }
-
-    return i + 1.f;
-}
-}  // namespace MathInternal
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{MathInternal::round_to_nearest(V.vector4_f32[0]),
-                            MathInternal::round_to_nearest(V.vector4_f32[1]),
-                            MathInternal::round_to_nearest(V.vector4_f32[2]),
-                            MathInternal::round_to_nearest(V.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vrndnq_f32(V);
-#else
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero);
-    float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign));
-    float32x4_t R1 = vaddq_f32(V, sMagic);
-    R1 = vsubq_f32(R1, sMagic);
-    float32x4_t R2 = vabsq_f32(V);
-    uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction);
-    return vbslq_f32(mask, R1, V);
-#endif
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 sign = _mm_and_ps(V, g_XMNegativeZero);
-    __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign);
-    __m128 R1 = _mm_add_ps(V, sMagic);
-    R1 = _mm_sub_ps(R1, sMagic);
-    __m128 R2 = _mm_and_ps(V, g_XMAbsMask);
-    __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction);
-    R2 = _mm_andnot_ps(mask, V);
-    R1 = _mm_and_ps(R1, mask);
-    XMVECTOR vResult = _mm_xor_ps(R1, R2);
-    return vResult;
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR Result;
-    uint32_t i;
-
-    // Avoid C4701
-    Result.vector4_f32[0] = 0.0f;
-
-    for (i = 0; i < 4; i++) {
-        if (XMISNAN(V.vector4_f32[i])) {
-            Result.vector4_u32[i] = 0x7FC00000;
-        } else if (fabsf(V.vector4_f32[i]) < 8388608.0f) {
-            Result.vector4_f32[i] =
-                static_cast<float>(static_cast<int32_t>(V.vector4_f32[i]));
-        } else {
-            Result.vector4_f32[i] = V.vector4_f32[i];
-        }
-    }
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vrndq_f32(V);
-#else
-    float32x4_t vTest = vabsq_f32(V);
-    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
-
-    int32x4_t vInt = vcvtq_s32_f32(V);
-    float32x4_t vResult = vcvtq_f32_s32(vInt);
-
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
-#endif
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    // Get the abs value
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    // Test for greater than 8388608 (All floats with NO fractionals, NAN and
-    // INF
-    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
-    // Convert to int and back to float for rounding with truncation
-    __m128i vInt = _mm_cvttps_epi32(V);
-    // Convert back to floats
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{floorf(V.vector4_f32[0]), floorf(V.vector4_f32[1]),
-          floorf(V.vector4_f32[2]), floorf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vrndmq_f32(V);
-#else
-    float32x4_t vTest = vabsq_f32(V);
-    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
-    // Truncate
-    int32x4_t vInt = vcvtq_s32_f32(V);
-    float32x4_t vResult = vcvtq_f32_s32(vInt);
-    uint32x4_t vLargerMask = vcgtq_f32(vResult, V);
-    // 0 -> 0, 0xffffffff -> -1.0f
-    float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask));
-    vResult = vaddq_f32(vResult, vLarger);
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
-#endif
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_floor_ps(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
-    // Truncate
-    __m128i vInt = _mm_cvttps_epi32(V);
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    __m128 vLarger = _mm_cmpgt_ps(vResult, V);
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger));
-    vResult = _mm_add_ps(vResult, vLarger);
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{ceilf(V.vector4_f32[0]), ceilf(V.vector4_f32[1]),
-                            ceilf(V.vector4_f32[2]), ceilf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vrndpq_f32(V);
-#else
-    float32x4_t vTest = vabsq_f32(V);
-    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
-    // Truncate
-    int32x4_t vInt = vcvtq_s32_f32(V);
-    float32x4_t vResult = vcvtq_f32_s32(vInt);
-    uint32x4_t vSmallerMask = vcltq_f32(vResult, V);
-    // 0 -> 0, 0xffffffff -> -1.0f
-    float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask));
-    vResult = vsubq_f32(vResult, vSmaller);
-    // All numbers less than 8388608 will use the round to int
-    // All others, use the ORIGINAL value
-    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
-#endif
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_ceil_ps(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // To handle NAN, INF and numbers greater than 8388608, use masking
-    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
-    // Truncate
-    __m128i vInt = _mm_cvttps_epi32(V);
-    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
-    __m128 vSmaller = _mm_cmplt_ps(vResult, V);
-    // 0 -> 0, 0xffffffff -> -1.0f
-    vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller));
-    vResult = _mm_sub_ps(vResult, vSmaller);
-    // All numbers less than 8388608 will use the round to int
-    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
-    // All others, use the ORIGINAL value
-    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
-    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min,
-                                          FXMVECTOR Max) noexcept {
-    assert(XMVector4LessOrEqual(Min, Max));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVectorMax(Min, V);
-    Result = XMVectorMin(Max, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32(Min, V);
-    vResult = vminq_f32(Max, vResult);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult;
-    vResult = _mm_max_ps(Min, V);
-    vResult = _mm_min_ps(Max, vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    return XMVectorClamp(V, Zero, g_XMOne.v);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Set <0 to 0
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    // Set>1 to 1
-    return vminq_f32(vResult, vdupq_n_f32(1.0f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Set <0 to 0
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    // Set>1 to 1
-    return _mm_min_ps(vResult, g_XMOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Bitwise logical operations
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{V1.vector4_u32[0] & V2.vector4_u32[0],
-                            V1.vector4_u32[1] & V2.vector4_u32[1],
-                            V1.vector4_u32[2] & V2.vector4_u32[2],
-                            V1.vector4_u32[3] & V2.vector4_u32[3]}}};
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_and_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{V1.vector4_u32[0] & ~V2.vector4_u32[0],
-                            V1.vector4_u32[1] & ~V2.vector4_u32[1],
-                            V1.vector4_u32[2] & ~V2.vector4_u32[2],
-                            V1.vector4_u32[3] & ~V2.vector4_u32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{V1.vector4_u32[0] | V2.vector4_u32[0],
-                            V1.vector4_u32[1] | V2.vector4_u32[1],
-                            V1.vector4_u32[2] | V2.vector4_u32[2],
-                            V1.vector4_u32[3] | V2.vector4_u32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{~(V1.vector4_u32[0] | V2.vector4_u32[0]),
-                            ~(V1.vector4_u32[1] | V2.vector4_u32[1]),
-                            ~(V1.vector4_u32[2] | V2.vector4_u32[2]),
-                            ~(V1.vector4_u32[3] | V2.vector4_u32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t Result =
-        vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i Result;
-    Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    Result = _mm_andnot_si128(Result, g_XMNegOneMask);
-    return _mm_castsi128_ps(Result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORU32 Result = {{{V1.vector4_u32[0] ^ V2.vector4_u32[0],
-                            V1.vector4_u32[1] ^ V2.vector4_u32[1],
-                            V1.vector4_u32[2] ^ V2.vector4_u32[2],
-                            V1.vector4_u32[3] ^ V2.vector4_u32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vreinterpretq_f32_u32(
-        veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return _mm_castsi128_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{-V.vector4_f32[0], -V.vector4_f32[1],
-                            -V.vector4_f32[2], -V.vector4_f32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vnegq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Z;
-
-    Z = _mm_setzero_ps();
-
-    return _mm_sub_ps(Z, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{V1.vector4_f32[0] + V2.vector4_f32[0],
-                            V1.vector4_f32[1] + V2.vector4_f32[1],
-                            V1.vector4_f32[2] + V2.vector4_f32[2],
-                            V1.vector4_f32[3] + V2.vector4_f32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vaddq_f32(V1, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_add_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result;
-    Result.f[0] = Result.f[1] = Result.f[2] = Result.f[3] =
-        V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] +
-        V.vector4_f32[3];
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    float32x4_t vTemp = vpaddq_f32(V, V);
-    return vpaddq_f32(vTemp, vTemp);
-#else
-    float32x2_t v1 = vget_low_f32(V);
-    float32x2_t v2 = vget_high_f32(V);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    return vcombine_f32(v1, v1);
-#endif
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_hadd_ps(V, V);
-    return _mm_hadd_ps(vTemp, vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
-    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
-    return _mm_add_ps(vTemp, vTemp2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    // Add the given angles together.  If the range of V1 is such
-    // that -Pi <= V1 < Pi and the range of V2 is such that
-    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
-    // will be -Pi <= Result < Pi.
-    XMVECTOR Result = XMVectorAdd(V1, V2);
-
-    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
-    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
-
-    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
-    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
-
-    Result = XMVectorAdd(Result, Offset);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Adjust the angles
-    float32x4_t vResult = vaddq_f32(V1, V2);
-    // Less than Pi?
-    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
-    vOffset = vandq_u32(vOffset, g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
-    // Greater than or equal to Pi?
-    vOffset = vcgeq_f32(vResult, g_XMPi);
-    vOffset = vandq_u32(vOffset, g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = _mm_add_ps(V1, V2);
-    // Less than Pi?
-    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
-    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = _mm_add_ps(vResult, vOffset);
-    // Greater than or equal to Pi?
-    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
-    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = _mm_sub_ps(vResult, vOffset);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{V1.vector4_f32[0] - V2.vector4_f32[0],
-                            V1.vector4_f32[1] - V2.vector4_f32[1],
-                            V1.vector4_f32[2] - V2.vector4_f32[2],
-                            V1.vector4_f32[3] - V2.vector4_f32[3]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vsubq_f32(V1, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sub_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1,
-                                                   FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    // Subtract the given angles.  If the range of V1 is such
-    // that -Pi <= V1 < Pi and the range of V2 is such that
-    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
-    // will be -Pi <= Result < Pi.
-    XMVECTOR Result = XMVectorSubtract(V1, V2);
-
-    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
-    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
-
-    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
-    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
-
-    Result = XMVectorAdd(Result, Offset);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = vsubq_f32(V1, V2);
-    // Less than Pi?
-    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
-    vOffset = vandq_u32(vOffset, g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
-    // Greater than or equal to Pi?
-    vOffset = vcgeq_f32(vResult, g_XMPi);
-    vOffset = vandq_u32(vOffset, g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Adjust the angles
-    XMVECTOR vResult = _mm_sub_ps(V1, V2);
-    // Less than Pi?
-    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
-    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
-    // Add 2Pi to all entries less than -Pi
-    vResult = _mm_add_ps(vResult, vOffset);
-    // Greater than or equal to Pi?
-    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
-    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
-    // Sub 2Pi to all entries greater than Pi
-    vResult = _mm_sub_ps(vResult, vOffset);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{V1.vector4_f32[0] * V2.vector4_f32[0],
-                            V1.vector4_f32[1] * V2.vector4_f32[1],
-                            V1.vector4_f32[2] * V2.vector4_f32[2],
-                            V1.vector4_f32[3] * V2.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmulq_f32(V1, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_mul_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2,
-                                                FXMVECTOR V3) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0],
-          V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1],
-          V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2],
-          V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vfmaq_f32(V3, V1, V2);
-#else
-    return vmlaq_f32(V3, V1, V2);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_FMADD_PS(V1, V2, V3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{V1.vector4_f32[0] / V2.vector4_f32[0],
-                            V1.vector4_f32[1] / V2.vector4_f32[1],
-                            V1.vector4_f32[2] / V2.vector4_f32[2],
-                            V1.vector4_f32[3] / V2.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vdivq_f32(V1, V2);
-#else
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x4_t Reciprocal = vrecpeq_f32(V2);
-    float32x4_t S = vrecpsq_f32(Reciprocal, V2);
-    Reciprocal = vmulq_f32(S, Reciprocal);
-    S = vrecpsq_f32(Reciprocal, V2);
-    Reciprocal = vmulq_f32(S, Reciprocal);
-    return vmulq_f32(V1, Reciprocal);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_div_ps(V1, V2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(
-    FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
-          V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
-          V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
-          V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])}}};
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    return vfmsq_f32(V3, V1, V2);
-#else
-    return vmlsq_f32(V3, V1, V2);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    return XM_FNMADD_PS(V1, V2, V3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V,
-                                          float ScaleFactor) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{V.vector4_f32[0] * ScaleFactor, V.vector4_f32[1] * ScaleFactor,
-          V.vector4_f32[2] * ScaleFactor, V.vector4_f32[3] * ScaleFactor}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vmulq_n_f32(V, ScaleFactor);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
-    return _mm_mul_ps(vResult, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{1.f / V.vector4_f32[0], 1.f / V.vector4_f32[1],
-                            1.f / V.vector4_f32[2], 1.f / V.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vrecpeq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_rcp_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{1.f / V.vector4_f32[0], 1.f / V.vector4_f32[1],
-                            1.f / V.vector4_f32[2], 1.f / V.vector4_f32[3]}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-    float32x4_t one = vdupq_n_f32(1.0f);
-    return vdivq_f32(one, V);
-#else
-    // 2 iterations of Newton-Raphson refinement
-    float32x4_t Reciprocal = vrecpeq_f32(V);
-    float32x4_t S = vrecpsq_f32(Reciprocal, V);
-    Reciprocal = vmulq_f32(S, Reciprocal);
-    S = vrecpsq_f32(Reciprocal, V);
-    return vmulq_f32(S, Reciprocal);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_div_ps(g_XMOne, V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Return an estimated square root
-inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{sqrtf(V.vector4_f32[0]), sqrtf(V.vector4_f32[1]),
-                            sqrtf(V.vector4_f32[2]), sqrtf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 1 iteration of Newton-Raphson refinment of sqrt
-    float32x4_t S0 = vrsqrteq_f32(V);
-    float32x4_t P0 = vmulq_f32(V, S0);
-    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
-    float32x4_t S1 = vmulq_f32(S0, R0);
-
-    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
-    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
-    XMVECTOR Result = vmulq_f32(V, S1);
-    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    return XMVectorSelect(V, Result, Select);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{sqrtf(V.vector4_f32[0]), sqrtf(V.vector4_f32[1]),
-                            sqrtf(V.vector4_f32[2]), sqrtf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 3 iterations of Newton-Raphson refinment of sqrt
-    float32x4_t S0 = vrsqrteq_f32(V);
-    float32x4_t P0 = vmulq_f32(V, S0);
-    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
-    float32x4_t S1 = vmulq_f32(S0, R0);
-    float32x4_t P1 = vmulq_f32(V, S1);
-    float32x4_t R1 = vrsqrtsq_f32(P1, S1);
-    float32x4_t S2 = vmulq_f32(S1, R1);
-    float32x4_t P2 = vmulq_f32(V, S2);
-    float32x4_t R2 = vrsqrtsq_f32(P2, S2);
-    float32x4_t S3 = vmulq_f32(S2, R2);
-
-    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
-    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
-    XMVECTOR Result = vmulq_f32(V, S3);
-    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
-    return XMVectorSelect(V, Result, Select);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_sqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{1.f / sqrtf(V.vector4_f32[0]), 1.f / sqrtf(V.vector4_f32[1]),
-          1.f / sqrtf(V.vector4_f32[2]), 1.f / sqrtf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vrsqrteq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    return _mm_rsqrt_ps(V);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {
-        {{1.f / sqrtf(V.vector4_f32[0]), 1.f / sqrtf(V.vector4_f32[1]),
-          1.f / sqrtf(V.vector4_f32[2]), 1.f / sqrtf(V.vector4_f32[3])}}};
-    return Result;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // 2 iterations of Newton-Raphson refinement of reciprocal
-    float32x4_t S0 = vrsqrteq_f32(V);
-
-    float32x4_t P0 = vmulq_f32(V, S0);
-    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
-
-    float32x4_t S1 = vmulq_f32(S0, R0);
-    float32x4_t P1 = vmulq_f32(V, S1);
-    float32x4_t R1 = vrsqrtsq_f32(P1, S1);
-
-    return vmulq_f32(S1, R1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_sqrt_ps(V);
-    vResult = _mm_div_ps(g_XMOne, vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{exp2f(V.vector4_f32[0]), exp2f(V.vector4_f32[1]),
-                            exp2f(V.vector4_f32[2]), exp2f(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t itrunc = vcvtq_s32_f32(V);
-    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
-    float32x4_t y = vsubq_f32(V, ftrunc);
-
-    float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y);
-    poly = vmlaq_f32(g_XMExpEst5, poly, y);
-    poly = vmlaq_f32(g_XMExpEst4, poly, y);
-    poly = vmlaq_f32(g_XMExpEst3, poly, y);
-    poly = vmlaq_f32(g_XMExpEst2, poly, y);
-    poly = vmlaq_f32(g_XMExpEst1, poly, y);
-    poly = vmlaq_f32(g_XMOne, poly, y);
-
-    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);
-
-    biased = vaddq_s32(itrunc, g_XM253);
-    biased = vshlq_n_s32(biased, 23);
-    float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);
-    result1 = vmulq_f32(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128);
-    float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity);
-
-    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
-    float32x4_t result3 = vbslq_f32(comp, result1, result0);
-
-    comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150);
-    float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero);
-
-    int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero);
-    comp = vceqq_s32(sign, g_XMNegativeZero);
-    float32x4_t result5 = vbslq_f32(comp, result4, result2);
-
-    int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
-    int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero));
-    t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity));
-    int32x4_t isNaN = vbicq_s32(t1, t0);
-
-    float32x4_t vResult =
-        vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5);
-    return vResult;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_exp2_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i itrunc = _mm_cvttps_epi32(V);
-    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
-    __m128 y = _mm_sub_ps(V, ftrunc);
-
-    __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6);
-    poly = XM_FMADD_PS(poly, y, g_XMExpEst5);
-    poly = XM_FMADD_PS(poly, y, g_XMExpEst4);
-    poly = XM_FMADD_PS(poly, y, g_XMExpEst3);
-    poly = XM_FMADD_PS(poly, y, g_XMExpEst2);
-    poly = XM_FMADD_PS(poly, y, g_XMExpEst1);
-    poly = XM_FMADD_PS(poly, y, g_XMOne);
-
-    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-
-    biased = _mm_add_epi32(itrunc, g_XM253);
-    biased = _mm_slli_epi32(biased, 23);
-    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
-    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);
-
-    // Use selection to handle the cases
-    //  if (V is NaN) -> QNaN;
-    //  else if (V sign bit set)
-    //      if (V > -150)
-    //         if (V.exponent < -126) -> result1
-    //         else -> result0
-    //      else -> +0
-    //  else
-    //      if (V < 128) -> result0
-    //      else -> +inf
-
-    __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128);
-    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
-    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
-    __m128i result2 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
-    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
-    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
-    __m128i result3 = _mm_or_si128(select0, select1);
-
-    comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
-    select0 = _mm_and_si128(comp, result3);
-    select1 = _mm_andnot_si128(comp, g_XMZero);
-    __m128i result4 = _mm_or_si128(select0, select1);
-
-    __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
-    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
-    select0 = _mm_and_si128(comp, result4);
-    select1 = _mm_andnot_si128(comp, result2);
-    __m128i result5 = _mm_or_si128(select0, select1);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result5);
-    __m128i vResult = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(vResult);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {
-        {{powf(10.0f, V.vector4_f32[0]), powf(10.0f, V.vector4_f32[1]),
-          powf(10.0f, V.vector4_f32[2]), powf(10.0f, V.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_exp10_ps(V);
-    return Result;
-#else
-    // exp10(V) = exp2(vin*log2(10))
-    XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V);
-    return XMVectorExp2(Vten);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{expf(V.vector4_f32[0]), expf(V.vector4_f32[1]),
-                            expf(V.vector4_f32[2]), expf(V.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_exp_ps(V);
-    return Result;
-#else
-    // expE(V) = exp2(vin*log2(e))
-    XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V);
-    return XMVectorExp2(Ve);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept {
-    return XMVectorExp2(V);
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(_XM_SSE_INTRINSICS_)
-
-namespace MathInternal {
-inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept {
-    __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
-    __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r0 = _mm_sll_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r1 = _mm_sll_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r2 = _mm_sll_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r3 = _mm_sll_epi32(v, c);
-
-    // (r0,r0,r1,r1)
-    __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1),
-                                _MM_SHUFFLE(0, 0, 0, 0));
-    // (r2,r2,r3,r3)
-    __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3),
-                                _MM_SHUFFLE(0, 0, 0, 0));
-    // (r0,r1,r2,r3)
-    __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
-    return _mm_castps_si128(result);
-}
-
-inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept {
-    __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
-    __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r0 = _mm_srl_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r1 = _mm_srl_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r2 = _mm_srl_epi32(v, c);
-
-    v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
-    c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
-    c = _mm_and_si128(c, g_XMMaskX);
-    __m128i r3 = _mm_srl_epi32(v, c);
-
-    // (r0,r0,r1,r1)
-    __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1),
-                                _MM_SHUFFLE(0, 0, 0, 0));
-    // (r2,r2,r3,r3)
-    __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3),
-                                _MM_SHUFFLE(0, 0, 0, 0));
-    // (r0,r1,r2,r3)
-    __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
-    return _mm_castps_si128(result);
-}
-
-inline __m128i GetLeadingBit(const __m128i value) noexcept {
-    static const XMVECTORI32 g_XM0000FFFF = {
-        {{0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}}};
-    static const XMVECTORI32 g_XM000000FF = {
-        {{0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}}};
-    static const XMVECTORI32 g_XM0000000F = {
-        {{0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}}};
-    static const XMVECTORI32 g_XM00000003 = {
-        {{0x00000003, 0x00000003, 0x00000003, 0x00000003}}};
-
-    __m128i v = value, r, c, b, s;
-
-    c = _mm_cmpgt_epi32(v, g_XM0000FFFF);  // c = (v > 0xFFFF)
-    b = _mm_srli_epi32(c, 31);             // b = (c ? 1 : 0)
-    r = _mm_slli_epi32(b, 4);              // r = (b << 4)
-    v = multi_srl_epi32(v, r);             // v = (v >> r)
-
-    c = _mm_cmpgt_epi32(v, g_XM000000FF);  // c = (v > 0xFF)
-    b = _mm_srli_epi32(c, 31);             // b = (c ? 1 : 0)
-    s = _mm_slli_epi32(b, 3);              // s = (b << 3)
-    v = multi_srl_epi32(v, s);             // v = (v >> s)
-    r = _mm_or_si128(r, s);                // r = (r | s)
-
-    c = _mm_cmpgt_epi32(v, g_XM0000000F);  // c = (v > 0xF)
-    b = _mm_srli_epi32(c, 31);             // b = (c ? 1 : 0)
-    s = _mm_slli_epi32(b, 2);              // s = (b << 2)
-    v = multi_srl_epi32(v, s);             // v = (v >> s)
-    r = _mm_or_si128(r, s);                // r = (r | s)
-
-    c = _mm_cmpgt_epi32(v, g_XM00000003);  // c = (v > 0x3)
-    b = _mm_srli_epi32(c, 31);             // b = (c ? 1 : 0)
-    s = _mm_slli_epi32(b, 1);              // s = (b << 1)
-    v = multi_srl_epi32(v, s);             // v = (v >> s)
-    r = _mm_or_si128(r, s);                // r = (r | s)
-
-    s = _mm_srli_epi32(v, 1);
-    r = _mm_or_si128(r, s);
-    return r;
-}
-}  // namespace MathInternal
-
-#endif  // _XM_SSE_INTRINSICS_
-
-#if defined(_XM_ARM_NEON_INTRINSICS_)
-
-namespace MathInternal {
-inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept {
-    static const XMVECTORI32 g_XM0000FFFF = {
-        {{0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF}}};
-    static const XMVECTORI32 g_XM000000FF = {
-        {{0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}}};
-    static const XMVECTORI32 g_XM0000000F = {
-        {{0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F}}};
-    static const XMVECTORI32 g_XM00000003 = {
-        {{0x00000003, 0x00000003, 0x00000003, 0x00000003}}};
-
-    uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF);  // c = (v > 0xFFFF)
-    int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);  // b = (c ? 1 : 0)
-    int32x4_t r = vshlq_n_s32(b, 4);                          // r = (b << 4)
-    r = vnegq_s32(r);
-    int32x4_t v = vshlq_s32(value, r);  // v = (v >> r)
-
-    c = vcgtq_s32(v, g_XM000000FF);                 // c = (v > 0xFF)
-    b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);  // b = (c ? 1 : 0)
-    int32x4_t s = vshlq_n_s32(b, 3);                // s = (b << 3)
-    s = vnegq_s32(s);
-    v = vshlq_s32(v, s);  // v = (v >> s)
-    r = vorrq_s32(r, s);  // r = (r | s)
-
-    c = vcgtq_s32(v, g_XM0000000F);                 // c = (v > 0xF)
-    b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);  // b = (c ? 1 : 0)
-    s = vshlq_n_s32(b, 2);                          // s = (b << 2)
-    s = vnegq_s32(s);
-    v = vshlq_s32(v, s);  // v = (v >> s)
-    r = vorrq_s32(r, s);  // r = (r | s)
-
-    c = vcgtq_s32(v, g_XM00000003);                 // c = (v > 0x3)
-    b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);  // b = (c ? 1 : 0)
-    s = vshlq_n_s32(b, 1);                          // s = (b << 1)
-    s = vnegq_s32(s);
-    v = vshlq_s32(v, s);  // v = (v >> s)
-    r = vorrq_s32(r, s);  // r = (r | s)
-
-    s = vshrq_n_s32(v, 1);
-    r = vorrq_s32(r, s);
-    return r;
-}
-
-}  // namespace MathInternal
-
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{log2f(V.vector4_f32[0]), log2f(V.vector4_f32[1]),
-                            log2f(V.vector4_f32[2]), log2f(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
-    uint32x4_t isExponentZero =
-        vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased);
-
-    // Compute exponent and significand for normals.
-    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
-    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
-    int32x4_t trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    int32x4_t leading = MathInternal::GetLeadingBit(trailing);
-    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
-    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
-    int32x4_t trailingSub = vshlq_s32(trailing, shift);
-    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
-    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
-    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
-
-    // Compute the approximation.
-    int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t);
-    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
-
-    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
-    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
-    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
-
-    uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero);
-    uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity);
-    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
-
-    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isZero = vceqq_u32(isZero, g_XMZero);
-
-    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
-    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
-    t0 = vceqq_u32(t0, g_XMZero);
-    t1 = vceqq_u32(t1, g_XMInfinity);
-    uint32x4_t isNaN = vbicq_u32(t1, t0);
-
-    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
-    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
-    result = vbslq_f32(isPositive, result, tmp2);
-    result = vbslq_f32(isNaN, g_XMQNaN, result);
-    return result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_log2_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    __m128i biased = _mm_srli_epi32(rawBiased, 23);
-    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
-    __m128i trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    __m128i leading = MathInternal::GetLeadingBit(trailing);
-    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
-    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
-    __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift);
-    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
-
-    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
-    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
-    __m128i e = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isExponentZero, trailingSub);
-    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
-    __m128i t = _mm_or_si128(select0, select1);
-
-    // Compute the approximation.
-    __m128i tmp = _mm_or_si128(g_XMOne, t);
-    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
-
-    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
-    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
-
-    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
-    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
-    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
-
-    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
-    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
-    __m128i result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
-    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
-    tmp = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isPositive, result);
-    select1 = _mm_andnot_si128(isPositive, tmp);
-    result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result);
-    result = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {
-        {{log10f(V.vector4_f32[0]), log10f(V.vector4_f32[1]),
-          log10f(V.vector4_f32[2]), log10f(V.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
-    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
-    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
-    int32x4_t trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    int32x4_t leading = MathInternal::GetLeadingBit(trailing);
-    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
-    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
-    int32x4_t trailingSub = vshlq_s32(trailing, shift);
-    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
-    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
-    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
-
-    // Compute the approximation.
-    int32x4_t tmp = vorrq_s32(g_XMOne, t);
-    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
-
-    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
-    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
-    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
-
-    log2 = vmulq_f32(g_XMInvLg10, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
-
-    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
-    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
-
-    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isZero = vceqq_u32(isZero, g_XMZero);
-
-    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
-    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
-    t0 = vceqq_u32(t0, g_XMZero);
-    t1 = vceqq_u32(t1, g_XMInfinity);
-    uint32x4_t isNaN = vbicq_u32(t1, t0);
-
-    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
-    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
-    result = vbslq_f32(isPositive, result, tmp2);
-    result = vbslq_f32(isNaN, g_XMQNaN, result);
-    return result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_log10_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    __m128i biased = _mm_srli_epi32(rawBiased, 23);
-    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
-    __m128i trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    __m128i leading = MathInternal::GetLeadingBit(trailing);
-    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
-    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
-    __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift);
-    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
-
-    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
-    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
-    __m128i e = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isExponentZero, trailingSub);
-    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
-    __m128i t = _mm_or_si128(select0, select1);
-
-    // Compute the approximation.
-    __m128i tmp = _mm_or_si128(g_XMOne, t);
-    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
-
-    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
-    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
-
-    log2 = _mm_mul_ps(g_XMInvLg10, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
-
-    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
-    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
-    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
-
-    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
-    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
-    __m128i result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
-    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
-    tmp = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isPositive, result);
-    select1 = _mm_andnot_si128(isPositive, tmp);
-    result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result);
-    result = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{logf(V.vector4_f32[0]), logf(V.vector4_f32[1]),
-                            logf(V.vector4_f32[2]), logf(V.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
-    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
-    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
-    int32x4_t trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    int32x4_t leading = MathInternal::GetLeadingBit(trailing);
-    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
-    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
-    int32x4_t trailingSub = vshlq_s32(trailing, shift);
-    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
-    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
-    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);
-
-    // Compute the approximation.
-    int32x4_t tmp = vorrq_s32(g_XMOne, t);
-    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);
-
-    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
-    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
-    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
-    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);
-
-    log2 = vmulq_f32(g_XMInvLgE, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);
-
-    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
-    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
-    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);
-
-    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    isZero = vceqq_u32(isZero, g_XMZero);
-
-    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
-    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
-    t0 = vceqq_u32(t0, g_XMZero);
-    t1 = vceqq_u32(t1, g_XMInfinity);
-    uint32x4_t isNaN = vbicq_u32(t1, t0);
-
-    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
-    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
-    result = vbslq_f32(isPositive, result, tmp2);
-    result = vbslq_f32(isNaN, g_XMQNaN, result);
-    return result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_log_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);
-
-    // Compute exponent and significand for normals.
-    __m128i biased = _mm_srli_epi32(rawBiased, 23);
-    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
-    __m128i trailingNor = trailing;
-
-    // Compute exponent and significand for subnormals.
-    __m128i leading = MathInternal::GetLeadingBit(trailing);
-    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
-    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
-    __m128i trailingSub = MathInternal::multi_sll_epi32(trailing, shift);
-    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);
-
-    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
-    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
-    __m128i e = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isExponentZero, trailingSub);
-    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
-    __m128i t = _mm_or_si128(select0, select1);
-
-    // Compute the approximation.
-    __m128i tmp = _mm_or_si128(g_XMOne, t);
-    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);
-
-    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
-    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
-    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));
-
-    log2 = _mm_mul_ps(g_XMInvLgE, log2);
-
-    //  if (x is NaN) -> QNaN
-    //  else if (V is positive)
-    //      if (V is infinite) -> +inf
-    //      else -> log2(V)
-    //  else
-    //      if (V is zero) -> -inf
-    //      else -> -QNaN
-
-    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);
-
-    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
-    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
-    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);
-
-    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
-    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);
-
-    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
-    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
-    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
-    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
-    __m128i isNaN = _mm_andnot_si128(t0, t1);
-
-    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
-    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
-    __m128i result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
-    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
-    tmp = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isPositive, result);
-    select1 = _mm_andnot_si128(isPositive, tmp);
-    result = _mm_or_si128(select0, select1);
-
-    select0 = _mm_and_si128(isNaN, g_XMQNaN);
-    select1 = _mm_andnot_si128(isNaN, result);
-    result = _mm_or_si128(select0, select1);
-
-    return _mm_castsi128_ps(result);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept {
-    return XMVectorLog2(V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{powf(V1.vector4_f32[0], V2.vector4_f32[0]),
-                            powf(V1.vector4_f32[1], V2.vector4_f32[1]),
-                            powf(V1.vector4_f32[2], V2.vector4_f32[2]),
-                            powf(V1.vector4_f32[3], V2.vector4_f32[3])}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
-          powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
-          powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
-          powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))}}};
-    return vResult.v;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_pow_ps(V1, V2);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XM_ALIGNED_DATA(16) float a[4];
-    XM_ALIGNED_DATA(16) float b[4];
-    _mm_store_ps(a, V1);
-    _mm_store_ps(b, V2);
-    XMVECTOR vResult = _mm_setr_ps(powf(a[0], b[0]), powf(a[1], b[1]),
-                                   powf(a[2], b[2]), powf(a[3], b[3]));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{fabsf(V.vector4_f32[0]), fabsf(V.vector4_f32[1]),
-          fabsf(V.vector4_f32[2]), fabsf(V.vector4_f32[3])}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    return vabsq_f32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_setzero_ps();
-    vResult = _mm_sub_ps(vResult, V);
-    vResult = _mm_max_ps(vResult, V);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    // V1 % V2 = V1 - V2 * truncate(V1 / V2)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Quotient = XMVectorDivide(V1, V2);
-    Quotient = XMVectorTruncate(Quotient);
-    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR vResult = XMVectorDivide(V1, V2);
-    vResult = XMVectorTruncate(vResult);
-    return vmlsq_f32(V1, vResult, V2);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_div_ps(V1, V2);
-    vResult = XMVectorTruncate(vResult);
-    return XM_FNMADD_PS(vResult, V2, V1);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR V;
-    XMVECTOR Result;
-
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
-    V = XMVectorRound(V);
-    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi);
-    // Use the inline function due to complexity for rounding
-    vResult = XMVectorRound(vResult);
-    return vmlsq_f32(Angles, vResult, g_XMTwoPi);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
-    XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi);
-    // Use the inline function due to complexity for rounding
-    vResult = XMVectorRound(vResult);
-    return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept {
-    // 11-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{sinf(V.vector4_f32[0]), sinf(V.vector4_f32[1]),
-                            sinf(V.vector4_f32[2]), sinf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, x);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_sin_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept {
-    // 10-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{cosf(V.vector4_f32[0]), cosf(V.vector4_f32[1]),
-                            cosf(V.vector4_f32[2]), cosf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, fsign);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_cos_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorSinCos(XMVECTOR* pSin, XMVECTOR* pCos, FXMVECTOR V) noexcept {
-    assert(pSin != nullptr);
-    assert(pCos != nullptr);
-
-    // 11/10-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Sin = {{{sinf(V.vector4_f32[0]), sinf(V.vector4_f32[1]),
-                         sinf(V.vector4_f32[2]), sinf(V.vector4_f32[3])}}};
-
-    XMVECTORF32 Cos = {{{cosf(V.vector4_f32[0]), cosf(V.vector4_f32[1]),
-                         cosf(V.vector4_f32[2]), cosf(V.vector4_f32[3])}}};
-
-    *pSin = Sin.v;
-    *pCos = Cos.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pSin = vmulq_f32(Result, x);
-
-    // Compute polynomial approximation for cosine
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
-    Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pCos = vmulq_f32(Result, fsign);
-#elif defined(_XM_SVML_INTRINSICS_)
-    *pSin = _mm_sincos_ps(pCos, V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation of sine
-    const XMVECTOR SC1 = g_XMSinCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
-    const XMVECTOR SC0 = g_XMSinCoefficients0;
-    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    *pSin = Result;
-
-    // Compute polynomial approximation of cosine
-    const XMVECTOR CC1 = g_XMCosCoefficients1;
-    vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
-    const XMVECTOR CC0 = g_XMCosCoefficients0;
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
-    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    *pCos = Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept {
-    // Cody and Waite algorithm to compute tangent.
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{tanf(V.vector4_f32[0]), tanf(V.vector4_f32[1]),
-                            tanf(V.vector4_f32[2]), tanf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_tan_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    static const XMVECTORF32 TanCoefficients0 = {
-        {{1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}}};
-    static const XMVECTORF32 TanCoefficients1 = {
-        {{4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f,
-          -1.786170734e-5f}}};
-    static const XMVECTORF32 TanConstants = {
-        {{1.570796371f, 6.077100628e-11f, 0.000244140625f,
-          0.63661977228f /*2 / Pi*/}}};
-    static const XMVECTORU32 Mask = {{{0x1, 0x1, 0x1, 0x1}}};
-
-    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
-
-    XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
-    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
-    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
-
-    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
-
-    VA = XMVectorRound(VA);
-
-    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
-
-    XMVECTOR VB = XMVectorAbs(VA);
-
-    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
-
-#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB));
-#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB);
-#else
-    for (size_t i = 0; i < 4; i++) {
-        VB.vector4_u32[i] = static_cast<uint32_t>(VB.vector4_f32[i]);
-    }
-#endif
-
-    XMVECTOR VC2 = XMVectorMultiply(VC, VC);
-
-    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
-    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
-    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
-    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
-    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
-    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
-    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
-    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
-
-    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
-    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
-
-    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
-    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
-    N = XMVectorMultiplyAdd(VC2, N, T5);
-    D = XMVectorMultiplyAdd(VC2, D, T2);
-    N = XMVectorMultiply(VC2, N);
-    D = XMVectorMultiplyAdd(VC2, D, T1);
-    N = XMVectorMultiplyAdd(VC, N, VC);
-    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
-    D = XMVectorMultiplyAdd(VC2, D, T0);
-
-    N = XMVectorSelect(N, VC, VCNearZero);
-    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
-
-    XMVECTOR R0 = XMVectorNegate(N);
-    XMVECTOR R1 = XMVectorDivide(N, D);
-    R0 = XMVectorDivide(D, R0);
-
-    XMVECTOR VIsZero = XMVectorEqual(V, Zero);
-
-    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
-
-    Result = XMVectorSelect(Result, Zero, VIsZero);
-
-    return Result;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{sinhf(V.vector4_f32[0]), sinhf(V.vector4_f32[1]),
-                            sinhf(V.vector4_f32[2]), sinhf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.442695040888963f, 1.442695040888963f, 1.442695040888963f,
-          1.442695040888963f}}};  // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-
-    return vsubq_f32(E1, E2);
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_sinh_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.442695040888963f, 1.442695040888963f, 1.442695040888963f,
-          1.442695040888963f}}};  // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne);
-    XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-
-    return _mm_sub_ps(E1, E2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{coshf(V.vector4_f32[0]), coshf(V.vector4_f32[1]),
-                            coshf(V.vector4_f32[2]), coshf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.442695040888963f, 1.442695040888963f, 1.442695040888963f,
-          1.442695040888963f}}};  // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-    return vaddq_f32(E1, E2);
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_cosh_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.442695040888963f, 1.442695040888963f, 1.442695040888963f,
-          1.442695040888963f}}};  // 1.0f / ln(2.0f)
-
-    XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v);
-    XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v);
-    XMVECTOR E1 = XMVectorExp(V1);
-    XMVECTOR E2 = XMVectorExp(V2);
-    return _mm_add_ps(E1, E2);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{tanhf(V.vector4_f32[0]), tanhf(V.vector4_f32[1]),
-                            tanhf(V.vector4_f32[2]), tanhf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f,
-          2.8853900817779268f}}};  // 2.0f / ln(2.0f)
-
-    XMVECTOR E = vmulq_f32(V, Scale.v);
-    E = XMVectorExp(E);
-    E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v);
-    E = XMVectorReciprocal(E);
-    return vsubq_f32(g_XMOne.v, E);
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_tanh_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f,
-          2.8853900817779268f}}};  // 2.0f / ln(2.0f)
-
-    XMVECTOR E = _mm_mul_ps(V, Scale.v);
-    E = XMVectorExp(E);
-    E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v);
-    E = _mm_div_ps(g_XMOne.v, E);
-    return _mm_sub_ps(g_XMOne.v, E);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept {
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{asinf(V.vector4_f32[0]), asinf(V.vector4_f32[1]),
-                            asinf(V.vector4_f32[2]), asinf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
-    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32(nonnegative, t0, t1);
-    t0 = vsubq_f32(g_XMHalfPi, t0);
-    return t0;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_asin_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    t0 = _mm_sub_ps(g_XMHalfPi, t0);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept {
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{acosf(V.vector4_f32[0]), acosf(V.vector4_f32[1]),
-                            acosf(V.vector4_f32[2]), acosf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
-    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32(nonnegative, t0, t1);
-    return t0;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_acos_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AC1 = g_XMArcCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    const XMVECTOR AC0 = g_XMArcCoefficients0;
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept {
-    // 17-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{atanf(V.vector4_f32[0]), atanf(V.vector4_f32[1]),
-                            atanf(V.vector4_f32[2]), atanf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t absV = vabsq_f32(V);
-    float32x4_t invV = XMVectorReciprocal(V);
-    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
-    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-    comp = vcleq_f32(absV, g_XMOne);
-    sign = vbslq_f32(comp, g_XMZero, sign);
-    float32x4_t x = vbslq_f32(comp, V, invV);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR TC1 = g_XMATanCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    const XMVECTOR TC0 = g_XMATanCoefficients0;
-    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, x);
-
-    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
-    result1 = vsubq_f32(result1, Result);
-
-    comp = vceqq_f32(sign, g_XMZero);
-    Result = vbslq_f32(comp, Result, result1);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_atan_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 absV = XMVectorAbs(V);
-    __m128 invV = _mm_div_ps(g_XMOne, V);
-    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
-    __m128 select0 = _mm_and_ps(comp, g_XMOne);
-    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    __m128 sign = _mm_or_ps(select0, select1);
-    comp = _mm_cmple_ps(absV, g_XMOne);
-    select0 = _mm_and_ps(comp, g_XMZero);
-    select1 = _mm_andnot_ps(comp, sign);
-    sign = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, V);
-    select1 = _mm_andnot_ps(comp, invV);
-    __m128 x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR TC1 = g_XMATanCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    const XMVECTOR TC0 = g_XMATanCoefficients0;
-    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-
-    Result = _mm_mul_ps(Result, x);
-    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
-    result1 = _mm_sub_ps(result1, Result);
-
-    comp = _mm_cmpeq_ps(sign, g_XMZero);
-    select0 = _mm_and_ps(comp, Result);
-    select1 = _mm_andnot_ps(comp, result1);
-    Result = _mm_or_ps(select0, select1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
-                            atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
-                            atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
-                            atan2f(Y.vector4_f32[3], X.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_atan2_ps(Y, X);
-    return Result;
-#else
-
-    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the
-    // following exceptions:
-
-    //     Y == 0 and X is Negative         -> Pi with the sign of Y
-    //     y == 0 and x is positive         -> 0 with the sign of y
-    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
-    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of
-    //     Y) X == -Infinity and Finite Y      -> Pi with the sign of Y X ==
-    //     +Infinity and Finite Y      -> 0 with the sign of Y Y == Infinity and
-    //     X is Finite    -> Pi / 2 with the sign of Y Y == Infinity and X ==
-    //     -Infinity -> 3Pi / 4 with the sign of Y Y == Infinity and X ==
-    //     +Infinity -> Pi / 4 with the sign of Y
-
-    static const XMVECTORF32 ATan2Constants = {
-        {{XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}}};
-
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR ATanResultValid = XMVectorTrueInt();
-
-    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
-    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
-    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
-    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
-
-    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
-    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
-    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
-    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
-    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
-
-    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
-    Pi = XMVectorOrInt(Pi, YSign);
-    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
-    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
-    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
-
-    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
-    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
-    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
-    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
-    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
-    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
-
-    XMVECTOR V = XMVectorDivide(Y, X);
-
-    XMVECTOR R0 = XMVectorATan(V);
-
-    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
-    R2 = XMVectorAdd(R0, R1);
-
-    return XMVectorSelect(Result, R2, ATanResultValid);
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept {
-    // 7-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{sinf(V.vector4_f32[0]), sinf(V.vector4_f32[1]),
-                            sinf(V.vector4_f32[2]), sinf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, x);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_sin_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
-    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept {
-    // 6-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{cosf(V.vector4_f32[0]), cosf(V.vector4_f32[1]),
-                            cosf(V.vector4_f32[2]), cosf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    Result = vmulq_f32(Result, fsign);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_cos_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Map V to x in [-pi,pi].
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVectorSinCosEst(XMVECTOR* pSin, XMVECTOR* pCos, FXMVECTOR V) noexcept {
-    assert(pSin != nullptr);
-    assert(pCos != nullptr);
-
-    // 7/6-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Sin = {{{sinf(V.vector4_f32[0]), sinf(V.vector4_f32[1]),
-                         sinf(V.vector4_f32[2]), sinf(V.vector4_f32[3])}}};
-
-    XMVECTORF32 Cos = {{{cosf(V.vector4_f32[0]), cosf(V.vector4_f32[1]),
-                         cosf(V.vector4_f32[2]), cosf(V.vector4_f32[3])}}};
-
-    *pSin = Sin.v;
-    *pCos = Cos.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
-    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
-    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
-    float32x4_t absx = vabsq_f32(x);
-    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
-    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
-    x = vbslq_f32(comp, x, rflx);
-    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pSin = vmulq_f32(Result, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
-    Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    Result = vmlaq_f32(g_XMOne, Result, x2);
-    *pCos = vmulq_f32(Result, fsign);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Force the value within the bounds of pi
-    XMVECTOR x = XMVectorModAngles(V);
-
-    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
-    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
-    __m128 c = _mm_or_ps(g_XMPi, sign);    // pi when x >= 0, -pi when x < 0
-    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
-    __m128 rflx = _mm_sub_ps(c, x);
-    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
-    __m128 select0 = _mm_and_ps(comp, x);
-    __m128 select1 = _mm_andnot_ps(comp, rflx);
-    x = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, g_XMOne);
-    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    sign = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation for sine
-    const XMVECTOR SEC = g_XMSinCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, x);
-    *pSin = Result;
-
-    // Compute polynomial approximation for cosine
-    const XMVECTOR CEC = g_XMCosCoefficients1;
-    vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
-    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
-    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-    Result = XM_FMADD_PS(Result, x2, g_XMOne);
-    Result = _mm_mul_ps(Result, sign);
-    *pCos = Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{tanf(V.vector4_f32[0]), tanf(V.vector4_f32[1]),
-                            tanf(V.vector4_f32[2]), tanf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_tan_ps(V);
-    return Result;
-#else
-
-    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
-
-    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
-    V1 = XMVectorRound(V1);
-
-    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
-
-    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
-    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
-    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
-
-    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
-    XMVECTOR V2 = XMVectorMultiply(V1, V1);
-    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
-    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
-
-    XMVECTOR D = XMVectorReciprocalEst(V2T2);
-    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
-
-    return XMVectorMultiply(N, D);
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept {
-    // 3-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result;
-    Result.f[0] = asinf(V.vector4_f32[0]);
-    Result.f[1] = asinf(V.vector4_f32[1]);
-    Result.f[2] = asinf(V.vector4_f32[2]);
-    Result.f[3] = asinf(V.vector4_f32[3]);
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32(nonnegative, t0, t1);
-    t0 = vsubq_f32(g_XMHalfPi, t0);
-    return t0;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_asin_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    t0 = _mm_sub_ps(g_XMHalfPi, t0);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept {
-    // 3-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{acosf(V.vector4_f32[0]), acosf(V.vector4_f32[1]),
-                            acosf(V.vector4_f32[2]), acosf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
-    float32x4_t x = vabsq_f32(V);
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
-    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
-    float32x4_t root = XMVectorSqrt(clampOneMValue);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    t0 = vmlaq_f32(vConstants, t0, x);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
-    t0 = vmlaq_f32(vConstants, t0, x);
-    t0 = vmulq_f32(t0, root);
-
-    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
-    t0 = vbslq_f32(nonnegative, t0, t1);
-    return t0;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_acos_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
-    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
-    __m128 x = _mm_max_ps(V, mvalue);  // |V|
-
-    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
-    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
-    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
-    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMArcEstCoefficients;
-    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
-    t0 = XM_FMADD_PS(t0, x, vConstants);
-    t0 = _mm_mul_ps(t0, root);
-
-    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
-    t0 = _mm_and_ps(nonnegative, t0);
-    t1 = _mm_andnot_ps(nonnegative, t1);
-    t0 = _mm_or_ps(t0, t1);
-    return t0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept {
-    // 9-degree minimax approximation
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{atanf(V.vector4_f32[0]), atanf(V.vector4_f32[1]),
-                            atanf(V.vector4_f32[2]), atanf(V.vector4_f32[3])}}};
-    return Result.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t absV = vabsq_f32(V);
-    float32x4_t invV = XMVectorReciprocalEst(V);
-    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
-    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
-    comp = vcleq_f32(absV, g_XMOne);
-    sign = vbslq_f32(comp, g_XMZero, sign);
-    float32x4_t x = vbslq_f32(comp, V, invV);
-
-    float32x4_t x2 = vmulq_f32(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMATanEstCoefficients1;
-    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
-    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
-    Result = vmlaq_f32(vConstants, Result, x2);
-
-    // ATanEstCoefficients0 is already splatted
-    Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2);
-    Result = vmulq_f32(Result, x);
-
-    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
-    result1 = vsubq_f32(result1, Result);
-
-    comp = vceqq_f32(sign, g_XMZero);
-    Result = vbslq_f32(comp, Result, result1);
-    return Result;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_atan_ps(V);
-    return Result;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128 absV = XMVectorAbs(V);
-    __m128 invV = _mm_div_ps(g_XMOne, V);
-    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
-    __m128 select0 = _mm_and_ps(comp, g_XMOne);
-    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
-    __m128 sign = _mm_or_ps(select0, select1);
-    comp = _mm_cmple_ps(absV, g_XMOne);
-    select0 = _mm_and_ps(comp, g_XMZero);
-    select1 = _mm_andnot_ps(comp, sign);
-    sign = _mm_or_ps(select0, select1);
-    select0 = _mm_and_ps(comp, V);
-    select1 = _mm_andnot_ps(comp, invV);
-    __m128 x = _mm_or_ps(select0, select1);
-
-    __m128 x2 = _mm_mul_ps(x, x);
-
-    // Compute polynomial approximation
-    const XMVECTOR AEC = g_XMATanEstCoefficients1;
-    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
-    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-
-    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
-    Result = XM_FMADD_PS(Result, x2, vConstants);
-    // ATanEstCoefficients0 is already splatted
-    Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0);
-    Result = _mm_mul_ps(Result, x);
-    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
-    result1 = _mm_sub_ps(result1, Result);
-
-    comp = _mm_cmpeq_ps(sign, g_XMZero);
-    select0 = _mm_and_ps(comp, Result);
-    select1 = _mm_andnot_ps(comp, result1);
-    Result = _mm_or_ps(select0, select1);
-    return Result;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y,
-                                             FXMVECTOR X) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 Result = {{{
-        atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
-        atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
-        atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
-        atan2f(Y.vector4_f32[3], X.vector4_f32[3]),
-    }}};
-    return Result.v;
-#elif defined(_XM_SVML_INTRINSICS_)
-    XMVECTOR Result = _mm_atan2_ps(Y, X);
-    return Result;
-#else
-
-    static const XMVECTORF32 ATan2Constants = {
-        {{XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */}}};
-
-    const XMVECTOR Zero = XMVectorZero();
-    XMVECTOR ATanResultValid = XMVectorTrueInt();
-
-    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
-    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
-    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
-    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
-
-    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
-    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
-    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
-    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
-    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
-    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
-
-    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
-    Pi = XMVectorOrInt(Pi, YSign);
-    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
-    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
-    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
-
-    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
-    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
-    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
-    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
-    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
-    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
-    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
-
-    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
-    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
-    XMVECTOR R0 = XMVectorATanEst(V);
-
-    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
-    R2 = XMVectorAdd(R0, R1);
-
-    Result = XMVectorSelect(Result, R2, ATanResultValid);
-
-    return Result;
-
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1,
-                                         float t) noexcept {
-    // V0 + t * (V1 - V0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Scale = XMVectorReplicate(t);
-    XMVECTOR Length = XMVectorSubtract(V1, V0);
-    return XMVectorMultiplyAdd(Length, Scale, V0);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR L = vsubq_f32(V1, V0);
-    return vmlaq_n_f32(V0, L, t);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR L = _mm_sub_ps(V1, V0);
-    XMVECTOR S = _mm_set_ps1(t);
-    return XM_FMADD_PS(L, S, V0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1,
-                                          FXMVECTOR T) noexcept {
-    // V0 + T * (V1 - V0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Length = XMVectorSubtract(V1, V0);
-    return XMVectorMultiplyAdd(Length, T, V0);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR L = vsubq_f32(V1, V0);
-    return vmlaq_f32(V0, L, T);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR Length = _mm_sub_ps(V1, V0);
-    return XM_FMADD_PS(Length, T, V0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0,
-                                            FXMVECTOR Tangent0,
-                                            FXMVECTOR Position1,
-                                            GXMVECTOR Tangent1,
-                                            float t) noexcept {
-    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
-    //          (t^3 - 2 * t^2 + t) * Tangent0 +
-    //          (-2 * t^3 + 3 * t^2) * Position1 +
-    //          (t^3 - t^2) * Tangent1
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
-    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
-    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
-    XMVECTOR T1 = XMVectorReplicate(t3 - t2);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
-    float t0 = t3 - 2.0f * t2 + t;
-    float p1 = -2.0f * t3 + 3.0f * t2;
-    float t1 = t3 - t2;
-
-    XMVECTOR vResult = vmulq_n_f32(Position0, p0);
-    vResult = vmlaq_n_f32(vResult, Tangent0, t0);
-    vResult = vmlaq_n_f32(vResult, Position1, p1);
-    vResult = vmlaq_n_f32(vResult, Tangent1, t1);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
-    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
-    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
-    XMVECTOR T1 = _mm_set_ps1(t3 - t2);
-
-    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
-    vResult = XM_FMADD_PS(Tangent0, T0, vResult);
-    vResult = XM_FMADD_PS(Position1, P1, vResult);
-    vResult = XM_FMADD_PS(Tangent1, T1, vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0,
-                                             FXMVECTOR Tangent0,
-                                             FXMVECTOR Position1,
-                                             GXMVECTOR Tangent1,
-                                             HXMVECTOR T) noexcept {
-    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
-    //          (t^3 - 2 * t^2 + t) * Tangent0 +
-    //          (-2 * t^3 + 3 * t^2) * Position1 +
-    //          (t^3 - t^2) * Tangent1
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR T2 = XMVectorMultiply(T, T);
-    XMVECTOR T3 = XMVectorMultiply(T, T2);
-
-    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] -
-                                    3.0f * T2.vector4_f32[0] + 1.0f);
-    XMVECTOR T0 = XMVectorReplicate(
-        T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
-    XMVECTOR P1 =
-        XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
-    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 CatMulT2 = {{{-3.0f, -2.0f, 3.0f, -1.0f}}};
-    static const XMVECTORF32 CatMulT3 = {{{2.0f, 1.0f, -2.0f, 1.0f}}};
-
-    XMVECTOR T2 = vmulq_f32(T, T);
-    XMVECTOR T3 = vmulq_f32(T, T2);
-    // Mul by the constants against t^2
-    T2 = vmulq_f32(T2, CatMulT2);
-    // Mul by the constants against t^3
-    T3 = vmlaq_f32(T2, T3, CatMulT3);
-    // T3 now has the pre-result.
-    // I need to add t.y only
-    T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY));
-    T3 = vaddq_f32(T3, T2);
-    // Add 1.0f to x
-    T3 = vaddq_f32(T3, g_XMIdentityR0);
-    // Now, I have the constants created
-    // Mul the x constant to Position0
-    XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0);  // T3[0]
-    // Mul the y constant to Tangent0
-    vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1);  // T3[1]
-    // Mul the z constant to Position1
-    vResult =
-        vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0);  // T3[2]
-    // Mul the w constant to Tangent1
-    vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1);  // T3[3]
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 CatMulT2 = {{{-3.0f, -2.0f, 3.0f, -1.0f}}};
-    static const XMVECTORF32 CatMulT3 = {{{2.0f, 1.0f, -2.0f, 1.0f}}};
-
-    XMVECTOR T2 = _mm_mul_ps(T, T);
-    XMVECTOR T3 = _mm_mul_ps(T, T2);
-    // Mul by the constants against t^2
-    T2 = _mm_mul_ps(T2, CatMulT2);
-    // Mul by the constants against t^3
-    T3 = XM_FMADD_PS(T3, CatMulT3, T2);
-    // T3 now has the pre-result.
-    // I need to add t.y only
-    T2 = _mm_and_ps(T, g_XMMaskY);
-    T3 = _mm_add_ps(T3, T2);
-    // Add 1.0f to x
-    T3 = _mm_add_ps(T3, g_XMIdentityR0);
-    // Now, I have the constants created
-    // Mul the x constant to Position0
-    XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0));
-    vResult = _mm_mul_ps(vResult, Position0);
-    // Mul the y constant to Tangent0
-    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1));
-    vResult = XM_FMADD_PS(T2, Tangent0, vResult);
-    // Mul the z constant to Position1
-    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2));
-    vResult = XM_FMADD_PS(T2, Position1, vResult);
-    // Mul the w constant to Tangent1
-    T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3));
-    vResult = XM_FMADD_PS(T3, Tangent1, vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0,
-                                               FXMVECTOR Position1,
-                                               FXMVECTOR Position2,
-                                               GXMVECTOR Position3,
-                                               float t) noexcept {
-    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
-    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
-    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
-    //           (t^3 - t^2) * Position3) * 0.5
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
-    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
-    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
-    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
-
-    XMVECTOR Result = XMVectorMultiply(P0, Position0);
-    Result = XMVectorMultiplyAdd(P1, Position1, Result);
-    Result = XMVectorMultiplyAdd(P2, Position2, Result);
-    Result = XMVectorMultiplyAdd(P3, Position3, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
-    float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
-    float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
-    float p3 = (t3 - t2) * 0.5f;
-
-    XMVECTOR P1 = vmulq_n_f32(Position1, p1);
-    XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
-    XMVECTOR P3 = vmulq_n_f32(Position3, p3);
-    XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
-    P0 = vaddq_f32(P0, P2);
-    return P0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
-    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
-    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
-    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
-
-    P1 = _mm_mul_ps(Position1, P1);
-    P0 = XM_FMADD_PS(Position0, P0, P1);
-    P3 = _mm_mul_ps(Position3, P3);
-    P2 = XM_FMADD_PS(Position2, P2, P3);
-    P0 = _mm_add_ps(P0, P2);
-    return P0;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0,
-                                                FXMVECTOR Position1,
-                                                FXMVECTOR Position2,
-                                                GXMVECTOR Position3,
-                                                HXMVECTOR T) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float fx = T.vector4_f32[0];
-    float fy = T.vector4_f32[1];
-    float fz = T.vector4_f32[2];
-    float fw = T.vector4_f32[3];
-    XMVECTORF32 vResult = {
-        {{0.5f *
-              ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0] +
-               (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0] +
-               (-3 * fx * fx * fx + 4 * fx * fx + fx) *
-                   Position2.vector4_f32[0] +
-               (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]),
-
-          0.5f *
-              ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1] +
-               (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1] +
-               (-3 * fy * fy * fy + 4 * fy * fy + fy) *
-                   Position2.vector4_f32[1] +
-               (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]),
-
-          0.5f *
-              ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2] +
-               (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2] +
-               (-3 * fz * fz * fz + 4 * fz * fz + fz) *
-                   Position2.vector4_f32[2] +
-               (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]),
-
-          0.5f *
-              ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3] +
-               (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3] +
-               (-3 * fw * fw * fw + 4 * fw * fw + fw) *
-                   Position2.vector4_f32[3] +
-               (fw * fw * fw - fw * fw) * Position3.vector4_f32[3])}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Catmul2 = {{{2.0f, 2.0f, 2.0f, 2.0f}}};
-    static const XMVECTORF32 Catmul3 = {{{3.0f, 3.0f, 3.0f, 3.0f}}};
-    static const XMVECTORF32 Catmul4 = {{{4.0f, 4.0f, 4.0f, 4.0f}}};
-    static const XMVECTORF32 Catmul5 = {{{5.0f, 5.0f, 5.0f, 5.0f}}};
-    // Cache T^2 and T^3
-    XMVECTOR T2 = vmulq_f32(T, T);
-    XMVECTOR T3 = vmulq_f32(T, T2);
-    // Perform the Position0 term
-    XMVECTOR vResult = vaddq_f32(T2, T2);
-    vResult = vsubq_f32(vResult, T);
-    vResult = vsubq_f32(vResult, T3);
-    vResult = vmulq_f32(vResult, Position0);
-    // Perform the Position1 term and add
-    XMVECTOR vTemp = vmulq_f32(T3, Catmul3);
-    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
-    vTemp = vaddq_f32(vTemp, Catmul2);
-    vResult = vmlaq_f32(vResult, vTemp, Position1);
-    // Perform the Position2 term and add
-    vTemp = vmulq_f32(T2, Catmul4);
-    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
-    vTemp = vaddq_f32(vTemp, T);
-    vResult = vmlaq_f32(vResult, vTemp, Position2);
-    // Position3 is the last term
-    T3 = vsubq_f32(T3, T2);
-    vResult = vmlaq_f32(vResult, T3, Position3);
-    // Multiply by 0.5f and exit
-    vResult = vmulq_f32(vResult, g_XMOneHalf);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Catmul2 = {{{2.0f, 2.0f, 2.0f, 2.0f}}};
-    static const XMVECTORF32 Catmul3 = {{{3.0f, 3.0f, 3.0f, 3.0f}}};
-    static const XMVECTORF32 Catmul4 = {{{4.0f, 4.0f, 4.0f, 4.0f}}};
-    static const XMVECTORF32 Catmul5 = {{{5.0f, 5.0f, 5.0f, 5.0f}}};
-    // Cache T^2 and T^3
-    XMVECTOR T2 = _mm_mul_ps(T, T);
-    XMVECTOR T3 = _mm_mul_ps(T, T2);
-    // Perform the Position0 term
-    XMVECTOR vResult = _mm_add_ps(T2, T2);
-    vResult = _mm_sub_ps(vResult, T);
-    vResult = _mm_sub_ps(vResult, T3);
-    vResult = _mm_mul_ps(vResult, Position0);
-    // Perform the Position1 term and add
-    XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3);
-    vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp);
-    vTemp = _mm_add_ps(vTemp, Catmul2);
-    vResult = XM_FMADD_PS(vTemp, Position1, vResult);
-    // Perform the Position2 term and add
-    vTemp = _mm_mul_ps(T2, Catmul4);
-    vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp);
-    vTemp = _mm_add_ps(vTemp, T);
-    vResult = XM_FMADD_PS(vTemp, Position2, vResult);
-    // Position3 is the last term
-    T3 = _mm_sub_ps(T3, T2);
-    vResult = XM_FMADD_PS(T3, Position3, vResult);
-    // Multiply by 0.5f and exit
-    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0,
-                                                FXMVECTOR Position1,
-                                                FXMVECTOR Position2, float f,
-                                                float g) noexcept {
-    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 -
-    // Position0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
-    XMVECTOR ScaleF = XMVectorReplicate(f);
-
-    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
-    XMVECTOR ScaleG = XMVectorReplicate(g);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
-    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR R1 = vsubq_f32(Position1, Position0);
-    XMVECTOR R2 = vsubq_f32(Position2, Position0);
-    R1 = vmlaq_n_f32(Position0, R1, f);
-    return vmlaq_n_f32(R1, R2, g);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
-    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
-    XMVECTOR SF = _mm_set_ps1(f);
-    R1 = XM_FMADD_PS(R1, SF, Position0);
-    XMVECTOR SG = _mm_set_ps1(g);
-    return XM_FMADD_PS(R2, SG, R1);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0,
-                                                 FXMVECTOR Position1,
-                                                 FXMVECTOR Position2,
-                                                 GXMVECTOR F,
-                                                 HXMVECTOR G) noexcept {
-    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 -
-    // Position0)
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
-    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
-    Result = XMVectorMultiplyAdd(P20, G, Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR R1 = vsubq_f32(Position1, Position0);
-    XMVECTOR R2 = vsubq_f32(Position2, Position0);
-    R1 = vmlaq_f32(Position0, R1, F);
-    return vmlaq_f32(R1, R2, G);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
-    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
-    R1 = XM_FMADD_PS(R1, F, Position0);
-    return XM_FMADD_PS(R2, G, R1);
-#endif
-}
-
-/****************************************************************************
- *
- * 2D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    // z and w are don't care
-    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] == V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] != V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
-    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    // z and w are don't care
-    int iTest = _mm_movemask_ps(vTemp) & 3;
-    uint32_t CR = 0;
-    if (iTest == 3) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
-             (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)),
-                                vget_low_u32(vreinterpretq_u32_f32(V2)));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1,
-                                               FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
-        (V1.vector4_u32[1] == V2.vector4_u32[1])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
-               (V1.vector4_u32[1] != V2.vector4_u32[1])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)),
-                                vget_low_u32(vreinterpretq_u32_f32(V2)));
-    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3;
-    uint32_t CR = 0;
-    if (iTest == 3) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                           FXMVECTOR Epsilon) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
-    float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
-    return ((dx <= Epsilon.vector4_f32[0]) && (dy <= Epsilon.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2));
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon));
-#else
-    uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon));
-#endif
-    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
-    return (r == 0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp, vDelta);
-    vTemp = _mm_max_ps(vTemp, vDelta);
-    vTemp = _mm_cmple_ps(vTemp, Epsilon);
-    // z and w are don't care
-    return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) ||
-             (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) !=
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    // z and w are don't care
-    return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) ||
-             (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)),
-                                vget_low_u32(vreinterpretq_u32_f32(V2)));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) !=
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    // z and w are don't care
-    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] > V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] <= V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
-    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp) & 3;
-    uint32_t CR = 0;
-    if (iTest == 3) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1,
-                                                FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1,
-                                                     FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] >= V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] < V2.vector4_f32[1])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
-    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp) & 3;
-    uint32_t CR = 0;
-    if (iTest == 3) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2));
-    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V,
-                                          FXMVECTOR Bounds) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] &&
-              V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
-             (V.vector4_f32[1] <= Bounds.vector4_f32[1] &&
-              V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    float32x2_t B = vget_low_f32(Bounds);
-    // Test if less than or equal
-    uint32x2_t ivTemp1 = vcle_f32(VL, B);
-    // Negate the bounds
-    float32x2_t vTemp2 = vneg_f32(B);
-    // Test if greater or equal (Reversed)
-    uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL);
-    // Blend answers
-    ivTemp1 = vand_u32(ivTemp1, ivTemp2);
-    // x and y in bounds?
-    return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) ==
-            0xFFFFFFFFFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2, V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-    // x and y in bounds? (z and w are don't care)
-    return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISNAN(V.vector4_f32[0]) || XMISNAN(V.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1));
-#else
-    float32x2_t VL = vget_low_f32(V);
-    // Test against itself. NaN is always not equal
-    uint32x2_t vTempNan = vceq_f32(VL, VL);
-    // If x or y are NaN, the mask is zero
-    return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) !=
-            0xFFFFFFFFFFFFFFFFU);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    XM_ALIGNED_DATA(16) float tmp[4];
-    _mm_store_ps(tmp, V);
-    return isnan(tmp[0]) || isnan(tmp[1]);
-#else
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
-    // If x or y are NaN, the mask is non-zero
-    return ((_mm_movemask_ps(vTempNan) & 3) != 0);
-#endif
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISINF(V.vector4_f32[0]) || XMISINF(V.vector4_f32[1]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)),
-                                vget_low_u32(g_XMAbsMask));
-    // Compare to infinity
-    vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity));
-    // If any are infinity, the signs are true.
-    return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
-    // If x or z are infinity, the signs are true.
-    return ((_mm_movemask_ps(vTemp) & 3) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result;
-    Result.f[0] = Result.f[1] = Result.f[2] = Result.f[3] =
-        V1.vector4_f32[0] * V2.vector4_f32[0] +
-        V1.vector4_f32[1] * V2.vector4_f32[1];
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Perform the dot product on x and y
-    float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2));
-    vTemp = vpadd_f32(vTemp, vTemp);
-    return vcombine_f32(vTemp, vTemp);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps(V1, V2, 0x3f);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V1, V2);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_moveldup_ps(vDot);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V1, V2);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) -
-                   (V1.vector4_f32[1] * V2.vector4_f32[0]);
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] = fCross;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = {{{1.f, -1.f, 0, 0}}};
-
-    float32x2_t vTemp =
-        vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2)));
-    vTemp = vmul_f32(vTemp, vget_low_f32(Negate));
-    vTemp = vpadd_f32(vTemp, vTemp);
-    return vcombine_f32(vTemp, vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Swap x and y
-    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
-    // Perform the muls
-    vResult = _mm_mul_ps(vResult, V1);
-    // Splat y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
-    // Sub the values
-    vResult = _mm_sub_ss(vResult, vTemp);
-    // Splat the cross product
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept {
-    return XMVector2Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    // Reciprocal sqrt (estimate)
-    vTemp = vrsqrte_f32(vTemp);
-    return vcombine_f32(vTemp, vTemp);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
-    return _mm_rsqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    // Reciprocal sqrt
-    float32x2_t S0 = vrsqrte_f32(vTemp);
-    float32x2_t P0 = vmul_f32(vTemp, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(vTemp, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
-    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
-    return _mm_div_ps(g_XMOne, vLengthSq);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = _mm_sqrt_ss(vLengthSq);
-    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32(vTemp);
-    Result = vmul_f32(vTemp, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = _mm_sqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2LengthSq(V);
-    Result = XMVectorSqrt(Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32(vTemp);
-    float32x2_t P0 = vmul_f32(vTemp, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(vTemp, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    Result = vmul_f32(vTemp, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ss(vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector2NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector2ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    // Reciprocal sqrt (estimate)
-    vTemp = vrsqrte_f32(vTemp);
-    // Normalize
-    float32x2_t Result = vmul_f32(VL, vTemp);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
-    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    vLengthSq = _mm_mul_ps(vLengthSq, V);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has y splatted
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = _mm_rsqrt_ss(vLengthSq);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    vLengthSq = _mm_mul_ps(vLengthSq, V);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR vResult = XMVector2Length(V);
-    float fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f / fLength;
-    }
-
-    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    // Dot2
-    float32x2_t vTemp = vmul_f32(VL, VL);
-    vTemp = vpadd_f32(vTemp, vTemp);
-    uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0));
-    uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity));
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32(vTemp);
-    float32x2_t P0 = vmul_f32(vTemp, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(vTemp, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    vTemp = vmul_f32(S1, R1);
-    // Normalize
-    float32x2_t Result = vmul_f32(VL, vTemp);
-    Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result);
-    Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x and y only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_moveldup_ps(vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x and y only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin,
-                                                 float LengthMax) noexcept {
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV(
-    FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept {
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
-    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
-    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector2LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls
-    // within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident,
-                                             FXMVECTOR Normal) noexcept {
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result;
-    Result = XMVector2Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident,
-                                             FXMVECTOR Normal,
-                                             float RefractionIndex) noexcept {
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector2RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-// Return the refraction of a 2D vector
-inline XMVECTOR XM_CALLCONV XMVector2RefractV(
-    FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept {
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex *
-    // dot(Incident, Normal) + sqrt(1 - RefractionIndex * RefractionIndex * (1 -
-    // dot(Incident, Normal) * dot(Incident, Normal))))
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) +
-                  (Incident.vector4_f32[1] * Normal.vector4_f32[1]);
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float RY = 1.0f - (IDotN * IDotN);
-    float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] *
-                       RefractionIndex.vector4_f32[0]);
-    RY = 1.0f -
-         (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]);
-    if (RX >= 0.0f) {
-        RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) -
-             (Normal.vector4_f32[0] *
-              ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX)));
-    } else {
-        RX = 0.0f;
-    }
-    if (RY >= 0.0f) {
-        RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) -
-             (Normal.vector4_f32[1] *
-              ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY)));
-    } else {
-        RY = 0.0f;
-    }
-
-    XMVECTOR vResult;
-    vResult.vector4_f32[0] = RX;
-    vResult.vector4_f32[1] = RY;
-    vResult.vector4_f32[2] = 0.0f;
-    vResult.vector4_f32[3] = 0.0f;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t IL = vget_low_f32(Incident);
-    float32x2_t NL = vget_low_f32(Normal);
-    float32x2_t RIL = vget_low_f32(RefractionIndex);
-    // Get the 2D Dot product of Incident-Normal
-    float32x2_t vTemp = vmul_f32(IL, NL);
-    float32x2_t IDotN = vpadd_f32(vTemp, vTemp);
-    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN);
-    vTemp = vmul_f32(vTemp, RIL);
-    vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL);
-    // If any terms are <=0, sqrt() will fail, punt to zero
-    uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero));
-    // Sqrt(vTemp)
-    float32x2_t S0 = vrsqrte_f32(vTemp);
-    float32x2_t P0 = vmul_f32(vTemp, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(vTemp, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t S2 = vmul_f32(S1, R1);
-    vTemp = vmul_f32(vTemp, S2);
-    // R = RefractionIndex * IDotN + sqrt(R)
-    vTemp = vmla_f32(vTemp, RIL, IDotN);
-    // Result = RefractionIndex * Incident - Normal * R
-    float32x2_t vResult = vmul_f32(RIL, IL);
-    vResult = vmls_f32(vResult, vTemp, NL);
-    vResult =
-        vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask));
-    return vcombine_f32(vResult, vResult);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex *
-    // dot(Incident, Normal) + sqrt(1 - RefractionIndex * RefractionIndex * (1 -
-    // dot(Incident, Normal) * dot(Incident, Normal)))) Get the 2D Dot product
-    // of Incident-Normal
-    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
-    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
-    vTemp = _mm_mul_ps(vTemp, RefractionIndex);
-    vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne);
-    // If any terms are <=0, sqrt() will fail, punt to zero
-    XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero);
-    // R = RefractionIndex * IDotN + sqrt(R)
-    vTemp = _mm_sqrt_ps(vTemp);
-    vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp);
-    // Result = RefractionIndex * Incident - Normal * R
-    XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident);
-    vResult = XM_FNMADD_PS(vTemp, Normal, vResult);
-    vResult = _mm_and_ps(vResult, vMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{-V.vector4_f32[1], V.vector4_f32[0], 0.f, 0.f}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = {{{-1.f, 1.f, 0, 0}}};
-    const float32x2_t zero = vdup_n_f32(0);
-
-    float32x2_t VL = vget_low_f32(V);
-    float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate));
-    return vcombine_f32(Result, zero);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
-    vResult = _mm_mul_ps(vResult, g_XMNegateX);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector2Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector2Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector2ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector2Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance(
-    FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept {
-    // Given a vector PointVector from LinePoint1 to Point and a vector
-    // LineVector from LinePoint1 to LinePoint2, the scaled distance
-    // PointProjectionScale from LinePoint1 to the perpendicular projection
-    // of PointVector onto the line is defined as:
-    //
-    //     PointProjectionScale = dot(PointVector, LineVector) /
-    //     LengthSq(LineVector)
-
-    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
-    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
-
-    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
-
-    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
-    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
-
-    XMVECTOR DistanceVector =
-        XMVectorMultiply(LineVector, PointProjectionScale);
-    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
-
-    return XMVector2Length(DistanceVector);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2,
-                       FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept {
-#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
-
-    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
-    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
-    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
-
-    XMVECTOR C1 = XMVector2Cross(V1, V2);
-    XMVECTOR C2 = XMVector2Cross(V2, V3);
-
-    XMVECTOR Result;
-    const XMVECTOR Zero = XMVectorZero();
-    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) {
-        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) {
-            // Coincident
-            Result = g_XMInfinity.v;
-        } else {
-            // Parallel
-            Result = g_XMQNaN.v;
-        }
-    } else {
-        // Intersection point = Line1Point1 + V1 * (C2 / C1)
-        XMVECTOR Scale = XMVectorReciprocal(C1);
-        Scale = XMVectorMultiply(C2, Scale);
-        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
-    }
-
-    return Result;
-
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
-    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
-    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
-    // Generate the cross products
-    XMVECTOR C1 = XMVector2Cross(V1, V2);
-    XMVECTOR C2 = XMVector2Cross(V2, V3);
-    // If C1 is not close to epsilon, use the calculated value
-    XMVECTOR vResultMask = _mm_setzero_ps();
-    vResultMask = _mm_sub_ps(vResultMask, C1);
-    vResultMask = _mm_max_ps(vResultMask, C1);
-    // 0xFFFFFFFF if the calculated value is to be used
-    vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon);
-    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
-    XMVECTOR vFailMask = _mm_setzero_ps();
-    vFailMask = _mm_sub_ps(vFailMask, C2);
-    vFailMask = _mm_max_ps(vFailMask, C2);
-    vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon);
-    XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity);
-    vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN);
-    // vFail is NAN or INF
-    vFail = _mm_or_ps(vFail, vFailMask);
-    // Intersection point = Line1Point1 + V1 * (C2 / C1)
-    XMVECTOR vResult = _mm_div_ps(C2, C1);
-    vResult = XM_FMADD_PS(vResult, V1, Line1Point1);
-    // Use result, or failure value
-    vResult = _mm_and_ps(vResult, vResultMask);
-    vResultMask = _mm_andnot_ps(vResultMask, vFail);
-    vResult = _mm_or_ps(vResult, vResultMask);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V,
-                                               FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1);  // Y
-    return vmlaq_lane_f32(Result, M.r[0], VL, 0);                // X
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));  // Y
-    vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));  // X
-    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream(
-    XMFLOAT4* pOutputStream, size_t OutputStride, const XMFLOAT2* pInputStream,
-    size_t InputStride, size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015, "PREfast noise: Esp:1307")
-#endif
-
-        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT2)) &&
-            (OutputStride == sizeof(XMFLOAT4))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x2_t V =
-                    vld2q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT2) * 4;
-
-                float32x2_t r3 = vget_low_f32(row3);
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(row3);
-                r = vget_high_f32(row0);
-                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Cx+O
-                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy+O
-                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1);  // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                float32x4x4_t R;
-                R.val[0] = vResult0;
-                R.val[1] = vResult1;
-                R.val[2] = vResult2;
-                R.val[3] = vResult3;
-
-                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
-                pOutputVector += sizeof(XMFLOAT4) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, V, 1);        // Y
-
-        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
-        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
-        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
-
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (OutputStride == sizeof(XMFLOAT4)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                        __m256 vTempA = _mm256_mul_ps(X1, row0);
-                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                        vTempA = _mm256_add_ps(vTempA, vTempB);
-                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                        X1 = _mm256_insertf128_ps(
-                            vTempA, _mm256_castps256_ps128(vTempA2), 1);
-                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                        X1);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        X2 = _mm256_insertf128_ps(
-                            vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
-                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                        X2);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                        __m256 vTempA = _mm256_mul_ps(X1, row0);
-                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                        vTempA = _mm256_add_ps(vTempA, vTempB);
-                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                        X1 = _mm256_insertf128_ps(
-                            vTempA, _mm256_castps256_ps128(vTempA2), 1);
-                        _mm256_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector), X1);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        X2 = _mm256_insertf128_ps(
-                            vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
-                        _mm256_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector), X2);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m256 VV = _mm256_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 4;
-
-                    __m256 Y2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                    __m256 X2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                    __m256 Y1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                    __m256 X1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                    __m256 vTempA = _mm256_mul_ps(X1, row0);
-                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                    vTempA = _mm256_add_ps(vTempA, vTempB);
-                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_castps256_ps128(vTempA));
-                    pOutputVector += OutputStride;
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_castps256_ps128(vTempA2));
-                    pOutputVector += OutputStride;
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_extractf128_ps(vTempA, 1));
-                    pOutputVector += OutputStride;
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_extractf128_ps(vTempA2, 1));
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        const XMVECTOR row0 = M.r[0];
-        const XMVECTOR row1 = M.r[1];
-        const XMVECTOR row3 = M.r[3];
-
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-
-            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if (two > 0) {
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) &&
-                !(OutputStride & 0xF)) {
-                // Packed input, aligned output
-                for (size_t j = 0; j < two; ++j) {
-                    XMVECTOR V = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 2;
-
-                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                    vTemp = XM_FMADD_PS(Y, row1, row3);
-                    vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            } else {
-                // Packed input, unaligned output
-                for (size_t j = 0; j < two; ++j) {
-                    XMVECTOR V = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 2;
-
-                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                    vTemp = XM_FMADD_PS(Y, row1, row3);
-                    vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) &&
-        !(InputStride & 0xF)) {
-        if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) &&
-            !(OutputStride & 0xF)) {
-            // Aligned input, aligned output
-            for (; i < VectorCount; i++) {
-                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(
-                    reinterpret_cast<const __m128i*>(pInputVector)));
-                pInputVector += InputStride;
-
-                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
-                pOutputVector += OutputStride;
-            }
-        } else {
-            // Aligned input, unaligned output
-            for (; i < VectorCount; i++) {
-                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(
-                    reinterpret_cast<const __m128i*>(pInputVector)));
-                pInputVector += InputStride;
-
-                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
-                pOutputVector += OutputStride;
-            }
-        }
-    } else {
-        // Unaligned input
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-
-            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V,
-                                                    FXMMATRIX M) noexcept {
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    XMVECTOR W = XMVectorSplatW(Result);
-    return XMVectorDivide(Result, W);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMFLOAT2* XM_CALLCONV
-XMVector2TransformCoordStream(XMFLOAT2* pOutputStream, size_t OutputStride,
-                              const XMFLOAT2* pInputStream, size_t InputStride,
-                              size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMVECTOR W = XMVectorSplatW(Result);
-
-        Result = XMVectorDivide(Result, W);
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015, "PREfast noise: Esp:1307")
-#endif
-
-        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT2)) &&
-            (OutputStride == sizeof(XMFLOAT2))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x2_t V =
-                    vld2q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT2) * 4;
-
-                float32x2_t r3 = vget_low_f32(row3);
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(row3);
-                r = vget_high_f32(row0);
-                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r,
-                                            1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                W = vmlaq_lane_f32(W, V.val[1], r, 1);  // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-                V.val[0] = vdivq_f32(vResult0, W);
-                V.val[1] = vdivq_f32(vResult1, W);
-#else
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-                S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-
-                V.val[0] = vmulq_f32(vResult0, Reciprocal);
-                V.val[1] = vmulq_f32(vResult1, Reciprocal);
-#endif
-
-                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, V, 1);        // Y
-
-        V = vget_high_f32(vResult);
-        float32x2_t W = vdup_lane_f32(V, 1);
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-        V = vget_low_f32(vResult);
-        V = vdiv_f32(V, W);
-#else
-        // 2 iterations of Newton-Raphson refinement of reciprocal for W
-        float32x2_t Reciprocal = vrecpe_f32(W);
-        float32x2_t S = vrecps_f32(Reciprocal, W);
-        Reciprocal = vmul_f32(S, Reciprocal);
-        S = vrecps_f32(Reciprocal, W);
-        Reciprocal = vmul_f32(S, Reciprocal);
-
-        V = vget_low_f32(vResult);
-        V = vmul_f32(V, Reciprocal);
-#endif
-
-        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
-        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
-        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
-
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (OutputStride == sizeof(XMFLOAT2)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                        __m256 vTempA = _mm256_mul_ps(X1, row0);
-                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                        vTempA = _mm256_add_ps(vTempA, vTempB);
-                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA,
-                                                     _MM_SHUFFLE(3, 3, 3, 3));
-                        vTempA = _mm256_div_ps(vTempA, W);
-
-                        W = _mm256_shuffle_ps(vTempA2, vTempA2,
-                                              _MM_SHUFFLE(3, 3, 3, 3));
-                        vTempA2 = _mm256_div_ps(vTempA2, W);
-
-                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
-                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                        X1);
-                        pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                        __m256 vTempA = _mm256_mul_ps(X1, row0);
-                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                        vTempA = _mm256_add_ps(vTempA, vTempB);
-                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA,
-                                                     _MM_SHUFFLE(3, 3, 3, 3));
-                        vTempA = _mm256_div_ps(vTempA, W);
-
-                        W = _mm256_shuffle_ps(vTempA2, vTempA2,
-                                              _MM_SHUFFLE(3, 3, 3, 3));
-                        vTempA2 = _mm256_div_ps(vTempA2, W);
-
-                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
-                        _mm256_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector), X1);
-                        pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m256 VV = _mm256_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 4;
-
-                    __m256 Y2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                    __m256 X2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                    __m256 Y1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                    __m256 X1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
-                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
-                    __m256 vTempA = _mm256_mul_ps(X1, row0);
-                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
-                    vTempA = _mm256_add_ps(vTempA, vTempB);
-                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);
-
-                    __m256 W = _mm256_shuffle_ps(vTempA, vTempA,
-                                                 _MM_SHUFFLE(3, 3, 3, 3));
-                    vTempA = _mm256_div_ps(vTempA, W);
-
-                    W = _mm256_shuffle_ps(vTempA2, vTempA2,
-                                          _MM_SHUFFLE(3, 3, 3, 3));
-                    vTempA2 = _mm256_div_ps(vTempA2, W);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(
-                        reinterpret_cast<double*>(pOutputVector),
-                        _mm_castps_pd(_mm256_castps256_ps128(vTempA2)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(
-                        reinterpret_cast<double*>(pOutputVector),
-                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(
-                        reinterpret_cast<double*>(pOutputVector),
-                        _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1)));
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        const XMVECTOR row0 = M.r[0];
-        const XMVECTOR row1 = M.r[1];
-        const XMVECTOR row3 = M.r[3];
-
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-
-            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-            vTemp = _mm_div_ps(vTemp, W);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if (two > 0) {
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (OutputStride == sizeof(XMFLOAT2)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        XMVECTOR V = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 2;
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        XMVECTOR V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                        vTemp = XM_FMADD_PS(Y, row1, row3);
-                        vTemp2 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        XMVECTOR V2 = _mm_div_ps(vTemp, W);
-
-                        vTemp = _mm_movelh_ps(V1, V2);
-
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     vTemp);
-                        pOutputVector += sizeof(XMFLOAT2) * 2;
-
-                        i += 2;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        XMVECTOR V = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 2;
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        XMVECTOR V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                        vTemp = XM_FMADD_PS(Y, row1, row3);
-                        vTemp2 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        XMVECTOR V2 = _mm_div_ps(vTemp, W);
-
-                        vTemp = _mm_movelh_ps(V1, V2);
-
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      vTemp);
-                        pOutputVector += sizeof(XMFLOAT2) * 2;
-
-                        i += 2;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < two; ++j) {
-                    XMVECTOR V = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 2;
-
-                    // Result 1
-                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(vTemp));
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                    vTemp = XM_FMADD_PS(Y, row1, row3);
-                    vTemp2 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(vTemp));
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) &&
-        !(InputStride & 0xF)) {
-        // Aligned input
-        for (; i < VectorCount; i++) {
-            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(
-                reinterpret_cast<const __m128i*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-
-            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-            vTemp = _mm_div_ps(vTemp, W);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    } else {
-        // Unaligned input
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-
-            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-            vTemp = _mm_div_ps(vTemp, W);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V,
-                                                     FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1);  // Y
-    return vmlaq_lane_f32(Result, M.r[0], VL, 0);        // X
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));  // Y
-    vResult = _mm_mul_ps(vResult, M.r[1]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));  // X
-    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMFLOAT2* XM_CALLCONV
-XMVector2TransformNormalStream(XMFLOAT2* pOutputStream, size_t OutputStride,
-                               const XMFLOAT2* pInputStream, size_t InputStride,
-                               size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT2));
-
-    assert(OutputStride >= sizeof(XMFLOAT2));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT2));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(Y, row1);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015, "PREfast noise: Esp:1307")
-#endif
-
-        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT2)) &&
-            (OutputStride == sizeof(XMFLOAT2))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x2_t V =
-                    vld2q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT2) * 4;
-
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0);  // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1);  // Bx
-
-                XM_PREFETCH(pInputVector);
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-
-                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmulq_lane_f32(row0, V, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, V, 1);  // Y
-
-        V = vget_low_f32(vResult);
-        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
-        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
-
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (OutputStride == sizeof(XMFLOAT2)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
-                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
-                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
-                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
-
-                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
-                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                        X1);
-                        pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 4;
-
-                        __m256 Y2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                        __m256 X2 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 Y1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 X1 =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
-                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
-                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
-                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
-
-                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
-                        _mm256_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector), X1);
-                        pOutputVector += sizeof(XMFLOAT2) * 4;
-
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m256 VV = _mm256_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 4;
-
-                    __m256 Y2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-                    __m256 X2 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                    __m256 Y1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                    __m256 X1 =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    __m256 vTempA = _mm256_mul_ps(Y1, row1);
-                    __m256 vTempB = _mm256_mul_ps(Y2, row1);
-                    vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
-                    vTempB = _mm256_fmadd_ps(X2, row0, vTempB);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(_mm256_castps256_ps128(vTempB)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(
-                        reinterpret_cast<double*>(pOutputVector),
-                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
-                    pOutputVector += OutputStride;
-
-                    _mm_store_sd(
-                        reinterpret_cast<double*>(pOutputVector),
-                        _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1)));
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        const XMVECTOR row0 = M.r[0];
-        const XMVECTOR row1 = M.r[1];
-
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-            vTemp = XM_FMADD_PS(X, row0, vTemp);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if (two > 0) {
-        if (InputStride == sizeof(XMFLOAT2)) {
-            if (OutputStride == sizeof(XMFLOAT2)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        XMVECTOR V = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 2;
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                        vTemp = _mm_mul_ps(Y, row1);
-                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);
-
-                        vTemp = _mm_movelh_ps(V1, V2);
-
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     vTemp);
-                        pOutputVector += sizeof(XMFLOAT2) * 2;
-
-                        i += 2;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        XMVECTOR V = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT2) * 2;
-
-                        // Result 1
-                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);
-
-                        // Result 2
-                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                        vTemp = _mm_mul_ps(Y, row1);
-                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);
-
-                        vTemp = _mm_movelh_ps(V1, V2);
-
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      vTemp);
-                        pOutputVector += sizeof(XMFLOAT2) * 2;
-
-                        i += 2;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < two; ++j) {
-                    XMVECTOR V = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT2) * 2;
-
-                    // Result 1
-                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-                    vTemp = XM_FMADD_PS(X, row0, vTemp);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(vTemp));
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-
-                    vTemp = _mm_mul_ps(Y, row1);
-                    vTemp = XM_FMADD_PS(X, row0, vTemp);
-
-                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                                 _mm_castps_pd(vTemp));
-                    pOutputVector += OutputStride;
-
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) &&
-        !(InputStride & 0xF)) {
-        // Aligned input
-        for (; i < VectorCount; i++) {
-            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(
-                reinterpret_cast<const __m128i*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-            vTemp = XM_FMADD_PS(X, row0, vTemp);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    } else {
-        // Unaligned input
-        for (; i < VectorCount; i++) {
-            __m128 xy = _mm_castpd_ps(
-                _mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
-            pInputVector += InputStride;
-
-            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
-            vTemp = XM_FMADD_PS(X, row0, vTemp);
-
-            _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
-                         _mm_castps_pd(vTemp));
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-/****************************************************************************
- *
- * 3D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] == V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
-               (V1.vector4_f32[2] != V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r =
-        vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp) & 7;
-    uint32_t CR = 0;
-    if (iTest == 7) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
-             (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
-             (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1,
-                                               FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
-        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
-        (V1.vector4_u32[2] == V2.vector4_u32[2])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
-               (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
-               (V1.vector4_u32[2] != V2.vector4_u32[2])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r =
-        vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7;
-    uint32_t CR = 0;
-    if (iTemp == 7) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTemp) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                           FXMVECTOR Epsilon) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float dx, dy, dz;
-
-    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
-    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
-    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
-    return (((dx <= Epsilon.vector4_f32[0]) && (dy <= Epsilon.vector4_f32[1]) &&
-             (dz <= Epsilon.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vDelta = vsubq_f32(V1, V2);
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
-#else
-    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
-#endif
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp, vDelta);
-    vTemp = _mm_max_ps(vTemp, vDelta);
-    vTemp = _mm_cmple_ps(vTemp, Epsilon);
-    // w is don't care
-    return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) ||
-             (V1.vector4_f32[1] != V2.vector4_f32[1]) ||
-             (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) != 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) ||
-             (V1.vector4_u32[1] != V2.vector4_u32[1]) ||
-             (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) != 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] > V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
-               (V1.vector4_f32[2] <= V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r =
-        vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp) & 7;
-    if (iTest == 7) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1,
-                                                FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1,
-                                                     FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] >= V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-               (V1.vector4_f32[2] < V2.vector4_f32[2])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r =
-        vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    uint32_t CR = 0;
-    int iTest = _mm_movemask_ps(vTemp) & 7;
-    if (iTest == 7) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcltq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcleq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
-    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V,
-                                          FXMVECTOR Bounds) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] &&
-              V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
-             (V.vector4_f32[1] <= Bounds.vector4_f32[1] &&
-              V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
-             (V.vector4_f32[2] <= Bounds.vector4_f32[2] &&
-              V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
-    // Negate the bounds
-    float32x4_t vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
-    // Blend answers
-    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
-    // in bounds?
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)),
-                                vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
-    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) &
-             0xFFFFFFU) == 0xFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2, V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-    // x,y and z in bounds? (w is don't care)
-    return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0);
-#else
-    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISNAN(V.vector4_f32[0]) || XMISNAN(V.vector4_f32[1]) ||
-            XMISNAN(V.vector4_f32[2]));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1)) ||
-           isnan(vgetq_lane_f32(V, 2));
-#else
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32(V, V);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)),
-                                vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    // If x or y or z are NaN, the mask is zero
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) != 0xFFFFFFU);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    XM_ALIGNED_DATA(16) float tmp[4];
-    _mm_store_ps(tmp, V);
-    return isnan(tmp[0]) || isnan(tmp[1]) || isnan(tmp[2]);
-#else
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
-    // If x or y or z are NaN, the mask is non-zero
-    return ((_mm_movemask_ps(vTempNan) & 7) != 0);
-#endif
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISINF(V.vector4_f32[0]) || XMISINF(V.vector4_f32[1]) ||
-            XMISINF(V.vector4_f32[2]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    // Compare to infinity
-    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
-    // If any are infinity, the signs are true.
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)),
-                                vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) &
-             0xFFFFFFU) != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
-    // If x,y or z are infinity, the signs are true.
-    return ((_mm_movemask_ps(vTemp) & 7) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] +
-                   V1.vector4_f32[1] * V2.vector4_f32[1] +
-                   V1.vector4_f32[2] * V2.vector4_f32[2];
-    XMVECTORF32 vResult;
-    vResult.f[0] = vResult.f[1] = vResult.f[2] = vResult.f[3] = fValue;
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTemp = vmulq_f32(V1, V2);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    return vcombine_f32(v1, v1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps(V1, V2, 0x7f);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
-    vTemp = _mm_and_ps(vTemp, g_XMMask3);
-    vTemp = _mm_hadd_ps(vTemp, vTemp);
-    return _mm_hadd_ps(vTemp, vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V1, V2);
-    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
-    // Result.vector4_f32[0] = x+y
-    vDot = _mm_add_ss(vDot, vTemp);
-    // x=Dot.vector4_f32[2]
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // Result.vector4_f32[0] = (x+y)+z
-    vDot = _mm_add_ss(vDot, vTemp);
-    // Splat x
-    return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1,
-                                           FXMVECTOR V2) noexcept {
-    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{(V1.vector4_f32[1] * V2.vector4_f32[2]) -
-                                 (V1.vector4_f32[2] * V2.vector4_f32[1]),
-                             (V1.vector4_f32[2] * V2.vector4_f32[0]) -
-                                 (V1.vector4_f32[0] * V2.vector4_f32[2]),
-                             (V1.vector4_f32[0] * V2.vector4_f32[1]) -
-                                 (V1.vector4_f32[1] * V2.vector4_f32[0]),
-                             0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t v1xy = vget_low_f32(V1);
-    float32x2_t v2xy = vget_low_f32(V2);
-
-    float32x2_t v1yx = vrev64_f32(v1xy);
-    float32x2_t v2yx = vrev64_f32(v2xy);
-
-    float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0);
-    float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0);
-
-    XMVECTOR vResult =
-        vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx));
-    vResult =
-        vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy));
-    vResult = vreinterpretq_f32_u32(
-        veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY));
-    return vreinterpretq_f32_u32(
-        vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // y1,z1,x1,w1
-    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
-    // z2,x2,y2,w2
-    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
-    // Perform the left operation
-    XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
-    // z1,x1,y1,w1
-    vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
-    // y2,z2,x2,w2
-    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
-    // Perform the right operation
-    vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult);
-    // Set w to zero
-    return _mm_and_ps(vResult, g_XMMask3);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept {
-    return XMVector3Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32(v1);
-    return vcombine_f32(v2, v2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
-    return _mm_rsqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the reciprocal
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    // Reciprocal sqrt
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
-    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
-    return _mm_div_ps(g_XMOne, vLengthSq);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_and_ps(vDot, g_XMMask3);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_sqrt_ps(vDot);
-    vDot = _mm_div_ps(g_XMOne, vDot);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot, vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot, vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the reciprocal
-    vDot = _mm_sqrt_ps(vDot);
-    // Get the reciprocal
-    vDot = _mm_div_ps(g_XMOne, vDot);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32(v1);
-    Result = vmul_f32(v1, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector3LengthSq(V);
-    Result = XMVectorSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    Result = vmul_f32(v1, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and y
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
-    // x+z, y
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // y,y,y,y
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // x+z+y,??,??,??
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    // Splat the length squared
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector3NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector3ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32(v1);
-    // Normalize
-    return vmulq_f32(V, vcombine_f32(v2, v2));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
-    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_and_ps(vDot, g_XMMask3);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_rsqrt_ps(vDot);
-    vDot = _mm_mul_ps(vDot, V);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    // x=Dot.y, y=Dot.z
-    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
-    // Result.x = x+y
-    vDot = _mm_add_ss(vDot, vTemp);
-    // x=Dot.z
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    // Result.x = (x+y)+z
-    vDot = _mm_add_ss(vDot, vTemp);
-    // Splat x
-    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
-    // Get the reciprocal
-    vDot = _mm_rsqrt_ps(vDot);
-    // Perform the normalization
-    vDot = _mm_mul_ps(vDot, V);
-    return vDot;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float fLength;
-    XMVECTOR vResult;
-
-    vResult = XMVector3Length(V);
-    fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f / fLength;
-    }
-
-    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot3
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vpadd_f32(v1, v1);
-    v2 = vdup_lane_f32(v2, 0);
-    v1 = vadd_f32(v1, v2);
-    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
-    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    v2 = vmul_f32(S1, R1);
-    // Normalize
-    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
-    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0),
-                        vResult);
-    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y and z only
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
-    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin,
-                                                 float LengthMax) noexcept {
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-
-    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV(
-    FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept {
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) &&
-           (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) &&
-           (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
-    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
-    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector3LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls
-    // within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident,
-                                             FXMVECTOR Normal) noexcept {
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result = XMVector3Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident,
-                                             FXMVECTOR Normal,
-                                             float RefractionIndex) noexcept {
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector3RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3RefractV(
-    FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept {
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex *
-    // dot(Incident, Normal) + sqrt(1 - RefractionIndex * RefractionIndex * (1 -
-    // dot(Incident, Normal) * dot(Incident, Normal))))
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
-    R = XMVectorMultiply(R, RefractionIndex);
-    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
-
-    if (XMVector4LessOrEqual(R, Zero)) {
-        // Total internal reflection
-        return Zero;
-    } else {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = XMVectorSqrt(R);
-        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
-
-        // Result = RefractionIndex * Incident - Normal * R
-        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
-        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
-
-        return Result;
-    }
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
-    R = vmulq_f32(R, RefractionIndex);
-    R = vmlsq_f32(g_XMOne, R, RefractionIndex);
-
-    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)),
-                                vget_high_u8(vreinterpretq_u8_u32(isrzero)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-
-    float32x4_t vResult;
-    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) {
-        // Total internal reflection
-        vResult = g_XMZero;
-    } else {
-        // Sqrt(R)
-        float32x4_t S0 = vrsqrteq_f32(R);
-        float32x4_t P0 = vmulq_f32(R, S0);
-        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
-        float32x4_t S1 = vmulq_f32(S0, R0);
-        float32x4_t P1 = vmulq_f32(R, S1);
-        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
-        float32x4_t S2 = vmulq_f32(S1, R1);
-        R = vmulq_f32(R, S2);
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = vmlaq_f32(R, RefractionIndex, IDotN);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = vmulq_f32(RefractionIndex, Incident);
-        vResult = vmlsq_f32(vResult, R, Normal);
-    }
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex *
-    // dot(Incident, Normal) + sqrt(1 - RefractionIndex * RefractionIndex * (1 -
-    // dot(Incident, Normal) * dot(Incident, Normal))))
-    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
-    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
-    R = XM_FNMADD_PS(R, R2, g_XMOne);
-
-    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
-    if (_mm_movemask_ps(vResult) == 0x0f) {
-        // Total internal reflection
-        vResult = g_XMZero;
-    } else {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = _mm_sqrt_ps(R);
-        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = _mm_mul_ps(RefractionIndex, Incident);
-        vResult = XM_FNMADD_PS(R, Normal, vResult);
-    }
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept {
-    XMVECTOR Zero = XMVectorZero();
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR YZYY =
-        XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(
-            V);
-
-    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
-
-    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
-    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
-
-    XMVECTOR S = XMVectorAdd(YZYY, Z);
-    XMVECTOR D = XMVectorSubtract(YZYY, Z);
-
-    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
-
-    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X,
-                                  XM_PERMUTE_0X>(NegativeV, S);
-    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X,
-                                  XM_PERMUTE_0X>(V, D);
-
-    return XMVectorSelect(R1, R0, Select);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector3Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector3Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector3ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector3Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance(
-    FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept {
-    // Given a vector PointVector from LinePoint1 to Point and a vector
-    // LineVector from LinePoint1 to LinePoint2, the scaled distance
-    // PointProjectionScale from LinePoint1 to the perpendicular projection
-    // of PointVector onto the line is defined as:
-    //
-    //     PointProjectionScale = dot(PointVector, LineVector) /
-    //     LengthSq(LineVector)
-
-    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
-    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
-
-    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
-
-    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
-    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
-
-    XMVECTOR DistanceVector =
-        XMVectorMultiply(LineVector, PointProjectionScale);
-    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
-
-    return XMVector3Length(DistanceVector);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMVector3ComponentsFromNormal(XMVECTOR* pParallel, XMVECTOR* pPerpendicular,
-                              FXMVECTOR V, FXMVECTOR Normal) noexcept {
-    assert(pParallel != nullptr);
-    assert(pPerpendicular != nullptr);
-
-    XMVECTOR Scale = XMVector3Dot(V, Normal);
-
-    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
-
-    *pParallel = Parallel;
-    *pPerpendicular = XMVectorSubtract(V, Parallel);
-}
-
-//------------------------------------------------------------------------------
-// Transform a vector using a rotation expressed as a unit quaternion
-
-inline XMVECTOR XM_CALLCONV
-XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept {
-    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
-    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
-    XMVECTOR Result = XMQuaternionMultiply(Q, A);
-    return XMQuaternionMultiply(Result, RotationQuaternion);
-}
-
-//------------------------------------------------------------------------------
-// Transform a vector using the inverse of a rotation expressed as a unit
-// quaternion
-
-inline XMVECTOR XM_CALLCONV
-XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept {
-    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
-    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
-    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
-    return XMQuaternionMultiply(Result, Q);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V,
-                                               FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0);     // X
-    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1);             // Y
-    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0);  // Z
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));  // Z
-    vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));  // Y
-    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
-    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));  // X
-    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream(
-    XMFLOAT4* pOutputStream, size_t OutputStride, const XMFLOAT3* pInputStream,
-    size_t InputStride, size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT3)) &&
-            (OutputStride == sizeof(XMFLOAT4))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x3_t V =
-                    vld3q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT3) * 4;
-
-                float32x2_t r3 = vget_low_f32(row3);
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(row3);
-                r = vget_high_f32(row0);
-                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Cx+O
-                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy+O
-                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1);  // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                r = vget_low_f32(row2);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[2], r, 0);  // Ax+Ey+Iz+M
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[2], r, 1);  // Bx+Fy+Jz+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(row2);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[2], r, 0);  // Cx+Gy+Kz+O
-                vResult3 =
-                    vmlaq_lane_f32(vResult3, V.val[2], r, 1);  // Dx+Hy+Lz+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-                float32x4x4_t R;
-                R.val[0] = vResult0;
-                R.val[1] = vResult1;
-                R.val[2] = vResult2;
-                R.val[3] = vResult3;
-
-                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
-                pOutputVector += sizeof(XMFLOAT4) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32(
-            reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, VL, 1);        // Y
-        vResult = vmlaq_lane_f32(vResult, row2, VH, 0);        // Z
-
-        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(XMFLOAT3)) {
-            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) &&
-                !(OutputStride & 0xF)) {
-                // Packed input, aligned output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                 vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            } else {
-                // Packed input, unaligned output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) &&
-        !(OutputStride & 0xF)) {
-        // Aligned output
-        for (; i < VectorCount; ++i) {
-            XMVECTOR V =
-                XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-            pInputVector += InputStride;
-
-            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-            vTemp = _mm_add_ps(vTemp, vTemp3);
-
-            XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
-            pOutputVector += OutputStride;
-        }
-    } else {
-        // Unaligned output
-        for (; i < VectorCount; ++i) {
-            XMVECTOR V =
-                XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-            pInputVector += InputStride;
-
-            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-            vTemp = _mm_add_ps(vTemp, vTemp2);
-            vTemp = _mm_add_ps(vTemp, vTemp3);
-
-            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V,
-                                                    FXMMATRIX M) noexcept {
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    XMVECTOR W = XMVectorSplatW(Result);
-    return XMVectorDivide(Result, W);
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline XMFLOAT3* XM_CALLCONV
-XMVector3TransformCoordStream(XMFLOAT3* pOutputStream, size_t OutputStride,
-                              const XMFLOAT3* pInputStream, size_t InputStride,
-                              size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMVECTOR W = XMVectorSplatW(Result);
-
-        Result = XMVectorDivide(Result, W);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT3)) &&
-            (OutputStride == sizeof(XMFLOAT3))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x3_t V =
-                    vld3q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT3) * 4;
-
-                float32x2_t r3 = vget_low_f32(row3);
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(row3);
-                r = vget_high_f32(row0);
-                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Cx+O
-                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r,
-                                            1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy+O
-                W = vmlaq_lane_f32(W, V.val[1], r, 1);                // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                r = vget_low_f32(row2);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[2], r, 0);  // Ax+Ey+Iz+M
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[2], r, 1);  // Bx+Fy+Jz+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(row2);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[2], r, 0);  // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32(W, V.val[2], r, 1);         // Dx+Hy+Lz+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-                V.val[0] = vdivq_f32(vResult0, W);
-                V.val[1] = vdivq_f32(vResult1, W);
-                V.val[2] = vdivq_f32(vResult2, W);
-#else
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-                S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-
-                V.val[0] = vmulq_f32(vResult0, Reciprocal);
-                V.val[1] = vmulq_f32(vResult1, Reciprocal);
-                V.val[2] = vmulq_f32(vResult2, Reciprocal);
-#endif
-
-                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT3) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32(
-            reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, VL, 1);        // Y
-        vResult = vmlaq_lane_f32(vResult, row2, VH, 0);        // Z
-
-        VH = vget_high_f32(vResult);
-        XMVECTOR W = vdupq_lane_f32(VH, 1);
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-        vResult = vdivq_f32(vResult, W);
-#else
-        // 2 iterations of Newton-Raphson refinement of reciprocal for W
-        float32x4_t Reciprocal = vrecpeq_f32(W);
-        float32x4_t S = vrecpsq_f32(Reciprocal, W);
-        Reciprocal = vmulq_f32(S, Reciprocal);
-        S = vrecpsq_f32(Reciprocal, W);
-        Reciprocal = vmulq_f32(S, Reciprocal);
-
-        vResult = vmulq_f32(vResult, Reciprocal);
-#endif
-
-        VL = vget_low_f32(vResult);
-        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
-        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(XMFLOAT3)) {
-            if (OutputStride == sizeof(XMFLOAT3)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V2 = _mm_div_ps(vTemp, W);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V3 = _mm_div_ps(vTemp, W);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V4 = _mm_div_ps(vTemp, W);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     V1);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V2 = _mm_div_ps(vTemp, W);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V3 = _mm_div_ps(vTemp, W);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, row2, row3);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        V4 = _mm_div_ps(vTemp, W);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      V1);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, row2, row3);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTemp = _mm_div_ps(vTemp, W);
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
-        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-        vTemp = _mm_add_ps(vTemp, vTemp2);
-        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-
-        vTemp = _mm_div_ps(vTemp, W);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V,
-                                                     FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Z = XMVectorSplatZ(V);
-    XMVECTOR Y = XMVectorSplatY(V);
-    XMVECTOR X = XMVectorSplatX(V);
-
-    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
-    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
-    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0);             // X
-    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1);             // Y
-    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0);  // Z
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));  // Z
-    vResult = _mm_mul_ps(vResult, M.r[2]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));  // Y
-    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
-    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));  // X
-    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline XMFLOAT3* XM_CALLCONV
-XMVector3TransformNormalStream(XMFLOAT3* pOutputStream, size_t OutputStride,
-                               const XMFLOAT3* pInputStream, size_t InputStride,
-                               size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(Z, row2);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT3)) &&
-            (OutputStride == sizeof(XMFLOAT3))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x3_t V =
-                    vld3q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT3) * 4;
-
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0);  // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1);  // Bx
-
-                XM_PREFETCH(pInputVector);
-
-                r = vget_high_f32(row0);
-                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0);  // Cx
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                r = vget_low_f32(row2);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[2], r, 0);  // Ax+Ey+Iz
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[2], r, 1);  // Bx+Fy+Jz
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(row2);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[2], r, 0);  // Cx+Gy+Kz
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-                V.val[2] = vResult2;
-
-                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT3) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
-        float32x2_t zero = vdup_n_f32(0);
-        float32x2_t VH = vld1_lane_f32(
-            reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
-        pInputVector += InputStride;
-
-        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, VL, 1);  // Y
-        vResult = vmlaq_lane_f32(vResult, row2, VH, 0);  // Z
-
-        VL = vget_low_f32(vResult);
-        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
-        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(XMFLOAT3)) {
-            if (OutputStride == sizeof(XMFLOAT3)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V1 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V2 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V3 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V4 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     V1);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V1 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V2 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V3 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = _mm_mul_ps(Z, row2);
-                        vTemp2 = _mm_mul_ps(Y, row1);
-                        vTemp3 = _mm_mul_ps(X, row0);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        V4 = _mm_add_ps(vTemp, vTemp3);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      V1);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp = _mm_mul_ps(Z, row2);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = _mm_mul_ps(Z, row2);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = _mm_mul_ps(Z, row2);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = _mm_mul_ps(Z, row2);
-                    vTemp2 = _mm_mul_ps(Y, row1);
-                    vTemp3 = _mm_mul_ps(X, row0);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
-        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
-        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
-        vTemp = _mm_add_ps(vTemp, vTemp2);
-        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Project(
-    FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth,
-    float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
-    FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept {
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight,
-                                 ViewportMaxZ - ViewportMinZ, 0.0f);
-    XMVECTOR Offset =
-        XMVectorSet(ViewportX + HalfViewportWidth,
-                    ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    XMVECTOR Result = XMVector3TransformCoord(V, Transform);
-
-    Result = XMVectorMultiplyAdd(Result, Scale, Offset);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream(
-    XMFLOAT3* pOutputStream, size_t OutputStride, const XMFLOAT3* pInputStream,
-    size_t InputStride, size_t VectorCount, float ViewportX, float ViewportY,
-    float ViewportWidth, float ViewportHeight, float ViewportMinZ,
-    float ViewportMaxZ, FXMMATRIX Projection, CXMMATRIX View,
-    CXMMATRIX World) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight,
-                                 ViewportMaxZ - ViewportMinZ, 1.0f);
-    XMVECTOR Offset =
-        XMVectorSet(ViewportX + HalfViewportWidth,
-                    ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-
-        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
-        Result = XMVectorMultiplyAdd(Result, Scale, Offset);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT3)) &&
-            (OutputStride == sizeof(XMFLOAT3))) {
-            XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
-            XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
-            XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);
-
-            XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
-            XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
-            XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);
-
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x3_t V =
-                    vld3q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT3) * 4;
-
-                float32x2_t r3 = vget_low_f32(Transform.r[3]);
-                float32x2_t r = vget_low_f32(Transform.r[0]);
-                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Ax+M
-                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1),
-                                                   V.val[0], r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(Transform.r[3]);
-                r = vget_high_f32(Transform.r[0]);
-                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0),
-                                                   V.val[0], r, 0);  // Cx+O
-                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r,
-                                            1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(Transform.r[1]);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(Transform.r[1]);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy+O
-                W = vmlaq_lane_f32(W, V.val[1], r, 1);                // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                r = vget_low_f32(Transform.r[2]);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[2], r, 0);  // Ax+Ey+Iz+M
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[2], r, 1);  // Bx+Fy+Jz+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(Transform.r[2]);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[2], r, 0);  // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32(W, V.val[2], r, 1);         // Dx+Hy+Lz+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-                vResult0 = vdivq_f32(vResult0, W);
-                vResult1 = vdivq_f32(vResult1, W);
-                vResult2 = vdivq_f32(vResult2, W);
-#else
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-                S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-
-                vResult0 = vmulq_f32(vResult0, Reciprocal);
-                vResult1 = vmulq_f32(vResult1, Reciprocal);
-                vResult2 = vmulq_f32(vResult2, Reciprocal);
-#endif
-
-                V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX);
-                V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY);
-                V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ);
-
-                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT3) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight,
-                                     ViewportMaxZ - ViewportMinZ, 1.0f);
-        XMVECTOR Offset =
-            XMVectorSet(ViewportX + HalfViewportWidth,
-                        ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-        for (; i < VectorCount; i++) {
-            float32x2_t VL =
-                vld1_f32(reinterpret_cast<const float*>(pInputVector));
-            float32x2_t zero = vdup_n_f32(0);
-            float32x2_t VH = vld1_lane_f32(
-                reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
-            pInputVector += InputStride;
-
-            XMVECTOR vResult =
-                vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0);  // X
-            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1);   // Y
-            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0);   // Z
-
-            VH = vget_high_f32(vResult);
-            XMVECTOR W = vdupq_lane_f32(VH, 1);
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-            vResult = vdivq_f32(vResult, W);
-#else
-            // 2 iterations of Newton-Raphson refinement of reciprocal for W
-            float32x4_t Reciprocal = vrecpeq_f32(W);
-            float32x4_t S = vrecpsq_f32(Reciprocal, W);
-            Reciprocal = vmulq_f32(S, Reciprocal);
-            S = vrecpsq_f32(Reciprocal, W);
-            Reciprocal = vmulq_f32(S, Reciprocal);
-
-            vResult = vmulq_f32(vResult, Reciprocal);
-#endif
-
-            vResult = vmlaq_f32(Offset, vResult, Scale);
-
-            VL = vget_low_f32(vResult);
-            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
-            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult,
-                           2);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    const float HalfViewportWidth = ViewportWidth * 0.5f;
-    const float HalfViewportHeight = ViewportHeight * 0.5f;
-
-    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight,
-                                 ViewportMaxZ - ViewportMinZ, 1.0f);
-    XMVECTOR Offset =
-        XMVectorSet(ViewportX + HalfViewportWidth,
-                    ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(XMFLOAT3)) {
-            if (OutputStride == sizeof(XMFLOAT3)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp =
-                            XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     V1);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp =
-                            XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 2
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 3
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Result 4
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        vTemp = _mm_div_ps(vTemp, W);
-                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      V1);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp =
-                        XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride;
-
-        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-        vTemp = _mm_add_ps(vTemp, vTemp2);
-        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-        vTemp = _mm_div_ps(vTemp, W);
-        vTemp = XM_FMADD_PS(vTemp, Scale, Offset);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector3Unproject(
-    FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth,
-    float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
-    FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept {
-    static const XMVECTORF32 D = {{{-1.0f, 1.0f, 0.0f, 0.0f}}};
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f,
-                                 ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
-
-    return XMVector3TransformCoord(Result, Transform);
-}
-
-//------------------------------------------------------------------------------
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream(
-    XMFLOAT3* pOutputStream, size_t OutputStride, const XMFLOAT3* pInputStream,
-    size_t InputStride, size_t VectorCount, float ViewportX, float ViewportY,
-    float ViewportWidth, float ViewportHeight, float ViewportMinZ,
-    float ViewportMaxZ, FXMMATRIX Projection, CXMMATRIX View,
-    CXMMATRIX World) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT3));
-
-    assert(OutputStride >= sizeof(XMFLOAT3));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT3));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 D = {{{-1.0f, 1.0f, 0.0f, 0.0f}}};
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f,
-                                 ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-
-        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
-
-        Result = XMVector3TransformCoord(Result, Transform);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    float sx = 1.f / (ViewportWidth * 0.5f);
-    float sy = 1.f / (-ViewportHeight * 0.5f);
-    float sz = 1.f / (ViewportMaxZ - ViewportMinZ);
-
-    float ox = (-ViewportX * sx) - 1.f;
-    float oy = (-ViewportY * sy) + 1.f;
-    float oz = (-ViewportMinZ * sz);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT3)) &&
-            (OutputStride == sizeof(XMFLOAT3))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x3_t V =
-                    vld3q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT3) * 4;
-
-                XMVECTOR ScaleX = vdupq_n_f32(sx);
-                XMVECTOR OffsetX = vdupq_n_f32(ox);
-                XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]);
-
-                float32x2_t r3 = vget_low_f32(Transform.r[3]);
-                float32x2_t r = vget_low_f32(Transform.r[0]);
-                XMVECTOR vResult0 =
-                    vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0);  // Ax+M
-                XMVECTOR vResult1 =
-                    vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1);  // Bx+N
-
-                XM_PREFETCH(pInputVector);
-
-                r3 = vget_high_f32(Transform.r[3]);
-                r = vget_high_f32(Transform.r[0]);
-                XMVECTOR vResult2 =
-                    vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0);  // Cx+O
-                XMVECTOR W =
-                    vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1);  // Dx+P
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                XMVECTOR ScaleY = vdupq_n_f32(sy);
-                XMVECTOR OffsetY = vdupq_n_f32(oy);
-                XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]);
-
-                r = vget_low_f32(Transform.r[1]);
-                vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0);  // Ax+Ey+M
-                vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1);  // Bx+Fy+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(Transform.r[1]);
-                vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0);  // Cx+Gy+O
-                W = vmlaq_lane_f32(W, VY, r, 1);                // Dx+Hy+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                XMVECTOR ScaleZ = vdupq_n_f32(sz);
-                XMVECTOR OffsetZ = vdupq_n_f32(oz);
-                XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]);
-
-                r = vget_low_f32(Transform.r[2]);
-                vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0);  // Ax+Ey+Iz+M
-                vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1);  // Bx+Fy+Jz+N
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(Transform.r[2]);
-                vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0);  // Cx+Gy+Kz+O
-                W = vmlaq_lane_f32(W, VZ, r, 1);                // Dx+Hy+Lz+P
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-                V.val[0] = vdivq_f32(vResult0, W);
-                V.val[1] = vdivq_f32(vResult1, W);
-                V.val[2] = vdivq_f32(vResult2, W);
-#else
-                // 2 iterations of Newton-Raphson refinement of reciprocal
-                float32x4_t Reciprocal = vrecpeq_f32(W);
-                float32x4_t S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-                S = vrecpsq_f32(Reciprocal, W);
-                Reciprocal = vmulq_f32(S, Reciprocal);
-
-                V.val[0] = vmulq_f32(vResult0, Reciprocal);
-                V.val[1] = vmulq_f32(vResult1, Reciprocal);
-                V.val[2] = vmulq_f32(vResult2, Reciprocal);
-#endif
-
-                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT3) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        float32x2_t ScaleL = vcreate_f32(
-            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sx)) |
-            (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sy))
-             << 32));
-        float32x2_t ScaleH = vcreate_f32(
-            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sz)));
-
-        float32x2_t OffsetL = vcreate_f32(
-            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&ox)) |
-            (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oy))
-             << 32));
-        float32x2_t OffsetH = vcreate_f32(
-            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oz)));
-
-        for (; i < VectorCount; i++) {
-            float32x2_t VL =
-                vld1_f32(reinterpret_cast<const float*>(pInputVector));
-            float32x2_t zero = vdup_n_f32(0);
-            float32x2_t VH = vld1_lane_f32(
-                reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
-            pInputVector += InputStride;
-
-            VL = vmla_f32(OffsetL, VL, ScaleL);
-            VH = vmla_f32(OffsetH, VH, ScaleH);
-
-            XMVECTOR vResult =
-                vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0);  // X
-            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1);   // Y
-            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0);   // Z
-
-            VH = vget_high_f32(vResult);
-            XMVECTOR W = vdupq_lane_f32(VH, 1);
-
-#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-    defined(_M_ARM64EC) || __aarch64__
-            vResult = vdivq_f32(vResult, W);
-#else
-            // 2 iterations of Newton-Raphson refinement of reciprocal for W
-            float32x4_t Reciprocal = vrecpeq_f32(W);
-            float32x4_t S = vrecpsq_f32(Reciprocal, W);
-            Reciprocal = vmulq_f32(S, Reciprocal);
-            S = vrecpsq_f32(Reciprocal, W);
-            Reciprocal = vmulq_f32(S, Reciprocal);
-
-            vResult = vmulq_f32(vResult, Reciprocal);
-#endif
-
-            VL = vget_low_f32(vResult);
-            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
-            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult,
-                           2);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 D = {{{-1.0f, 1.0f, 0.0f, 0.0f}}};
-
-    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f,
-                                 ViewportMaxZ - ViewportMinZ, 1.0f);
-    Scale = XMVectorReciprocal(Scale);
-
-    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
-    Offset = _mm_mul_ps(Scale, Offset);
-    Offset = _mm_add_ps(Offset, D);
-
-    XMMATRIX Transform = XMMatrixMultiply(World, View);
-    Transform = XMMatrixMultiply(Transform, Projection);
-    Transform = XMMatrixInverse(nullptr, Transform);
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(XMFLOAT3)) {
-            if (OutputStride == sizeof(XMFLOAT3)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        V1 = XM_FMADD_PS(V1, Scale, Offset);
-
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp =
-                            XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        V2 = XM_FMADD_PS(V2, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V2 = _mm_div_ps(vTemp, W);
-
-                        // Result 3
-                        V3 = XM_FMADD_PS(V3, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V3 = _mm_div_ps(vTemp, W);
-
-                        // Result 4
-                        V4 = XM_FMADD_PS(V4, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V4 = _mm_div_ps(vTemp, W);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                     V1);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        XM_STREAM_PS(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, unaligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 V1 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        __m128 L2 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 16));
-                        __m128 L3 = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector + 32));
-                        pInputVector += sizeof(XMFLOAT3) * 4;
-
-                        // Unpack the 4 vectors (.w components are junk)
-                        XM3UNPACK3INTO4(V1, L2, L3);
-
-                        // Result 1
-                        V1 = XM_FMADD_PS(V1, Scale, Offset);
-
-                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        XMVECTOR vTemp =
-                            XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        XMVECTOR W =
-                            XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V1 = _mm_div_ps(vTemp, W);
-
-                        // Result 2
-                        V2 = XM_FMADD_PS(V2, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V2 = _mm_div_ps(vTemp, W);
-
-                        // Result 3
-                        V3 = XM_FMADD_PS(V3, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V3 = _mm_div_ps(vTemp, W);
-
-                        // Result 4
-                        V4 = XM_FMADD_PS(V4, Scale, Offset);
-
-                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                        vTemp = _mm_add_ps(vTemp, vTemp2);
-                        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                        V4 = _mm_div_ps(vTemp, W);
-
-                        // Pack and store the vectors
-                        XM3PACK4INTO3(vTemp);
-                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                      V1);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 16),
-                            vTemp);
-                        _mm_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector + 32), V3);
-                        pOutputVector += sizeof(XMFLOAT3) * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128 V1 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    __m128 L2 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 16));
-                    __m128 L3 = _mm_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector + 32));
-                    pInputVector += sizeof(XMFLOAT3) * 4;
-
-                    // Unpack the 4 vectors (.w components are junk)
-                    XM3UNPACK3INTO4(V1, L2, L3);
-
-                    // Result 1
-                    V1 = XM_FMADD_PS(V1, Scale, Offset);
-
-                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
-                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
-                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    XMVECTOR vTemp =
-                        XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 2
-                    V2 = XM_FMADD_PS(V2, Scale, Offset);
-
-                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 3
-                    V3 = XM_FMADD_PS(V3, Scale, Offset);
-
-                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    // Result 4
-                    V4 = XM_FMADD_PS(V4, Scale, Offset);
-
-                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
-                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
-                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));
-
-                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-                    vTemp = _mm_add_ps(vTemp, vTemp2);
-                    vTemp = _mm_add_ps(vTemp, vTemp3);
-
-                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-                    vTemp = _mm_div_ps(vTemp, W);
-
-                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector),
-                                  vTemp);
-                    pOutputVector += OutputStride;
-
-                    i += 4;
-                }
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
-        pInputVector += InputStride;
-
-        V = _mm_mul_ps(V, Scale);
-        V = _mm_add_ps(V, Offset);
-
-        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-
-        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
-        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
-        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
-        vTemp = _mm_add_ps(vTemp, vTemp2);
-        vTemp = _mm_add_ps(vTemp, vTemp3);
-
-        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
-        vTemp = _mm_div_ps(vTemp, W);
-
-        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
-        pOutputVector += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-/****************************************************************************
- *
- * 4D Vector
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-// Comparison operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
-             (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1,
-                                            FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t CR = 0;
-
-    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] == V2.vector4_f32[3])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
-               (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
-               (V1.vector4_f32[3] != V2.vector4_f32[3])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    uint32_t CR = 0;
-    if (iTest == 0xf)  // All equal?
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (iTest == 0)  // All not equal?
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
-             (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
-             (V1.vector4_u32[2] == V2.vector4_u32[2]) &&
-             (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1,
-                                               FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
-        V1.vector4_u32[1] == V2.vector4_u32[1] &&
-        V1.vector4_u32[2] == V2.vector4_u32[2] &&
-        V1.vector4_u32[3] == V2.vector4_u32[3]) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
-               V1.vector4_u32[1] != V2.vector4_u32[1] &&
-               V1.vector4_u32[2] != V2.vector4_u32[2] &&
-               V1.vector4_u32[3] != V2.vector4_u32[3]) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
-    uint32_t CR = 0;
-    if (iTest == 0xf)  // All equal?
-    {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (iTest == 0)  // All not equal?
-    {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-inline bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2,
-                                           FXMVECTOR Epsilon) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float dx, dy, dz, dw;
-
-    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
-    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
-    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
-    dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]);
-    return (((dx <= Epsilon.vector4_f32[0]) && (dy <= Epsilon.vector4_f32[1]) &&
-             (dz <= Epsilon.vector4_f32[2]) &&
-             (dw <= Epsilon.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vDelta = vsubq_f32(V1, V2);
-#if defined(_MSC_VER) && !defined(__clang__) && \
-    !defined(_ARM64_DISTINCT_NEON_TYPES)
-    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
-#else
-    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
-#endif
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Get the difference
-    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
-    // Get the absolute value of the difference
-    XMVECTOR vTemp = _mm_setzero_ps();
-    vTemp = _mm_sub_ps(vTemp, vDelta);
-    vTemp = _mm_max_ps(vTemp, vDelta);
-    vTemp = _mm_cmple_ps(vTemp, Epsilon);
-    return ((_mm_movemask_ps(vTemp) == 0xf) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) ||
-             (V1.vector4_f32[1] != V2.vector4_f32[1]) ||
-             (V1.vector4_f32[2] != V2.vector4_f32[2]) ||
-             (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vceqq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) !=
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp)) != 0);
-#else
-    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) ||
-             (V1.vector4_u32[1] != V2.vector4_u32[1]) ||
-             (V1.vector4_u32[2] != V2.vector4_u32[2]) ||
-             (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult =
-        vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) !=
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
-    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0);
-#else
-    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] > V2.vector4_f32[2]) &&
-             (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1,
-                                              FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
-        V1.vector4_f32[1] > V2.vector4_f32[1] &&
-        V1.vector4_f32[2] > V2.vector4_f32[2] &&
-        V1.vector4_f32[3] > V2.vector4_f32[3]) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
-               V1.vector4_f32[1] <= V2.vector4_f32[1] &&
-               V1.vector4_f32[2] <= V2.vector4_f32[2] &&
-               V1.vector4_f32[3] <= V2.vector4_f32[3]) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgtq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    uint32_t CR = 0;
-    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest == 0xf) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1,
-                                                FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
-             (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1,
-                                                     FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t CR = 0;
-    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
-        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
-        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
-        (V1.vector4_f32[3] >= V2.vector4_f32[3])) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-               (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-               (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
-               (V1.vector4_f32[3] < V2.vector4_f32[3])) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcgeq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
-
-    uint32_t CR = 0;
-    if (r == 0xFFFFFFFFU) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!r) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#elif defined(_XM_SSE_INTRINSICS_)
-    uint32_t CR = 0;
-    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
-    int iTest = _mm_movemask_ps(vTemp);
-    if (iTest == 0x0f) {
-        CR = XM_CRMASK_CR6TRUE;
-    } else if (!iTest) {
-        CR = XM_CRMASK_CR6FALSE;
-    }
-    return CR;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
-             (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcltq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1,
-                                             FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
-             (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
-             (V1.vector4_f32[2] <= V2.vector4_f32[2]) &&
-             (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vResult = vcleq_f32(V1, V2);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)),
-                                vget_high_u8(vreinterpretq_u8_u32(vResult)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
-    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
-#else
-    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V,
-                                          FXMVECTOR Bounds) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] &&
-              V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
-             (V.vector4_f32[1] <= Bounds.vector4_f32[1] &&
-              V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
-             (V.vector4_f32[2] <= Bounds.vector4_f32[2] &&
-              V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
-             (V.vector4_f32[3] <= Bounds.vector4_f32[3] &&
-              V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Test if less than or equal
-    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
-    // Negate the bounds
-    float32x4_t vTemp2 = vnegq_f32(Bounds);
-    // Test if greater or equal (Reversed)
-    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
-    // Blend answers
-    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
-    // in bounds?
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)),
-                                vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
-    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) ==
-            0xFFFFFFFFU);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Test if less than or equal
-    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
-    // Negate the bounds
-    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
-    // Test if greater or equal (Reversed)
-    vTemp2 = _mm_cmple_ps(vTemp2, V);
-    // Blend answers
-    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
-    // All in bounds?
-    return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0);
-#else
-    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(push)
-#pragma float_control(precise, on)
-#endif
-
-inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    return (XMISNAN(V.vector4_f32[0]) || XMISNAN(V.vector4_f32[1]) ||
-            XMISNAN(V.vector4_f32[2]) || XMISNAN(V.vector4_f32[3]));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    return isnan(vgetq_lane_f32(V, 0)) || isnan(vgetq_lane_f32(V, 1)) ||
-           isnan(vgetq_lane_f32(V, 2)) || isnan(vgetq_lane_f32(V, 3));
-#else
-    // Test against itself. NaN is always not equal
-    uint32x4_t vTempNan = vceqq_f32(V, V);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)),
-                                vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    // If any are NaN, the mask is zero
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) !=
-            0xFFFFFFFFU);
-#endif
-#elif defined(_XM_SSE_INTRINSICS_)
-#if defined(__clang__) && defined(__FINITE_MATH_ONLY__)
-    XM_ALIGNED_DATA(16) float tmp[4];
-    _mm_store_ps(tmp, V);
-    return isnan(tmp[0]) || isnan(tmp[1]) || isnan(tmp[2]) || isnan(tmp[3]);
-#else
-    // Test against itself. NaN is always not equal
-    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
-    // If any are NaN, the mask is non-zero
-    return (_mm_movemask_ps(vTempNan) != 0);
-#endif
-#endif
-}
-
-#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && \
-    !defined(__INTEL_COMPILER)
-#pragma float_control(pop)
-#endif
-
-//------------------------------------------------------------------------------
-
-inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    return (XMISINF(V.vector4_f32[0]) || XMISINF(V.vector4_f32[1]) ||
-            XMISINF(V.vector4_f32[2]) || XMISINF(V.vector4_f32[3]));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Mask off the sign bit
-    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
-    // Compare to infinity
-    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
-    // If any are infinity, the signs are true.
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)),
-                                vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Mask off the sign bit
-    XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask);
-    // Compare to infinity
-    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
-    // If any are infinity, the signs are true.
-    return (_mm_movemask_ps(vTemp) != 0);
-#endif
-}
-
-//------------------------------------------------------------------------------
-// Computation operations
-//------------------------------------------------------------------------------
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result;
-    Result.f[0] = Result.f[1] = Result.f[2] = Result.f[3] =
-        V1.vector4_f32[0] * V2.vector4_f32[0] +
-        V1.vector4_f32[1] * V2.vector4_f32[1] +
-        V1.vector4_f32[2] * V2.vector4_f32[2] +
-        V1.vector4_f32[3] * V2.vector4_f32[3];
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vTemp = vmulq_f32(V1, V2);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    return vcombine_f32(v1, v1);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    return _mm_dp_ps(V1, V2, 0xff);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
-    vTemp = _mm_hadd_ps(vTemp, vTemp);
-    return _mm_hadd_ps(vTemp, vTemp);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vTemp2 = V2;
-    XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2);
-    vTemp2 = _mm_shuffle_ps(
-        vTemp2, vTemp,
-        _MM_SHUFFLE(1, 0, 0,
-                    0));  // Copy X to the Z position and Y to the W position
-    vTemp2 = _mm_add_ps(vTemp2, vTemp);  // Add Z = X+Z; W = Y+W;
-    vTemp = _mm_shuffle_ps(
-        vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0));  // Copy W to the Z position
-    vTemp = _mm_add_ps(vTemp, vTemp2);            // Add Z and W together
-    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2));  // Splat Z and return
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2,
-                                           FXMVECTOR V3) noexcept {
-    // [
-    // ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
-    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
-    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
-    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z)
-    //   ]
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{
-        (((V2.vector4_f32[2] * V3.vector4_f32[3]) -
-          (V2.vector4_f32[3] * V3.vector4_f32[2])) *
-         V1.vector4_f32[1]) -
-            (((V2.vector4_f32[1] * V3.vector4_f32[3]) -
-              (V2.vector4_f32[3] * V3.vector4_f32[1])) *
-             V1.vector4_f32[2]) +
-            (((V2.vector4_f32[1] * V3.vector4_f32[2]) -
-              (V2.vector4_f32[2] * V3.vector4_f32[1])) *
-             V1.vector4_f32[3]),
-        (((V2.vector4_f32[3] * V3.vector4_f32[2]) -
-          (V2.vector4_f32[2] * V3.vector4_f32[3])) *
-         V1.vector4_f32[0]) -
-            (((V2.vector4_f32[3] * V3.vector4_f32[0]) -
-              (V2.vector4_f32[0] * V3.vector4_f32[3])) *
-             V1.vector4_f32[2]) +
-            (((V2.vector4_f32[2] * V3.vector4_f32[0]) -
-              (V2.vector4_f32[0] * V3.vector4_f32[2])) *
-             V1.vector4_f32[3]),
-        (((V2.vector4_f32[1] * V3.vector4_f32[3]) -
-          (V2.vector4_f32[3] * V3.vector4_f32[1])) *
-         V1.vector4_f32[0]) -
-            (((V2.vector4_f32[0] * V3.vector4_f32[3]) -
-              (V2.vector4_f32[3] * V3.vector4_f32[0])) *
-             V1.vector4_f32[1]) +
-            (((V2.vector4_f32[0] * V3.vector4_f32[1]) -
-              (V2.vector4_f32[1] * V3.vector4_f32[0])) *
-             V1.vector4_f32[3]),
-        (((V2.vector4_f32[2] * V3.vector4_f32[1]) -
-          (V2.vector4_f32[1] * V3.vector4_f32[2])) *
-         V1.vector4_f32[0]) -
-            (((V2.vector4_f32[2] * V3.vector4_f32[0]) -
-              (V2.vector4_f32[0] * V3.vector4_f32[2])) *
-             V1.vector4_f32[1]) +
-            (((V2.vector4_f32[1] * V3.vector4_f32[0]) -
-              (V2.vector4_f32[0] * V3.vector4_f32[1])) *
-             V1.vector4_f32[2]),
-    }}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    const uint32x2_t select = vget_low_u32(g_XMMaskX);
-
-    // Term1: V2zwyz * V3wzwy
-    const float32x2_t v2xy = vget_low_f32(V2);
-    const float32x2_t v2zw = vget_high_f32(V2);
-    const float32x2_t v2yx = vrev64_f32(v2xy);
-    const float32x2_t v2wz = vrev64_f32(v2zw);
-    const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz);
-
-    const float32x2_t v3zw = vget_high_f32(V3);
-    const float32x2_t v3wz = vrev64_f32(v3zw);
-    const float32x2_t v3xy = vget_low_f32(V3);
-    const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy);
-
-    float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz);
-    float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy);
-    XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2);
-
-    // - V2wzwy * V3zwyz
-    const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy);
-
-    const float32x2_t v3yx = vrev64_f32(v3xy);
-    const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz);
-
-    vTemp1 = vcombine_f32(v2wz, v2wy);
-    vTemp2 = vcombine_f32(v3zw, v3yz);
-    vResult = vmlsq_f32(vResult, vTemp1, vTemp2);
-
-    // term1 * V1yxxx
-    const float32x2_t v1xy = vget_low_f32(V1);
-    const float32x2_t v1yx = vrev64_f32(v1xy);
-
-    vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1));
-    vResult = vmulq_f32(vResult, vTemp1);
-
-    // Term2: V2ywxz * V3wxwx
-    const float32x2_t v2yw = vrev64_f32(v2wy);
-    const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz);
-
-    const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx);
-
-    vTemp1 = vcombine_f32(v2yw, v2xz);
-    vTemp2 = vcombine_f32(v3wx, v3wx);
-    float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2);
-
-    // - V2wxwx * V3ywxz
-    const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx);
-
-    const float32x2_t v3yw = vrev64_f32(v3wy);
-    const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz);
-
-    vTemp1 = vcombine_f32(v2wx, v2wx);
-    vTemp2 = vcombine_f32(v3yw, v3xz);
-    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);
-
-    // vResult - term2 * V1zzyy
-    const float32x2_t v1zw = vget_high_f32(V1);
-
-    vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0));
-    vResult = vmlsq_f32(vResult, vTerm, vTemp1);
-
-    // Term3: V2yzxy * V3zxyx
-    const float32x2_t v3zx = vrev64_f32(v3xz);
-
-    vTemp1 = vcombine_f32(v2yz, v2xy);
-    vTemp2 = vcombine_f32(v3zx, v3yx);
-    vTerm = vmulq_f32(vTemp1, vTemp2);
-
-    // - V2zxyx * V3yzxy
-    const float32x2_t v2zx = vrev64_f32(v2xz);
-
-    vTemp1 = vcombine_f32(v2zx, v2yx);
-    vTemp2 = vcombine_f32(v3yz, v3xy);
-    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);
-
-    // vResult + term3 * V1wwwz
-    const float32x2_t v1wz = vrev64_f32(v1zw);
-
-    vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz);
-    return vmlaq_f32(vResult, vTerm, vTemp1);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // V2zwyz * V3wzwy
-    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2));
-    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3));
-    vResult = _mm_mul_ps(vResult, vTemp3);
-    // - V2wzwy * V3zwyz
-    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3));
-    vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1));
-    vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult);
-    // term1 * V1yxxx
-    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1));
-    vResult = _mm_mul_ps(vResult, vTemp1);
-
-    // V2ywxz * V3wxwx
-    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1));
-    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3));
-    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
-    // - V2wxwx * V3ywxz
-    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1));
-    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1));
-    vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3);
-    // vResult - temp * V1zzyy
-    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2));
-    vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult);
-
-    // V2yzxy * V3zxyx
-    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1));
-    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2));
-    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
-    // - V2zxyx * V3yzxy
-    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1));
-    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1));
-    vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3);
-    // vResult + term * V1wwwz
-    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3));
-    vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept {
-    return XMVector4Dot(V, V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorReciprocalSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32(v1);
-    return vcombine_f32(v2, v2);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
-    return _mm_rsqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Get the reciprocal
-    vLengthSq = _mm_rsqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorReciprocalSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    // Reciprocal sqrt
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
-    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
-    return _mm_div_ps(g_XMOne, vLengthSq);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Get the reciprocal
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    // Accurate!
-    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorSqrtEst(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
-    // Sqrt (estimate)
-    float32x2_t Result = vrsqrte_f32(v1);
-    Result = vmul_f32(v1, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-
-    Result = XMVector4LengthSq(V);
-    Result = XMVectorSqrt(Result);
-
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    const float32x2_t zero = vdup_n_f32(0);
-    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
-    // Sqrt
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    float32x2_t Result = vmul_f32(S1, R1);
-    Result = vmul_f32(v1, Result);
-    Result = vbsl_f32(VEqualsZero, zero, Result);
-    return vcombine_f32(Result, Result);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
-    return _mm_sqrt_ps(vTemp);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Get the length
-    vLengthSq = _mm_sqrt_ps(vLengthSq);
-    return vLengthSq;
-#endif
-}
-
-//------------------------------------------------------------------------------
-// XMVector4NormalizeEst uses a reciprocal estimate and
-// returns QNaN on zero and infinite vectors.
-
-inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR Result;
-    Result = XMVector4ReciprocalLength(V);
-    Result = XMVectorMultiply(V, Result);
-    return Result;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    // Reciprocal sqrt (estimate)
-    v2 = vrsqrte_f32(v1);
-    // Normalize
-    return vmulq_f32(V, vcombine_f32(v2, v2));
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
-    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
-    return _mm_mul_ps(vResult, V);
-#elif defined(_XM_SSE3_INTRINSICS_)
-    XMVECTOR vDot = _mm_mul_ps(V, V);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_hadd_ps(vDot, vDot);
-    vDot = _mm_rsqrt_ps(vDot);
-    vDot = _mm_mul_ps(vDot, V);
-    return vDot;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Get the reciprocal
-    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
-    // Reciprocal mul to perform the normalization
-    vResult = _mm_mul_ps(vResult, V);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-    float fLength;
-    XMVECTOR vResult;
-
-    vResult = XMVector4Length(V);
-    fLength = vResult.vector4_f32[0];
-
-    // Prevent divide by zero
-    if (fLength > 0) {
-        fLength = 1.0f / fLength;
-    }
-
-    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
-    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
-    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
-    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
-    return vResult;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    // Dot4
-    float32x4_t vTemp = vmulq_f32(V, V);
-    float32x2_t v1 = vget_low_f32(vTemp);
-    float32x2_t v2 = vget_high_f32(vTemp);
-    v1 = vadd_f32(v1, v2);
-    v1 = vpadd_f32(v1, v1);
-    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
-    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
-    // Reciprocal sqrt (2 iterations of Newton-Raphson)
-    float32x2_t S0 = vrsqrte_f32(v1);
-    float32x2_t P0 = vmul_f32(v1, S0);
-    float32x2_t R0 = vrsqrts_f32(P0, S0);
-    float32x2_t S1 = vmul_f32(S0, R0);
-    float32x2_t P1 = vmul_f32(v1, S1);
-    float32x2_t R1 = vrsqrts_f32(P1, S1);
-    v2 = vmul_f32(S1, R1);
-    // Normalize
-    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
-    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0),
-                        vResult);
-    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
-#elif defined(_XM_SSE4_INTRINSICS_)
-    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE3_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Perform the dot product on x,y,z and w
-    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
-    // vTemp has z and w
-    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
-    // x+z, y+w
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // x+z,x+z,x+z,y+w
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
-    // ??,??,y+w,y+w
-    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
-    // ??,??,x+z+y+w,??
-    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
-    // Splat the length
-    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
-    // Prepare for the division
-    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
-    // Create zero with a single instruction
-    XMVECTOR vZeroMask = _mm_setzero_ps();
-    // Test for a divide by zero (Must be FP to detect -0.0)
-    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
-    // Failsafe on zero (Or epsilon) length planes
-    // If the length is infinity, set the elements to zero
-    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
-    // Divide to perform the normalization
-    vResult = _mm_div_ps(V, vResult);
-    // Any that are infinity, set to zero
-    vResult = _mm_and_ps(vResult, vZeroMask);
-    // Select qnan or result based on infinite length
-    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
-    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
-    vResult = _mm_or_ps(vTemp1, vTemp2);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin,
-                                                 float LengthMax) noexcept {
-    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
-    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
-
-    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV(
-    FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept {
-    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) &&
-           (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) &&
-           (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
-    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) &&
-           (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) &&
-           (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
-    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
-    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
-    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
-
-    XMVECTOR LengthSq = XMVector4LengthSq(V);
-
-    const XMVECTOR Zero = XMVectorZero();
-
-    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
-
-    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
-    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
-
-    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
-
-    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
-
-    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
-    Length = XMVectorSelect(LengthSq, Length, Select);
-    Normal = XMVectorSelect(LengthSq, Normal, Select);
-
-    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
-    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
-
-    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
-    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
-
-    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
-
-    // Preserve the original vector (with no precision loss) if the length falls
-    // within the given range
-    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
-    Result = XMVectorSelect(Result, V, Control);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident,
-                                             FXMVECTOR Normal) noexcept {
-    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
-
-    XMVECTOR Result = XMVector4Dot(Incident, Normal);
-    Result = XMVectorAdd(Result, Result);
-    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
-
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident,
-                                             FXMVECTOR Normal,
-                                             float RefractionIndex) noexcept {
-    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
-    return XMVector4RefractV(Incident, Normal, Index);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4RefractV(
-    FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR IDotN;
-    XMVECTOR R;
-    const XMVECTOR Zero = XMVectorZero();
-
-    // Result = RefractionIndex * Incident - Normal * (RefractionIndex *
-    // dot(Incident, Normal) + sqrt(1 - RefractionIndex * RefractionIndex * (1 -
-    // dot(Incident, Normal) * dot(Incident, Normal))))
-
-    IDotN = XMVector4Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
-    R = XMVectorMultiply(R, RefractionIndex);
-    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
-
-    if (XMVector4LessOrEqual(R, Zero)) {
-        // Total internal reflection
-        return Zero;
-    } else {
-        XMVECTOR Result;
-
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = XMVectorSqrt(R);
-        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
-
-        // Result = RefractionIndex * Incident - Normal * R
-        Result = XMVectorMultiply(RefractionIndex, Incident);
-        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
-
-        return Result;
-    }
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
-    R = vmulq_f32(R, RefractionIndex);
-    R = vmlsq_f32(g_XMOne, R, RefractionIndex);
-
-    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
-    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)),
-                                vget_high_u8(vreinterpretq_u8_u32(isrzero)));
-    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]),
-                                   vreinterpret_u16_u8(vTemp.val[1]));
-
-    float32x4_t vResult;
-    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU) {
-        // Total internal reflection
-        vResult = g_XMZero;
-    } else {
-        // Sqrt(R)
-        float32x4_t S0 = vrsqrteq_f32(R);
-        float32x4_t P0 = vmulq_f32(R, S0);
-        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
-        float32x4_t S1 = vmulq_f32(S0, R0);
-        float32x4_t P1 = vmulq_f32(R, S1);
-        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
-        float32x4_t S2 = vmulq_f32(S1, R1);
-        R = vmulq_f32(R, S2);
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = vmlaq_f32(R, RefractionIndex, IDotN);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = vmulq_f32(RefractionIndex, Incident);
-        vResult = vmlsq_f32(vResult, R, Normal);
-    }
-    return vResult;
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);
-
-    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
-    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
-    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
-    R = XM_FNMADD_PS(R, R2, g_XMOne);
-
-    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
-    if (_mm_movemask_ps(vResult) == 0x0f) {
-        // Total internal reflection
-        vResult = g_XMZero;
-    } else {
-        // R = RefractionIndex * IDotN + sqrt(R)
-        R = _mm_sqrt_ps(R);
-        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
-        // Result = RefractionIndex * Incident - Normal * R
-        vResult = _mm_mul_ps(RefractionIndex, Incident);
-        vResult = XM_FNMADD_PS(R, Normal, vResult);
-    }
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTORF32 Result = {{{V.vector4_f32[2], V.vector4_f32[3],
-                            -V.vector4_f32[0], -V.vector4_f32[1]}}};
-    return Result.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Negate = {{{1.f, 1.f, -1.f, -1.f}}};
-
-    float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V));
-    return vmulq_f32(Result, Negate);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FlipZW = {{{1.0f, 1.0f, -1.0f, -1.0f}}};
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2));
-    vResult = _mm_mul_ps(vResult, FlipZW);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector4Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACosEst(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept {
-    XMVECTOR Result = XMVector4Dot(N1, N2);
-    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
-    Result = XMVectorACos(Result);
-    return Result;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV
-XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
-    XMVECTOR L2 = XMVector4ReciprocalLength(V2);
-
-    XMVECTOR Dot = XMVector4Dot(V1, V2);
-
-    L1 = XMVectorMultiply(L1, L2);
-
-    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
-    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
-
-    return XMVectorACos(CosAngle);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V,
-                                               FXMMATRIX M) noexcept {
-#if defined(_XM_NO_INTRINSICS_)
-
-    float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) +
-               (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]);
-    float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) +
-               (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]);
-    float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) +
-               (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]);
-    float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) +
-               (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]);
-    XMVECTORF32 vResult = {{{fX, fY, fZ, fW}}};
-    return vResult.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x2_t VL = vget_low_f32(V);
-    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0);  // X
-    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1);  // Y
-    float32x2_t VH = vget_high_f32(V);
-    vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0);  // Z
-    return vmlaq_lane_f32(vResult, M.r[3], VH, 1);     // W
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));  // W
-    vResult = _mm_mul_ps(vResult, M.r[3]);
-    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));  // Z
-    vResult = XM_FMADD_PS(vTemp, M.r[2], vResult);
-    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));  // Y
-    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
-    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));  // X
-    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream(
-    XMFLOAT4* pOutputStream, size_t OutputStride, const XMFLOAT4* pInputStream,
-    size_t InputStride, size_t VectorCount, FXMMATRIX M) noexcept {
-    assert(pOutputStream != nullptr);
-    assert(pInputStream != nullptr);
-
-    assert(InputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(InputStride >= sizeof(XMFLOAT4));
-
-    assert(OutputStride >= sizeof(XMFLOAT4));
-    _Analysis_assume_(OutputStride >= sizeof(XMFLOAT4));
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    for (size_t i = 0; i < VectorCount; i++) {
-        XMVECTOR V =
-            XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pInputVector));
-        XMVECTOR W = XMVectorSplatW(V);
-        XMVECTOR Z = XMVectorSplatZ(V);
-        XMVECTOR Y = XMVectorSplatY(V);
-        XMVECTOR X = XMVectorSplatX(V);
-
-        XMVECTOR Result = XMVectorMultiply(W, row3);
-        Result = XMVectorMultiplyAdd(Z, row2, Result);
-        Result = XMVectorMultiplyAdd(Y, row1, Result);
-        Result = XMVectorMultiplyAdd(X, row0, Result);
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015, "PREfast noise: Esp:1307")
-#endif
-
-        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-        pInputVector += InputStride;
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    size_t i = 0;
-    size_t four = VectorCount >> 2;
-    if (four > 0) {
-        if ((InputStride == sizeof(XMFLOAT4)) &&
-            (OutputStride == sizeof(XMFLOAT4))) {
-            for (size_t j = 0; j < four; ++j) {
-                float32x4x4_t V =
-                    vld4q_f32(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += sizeof(XMFLOAT4) * 4;
-
-                float32x2_t r = vget_low_f32(row0);
-                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0);  // Ax
-                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1);  // Bx
-
-                XM_PREFETCH(pInputVector);
-
-                r = vget_high_f32(row0);
-                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0);  // Cx
-                XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1);  // Dx
-
-                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);
-
-                r = vget_low_f32(row1);
-                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0);  // Ax+Ey
-                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1);  // Bx+Fy
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
-
-                r = vget_high_f32(row1);
-                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0);  // Cx+Gy
-                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1);  // Dx+Hy
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));
-
-                r = vget_low_f32(row2);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[2], r, 0);  // Ax+Ey+Iz
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[2], r, 1);  // Bx+Fy+Jz
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));
-
-                r = vget_high_f32(row2);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[2], r, 0);  // Cx+Gy+Kz
-                vResult3 =
-                    vmlaq_lane_f32(vResult3, V.val[2], r, 1);  // Dx+Hy+Lz
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));
-
-                r = vget_low_f32(row3);
-                vResult0 =
-                    vmlaq_lane_f32(vResult0, V.val[3], r, 0);  // Ax+Ey+Iz+Mw
-                vResult1 =
-                    vmlaq_lane_f32(vResult1, V.val[3], r, 1);  // Bx+Fy+Jz+Nw
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6));
-
-                r = vget_high_f32(row3);
-                vResult2 =
-                    vmlaq_lane_f32(vResult2, V.val[3], r, 0);  // Cx+Gy+Kz+Ow
-                vResult3 =
-                    vmlaq_lane_f32(vResult3, V.val[3], r, 1);  // Dx+Hy+Lz+Pw
-
-                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7));
-
-                V.val[0] = vResult0;
-                V.val[1] = vResult1;
-                V.val[2] = vResult2;
-                V.val[3] = vResult3;
-
-                vst4q_f32(reinterpret_cast<float*>(pOutputVector), V);
-                pOutputVector += sizeof(XMFLOAT4) * 4;
-
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < VectorCount; i++) {
-        XMVECTOR V = vld1q_f32(reinterpret_cast<const float*>(pInputVector));
-        pInputVector += InputStride;
-
-        float32x2_t VL = vget_low_f32(V);
-        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0);  // X
-        vResult = vmlaq_lane_f32(vResult, row1, VL, 1);  // Y
-        float32x2_t VH = vget_high_f32(V);
-        vResult = vmlaq_lane_f32(vResult, row2, VH, 0);  // Z
-        vResult = vmlaq_lane_f32(vResult, row3, VH, 1);  // W
-
-        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
-        pOutputVector += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_AVX2_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t two = VectorCount >> 1;
-    if (two > 0) {
-        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
-        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
-        __m256 row2 = _mm256_broadcast_ps(&M.r[2]);
-        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);
-
-        if (InputStride == sizeof(XMFLOAT4)) {
-            if (OutputStride == sizeof(XMFLOAT4)) {
-                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F)) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT4) * 2;
-
-                        __m256 vTempX =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m256 vTempY =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 vTempZ =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 vTempW =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        vTempX = _mm256_mul_ps(vTempX, row0);
-                        vTempY = _mm256_mul_ps(vTempY, row1);
-                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
-                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
-                        vTempX = _mm256_add_ps(vTempZ, vTempW);
-
-                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector),
-                                        vTempX);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        i += 2;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < two; ++j) {
-                        __m256 VV = _mm256_loadu_ps(
-                            reinterpret_cast<const float*>(pInputVector));
-                        pInputVector += sizeof(XMFLOAT4) * 2;
-
-                        __m256 vTempX =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-                        __m256 vTempY =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                        __m256 vTempZ =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                        __m256 vTempW =
-                            _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-
-                        vTempX = _mm256_mul_ps(vTempX, row0);
-                        vTempY = _mm256_mul_ps(vTempY, row1);
-                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
-                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
-                        vTempX = _mm256_add_ps(vTempZ, vTempW);
-
-                        _mm256_storeu_ps(
-                            reinterpret_cast<float*>(pOutputVector), vTempX);
-                        pOutputVector += sizeof(XMFLOAT4) * 2;
-
-                        i += 2;
-                    }
-                }
-            } else {
-                // Packed input, unpacked output
-                for (size_t j = 0; j < two; ++j) {
-                    __m256 VV = _mm256_loadu_ps(
-                        reinterpret_cast<const float*>(pInputVector));
-                    pInputVector += sizeof(XMFLOAT4) * 2;
-
-                    __m256 vTempX =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
-                    __m256 vTempY =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
-                    __m256 vTempZ =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
-                    __m256 vTempW =
-                        _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
-
-                    vTempX = _mm256_mul_ps(vTempX, row0);
-                    vTempY = _mm256_mul_ps(vTempY, row1);
-                    vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
-                    vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
-                    vTempX = _mm256_add_ps(vTempZ, vTempW);
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_castps256_ps128(vTempX));
-                    pOutputVector += OutputStride;
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector),
-                                  _mm256_extractf128_ps(vTempX, 1));
-                    pOutputVector += OutputStride;
-                    i += 2;
-                }
-            }
-        }
-    }
-
-    if (i < VectorCount) {
-        const XMVECTOR row0 = M.r[0];
-        const XMVECTOR row1 = M.r[1];
-        const XMVECTOR row2 = M.r[2];
-        const XMVECTOR row3 = M.r[3];
-
-        for (; i < VectorCount; i++) {
-            __m128 V =
-                _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
-            pInputVector += InputStride;
-
-            XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-            XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-            XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-            XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-
-            vTempX = _mm_mul_ps(vTempX, row0);
-            vTempY = _mm_mul_ps(vTempY, row1);
-            vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
-            vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
-            vTempX = _mm_add_ps(vTempZ, vTempW);
-
-            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
-            pOutputVector += OutputStride;
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#elif defined(_XM_SSE_INTRINSICS_)
-    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    const XMVECTOR row0 = M.r[0];
-    const XMVECTOR row1 = M.r[1];
-    const XMVECTOR row2 = M.r[2];
-    const XMVECTOR row3 = M.r[3];
-
-    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) &&
-        !(OutputStride & 0xF)) {
-        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) &&
-            !(InputStride & 0xF)) {
-            // Aligned input, aligned output
-            for (size_t i = 0; i < VectorCount; i++) {
-                __m128 V =
-                    _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += InputStride;
-
-                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-
-                vTempX = _mm_mul_ps(vTempX, row0);
-                vTempY = _mm_mul_ps(vTempY, row1);
-                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
-                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
-                vTempX = _mm_add_ps(vTempZ, vTempW);
-
-                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
-                pOutputVector += OutputStride;
-            }
-        } else {
-            // Unaligned input, aligned output
-            for (size_t i = 0; i < VectorCount; i++) {
-                __m128 V =
-                    _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += InputStride;
-
-                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-
-                vTempX = _mm_mul_ps(vTempX, row0);
-                vTempY = _mm_mul_ps(vTempY, row1);
-                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
-                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
-                vTempX = _mm_add_ps(vTempZ, vTempW);
-
-                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
-                pOutputVector += OutputStride;
-            }
-        }
-    } else {
-        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) &&
-            !(InputStride & 0xF)) {
-            // Aligned input, unaligned output
-            for (size_t i = 0; i < VectorCount; i++) {
-                __m128 V =
-                    _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += InputStride;
-
-                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-
-                vTempX = _mm_mul_ps(vTempX, row0);
-                vTempY = _mm_mul_ps(vTempY, row1);
-                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
-                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
-                vTempX = _mm_add_ps(vTempZ, vTempW);
-
-                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
-                pOutputVector += OutputStride;
-            }
-        } else {
-            // Unaligned input, unaligned output
-            for (size_t i = 0; i < VectorCount; i++) {
-                __m128 V =
-                    _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
-                pInputVector += InputStride;
-
-                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
-                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
-                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
-                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
-
-                vTempX = _mm_mul_ps(vTempX, row0);
-                vTempY = _mm_mul_ps(vTempY, row1);
-                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
-                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
-                vTempX = _mm_add_ps(vTempZ, vTempW);
-
-                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
-                pOutputVector += OutputStride;
-            }
-        }
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#endif
-}
-
-/****************************************************************************
- *
- * XMVECTOR operators
- *
- ****************************************************************************/
-
-#ifndef _XM_NO_XMVECTOR_OVERLOADS_
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator+(FXMVECTOR V) noexcept { return V; }
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator-(FXMVECTOR V) noexcept {
-    return XMVectorNegate(V);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator+=(XMVECTOR& V1, FXMVECTOR V2) noexcept {
-    V1 = XMVectorAdd(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator-=(XMVECTOR& V1, FXMVECTOR V2) noexcept {
-    V1 = XMVectorSubtract(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator*=(XMVECTOR& V1, FXMVECTOR V2) noexcept {
-    V1 = XMVectorMultiply(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& XM_CALLCONV operator/=(XMVECTOR& V1, FXMVECTOR V2) noexcept {
-    V1 = XMVectorDivide(V1, V2);
-    return V1;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& operator*=(XMVECTOR& V, const float S) noexcept {
-    V = XMVectorScale(V, S);
-    return V;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR& operator/=(XMVECTOR& V, const float S) noexcept {
-    XMVECTOR vS = XMVectorReplicate(S);
-    V = XMVectorDivide(V, vS);
-    return V;
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator+(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    return XMVectorAdd(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator-(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    return XMVectorSubtract(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    return XMVectorMultiply(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator/(FXMVECTOR V1, FXMVECTOR V2) noexcept {
-    return XMVectorDivide(V1, V2);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*(FXMVECTOR V, const float S) noexcept {
-    return XMVectorScale(V, S);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator/(FXMVECTOR V, const float S) noexcept {
-    XMVECTOR vS = XMVectorReplicate(S);
-    return XMVectorDivide(V, vS);
-}
-
-//------------------------------------------------------------------------------
-
-inline XMVECTOR XM_CALLCONV operator*(float S, FXMVECTOR V) noexcept {
-    return XMVectorScale(V, S);
-}
-
-#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */
-
-#if defined(_XM_NO_INTRINSICS_)
-#undef XMISNAN
-#undef XMISINF
-#endif
-
-#if defined(_XM_SSE_INTRINSICS_)
-#undef XM3UNPACK3INTO4
-#undef XM3PACK4INTO3
-#endif
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.h b/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.h
deleted file mode 100644
index 4442fde21..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.h
+++ /dev/null
@@ -1,1329 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXPackedVector.h -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-#include "DirectXMath.h"
-
-namespace DirectX {
-
-namespace PackedVector {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4201 4365 4324 4996)
-// C4201: nonstandard extension used
-// C4365: Off by default noise
-// C4324: alignment padding warnings
-// C4996: deprecation warnings
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#pragma clang diagnostic ignored "-Wnested-anon-types"
-#endif
-
-//------------------------------------------------------------------------------
-// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into
-// a 32 bit integer.  The normalized color is packed into 32 bits using 8 bit
-// unsigned, normalized integers for the alpha, red, green, and blue components.
-// The alpha component is stored in the most significant bits and the blue
-// component in the least significant bits (A8R8G8B8):
-// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0]
-struct XMCOLOR {
-    union {
-        struct {
-            uint8_t b;  // Blue:    0/255 to 255/255
-            uint8_t g;  // Green:   0/255 to 255/255
-            uint8_t r;  // Red:     0/255 to 255/255
-            uint8_t a;  // Alpha:   0/255 to 255/255
-        };
-        uint32_t c;
-    };
-
-    XMCOLOR() = default;
-
-    XMCOLOR(const XMCOLOR&) = default;
-    XMCOLOR& operator=(const XMCOLOR&) = default;
-
-    XMCOLOR(XMCOLOR&&) = default;
-    XMCOLOR& operator=(XMCOLOR&&) = default;
-
-    constexpr XMCOLOR(uint32_t Color) noexcept : c(Color) {}
-    XMCOLOR(float _r, float _g, float _b, float _a) noexcept;
-    explicit XMCOLOR(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return c; }
-
-    XMCOLOR& operator=(const uint32_t Color) noexcept {
-        c = Color;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 16 bit floating point number consisting of a sign bit, a 5 bit biased
-// exponent, and a 10 bit mantissa
-using HALF = uint16_t;
-
-//------------------------------------------------------------------------------
-// 2D Vector; 16 bit floating point components
-struct XMHALF2 {
-    union {
-        struct {
-            HALF x;
-            HALF y;
-        };
-        uint32_t v;
-    };
-
-    XMHALF2() = default;
-
-    XMHALF2(const XMHALF2&) = default;
-    XMHALF2& operator=(const XMHALF2&) = default;
-
-    XMHALF2(XMHALF2&&) = default;
-    XMHALF2& operator=(XMHALF2&&) = default;
-
-    explicit constexpr XMHALF2(uint32_t Packed) noexcept : v(Packed) {}
-    constexpr XMHALF2(HALF _x, HALF _y) noexcept : x(_x), y(_y) {}
-    explicit XMHALF2(_In_reads_(2) const HALF* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMHALF2(float _x, float _y) noexcept;
-    explicit XMHALF2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMHALF2& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 16 bit signed normalized integer components
-struct XMSHORTN2 {
-    union {
-        struct {
-            int16_t x;
-            int16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMSHORTN2() = default;
-
-    XMSHORTN2(const XMSHORTN2&) = default;
-    XMSHORTN2& operator=(const XMSHORTN2&) = default;
-
-    XMSHORTN2(XMSHORTN2&&) = default;
-    XMSHORTN2& operator=(XMSHORTN2&&) = default;
-
-    explicit constexpr XMSHORTN2(uint32_t Packed) noexcept : v(Packed) {}
-    constexpr XMSHORTN2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMSHORTN2(_In_reads_(2) const int16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMSHORTN2(float _x, float _y) noexcept;
-    explicit XMSHORTN2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMSHORTN2& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 16 bit signed integer components
-struct XMSHORT2 {
-    union {
-        struct {
-            int16_t x;
-            int16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMSHORT2() = default;
-
-    XMSHORT2(const XMSHORT2&) = default;
-    XMSHORT2& operator=(const XMSHORT2&) = default;
-
-    XMSHORT2(XMSHORT2&&) = default;
-    XMSHORT2& operator=(XMSHORT2&&) = default;
-
-    explicit constexpr XMSHORT2(uint32_t Packed) noexcept : v(Packed) {}
-    constexpr XMSHORT2(int16_t _x, int16_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMSHORT2(_In_reads_(2) const int16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMSHORT2(float _x, float _y) noexcept;
-    explicit XMSHORT2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMSHORT2& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 16 bit unsigned normalized integer components
-struct XMUSHORTN2 {
-    union {
-        struct {
-            uint16_t x;
-            uint16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMUSHORTN2() = default;
-
-    XMUSHORTN2(const XMUSHORTN2&) = default;
-    XMUSHORTN2& operator=(const XMUSHORTN2&) = default;
-
-    XMUSHORTN2(XMUSHORTN2&&) = default;
-    XMUSHORTN2& operator=(XMUSHORTN2&&) = default;
-
-    explicit constexpr XMUSHORTN2(uint32_t Packed) noexcept : v(Packed) {}
-    constexpr XMUSHORTN2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMUSHORTN2(_In_reads_(2) const uint16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMUSHORTN2(float _x, float _y) noexcept;
-    explicit XMUSHORTN2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMUSHORTN2& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 16 bit unsigned integer components
-struct XMUSHORT2 {
-    union {
-        struct {
-            uint16_t x;
-            uint16_t y;
-        };
-        uint32_t v;
-    };
-
-    XMUSHORT2() = default;
-
-    XMUSHORT2(const XMUSHORT2&) = default;
-    XMUSHORT2& operator=(const XMUSHORT2&) = default;
-
-    XMUSHORT2(XMUSHORT2&&) = default;
-    XMUSHORT2& operator=(XMUSHORT2&&) = default;
-
-    explicit constexpr XMUSHORT2(uint32_t Packed) noexcept : v(Packed) {}
-    constexpr XMUSHORT2(uint16_t _x, uint16_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMUSHORT2(_In_reads_(2) const uint16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMUSHORT2(float _x, float _y) noexcept;
-    explicit XMUSHORT2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMUSHORT2& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 2D Vector; 8 bit signed normalized integer components
-struct XMBYTEN2 {
-    union {
-        struct {
-            int8_t x;
-            int8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMBYTEN2() = default;
-
-    XMBYTEN2(const XMBYTEN2&) = default;
-    XMBYTEN2& operator=(const XMBYTEN2&) = default;
-
-    XMBYTEN2(XMBYTEN2&&) = default;
-    XMBYTEN2& operator=(XMBYTEN2&&) = default;
-
-    explicit constexpr XMBYTEN2(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMBYTEN2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMBYTEN2(_In_reads_(2) const int8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMBYTEN2(float _x, float _y) noexcept;
-    explicit XMBYTEN2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMBYTEN2& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 8 bit signed integer components
-struct XMBYTE2 {
-    union {
-        struct {
-            int8_t x;
-            int8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMBYTE2() = default;
-
-    XMBYTE2(const XMBYTE2&) = default;
-    XMBYTE2& operator=(const XMBYTE2&) = default;
-
-    XMBYTE2(XMBYTE2&&) = default;
-    XMBYTE2& operator=(XMBYTE2&&) = default;
-
-    explicit constexpr XMBYTE2(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMBYTE2(int8_t _x, int8_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMBYTE2(_In_reads_(2) const int8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMBYTE2(float _x, float _y) noexcept;
-    explicit XMBYTE2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMBYTE2& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 8 bit unsigned normalized integer components
-struct XMUBYTEN2 {
-    union {
-        struct {
-            uint8_t x;
-            uint8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMUBYTEN2() = default;
-
-    XMUBYTEN2(const XMUBYTEN2&) = default;
-    XMUBYTEN2& operator=(const XMUBYTEN2&) = default;
-
-    XMUBYTEN2(XMUBYTEN2&&) = default;
-    XMUBYTEN2& operator=(XMUBYTEN2&&) = default;
-
-    explicit constexpr XMUBYTEN2(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMUBYTEN2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMUBYTEN2(_In_reads_(2) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMUBYTEN2(float _x, float _y) noexcept;
-    explicit XMUBYTEN2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMUBYTEN2& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 2D Vector; 8 bit unsigned integer components
-struct XMUBYTE2 {
-    union {
-        struct {
-            uint8_t x;
-            uint8_t y;
-        };
-        uint16_t v;
-    };
-
-    XMUBYTE2() = default;
-
-    XMUBYTE2(const XMUBYTE2&) = default;
-    XMUBYTE2& operator=(const XMUBYTE2&) = default;
-
-    XMUBYTE2(XMUBYTE2&&) = default;
-    XMUBYTE2& operator=(XMUBYTE2&&) = default;
-
-    explicit constexpr XMUBYTE2(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMUBYTE2(uint8_t _x, uint8_t _y) noexcept : x(_x), y(_y) {}
-    explicit XMUBYTE2(_In_reads_(2) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]) {}
-    XMUBYTE2(float _x, float _y) noexcept;
-    explicit XMUBYTE2(_In_reads_(2) const float* pArray) noexcept;
-
-    XMUBYTE2& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 5/6/5 unsigned integer components
-struct XMU565 {
-    union {
-        struct {
-            uint16_t x : 5;  // 0 to 31
-            uint16_t y : 6;  // 0 to 63
-            uint16_t z : 5;  // 0 to 31
-        };
-        uint16_t v;
-    };
-
-    XMU565() = default;
-
-    XMU565(const XMU565&) = default;
-    XMU565& operator=(const XMU565&) = default;
-
-    XMU565(XMU565&&) = default;
-    XMU565& operator=(XMU565&&) = default;
-
-    explicit constexpr XMU565(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMU565(uint8_t _x, uint8_t _y, uint8_t _z) noexcept
-        : x(_x), y(_y), z(_z) {}
-    explicit XMU565(_In_reads_(3) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
-    XMU565(float _x, float _y, float _z) noexcept;
-    explicit XMU565(_In_reads_(3) const float* pArray) noexcept;
-
-    operator uint16_t() const noexcept { return v; }
-
-    XMU565& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 11/11/10 floating-point components
-// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
-// and 6-bit mantissa for x component, a 5-bit biased exponent and
-// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit
-// mantissa for z. The z component is stored in the most significant bits
-// and the x component in the least significant bits. No sign bits so
-// all partial-precision numbers are positive.
-// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0]
-struct XMFLOAT3PK {
-    union {
-        struct {
-            uint32_t xm : 6;  // x-mantissa
-            uint32_t xe : 5;  // x-exponent
-            uint32_t ym : 6;  // y-mantissa
-            uint32_t ye : 5;  // y-exponent
-            uint32_t zm : 5;  // z-mantissa
-            uint32_t ze : 5;  // z-exponent
-        };
-        uint32_t v;
-    };
-
-    XMFLOAT3PK() = default;
-
-    XMFLOAT3PK(const XMFLOAT3PK&) = default;
-    XMFLOAT3PK& operator=(const XMFLOAT3PK&) = default;
-
-    XMFLOAT3PK(XMFLOAT3PK&&) = default;
-    XMFLOAT3PK& operator=(XMFLOAT3PK&&) = default;
-
-    explicit constexpr XMFLOAT3PK(uint32_t Packed) noexcept : v(Packed) {}
-    XMFLOAT3PK(float _x, float _y, float _z) noexcept;
-    explicit XMFLOAT3PK(_In_reads_(3) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMFLOAT3PK& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent
-// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent
-// with 9-bit mantissa for the x, y, and z component. The shared exponent
-// is stored in the most significant bits and the x component mantissa is in
-// the least significant bits. No sign bits so all partial-precision numbers
-// are positive.
-// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0]
-struct XMFLOAT3SE {
-    union {
-        struct {
-            uint32_t xm : 9;  // x-mantissa
-            uint32_t ym : 9;  // y-mantissa
-            uint32_t zm : 9;  // z-mantissa
-            uint32_t e : 5;   // shared exponent
-        };
-        uint32_t v;
-    };
-
-    XMFLOAT3SE() = default;
-
-    XMFLOAT3SE(const XMFLOAT3SE&) = default;
-    XMFLOAT3SE& operator=(const XMFLOAT3SE&) = default;
-
-    XMFLOAT3SE(XMFLOAT3SE&&) = default;
-    XMFLOAT3SE& operator=(XMFLOAT3SE&&) = default;
-
-    explicit constexpr XMFLOAT3SE(uint32_t Packed) noexcept : v(Packed) {}
-    XMFLOAT3SE(float _x, float _y, float _z) noexcept;
-    explicit XMFLOAT3SE(_In_reads_(3) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMFLOAT3SE& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 16 bit floating point components
-struct XMHALF4 {
-    union {
-        struct {
-            HALF x;
-            HALF y;
-            HALF z;
-            HALF w;
-        };
-        uint64_t v;
-    };
-
-    XMHALF4() = default;
-
-    XMHALF4(const XMHALF4&) = default;
-    XMHALF4& operator=(const XMHALF4&) = default;
-
-    XMHALF4(XMHALF4&&) = default;
-    XMHALF4& operator=(XMHALF4&&) = default;
-
-    explicit constexpr XMHALF4(uint64_t Packed) noexcept : v(Packed) {}
-    constexpr XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMHALF4(_In_reads_(4) const HALF* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMHALF4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMHALF4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMHALF4& operator=(uint64_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 16 bit signed normalized integer components
-struct XMSHORTN4 {
-    union {
-        struct {
-            int16_t x;
-            int16_t y;
-            int16_t z;
-            int16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMSHORTN4() = default;
-
-    XMSHORTN4(const XMSHORTN4&) = default;
-    XMSHORTN4& operator=(const XMSHORTN4&) = default;
-
-    XMSHORTN4(XMSHORTN4&&) = default;
-    XMSHORTN4& operator=(XMSHORTN4&&) = default;
-
-    explicit constexpr XMSHORTN4(uint64_t Packed) noexcept : v(Packed) {}
-    constexpr XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMSHORTN4(_In_reads_(4) const int16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMSHORTN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMSHORTN4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMSHORTN4& operator=(uint64_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 16 bit signed integer components
-struct XMSHORT4 {
-    union {
-        struct {
-            int16_t x;
-            int16_t y;
-            int16_t z;
-            int16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMSHORT4() = default;
-
-    XMSHORT4(const XMSHORT4&) = default;
-    XMSHORT4& operator=(const XMSHORT4&) = default;
-
-    XMSHORT4(XMSHORT4&&) = default;
-    XMSHORT4& operator=(XMSHORT4&&) = default;
-
-    explicit constexpr XMSHORT4(uint64_t Packed) noexcept : v(Packed) {}
-    constexpr XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMSHORT4(_In_reads_(4) const int16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMSHORT4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMSHORT4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMSHORT4& operator=(uint64_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 16 bit unsigned normalized integer components
-struct XMUSHORTN4 {
-    union {
-        struct {
-            uint16_t x;
-            uint16_t y;
-            uint16_t z;
-            uint16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMUSHORTN4() = default;
-
-    XMUSHORTN4(const XMUSHORTN4&) = default;
-    XMUSHORTN4& operator=(const XMUSHORTN4&) = default;
-
-    XMUSHORTN4(XMUSHORTN4&&) = default;
-    XMUSHORTN4& operator=(XMUSHORTN4&&) = default;
-
-    explicit constexpr XMUSHORTN4(uint64_t Packed) noexcept : v(Packed) {}
-    constexpr XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z,
-                         uint16_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUSHORTN4(_In_reads_(4) const uint16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUSHORTN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUSHORTN4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMUSHORTN4& operator=(uint64_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 16 bit unsigned integer components
-struct XMUSHORT4 {
-    union {
-        struct {
-            uint16_t x;
-            uint16_t y;
-            uint16_t z;
-            uint16_t w;
-        };
-        uint64_t v;
-    };
-
-    XMUSHORT4() = default;
-
-    XMUSHORT4(const XMUSHORT4&) = default;
-    XMUSHORT4& operator=(const XMUSHORT4&) = default;
-
-    XMUSHORT4(XMUSHORT4&&) = default;
-    XMUSHORT4& operator=(XMUSHORT4&&) = default;
-
-    explicit constexpr XMUSHORT4(uint64_t Packed) noexcept : v(Packed) {}
-    constexpr XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z,
-                        uint16_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUSHORT4(_In_reads_(4) const uint16_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUSHORT4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUSHORT4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMUSHORT4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
-// normalized integer for the w component and 10 bit signed, normalized
-// integers for the z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMXDECN4 {
-    union {
-        struct {
-            int32_t x : 10;  // -511/511 to 511/511
-            int32_t y : 10;  // -511/511 to 511/511
-            int32_t z : 10;  // -511/511 to 511/511
-            uint32_t w : 2;  //      0/3 to     3/3
-        };
-        uint32_t v;
-    };
-
-    XMXDECN4() = default;
-
-    XMXDECN4(const XMXDECN4&) = default;
-    XMXDECN4& operator=(const XMXDECN4&) = default;
-
-    XMXDECN4(XMXDECN4&&) = default;
-    XMXDECN4& operator=(XMXDECN4&&) = default;
-
-    explicit constexpr XMXDECN4(uint32_t Packed) : v(Packed) {}
-    XMXDECN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMXDECN4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMXDECN4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned
-// integer for the w component and 10 bit signed integers for the
-// z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMXDEC4 {
-    union {
-        struct {
-            int32_t x : 10;  // -511 to 511
-            int32_t y : 10;  // -511 to 511
-            int32_t z : 10;  // -511 to 511
-            uint32_t w : 2;  // 0 to 3
-        };
-        uint32_t v;
-    };
-
-    XMXDEC4() = default;
-
-    XMXDEC4(const XMXDEC4&) = default;
-    XMXDEC4& operator=(const XMXDEC4&) = default;
-
-    XMXDEC4(XMXDEC4&&) = default;
-    XMXDEC4& operator=(XMXDEC4&&) = default;
-
-    explicit constexpr XMXDEC4(uint32_t Packed) noexcept : v(Packed) {}
-    XMXDEC4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMXDEC4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMXDEC4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed,
-// normalized integer for the w component and 10 bit signed, normalized
-// integers for the z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMDECN4 {
-    union {
-        struct {
-            int32_t x : 10;  // -511/511 to 511/511
-            int32_t y : 10;  // -511/511 to 511/511
-            int32_t z : 10;  // -511/511 to 511/511
-            int32_t w : 2;   //     -1/1 to     1/1
-        };
-        uint32_t v;
-    };
-
-    XMDECN4() = default;
-
-    XMDECN4(const XMDECN4&) = default;
-    XMDECN4& operator=(const XMDECN4&) = default;
-
-    XMDECN4(XMDECN4&&) = default;
-    XMDECN4& operator=(XMDECN4&&) = default;
-
-    explicit constexpr XMDECN4(uint32_t Packed) noexcept : v(Packed) {}
-    XMDECN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMDECN4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMDECN4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The 4D Vector is packed into 32 bits as follows: a 2 bit signed,
-// integer for the w component and 10 bit signed integers for the
-// z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XM_DEPRECATED XMDEC4 {
-    union {
-        struct {
-            int32_t x : 10;  // -511 to 511
-            int32_t y : 10;  // -511 to 511
-            int32_t z : 10;  // -511 to 511
-            int32_t w : 2;   //   -1 to   1
-        };
-        uint32_t v;
-    };
-
-    XMDEC4() = default;
-
-    XMDEC4(const XMDEC4&) = default;
-    XMDEC4& operator=(const XMDEC4&) = default;
-
-    XMDEC4(XMDEC4&&) = default;
-    XMDEC4& operator=(XMDEC4&&) = default;
-
-    explicit constexpr XMDEC4(uint32_t Packed) noexcept : v(Packed) {}
-    XMDEC4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMDEC4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMDEC4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer
-// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
-// normalized integer for the w component and 10 bit unsigned, normalized
-// integers for the z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMUDECN4 {
-    union {
-        struct {
-            uint32_t x : 10;  // 0/1023 to 1023/1023
-            uint32_t y : 10;  // 0/1023 to 1023/1023
-            uint32_t z : 10;  // 0/1023 to 1023/1023
-            uint32_t w : 2;   //    0/3 to       3/3
-        };
-        uint32_t v;
-    };
-
-    XMUDECN4() = default;
-
-    XMUDECN4(const XMUDECN4&) = default;
-    XMUDECN4& operator=(const XMUDECN4&) = default;
-
-    XMUDECN4(XMUDECN4&&) = default;
-    XMUDECN4& operator=(XMUDECN4&&) = default;
-
-    explicit constexpr XMUDECN4(uint32_t Packed) noexcept : v(Packed) {}
-    XMUDECN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUDECN4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMUDECN4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer
-// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned,
-// integer for the w component and 10 bit unsigned integers
-// for the z, y, and x components.  The w component is stored in the
-// most significant bits and the x component in the least significant bits
-// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0]
-struct XMUDEC4 {
-    union {
-        struct {
-            uint32_t x : 10;  // 0 to 1023
-            uint32_t y : 10;  // 0 to 1023
-            uint32_t z : 10;  // 0 to 1023
-            uint32_t w : 2;   // 0 to    3
-        };
-        uint32_t v;
-    };
-
-    XMUDEC4() = default;
-
-    XMUDEC4(const XMUDEC4&) = default;
-    XMUDEC4& operator=(const XMUDEC4&) = default;
-
-    XMUDEC4(XMUDEC4&&) = default;
-    XMUDEC4& operator=(XMUDEC4&&) = default;
-
-    explicit constexpr XMUDEC4(uint32_t Packed) noexcept : v(Packed) {}
-    XMUDEC4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUDEC4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint32_t() const noexcept { return v; }
-
-    XMUDEC4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D Vector; 8 bit signed normalized integer components
-struct XMBYTEN4 {
-    union {
-        struct {
-            int8_t x;
-            int8_t y;
-            int8_t z;
-            int8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMBYTEN4() = default;
-
-    XMBYTEN4(const XMBYTEN4&) = default;
-    XMBYTEN4& operator=(const XMBYTEN4&) = default;
-
-    XMBYTEN4(XMBYTEN4&&) = default;
-    XMBYTEN4& operator=(XMBYTEN4&&) = default;
-
-    constexpr XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit constexpr XMBYTEN4(uint32_t Packed) noexcept : v(Packed) {}
-    explicit XMBYTEN4(_In_reads_(4) const int8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMBYTEN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMBYTEN4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMBYTEN4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 8 bit signed integer components
-struct XMBYTE4 {
-    union {
-        struct {
-            int8_t x;
-            int8_t y;
-            int8_t z;
-            int8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMBYTE4() = default;
-
-    XMBYTE4(const XMBYTE4&) = default;
-    XMBYTE4& operator=(const XMBYTE4&) = default;
-
-    XMBYTE4(XMBYTE4&&) = default;
-    XMBYTE4& operator=(XMBYTE4&&) = default;
-
-    constexpr XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit constexpr XMBYTE4(uint32_t Packed) noexcept : v(Packed) {}
-    explicit XMBYTE4(_In_reads_(4) const int8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMBYTE4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMBYTE4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMBYTE4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 8 bit unsigned normalized integer components
-struct XMUBYTEN4 {
-    union {
-        struct {
-            uint8_t x;
-            uint8_t y;
-            uint8_t z;
-            uint8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMUBYTEN4() = default;
-
-    XMUBYTEN4(const XMUBYTEN4&) = default;
-    XMUBYTEN4& operator=(const XMUBYTEN4&) = default;
-
-    XMUBYTEN4(XMUBYTEN4&&) = default;
-    XMUBYTEN4& operator=(XMUBYTEN4&&) = default;
-
-    constexpr XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit constexpr XMUBYTEN4(uint32_t Packed) noexcept : v(Packed) {}
-    explicit XMUBYTEN4(_In_reads_(4) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUBYTEN4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUBYTEN4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMUBYTEN4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-// 4D Vector; 8 bit unsigned integer components
-struct XMUBYTE4 {
-    union {
-        struct {
-            uint8_t x;
-            uint8_t y;
-            uint8_t z;
-            uint8_t w;
-        };
-        uint32_t v;
-    };
-
-    XMUBYTE4() = default;
-
-    XMUBYTE4(const XMUBYTE4&) = default;
-    XMUBYTE4& operator=(const XMUBYTE4&) = default;
-
-    XMUBYTE4(XMUBYTE4&&) = default;
-    XMUBYTE4& operator=(XMUBYTE4&&) = default;
-
-    constexpr XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit constexpr XMUBYTE4(uint32_t Packed) noexcept : v(Packed) {}
-    explicit XMUBYTE4(_In_reads_(4) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUBYTE4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUBYTE4(_In_reads_(4) const float* pArray) noexcept;
-
-    XMUBYTE4& operator=(uint32_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D vector; 4 bit unsigned integer components
-struct XMUNIBBLE4 {
-    union {
-        struct {
-            uint16_t x : 4;  // 0 to 15
-            uint16_t y : 4;  // 0 to 15
-            uint16_t z : 4;  // 0 to 15
-            uint16_t w : 4;  // 0 to 15
-        };
-        uint16_t v;
-    };
-
-    XMUNIBBLE4() = default;
-
-    XMUNIBBLE4(const XMUNIBBLE4&) = default;
-    XMUNIBBLE4& operator=(const XMUNIBBLE4&) = default;
-
-    XMUNIBBLE4(XMUNIBBLE4&&) = default;
-    XMUNIBBLE4& operator=(XMUNIBBLE4&&) = default;
-
-    explicit constexpr XMUNIBBLE4(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMUNIBBLE4(uint8_t _x, uint8_t _y, uint8_t _z,
-                         uint8_t _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w) {}
-    explicit XMUNIBBLE4(_In_reads_(4) const uint8_t* pArray) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}
-    XMUNIBBLE4(float _x, float _y, float _z, float _w) noexcept;
-    explicit XMUNIBBLE4(_In_reads_(4) const float* pArray) noexcept;
-
-    operator uint16_t() const noexcept { return v; }
-
-    XMUNIBBLE4& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-//------------------------------------------------------------------------------
-// 4D vector: 5/5/5/1 unsigned integer components
-struct XMU555 {
-    union {
-        struct {
-            uint16_t x : 5;  // 0 to 31
-            uint16_t y : 5;  // 0 to 31
-            uint16_t z : 5;  // 0 to 31
-            uint16_t w : 1;  // 0 or 1
-        };
-        uint16_t v;
-    };
-
-    XMU555() = default;
-
-    XMU555(const XMU555&) = default;
-    XMU555& operator=(const XMU555&) = default;
-
-    XMU555(XMU555&&) = default;
-    XMU555& operator=(XMU555&&) = default;
-
-    explicit constexpr XMU555(uint16_t Packed) noexcept : v(Packed) {}
-    constexpr XMU555(uint8_t _x, uint8_t _y, uint8_t _z, bool _w) noexcept
-        : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {}
-    XMU555(_In_reads_(3) const uint8_t* pArray, _In_ bool _w) noexcept
-        : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {}
-    XMU555(float _x, float _y, float _z, bool _w) noexcept;
-    XMU555(_In_reads_(3) const float* pArray, _In_ bool _w) noexcept;
-
-    operator uint16_t() const noexcept { return v; }
-
-    XMU555& operator=(uint16_t Packed) noexcept {
-        v = Packed;
-        return *this;
-    }
-};
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Data conversion operations
- *
- ****************************************************************************/
-
-float XMConvertHalfToFloat(HALF Value) noexcept;
-float* XMConvertHalfToFloatStream(
-    _Out_writes_bytes_(sizeof(float) +
-                       OutputStride * (HalfCount - 1)) float* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(HALF) + InputStride * (HalfCount - 1))
-        const HALF* pInputStream,
-    _In_ size_t InputStride, _In_ size_t HalfCount) noexcept;
-HALF XMConvertFloatToHalf(float Value) noexcept;
-HALF* XMConvertFloatToHalfStream(
-    _Out_writes_bytes_(sizeof(HALF) + OutputStride * (FloatCount - 1))
-        HALF* pOutputStream,
-    _In_ size_t OutputStride,
-    _In_reads_bytes_(sizeof(float) + InputStride * (FloatCount - 1))
-        const float* pInputStream,
-    _In_ size_t InputStride, _In_ size_t FloatCount) noexcept;
-
-/****************************************************************************
- *
- * Load operations
- *
- ****************************************************************************/
-
-XMVECTOR XM_CALLCONV XMLoadColor(_In_ const XMCOLOR* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadHalf2(_In_ const XMHALF2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadShortN2(_In_ const XMSHORTN2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadShort2(_In_ const XMSHORT2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUShortN2(_In_ const XMUSHORTN2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUShort2(_In_ const XMUSHORT2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadByteN2(_In_ const XMBYTEN2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadByte2(_In_ const XMBYTE2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUByteN2(_In_ const XMUBYTEN2* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUByte2(_In_ const XMUBYTE2* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadU565(_In_ const XMU565* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource) noexcept;
-
-XMVECTOR XM_CALLCONV XMLoadHalf4(_In_ const XMHALF4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadShortN4(_In_ const XMSHORTN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadShort4(_In_ const XMSHORT4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUShortN4(_In_ const XMUSHORTN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUShort4(_In_ const XMUSHORT4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadXDecN4(_In_ const XMXDECN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUDecN4(_In_ const XMUDECN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUDecN4_XR(_In_ const XMUDECN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUDec4(_In_ const XMUDEC4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadByteN4(_In_ const XMBYTEN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadByte4(_In_ const XMBYTE4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUByteN4(_In_ const XMUBYTEN4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUByte4(_In_ const XMUBYTE4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource) noexcept;
-XMVECTOR XM_CALLCONV XMLoadU555(_In_ const XMU555* pSource) noexcept;
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-XM_DEPRECATED
-XMVECTOR XM_CALLCONV XMLoadDecN4(_In_ const XMDECN4* pSource) noexcept;
-
-XM_DEPRECATED
-XMVECTOR XM_CALLCONV XMLoadDec4(_In_ const XMDEC4* pSource) noexcept;
-
-XM_DEPRECATED
-XMVECTOR XM_CALLCONV XMLoadXDec4(_In_ const XMXDEC4* pSource) noexcept;
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Store operations
- *
- ****************************************************************************/
-
-void XM_CALLCONV XMStoreColor(_Out_ XMCOLOR* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreHalf2(_Out_ XMHALF2* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreShortN2(_Out_ XMSHORTN2* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreShort2(_Out_ XMSHORT2* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination,
-                                 _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUShort2(_Out_ XMUSHORT2* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreByteN2(_Out_ XMBYTEN2* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreByte2(_Out_ XMBYTE2* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUByte2(_Out_ XMUBYTE2* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreU565(_Out_ XMU565* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination,
-                                 _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination,
-                                 _In_ FXMVECTOR V) noexcept;
-
-void XM_CALLCONV XMStoreHalf4(_Out_ XMHALF4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreShortN4(_Out_ XMSHORTN4* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreShort4(_Out_ XMSHORT4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination,
-                                 _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUShort4(_Out_ XMUSHORT4* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreXDecN4(_Out_ XMXDECN4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUDecN4(_Out_ XMUDECN4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUDecN4_XR(_Out_ XMUDECN4* pDestination,
-                                  _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUDec4(_Out_ XMUDEC4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreByteN4(_Out_ XMBYTEN4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreByte4(_Out_ XMBYTE4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination,
-                                _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUByte4(_Out_ XMUBYTE4* pDestination,
-                               _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination,
-                                 _In_ FXMVECTOR V) noexcept;
-void XM_CALLCONV XMStoreU555(_Out_ XMU555* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-XM_DEPRECATED
-void XM_CALLCONV XMStoreDecN4(_Out_ XMDECN4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-XM_DEPRECATED
-void XM_CALLCONV XMStoreDec4(_Out_ XMDEC4* pDestination,
-                             _In_ FXMVECTOR V) noexcept;
-
-XM_DEPRECATED
-void XM_CALLCONV XMStoreXDec4(_Out_ XMXDEC4* pDestination,
-                              _In_ FXMVECTOR V) noexcept;
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * Implementation
- *
- ****************************************************************************/
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4068 4214 4204 4365 4616 6001 6101)
-// C4068/4616: ignore unknown pragmas
-// C4214/4204: nonstandard extension used
-// C4365: Off by default noise
-// C6001/6101: False positives
-#endif
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
-#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunknown-warning-option"
-#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
-#endif
-
-#include "DirectXPackedVector.inl"
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-}  // namespace PackedVector
-
-}  // namespace DirectX
diff --git a/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.inl b/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.inl
deleted file mode 100644
index 2ed7774a0..000000000
--- a/targets/app/linux/Stubs/DirectXMath/DirectXPackedVector.inl
+++ /dev/null
@@ -1,4142 +0,0 @@
-//-------------------------------------------------------------------------------------
-// DirectXPackedVector.inl -- SIMD C++ Math library
-//
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-//
-// http://go.microsoft.com/fwlink/?LinkID=615560
-//-------------------------------------------------------------------------------------
-
-#pragma once
-
-/****************************************************************************
- *
- * Data conversion
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline float XMConvertHalfToFloat(HALF Value) noexcept {
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtsi32_si128(static_cast<int>(Value));
-    __m128 V2 = _mm_cvtph_ps(V1);
-    return _mm_cvtss_f32(V2);
-#elif defined(_XM_ARM_NEON_INTRINSICS_) &&                \
-    (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-     defined(_M_ARM64EC) || __aarch64__) &&               \
-    !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
-    uint16x4_t vHalf = vdup_n_u16(Value);
-    float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
-    return vgetq_lane_f32(vFloat, 0);
-#else
-    auto Mantissa = static_cast<uint32_t>(Value & 0x03FF);
-
-    uint32_t Exponent = (Value & 0x7C00);
-    if (Exponent == 0x7C00)  // INF/NAN
-    {
-        Exponent = 0x8f;
-    } else if (Exponent != 0)  // The value is normalized
-    {
-        Exponent =
-            static_cast<uint32_t>((static_cast<int>(Value) >> 10) & 0x1F);
-    } else if (Mantissa != 0)  // The value is denormalized
-    {
-        // Normalize the value in the resulting float
-        Exponent = 1;
-
-        do {
-            Exponent--;
-            Mantissa <<= 1;
-        } while ((Mantissa & 0x0400) == 0);
-
-        Mantissa &= 0x03FF;
-    } else  // The value is zero
-    {
-        Exponent = static_cast<uint32_t>(-112);
-    }
-
-    uint32_t Result = ((static_cast<uint32_t>(Value) & 0x8000) << 16)  // Sign
-                      | ((Exponent + 112) << 23)  // Exponent
-                      | (Mantissa << 13);         // Mantissa
-
-    return reinterpret_cast<float*>(&Result)[0];
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307")
-#endif
-
-_Use_decl_annotations_ inline float* XMConvertHalfToFloatStream(
-    float* pOutputStream, size_t OutputStride, const HALF* pInputStream,
-    size_t InputStride, size_t HalfCount) noexcept {
-    assert(pOutputStream);
-    assert(pInputStream);
-
-    assert(InputStride >= sizeof(HALF));
-    _Analysis_assume_(InputStride >= sizeof(HALF));
-
-    assert(OutputStride >= sizeof(float));
-    _Analysis_assume_(OutputStride >= sizeof(float));
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = HalfCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(HALF)) {
-            if (OutputStride == sizeof(float)) {
-                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0) {
-                    // Packed input, aligned & packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128i HV = _mm_loadl_epi64(
-                            reinterpret_cast<const __m128i*>(pHalf));
-                        pHalf += InputStride * 4;
-
-                        __m128 FV = _mm_cvtph_ps(HV);
-
-                        XM_STREAM_PS(reinterpret_cast<float*>(pFloat), FV);
-                        pFloat += OutputStride * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128i HV = _mm_loadl_epi64(
-                            reinterpret_cast<const __m128i*>(pHalf));
-                        pHalf += InputStride * 4;
-
-                        __m128 FV = _mm_cvtph_ps(HV);
-
-                        _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
-                        pFloat += OutputStride * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j) {
-                    __m128i HV = _mm_loadl_epi64(
-                        reinterpret_cast<const __m128i*>(pHalf));
-                    pHalf += InputStride * 4;
-
-                    __m128 FV = _mm_cvtph_ps(HV);
-
-                    _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
-                    pFloat += OutputStride;
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
-                    pFloat += OutputStride;
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
-                    pFloat += OutputStride;
-                    *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
-                    pFloat += OutputStride;
-                    i += 4;
-                }
-            }
-        } else if (OutputStride == sizeof(float)) {
-            if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0) {
-                // Scattered input, aligned & packed output
-                for (size_t j = 0; j < four; ++j) {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16(HV, H1, 0);
-                    HV = _mm_insert_epi16(HV, H2, 1);
-                    HV = _mm_insert_epi16(HV, H3, 2);
-                    HV = _mm_insert_epi16(HV, H4, 3);
-                    __m128 FV = _mm_cvtph_ps(HV);
-
-                    XM_STREAM_PS(reinterpret_cast<float*>(pFloat), FV);
-                    pFloat += OutputStride * 4;
-                    i += 4;
-                }
-            } else {
-                // Scattered input, packed output
-                for (size_t j = 0; j < four; ++j) {
-                    uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-                    uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                    pHalf += InputStride;
-
-                    __m128i HV = _mm_setzero_si128();
-                    HV = _mm_insert_epi16(HV, H1, 0);
-                    HV = _mm_insert_epi16(HV, H2, 1);
-                    HV = _mm_insert_epi16(HV, H3, 2);
-                    HV = _mm_insert_epi16(HV, H4, 3);
-                    __m128 FV = _mm_cvtph_ps(HV);
-
-                    _mm_storeu_ps(reinterpret_cast<float*>(pFloat), FV);
-                    pFloat += OutputStride * 4;
-                    i += 4;
-                }
-            }
-        } else {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j) {
-                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-
-                __m128i HV = _mm_setzero_si128();
-                HV = _mm_insert_epi16(HV, H1, 0);
-                HV = _mm_insert_epi16(HV, H2, 1);
-                HV = _mm_insert_epi16(HV, H3, 2);
-                HV = _mm_insert_epi16(HV, H4, 3);
-                __m128 FV = _mm_cvtph_ps(HV);
-
-                _mm_store_ss(reinterpret_cast<float*>(pFloat), FV);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 1);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 2);
-                pFloat += OutputStride;
-                *reinterpret_cast<int*>(pFloat) = _mm_extract_ps(FV, 3);
-                pFloat += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < HalfCount; ++i) {
-        *reinterpret_cast<float*>(pFloat) =
-            XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride;
-    }
-
-    XM_SFENCE();
-
-    return pOutputStream;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) &&                \
-    (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-     defined(_M_ARM64EC) || __aarch64__) &&               \
-    !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
-    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = HalfCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(HALF)) {
-            if (OutputStride == sizeof(float)) {
-                // Packed input, packed output
-                for (size_t j = 0; j < four; ++j) {
-                    uint16x4_t vHalf =
-                        vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
-                    pHalf += InputStride * 4;
-
-                    float32x4_t vFloat =
-                        vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
-
-                    vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
-                    pFloat += OutputStride * 4;
-                    i += 4;
-                }
-            } else {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j) {
-                    uint16x4_t vHalf =
-                        vld1_u16(reinterpret_cast<const uint16_t*>(pHalf));
-                    pHalf += InputStride * 4;
-
-                    float32x4_t vFloat =
-                        vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
-
-                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
-                    pFloat += OutputStride;
-                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
-                    pFloat += OutputStride;
-                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
-                    pFloat += OutputStride;
-                    vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
-                    pFloat += OutputStride;
-                    i += 4;
-                }
-            }
-        } else if (OutputStride == sizeof(float)) {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j) {
-                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-
-                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) |
-                                 (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
-                uint16x4_t vHalf = vcreate_u16(iHalf);
-
-                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
-
-                vst1q_f32(reinterpret_cast<float*>(pFloat), vFloat);
-                pFloat += OutputStride * 4;
-                i += 4;
-            }
-        } else {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j) {
-                uint16_t H1 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H2 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H3 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-                uint16_t H4 = *reinterpret_cast<const HALF*>(pHalf);
-                pHalf += InputStride;
-
-                uint64_t iHalf = uint64_t(H1) | (uint64_t(H2) << 16) |
-                                 (uint64_t(H3) << 32) | (uint64_t(H4) << 48);
-                uint16x4_t vHalf = vcreate_u16(iHalf);
-
-                float32x4_t vFloat = vcvt_f32_f16(vreinterpret_f16_u16(vHalf));
-
-                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 0);
-                pFloat += OutputStride;
-                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 1);
-                pFloat += OutputStride;
-                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 2);
-                pFloat += OutputStride;
-                vst1q_lane_f32(reinterpret_cast<float*>(pFloat), vFloat, 3);
-                pFloat += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < HalfCount; ++i) {
-        *reinterpret_cast<float*>(pFloat) =
-            XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride;
-    }
-
-    return pOutputStream;
-#else
-    auto pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < HalfCount; i++) {
-        *reinterpret_cast<float*>(pFloat) =
-            XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
-        pHalf += InputStride;
-        pFloat += OutputStride;
-    }
-
-    return pOutputStream;
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-
-inline HALF XMConvertFloatToHalf(float Value) noexcept {
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128 V1 = _mm_set_ss(Value);
-    __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
-    return static_cast<HALF>(_mm_extract_epi16(V2, 0));
-#elif defined(_XM_ARM_NEON_INTRINSICS_) &&                \
-    (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-     defined(_M_ARM64EC) || __aarch64__) &&               \
-    !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
-    float32x4_t vFloat = vdupq_n_f32(Value);
-    float16x4_t vHalf = vcvt_f16_f32(vFloat);
-    return vget_lane_u16(vreinterpret_u16_f16(vHalf), 0);
-#else
-    uint32_t Result;
-
-    auto IValue = reinterpret_cast<uint32_t*>(&Value)[0];
-    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
-    IValue = IValue & 0x7FFFFFFFU;  // Hack off the sign
-    if (IValue >= 0x47800000 /*e+16*/) {
-        // The number is too large to be represented as a half. Return infinity
-        // or NaN
-        Result =
-            0x7C00U |
-            ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U);
-    } else if (IValue <= 0x33000000U /*e-25*/) {
-        Result = 0;
-    } else if (IValue < 0x38800000U /*e-14*/) {
-        // The number is too small to be represented as a normalized half.
-        // Convert it to a denormalized value.
-        uint32_t Shift = 125U - (IValue >> 23U);
-        IValue = 0x800000U | (IValue & 0x7FFFFFU);
-        Result = IValue >> (Shift + 1);
-        uint32_t s = (IValue & ((1U << Shift) - 1)) != 0;
-        Result += (Result | s) & ((IValue >> Shift) & 1U);
-    } else {
-        // Rebias the exponent to represent the value as a normalized half.
-        IValue += 0xC8000000U;
-        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
-    }
-    return static_cast<HALF>(Result | Sign);
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline HALF* XMConvertFloatToHalfStream(
-    HALF* pOutputStream, size_t OutputStride, const float* pInputStream,
-    size_t InputStride, size_t FloatCount) noexcept {
-    assert(pOutputStream);
-    assert(pInputStream);
-
-    assert(InputStride >= sizeof(float));
-    _Analysis_assume_(InputStride >= sizeof(float));
-
-    assert(OutputStride >= sizeof(HALF));
-    _Analysis_assume_(OutputStride >= sizeof(HALF));
-
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = FloatCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(float)) {
-            if (OutputStride == sizeof(HALF)) {
-                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0) {
-                    // Aligned and packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 FV =
-                            _mm_load_ps(reinterpret_cast<const float*>(pFloat));
-                        pFloat += InputStride * 4;
-
-                        __m128i HV =
-                            _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
-                        pHalf += OutputStride * 4;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, packed output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 FV = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pFloat));
-                        pFloat += InputStride * 4;
-
-                        __m128i HV =
-                            _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                        _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
-                        pHalf += OutputStride * 4;
-                        i += 4;
-                    }
-                }
-            } else {
-                if ((reinterpret_cast<uintptr_t>(pFloat) & 0xF) == 0) {
-                    // Aligned & packed input, scattered output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 FV =
-                            _mm_load_ps(reinterpret_cast<const float*>(pFloat));
-                        pFloat += InputStride * 4;
-
-                        __m128i HV =
-                            _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 0));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 1));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 2));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 3));
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                } else {
-                    // Packed input, scattered output
-                    for (size_t j = 0; j < four; ++j) {
-                        __m128 FV = _mm_loadu_ps(
-                            reinterpret_cast<const float*>(pFloat));
-                        pFloat += InputStride * 4;
-
-                        __m128i HV =
-                            _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 0));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 1));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 2));
-                        pHalf += OutputStride;
-                        *reinterpret_cast<HALF*>(pHalf) =
-                            static_cast<HALF>(_mm_extract_epi16(HV, 3));
-                        pHalf += OutputStride;
-                        i += 4;
-                    }
-                }
-            }
-        } else if (OutputStride == sizeof(HALF)) {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j) {
-                __m128 FV1 =
-                    _mm_load_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV2 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV3 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV4 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
-                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
-                FV = _mm_blend_ps(FV, FT, 0xC);
-
-                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
-                pHalf += OutputStride * 4;
-                i += 4;
-            }
-        } else {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j) {
-                __m128 FV1 =
-                    _mm_load_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV2 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV3 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV4 =
-                    _mm_broadcast_ss(reinterpret_cast<const float*>(pFloat));
-                pFloat += InputStride;
-
-                __m128 FV = _mm_blend_ps(FV1, FV2, 0x2);
-                __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
-                FV = _mm_blend_ps(FV, FT, 0xC);
-
-                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
-
-                *reinterpret_cast<HALF*>(pHalf) =
-                    static_cast<HALF>(_mm_extract_epi16(HV, 0));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) =
-                    static_cast<HALF>(_mm_extract_epi16(HV, 1));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) =
-                    static_cast<HALF>(_mm_extract_epi16(HV, 2));
-                pHalf += OutputStride;
-                *reinterpret_cast<HALF*>(pHalf) =
-                    static_cast<HALF>(_mm_extract_epi16(HV, 3));
-                pHalf += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < FloatCount; ++i) {
-        *reinterpret_cast<HALF*>(pHalf) =
-            XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride;
-        pHalf += OutputStride;
-    }
-
-    return pOutputStream;
-#elif defined(_XM_ARM_NEON_INTRINSICS_) &&                \
-    (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || \
-     defined(_M_ARM64EC) || __aarch64__) &&               \
-    !defined(_XM_NO_INTRINSICS_) && (!defined(__GNUC__) || (__ARM_FP & 2))
-    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    size_t i = 0;
-    size_t four = FloatCount >> 2;
-    if (four > 0) {
-        if (InputStride == sizeof(float)) {
-            if (OutputStride == sizeof(HALF)) {
-                // Packed input, packed output
-                for (size_t j = 0; j < four; ++j) {
-                    float32x4_t vFloat =
-                        vld1q_f32(reinterpret_cast<const float*>(pFloat));
-                    pFloat += InputStride * 4;
-
-                    uint16x4_t vHalf =
-                        vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
-
-                    vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
-                    pHalf += OutputStride * 4;
-                    i += 4;
-                }
-            } else {
-                // Packed input, scattered output
-                for (size_t j = 0; j < four; ++j) {
-                    float32x4_t vFloat =
-                        vld1q_f32(reinterpret_cast<const float*>(pFloat));
-                    pFloat += InputStride * 4;
-
-                    uint16x4_t vHalf =
-                        vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
-
-                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 0);
-                    pHalf += OutputStride;
-                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 1);
-                    pHalf += OutputStride;
-                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 2);
-                    pHalf += OutputStride;
-                    vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 3);
-                    pHalf += OutputStride;
-                    i += 4;
-                }
-            }
-        } else if (OutputStride == sizeof(HALF)) {
-            // Scattered input, packed output
-            for (size_t j = 0; j < four; ++j) {
-                float32x4_t vFloat = vdupq_n_f32(0);
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 0);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 1);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 2);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 3);
-                pFloat += InputStride;
-
-                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
-
-                vst1_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf);
-                pHalf += OutputStride * 4;
-                i += 4;
-            }
-        } else {
-            // Scattered input, scattered output
-            for (size_t j = 0; j < four; ++j) {
-                float32x4_t vFloat = vdupq_n_f32(0);
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 0);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 1);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 2);
-                pFloat += InputStride;
-
-                vFloat = vld1q_lane_f32(reinterpret_cast<const float*>(pFloat),
-                                        vFloat, 3);
-                pFloat += InputStride;
-
-                uint16x4_t vHalf = vreinterpret_u16_f16(vcvt_f16_f32(vFloat));
-
-                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 0);
-                pHalf += OutputStride;
-                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 1);
-                pHalf += OutputStride;
-                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 2);
-                pHalf += OutputStride;
-                vst1_lane_u16(reinterpret_cast<uint16_t*>(pHalf), vHalf, 3);
-                pHalf += OutputStride;
-                i += 4;
-            }
-        }
-    }
-
-    for (; i < FloatCount; ++i) {
-        *reinterpret_cast<HALF*>(pHalf) =
-            XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride;
-        pHalf += OutputStride;
-    }
-
-    return pOutputStream;
-#else
-    auto pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
-    auto pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
-
-    for (size_t i = 0; i < FloatCount; i++) {
-        *reinterpret_cast<HALF*>(pHalf) =
-            XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
-        pFloat += InputStride;
-        pHalf += OutputStride;
-    }
-    return pOutputStream;
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-/****************************************************************************
- *
- * Vector and matrix load operations
- *
- ****************************************************************************/
-
-#ifdef _PREFAST_
-#pragma prefast(push)
-#pragma prefast(disable : 28931, "PREfast noise: Esp:1266")
-#endif
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadColor(const XMCOLOR* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    // int32_t -> Float conversions are done in one instruction.
-    // uint32_t -> Float calls a runtime function. Keep in int32_t
-    auto iColor = static_cast<int32_t>(pSource->c);
-    XMVECTORF32 vColor = {
-        {{static_cast<float>((iColor >> 16) & 0xFF) * (1.0f / 255.0f),
-          static_cast<float>((iColor >> 8) & 0xFF) * (1.0f / 255.0f),
-          static_cast<float>(iColor & 0xFF) * (1.0f / 255.0f),
-          static_cast<float>((iColor >> 24) & 0xFF) * (1.0f / 255.0f)}}};
-    return vColor.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32_t bgra = pSource->c;
-    uint32_t rgba =
-        (bgra & 0xFF00FF00) | ((bgra >> 16) & 0xFF) | ((bgra << 16) & 0xFF0000);
-    uint32x2_t vInt8 = vdup_n_u32(rgba);
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
-    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32(R, 1.0f / 255.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    __m128i vInt = _mm_set1_epi32(static_cast<int>(pSource->c));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vInt = _mm_and_si128(vInt, g_XMMaskA8R8G8B8);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vInt = _mm_xor_si128(vInt, g_XMFlipA8R8G8B8);
-    // Convert to floating point numbers
-    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMFixAA8R8G8B8);
-    // Convert 0-255 to 0.0f-1.0f
-    return _mm_mul_ps(vTemp, g_XMNormalizeA8R8G8B8);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadHalf2(const XMHALF2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128 V = _mm_load_ss(reinterpret_cast<const float*>(pSource));
-    return _mm_cvtph_ps(_mm_castps_si128(V));
-#else
-    XMVECTORF32 vResult = {{{XMConvertHalfToFloat(pSource->x),
-                             XMConvertHalfToFloat(pSource->y), 0.0f, 0.0f}}};
-    return vResult.v;
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadShortN2(const XMSHORTN2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{(pSource->x == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->x) * (1.0f / 32767.0f)),
-          (pSource->y == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->y) * (1.0f / 32767.0f)),
-          0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 =
-        vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16));
-    vInt = vandq_s32(vInt, g_XMMaskXY);
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32(R, 1.0f / 32767.0f);
-    return vmaxq_f32(R, vdupq_n_f32(-1.f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
-    // x needs to be sign extended
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x - 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16);
-    // Convert -1.0f - 1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16);
-    // Clamp result (for case of -32768)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadShort2(const XMSHORT2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x),
-                             static_cast<float>(pSource->y), 0.f, 0.f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 =
-        vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    int32x4_t vInt = vmovl_s16(vreinterpret_s16_u32(vInt16));
-    vInt = vandq_s32(vInt, g_XMMaskXY);
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
-    // x needs to be sign extended
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x - 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16);
-    // Y is 65536 too large
-    return _mm_mul_ps(vTemp, g_XMFixupY16);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUShortN2(const XMUSHORTN2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x) / 65535.0f,
-          static_cast<float>(pSource->y) / 65535.0f, 0.f, 0.f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 =
-        vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16));
-    vInt = vandq_u32(vInt, g_XMMaskXY);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    R = vmulq_n_f32(R, 1.0f / 65535.0f);
-    return vmaxq_f32(R, vdupq_n_f32(-1.f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixupY16 = {
-        {{1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f), 0.0f, 0.0f}}};
-    static const XMVECTORF32 FixaddY16 = {{{0, 32768.0f * 65536.0f, 0, 0}}};
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
-    // y needs to be sign flipped
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipY);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // y + 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, FixaddY16);
-    // Y is 65536 times too large
-    vTemp = _mm_mul_ps(vTemp, FixupY16);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUShort2(const XMUSHORT2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x),
-                             static_cast<float>(pSource->y), 0.f, 0.f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt16 =
-        vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    uint32x4_t vInt = vmovl_u16(vreinterpret_u16_u32(vInt16));
-    vInt = vandq_u32(vInt, g_XMMaskXY);
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixaddY16 = {{{0, 32768.0f, 0, 0}}};
-    // Splat the two shorts in all four entries (WORD alignment okay,
-    // DWORD alignment preferred)
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
-    vTemp = _mm_and_ps(vTemp, g_XMMaskX16Y16);
-    // y needs to be sign flipped
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipY);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // Y is 65536 times too large
-    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16);
-    // y + 0x8000 to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, FixaddY16);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadByteN2(const XMBYTEN2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{(pSource->x == -128)
-              ? -1.f
-              : (static_cast<float>(pSource->x) * (1.0f / 127.0f)),
-          (pSource->y == -128)
-              ? -1.f
-              : (static_cast<float>(pSource->y) * (1.0f / 127.0f)),
-          0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8));
-    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
-    vInt = vandq_s32(vInt, g_XMMaskXY);
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32(R, 1.0f / 127.0f);
-    return vmaxq_f32(R, vdupq_n_f32(-1.f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 0, 0}}};
-    static const XMVECTORU32 Mask = {{{0xFF, 0xFF00, 0, 0}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vTemp =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask
-    vTemp = _mm_and_ps(vTemp, Mask);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp, Scale);
-    // Clamp result (for case of -128)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadByte2(const XMBYTE2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x),
-                             static_cast<float>(pSource->y), 0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u16(vInt8));
-    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
-    vInt = vandq_s32(vInt, g_XMMaskXY);
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f)}}};
-    static const XMVECTORU32 Mask = {{{0xFF, 0xFF00, 0, 0}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vTemp =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask
-    vTemp = _mm_and_ps(vTemp, Mask);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp, Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUByteN2(const XMUBYTEN2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x) * (1.0f / 255.0f),
-          static_cast<float>(pSource->y) * (1.0f / 255.0f), 0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
-    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    vInt = vandq_u32(vInt, g_XMMaskXY);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32(R, 1.0f / 255.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 0, 0}}};
-    static const XMVECTORU32 Mask = {{{0xFF, 0xFF00, 0, 0}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vTemp =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask
-    vTemp = _mm_and_ps(vTemp, Mask);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp, Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUByte2(const XMUBYTE2* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x),
-                             static_cast<float>(pSource->y), 0.0f, 0.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt8 = vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u16(vInt8));
-    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    vInt = vandq_u32(vInt, g_XMMaskXY);
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {{{1.0f, 1.0f / 256.0f, 0, 0}}};
-    static const XMVECTORU32 Mask = {{{0xFF, 0xFF00, 0, 0}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vTemp =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask
-    vTemp = _mm_and_ps(vTemp, Mask);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    return _mm_mul_ps(vTemp, Scale);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadU565(const XMU565* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{
-        float(pSource->v & 0x1F),
-        float((pSource->v >> 5) & 0x3F),
-        float((pSource->v >> 11) & 0x1F),
-        0.f,
-    }}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 U565And = {{{0x1F, 0x3F << 5, 0x1F << 11, 0}}};
-    static const XMVECTORF32 U565Mul = {
-        {{1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0}}};
-    uint16x4_t vInt16 =
-        vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint32x4_t vInt = vmovl_u16(vInt16);
-    vInt = vandq_u32(vInt, U565And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R, U565Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 U565And = {{{0x1F, 0x3F << 5, 0x1F << 11, 0}}};
-    static const XMVECTORF32 U565Mul = {
-        {{1.0f, 1.0f / 32.0f, 1.0f / 2048.f, 0}}};
-    // Get the 16 bit value and splat it
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vResult =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult, U565And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult, U565Mul);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat3PK(const XMFLOAT3PK* pSource) noexcept {
-    assert(pSource);
-
-    XM_ALIGNED_DATA(16) uint32_t Result[4];
-    uint32_t Mantissa;
-    uint32_t Exponent;
-
-    // X Channel (6-bit mantissa)
-    Mantissa = pSource->xm;
-
-    if (pSource->xe == 0x1f)  // INF or NAN
-    {
-        Result[0] = static_cast<uint32_t>(
-            0x7f800000 | (static_cast<int>(pSource->xm) << 17));
-    } else {
-        if (pSource->xe != 0)  // The value is normalized
-        {
-            Exponent = pSource->xe;
-        } else if (Mantissa != 0)  // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-
-            do {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x40) == 0);
-
-            Mantissa &= 0x3F;
-        } else  // The value is zero
-        {
-            Exponent = static_cast<uint32_t>(-112);
-        }
-
-        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
-    }
-
-    // Y Channel (6-bit mantissa)
-    Mantissa = pSource->ym;
-
-    if (pSource->ye == 0x1f)  // INF or NAN
-    {
-        Result[1] = static_cast<uint32_t>(
-            0x7f800000 | (static_cast<int>(pSource->ym) << 17));
-    } else {
-        if (pSource->ye != 0)  // The value is normalized
-        {
-            Exponent = pSource->ye;
-        } else if (Mantissa != 0)  // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-
-            do {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x40) == 0);
-
-            Mantissa &= 0x3F;
-        } else  // The value is zero
-        {
-            Exponent = static_cast<uint32_t>(-112);
-        }
-
-        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
-    }
-
-    // Z Channel (5-bit mantissa)
-    Mantissa = pSource->zm;
-
-    if (pSource->ze == 0x1f)  // INF or NAN
-    {
-        Result[2] = static_cast<uint32_t>(
-            0x7f800000 | (static_cast<int>(pSource->zm) << 17));
-    } else {
-        if (pSource->ze != 0)  // The value is normalized
-        {
-            Exponent = pSource->ze;
-        } else if (Mantissa != 0)  // The value is denormalized
-        {
-            // Normalize the value in the resulting float
-            Exponent = 1;
-
-            do {
-                Exponent--;
-                Mantissa <<= 1;
-            } while ((Mantissa & 0x20) == 0);
-
-            Mantissa &= 0x1F;
-        } else  // The value is zero
-        {
-            Exponent = static_cast<uint32_t>(-112);
-        }
-
-        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
-    }
-
-    return XMLoadFloat3A(reinterpret_cast<const XMFLOAT3A*>(&Result));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept {
-    assert(pSource);
-
-    union {
-        float f;
-        int32_t i;
-    } fi;
-    fi.i = 0x33800000 + (pSource->e << 23);
-    float Scale = fi.f;
-
-    XMVECTORF32 v = {{{Scale * float(pSource->xm), Scale * float(pSource->ym),
-                       Scale * float(pSource->zm), 1.0f}}};
-    return v;
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadHalf4(const XMHALF4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSource));
-    return _mm_cvtph_ps(V);
-#else
-    XMVECTORF32 vResult = {
-        {{XMConvertHalfToFloat(pSource->x), XMConvertHalfToFloat(pSource->y),
-          XMConvertHalfToFloat(pSource->z), XMConvertHalfToFloat(pSource->w)}}};
-    return vResult.v;
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadShortN4(const XMSHORTN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{(pSource->x == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->x) * (1.0f / 32767.0f)),
-          (pSource->y == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->y) * (1.0f / 32767.0f)),
-          (pSource->z == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->z) * (1.0f / 32767.0f)),
-          (pSource->w == -32768)
-              ? -1.f
-              : (static_cast<float>(pSource->w) * (1.0f / 32767.0f))}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
-    int32x4_t V = vmovl_s16(vInt);
-    float32x4_t vResult = vcvtq_f32_s32(V);
-    vResult = vmulq_n_f32(vResult, 1.0f / 32767.0f);
-    return vmaxq_f32(vResult, vdupq_n_f32(-1.f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
-    // x and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x and z - 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16);
-    // Convert to -1.0f - 1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeX16Y16Z16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
-    // Clamp result (for case of -32768)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadShort4(const XMSHORT4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x), static_cast<float>(pSource->y),
-          static_cast<float>(pSource->z), static_cast<float>(pSource->w)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    int16x4_t vInt = vld1_s16(reinterpret_cast<const int16_t*>(pSource));
-    int32x4_t V = vmovl_s16(vInt);
-    return vcvtq_f32_s32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
-    // x and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipX16Y16Z16W16);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x and z - 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMFixX16Y16Z16W16);
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUShortN4(const XMUSHORTN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x) / 65535.0f,
-                             static_cast<float>(pSource->y) / 65535.0f,
-                             static_cast<float>(pSource->z) / 65535.0f,
-                             static_cast<float>(pSource->w) / 65535.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint32x4_t V = vmovl_u16(vInt);
-    float32x4_t vResult = vcvtq_f32_u32(V);
-    return vmulq_n_f32(vResult, 1.0f / 65535.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixupY16W16 = {
-        {{1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / (65535.0f * 65536.0f),
-          1.0f / (65535.0f * 65536.0f)}}};
-    static const XMVECTORF32 FixaddY16W16 = {
-        {{0, 0, 32768.0f * 65536.0f, 32768.0f * 65536.0f}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
-    // y and w are signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipZW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // y and w + 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, FixaddY16W16);
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp, FixupY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUShort4(const XMUSHORT4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x), static_cast<float>(pSource->y),
-          static_cast<float>(pSource->z), static_cast<float>(pSource->w)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint16x4_t vInt = vld1_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint32x4_t V = vmovl_u16(vInt);
-    return vcvtq_f32_u32(V);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 FixaddY16W16 = {{{0, 0, 32768.0f, 32768.0f}}};
-    // Splat the color in all four entries (x,z,y,w)
-    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double*>(&pSource->x));
-    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
-    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd), g_XMMaskX16Y16Z16W16);
-    // y and w are signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipZW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // Fix y and w because they are 65536 too large
-    vTemp = _mm_mul_ps(vTemp, g_XMFixupY16W16);
-    // y and w + 0x8000 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, FixaddY16W16);
-    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
-    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 1, 2, 0));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadXDecN4(const XMXDECN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        {{(ElementX == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementX | SignExtend[ElementX >> 9])) /
-                                 511.0f),
-          (ElementY == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementY | SignExtend[ElementY >> 9])) /
-                                 511.0f),
-          (ElementZ == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementZ | SignExtend[ElementZ >> 9])) /
-                                 511.0f),
-          static_cast<float>(pSource->v >> 30) / 3.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskA2B10G10R10);
-    vInt = veorq_u32(vInt, g_XMFlipA2B10G10R10);
-    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
-    R = vaddq_f32(R, g_XMFixAA2B10G10R10);
-    R = vmulq_f32(R, g_XMNormalizeA2B10G10R10);
-    return vmaxq_f32(R, vdupq_n_f32(-1.0f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskA2B10G10R10);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipA2B10G10R10);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMFixAA2B10G10R10);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMNormalizeA2B10G10R10);
-    // Clamp result (for case of -512)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadXDec4(const XMXDEC4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {{{static_cast<float>(static_cast<int16_t>(
-                                 ElementX | SignExtend[ElementX >> 9])),
-                             static_cast<float>(static_cast<int16_t>(
-                                 ElementY | SignExtend[ElementY >> 9])),
-                             static_cast<float>(static_cast<int16_t>(
-                                 ElementZ | SignExtend[ElementZ >> 9])),
-                             static_cast<float>(pSource->v >> 30)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORU32 XDec4Xor = {
-        {{0x200, 0x200 << 10, 0x200 << 20, 0x80000000}}};
-    static const XMVECTORF32 XDec4Add = {
-        {{-512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f,
-          32768 * 65536.0f}}};
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    vInt = veorq_u32(vInt, XDec4Xor);
-    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
-    R = vaddq_f32(R, XDec4Add);
-    return vmulq_f32(R, g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORU32 XDec4Xor = {
-        {{0x200, 0x200 << 10, 0x200 << 20, 0x80000000}}};
-    static const XMVECTORF32 XDec4Add = {
-        {{-512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f,
-          32768 * 65536.0f}}};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, XDec4Xor);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, XDec4Add);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUDecN4(const XMUDECN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {{{static_cast<float>(ElementX) / 1023.0f,
-                             static_cast<float>(ElementY) / 1023.0f,
-                             static_cast<float>(ElementZ) / 1023.0f,
-                             static_cast<float>(pSource->v >> 30) / 3.0f}}};
-    return vResult.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 UDecN4Mul = {
-        {{1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f),
-          1.0f / (1023.0f * 1024.0f * 1024.0f),
-          1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f)}}};
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R, UDecN4Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 UDecN4Mul = {
-        {{1.0f / 1023.0f, 1.0f / (1023.0f * 1024.0f),
-          1.0f / (1023.0f * 1024.0f * 1024.0f),
-          1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f)}}};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, UDecN4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUDecN4_XR(const XMUDECN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-
-    int32_t ElementX = pSource->v & 0x3FF;
-    int32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    int32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {{{static_cast<float>(ElementX - 0x180) / 510.0f,
-                             static_cast<float>(ElementY - 0x180) / 510.0f,
-                             static_cast<float>(ElementZ - 0x180) / 510.0f,
-                             static_cast<float>(pSource->v >> 30) / 3.0f}}};
-
-    return vResult.v;
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 XRMul = {
-        {{1.0f / 510.0f, 1.0f / (510.0f * 1024.0f),
-          1.0f / (510.0f * 1024.0f * 1024.0f),
-          1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f)}}};
-    static const XMVECTORI32 XRBias = {
-        {{0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0}}};
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    int32x4_t vTemp = vsubq_s32(vreinterpretq_s32_u32(vInt), XRBias);
-    vTemp = veorq_s32(vTemp, g_XMFlipW);
-    float32x4_t R = vcvtq_f32_s32(vTemp);
-    R = vaddq_f32(R, g_XMAddUDec4);
-    return vmulq_f32(R, XRMul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 XRMul = {
-        {{1.0f / 510.0f, 1.0f / (510.0f * 1024.0f),
-          1.0f / (510.0f * 1024.0f * 1024.0f),
-          1.0f / (3.0f * 1024.0f * 1024.0f * 1024.0f)}}};
-    static const XMVECTORI32 XRBias = {
-        {{0x180, 0x180 * 1024, 0x180 * 1024 * 1024, 0}}};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Mask channels
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // Subtract bias
-    vTemp = _mm_castsi128_ps(_mm_sub_epi32(_mm_castps_si128(vTemp), XRBias));
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Convert to 0.0f-1.0f
-    return _mm_mul_ps(vTemp, XRMul);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUDec4(const XMUDEC4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(ElementX), static_cast<float>(ElementY),
-          static_cast<float>(ElementZ), static_cast<float>(pSource->v >> 30)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R, g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadDecN4(const XMDECN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-    uint32_t ElementW = pSource->v >> 30;
-
-    XMVECTORF32 vResult = {
-        {{(ElementX == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementX | SignExtend[ElementX >> 9])) /
-                                 511.0f),
-          (ElementY == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementY | SignExtend[ElementY >> 9])) /
-                                 511.0f),
-          (ElementZ == 0x200) ? -1.f
-                              : (static_cast<float>(static_cast<int16_t>(
-                                     ElementZ | SignExtend[ElementZ >> 9])) /
-                                 511.0f),
-          (ElementW == 0x2)
-              ? -1.f
-              : static_cast<float>(static_cast<int16_t>(
-                    ElementW | SignExtendW[(ElementW >> 1) & 1]))}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 DecN4Mul = {
-        {{1.0f / 511.0f, 1.0f / (511.0f * 1024.0f),
-          1.0f / (511.0f * 1024.0f * 1024.0f),
-          1.0f / (1024.0f * 1024.0f * 1024.0f)}}};
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    vInt = veorq_u32(vInt, g_XMXorDec4);
-    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
-    R = vaddq_f32(R, g_XMAddDec4);
-    R = vmulq_f32(R, DecN4Mul);
-    return vmaxq_f32(R, vdupq_n_f32(-1.0f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 DecN4Mul = {
-        {{1.0f / 511.0f, 1.0f / (511.0f * 1024.0f),
-          1.0f / (511.0f * 1024.0f * 1024.0f),
-          1.0f / (1024.0f * 1024.0f * 1024.0f)}}};
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorDec4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMAddDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, DecN4Mul);
-    // Clamp result (for case of -512/-1)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadDec4(const XMDEC4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
-    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
-
-    uint32_t ElementX = pSource->v & 0x3FF;
-    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
-    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
-    uint32_t ElementW = pSource->v >> 30;
-
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(
-              static_cast<int16_t>(ElementX | SignExtend[ElementX >> 9])),
-          static_cast<float>(
-              static_cast<int16_t>(ElementY | SignExtend[ElementY >> 9])),
-          static_cast<float>(
-              static_cast<int16_t>(ElementZ | SignExtend[ElementZ >> 9])),
-          static_cast<float>(
-              static_cast<int16_t>(ElementW | SignExtendW[ElementW >> 1]))}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x4_t vInt = vld1q_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    vInt = vandq_u32(vInt, g_XMMaskDec4);
-    vInt = veorq_u32(vInt, g_XMXorDec4);
-    float32x4_t R = vcvtq_f32_s32(vreinterpretq_s32_u32(vInt));
-    R = vaddq_f32(R, g_XMAddDec4);
-    return vmulq_f32(R, g_XMMulDec4);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Splat the color in all four entries
-    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float*>(&pSource->v));
-    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskDec4);
-    // a is unsigned! Flip the bit to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorDec4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // RGB + 0, A + 0x80000000.f to undo the signed order.
-    vTemp = _mm_add_ps(vTemp, g_XMAddDec4);
-    // Convert 0-255 to 0.0f-1.0f
-    vTemp = _mm_mul_ps(vTemp, g_XMMulDec4);
-    return vTemp;
-#endif
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUByteN4(const XMUBYTEN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {{{static_cast<float>(pSource->x) / 255.0f,
-                             static_cast<float>(pSource->y) / 255.0f,
-                             static_cast<float>(pSource->z) / 255.0f,
-                             static_cast<float>(pSource->w) / 255.0f}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
-    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_n_f32(R, 1.0f / 255.0f);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadUByteN4Mul = {
-        {{1.0f / 255.0f, 1.0f / (255.0f * 256.0f), 1.0f / (255.0f * 65536.0f),
-          1.0f / (255.0f * 65536.0f * 256.0f)}}};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp, LoadUByteN4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUByte4(const XMUBYTE4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x), static_cast<float>(pSource->y),
-          static_cast<float>(pSource->z), static_cast<float>(pSource->w)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    uint16x8_t vInt16 = vmovl_u8(vreinterpret_u8_u32(vInt8));
-    uint32x4_t vInt = vmovl_u16(vget_low_u16(vInt16));
-    return vcvtq_f32_u32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadUByte4Mul = {
-        {{1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f)}}};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
-    // w is signed! Flip the bits to convert the order to unsigned
-    vTemp = _mm_xor_ps(vTemp, g_XMFlipW);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // w + 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddUDec4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp, LoadUByte4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadByteN4(const XMBYTEN4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{(pSource->x == -128) ? -1.f
-                               : (static_cast<float>(pSource->x) / 127.0f),
-          (pSource->y == -128) ? -1.f
-                               : (static_cast<float>(pSource->y) / 127.0f),
-          (pSource->z == -128) ? -1.f
-                               : (static_cast<float>(pSource->z) / 127.0f),
-          (pSource->w == -128) ? -1.f
-                               : (static_cast<float>(pSource->w) / 127.0f)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8));
-    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
-    float32x4_t R = vcvtq_f32_s32(vInt);
-    R = vmulq_n_f32(R, 1.0f / 127.0f);
-    return vmaxq_f32(R, vdupq_n_f32(-1.f));
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadByteN4Mul = {
-        {{1.0f / 127.0f, 1.0f / (127.0f * 256.0f), 1.0f / (127.0f * 65536.0f),
-          1.0f / (127.0f * 65536.0f * 256.0f)}}};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp, LoadByteN4Mul);
-    // Clamp result (for case of -128)
-    return _mm_max_ps(vTemp, g_XMNegativeOne);
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadByte4(const XMBYTE4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{static_cast<float>(pSource->x), static_cast<float>(pSource->y),
-          static_cast<float>(pSource->z), static_cast<float>(pSource->w)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    uint32x2_t vInt8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(pSource));
-    int16x8_t vInt16 = vmovl_s8(vreinterpret_s8_u32(vInt8));
-    int32x4_t vInt = vmovl_s16(vget_low_s16(vInt16));
-    return vcvtq_f32_s32(vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 LoadByte4Mul = {
-        {{1.0f, 1.0f / 256.0f, 1.0f / 65536.0f, 1.0f / (65536.0f * 256.0f)}}};
-    // Splat the color in all four entries (x,z,y,w)
-    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float*>(&pSource->x));
-    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
-    vTemp = _mm_and_ps(vTemp, g_XMMaskByte4);
-    // x,y and z are unsigned! Flip the bits to convert the order to signed
-    vTemp = _mm_xor_ps(vTemp, g_XMXorByte4);
-    // Convert to floating point numbers
-    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
-    // x, y and z - 0x80 to complete the conversion
-    vTemp = _mm_add_ps(vTemp, g_XMAddByte4);
-    // Fix y, z and w because they are too large
-    vTemp = _mm_mul_ps(vTemp, LoadByte4Mul);
-    return vTemp;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadUNibble4(const XMUNIBBLE4* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{float(pSource->v & 0xF), float((pSource->v >> 4) & 0xF),
-          float((pSource->v >> 8) & 0xF), float((pSource->v >> 12) & 0xF)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 UNibble4And = {{{0xF, 0xF0, 0xF00, 0xF000}}};
-    static const XMVECTORF32 UNibble4Mul = {
-        {{1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f}}};
-    uint16x4_t vInt16 =
-        vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint32x4_t vInt = vmovl_u16(vInt16);
-    vInt = vandq_u32(vInt, UNibble4And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R, UNibble4Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 UNibble4And = {{{0xF, 0xF0, 0xF00, 0xF000}}};
-    static const XMVECTORF32 UNibble4Mul = {
-        {{1.0f, 1.0f / 16.f, 1.0f / 256.f, 1.0f / 4096.f}}};
-    // Get the 16 bit value and splat it
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vResult =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult, UNibble4And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult, UNibble4Mul);
-    return vResult;
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMVECTOR XM_CALLCONV
-XMLoadU555(const XMU555* pSource) noexcept {
-    assert(pSource);
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTORF32 vResult = {
-        {{float(pSource->v & 0x1F), float((pSource->v >> 5) & 0x1F),
-          float((pSource->v >> 10) & 0x1F), float((pSource->v >> 15) & 0x1)}}};
-    return vResult.v;
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORI32 U555And = {
-        {{0x1F, 0x1F << 5, 0x1F << 10, 0x8000}}};
-    static const XMVECTORF32 U555Mul = {
-        {{1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f}}};
-    uint16x4_t vInt16 =
-        vld1_dup_u16(reinterpret_cast<const uint16_t*>(pSource));
-    uint32x4_t vInt = vmovl_u16(vInt16);
-    vInt = vandq_u32(vInt, U555And);
-    float32x4_t R = vcvtq_f32_u32(vInt);
-    return vmulq_f32(R, U555Mul);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORI32 U555And = {
-        {{0x1F, 0x1F << 5, 0x1F << 10, 0x8000}}};
-    static const XMVECTORF32 U555Mul = {
-        {{1.0f, 1.0f / 32.f, 1.0f / 1024.f, 1.0f / 32768.f}}};
-    // Get the 16bit value and splat it
-    __m128i vInt = XM_LOADU_SI16(&pSource->v);
-    XMVECTOR vResult =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vInt), _MM_SHUFFLE(0, 0, 0, 0));
-    // Mask off x, y and z
-    vResult = _mm_and_ps(vResult, U555And);
-    // Convert to float
-    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
-    // Normalize x, y, and z
-    vResult = _mm_mul_ps(vResult, U555Mul);
-    return vResult;
-#endif
-}
-
-#ifdef _PREFAST_
-#pragma prefast(pop)
-#endif
-
-/****************************************************************************
- *
- * Vector and matrix store operations
- *
- ****************************************************************************/
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreColor(XMCOLOR* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->c = (static_cast<uint32_t>(tmp.w) << 24) |
-                      (static_cast<uint32_t>(tmp.x) << 16) |
-                      (static_cast<uint32_t>(tmp.y) << 8) |
-                      static_cast<uint32_t>(tmp.z);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 255.0f);
-    R = XMVectorRound(R);
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
-    uint32_t rgba = vget_lane_u32(vreinterpret_u32_u8(vInt8), 0);
-    pDestination->c =
-        (rgba & 0xFF00FF00) | ((rgba >> 16) & 0xFF) | ((rgba << 16) & 0xFF0000);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Set <0 to 0
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    // Set>1 to 1
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Convert to 0-255
-    vResult = _mm_mul_ps(vResult, g_UByteMax);
-    // Shuffle RGBA to ARGB
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
-    // Convert to int
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Mash to shorts
-    vInt = _mm_packs_epi32(vInt, vInt);
-    // Mash to bytes
-    vInt = _mm_packus_epi16(vInt, vInt);
-    // Store the color
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->c),
-                 _mm_castsi128_ps(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreHalf2(XMHALF2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
-    _mm_store_ss(reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1));
-#else
-    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
-    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreShortN2(XMSHORTN2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int16_t>(tmp.x);
-    pDestination->y = static_cast<int16_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 32767.0f);
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    vResult = _mm_mul_ps(vResult, g_ShortMax);
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    vResulti = _mm_packs_epi32(vResulti, vResulti);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->x),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreShort2(XMSHORT2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int16_t>(tmp.x);
-    pDestination->y = static_cast<int16_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-32767.f));
-    R = vminq_f32(R, vdupq_n_f32(32767.0f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s16(vInt16), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_ShortMin);
-    vResult = _mm_min_ps(vResult, g_ShortMax);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Pack the ints into shorts
-    vInt = _mm_packs_epi32(vInt, vInt);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->x),
-                 _mm_castsi128_ps(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUShortN2(XMUSHORTN2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint16_t>(tmp.x);
-    pDestination->y = static_cast<uint16_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 65535.0f);
-    R = vaddq_f32(R, g_XMOneHalf);
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    vResult = _mm_mul_ps(vResult, g_UShortMax);
-    vResult = _mm_add_ps(vResult, g_XMOneHalf);
-    // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUShort2(XMUSHORT2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint16_t>(tmp.x);
-    pDestination->y = static_cast<uint16_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
-    R = vminq_f32(R, vdupq_n_f32(65535.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u16(vInt16), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_UShortMax);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreByteN2(XMBYTEN2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int8_t>(tmp.x);
-    pDestination->y = static_cast<int8_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 127.0f);
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
-    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination),
-                  vreinterpret_u16_s8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, g_ByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreByte2(XMBYTE2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int8_t>(tmp.x);
-    pDestination->y = static_cast<int8_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f));
-    R = vminq_f32(R, vdupq_n_f32(127.0f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
-    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination),
-                  vreinterpret_u16_s8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_ByteMin);
-    vResult = _mm_min_ps(vResult, g_ByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUByteN2(XMUBYTEN2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UByteMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint8_t>(tmp.x);
-    pDestination->y = static_cast<uint8_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 255.0f);
-    R = vaddq_f32(R, g_XMOneHalf);
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
-    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination),
-                  vreinterpret_u16_u8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, g_UByteMax);
-    vResult = _mm_add_ps(vResult, g_XMOneHalf);
-    // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUByte2(XMUBYTE2* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint8_t>(tmp.x);
-    pDestination->y = static_cast<uint8_t>(tmp.y);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0.f));
-    R = vminq_f32(R, vdupq_n_f32(255.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
-    vst1_lane_u16(reinterpret_cast<uint16_t*>(pDestination),
-                  vreinterpret_u16_u8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_UByteMax);
-    // Convert to int by rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(y) & 0xFF) << 8) | (static_cast<int>(x) & 0xFF));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreU565(XMU565* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 Max = {{{31.0f, 63.0f, 31.0f, 0.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint16_t>(((static_cast<int>(tmp.z) & 0x1F) << 11) |
-                              ((static_cast<int>(tmp.y) & 0x3F) << 5) |
-                              ((static_cast<int>(tmp.x) & 0x1F)));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {{{1.0f, 32.f, 32.f * 64.f, 0.f}}};
-    static const XMVECTORU32 Mask = {{{0x1F, 0x3F << 5, 0x1F << 11, 0}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    vResult = vminq_f32(vResult, Max);
-    vResult = vmulq_f32(vResult, Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vhi = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vhi);
-    vTemp = vpadd_u32(vTemp, vTemp);
-    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, Max);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(z) & 0x1F) << 11) |
-        ((static_cast<int>(y) & 0x3F) << 5) | ((static_cast<int>(x) & 0x1F)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3PK(XMFLOAT3PK* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-
-    XM_ALIGNED_DATA(16) uint32_t IValue[4];
-    XMStoreFloat3A(reinterpret_cast<XMFLOAT3A*>(&IValue), V);
-
-    uint32_t Result[3];
-
-    // X & Y Channels (5-bit exponent, 6-bit mantissa)
-    for (uint32_t j = 0; j < 2; ++j) {
-        uint32_t Sign = IValue[j] & 0x80000000;
-        uint32_t I = IValue[j] & 0x7FFFFFFF;
-
-        if ((I & 0x7F800000) == 0x7F800000) {
-            // INF or NAN
-            Result[j] = 0x7C0U;
-            if ((I & 0x7FFFFF) != 0) {
-                Result[j] = 0x7FFU;
-            } else if (Sign) {
-                // -INF is clamped to 0 since 3PK is positive only
-                Result[j] = 0;
-            }
-        } else if (Sign || I < 0x35800000) {
-            // 3PK is positive only, so clamp to zero
-            Result[j] = 0;
-        } else if (I > 0x477E0000U) {
-            // The number is too large to be represented as a float11, set to
-            // max
-            Result[j] = 0x7BFU;
-        } else {
-            if (I < 0x38800000U) {
-                // The number is too small to be represented as a normalized
-                // float11 Convert it to a denormalized value.
-                uint32_t Shift = 113U - (I >> 23U);
-                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
-            } else {
-                // Rebias the exponent to represent the value as a normalized
-                // float11
-                I += 0xC8000000U;
-            }
-
-            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U) & 0x7ffU;
-        }
-    }
-
-    // Z Channel (5-bit exponent, 5-bit mantissa)
-    uint32_t Sign = IValue[2] & 0x80000000;
-    uint32_t I = IValue[2] & 0x7FFFFFFF;
-
-    if ((I & 0x7F800000) == 0x7F800000) {
-        // INF or NAN
-        Result[2] = 0x3E0U;
-        if (I & 0x7FFFFF) {
-            Result[2] = 0x3FFU;
-        } else if (Sign || I < 0x36000000) {
-            // -INF is clamped to 0 since 3PK is positive only
-            Result[2] = 0;
-        }
-    } else if (Sign) {
-        // 3PK is positive only, so clamp to zero
-        Result[2] = 0;
-    } else if (I > 0x477C0000U) {
-        // The number is too large to be represented as a float10, set to max
-        Result[2] = 0x3DFU;
-    } else {
-        if (I < 0x38800000U) {
-            // The number is too small to be represented as a normalized float10
-            // Convert it to a denormalized value.
-            uint32_t Shift = 113U - (I >> 23U);
-            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
-        } else {
-            // Rebias the exponent to represent the value as a normalized
-            // float10
-            I += 0xC8000000U;
-        }
-
-        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U) & 0x3ffU;
-    }
-
-    // Pack Result into memory
-    pDestination->v = (Result[0] & 0x7ff) | ((Result[1] & 0x7ff) << 11) |
-                      ((Result[2] & 0x3ff) << 22);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreFloat3SE(XMFLOAT3SE* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-
-    XMFLOAT3A tmp;
-    XMStoreFloat3A(&tmp, V);
-
-    static constexpr float maxf9 = float(0x1FF << 7);
-    static constexpr float minf9 = float(1.f / (1 << 16));
-
-    float x = (tmp.x >= 0.f) ? ((tmp.x > maxf9) ? maxf9 : tmp.x) : 0.f;
-    float y = (tmp.y >= 0.f) ? ((tmp.y > maxf9) ? maxf9 : tmp.y) : 0.f;
-    float z = (tmp.z >= 0.f) ? ((tmp.z > maxf9) ? maxf9 : tmp.z) : 0.f;
-
-    const float max_xy = (x > y) ? x : y;
-    const float max_xyz = (max_xy > z) ? max_xy : z;
-
-    const float maxColor = (max_xyz > minf9) ? max_xyz : minf9;
-
-    union {
-        float f;
-        int32_t i;
-    } fi;
-    fi.f = maxColor;
-    fi.i += 0x00004000;  // round up leaving 9 bits in fraction (including
-                         // assumed 1)
-
-    auto exp = static_cast<uint32_t>(fi.i) >> 23;
-    pDestination->e = exp - 0x6f;
-
-    fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
-    float ScaleR = fi.f;
-
-    pDestination->xm =
-        static_cast<uint32_t>(MathInternal::round_to_nearest(x * ScaleR));
-    pDestination->ym =
-        static_cast<uint32_t>(MathInternal::round_to_nearest(y * ScaleR));
-    pDestination->zm =
-        static_cast<uint32_t>(MathInternal::round_to_nearest(z * ScaleR));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreHalf4(XMHALF4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
-    _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1);
-#else
-    XMFLOAT4A t;
-    XMStoreFloat4A(&t, V);
-
-    pDestination->x = XMConvertFloatToHalf(t.x);
-    pDestination->y = XMConvertFloatToHalf(t.y);
-    pDestination->z = XMConvertFloatToHalf(t.z);
-    pDestination->w = XMConvertFloatToHalf(t.w);
-#endif  // !_XM_F16C_INTRINSICS_
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreShortN4(XMSHORTN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int16_t>(tmp.x);
-    pDestination->y = static_cast<int16_t>(tmp.y);
-    pDestination->z = static_cast<int16_t>(tmp.z);
-    pDestination->w = static_cast<int16_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
-    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
-    vResult = vmulq_n_f32(vResult, 32767.0f);
-    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
-    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    vResult = _mm_mul_ps(vResult, g_ShortMax);
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    vResulti = _mm_packs_epi32(vResulti, vResulti);
-    _mm_store_sd(reinterpret_cast<double*>(&pDestination->x),
-                 _mm_castsi128_pd(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreShort4(XMSHORT4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ShortMin, g_ShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int16_t>(tmp.x);
-    pDestination->y = static_cast<int16_t>(tmp.y);
-    pDestination->z = static_cast<int16_t>(tmp.z);
-    pDestination->w = static_cast<int16_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32(V, g_ShortMin);
-    vResult = vminq_f32(vResult, g_ShortMax);
-    int16x4_t vInt = vmovn_s32(vcvtq_s32_f32(vResult));
-    vst1_s16(reinterpret_cast<int16_t*>(pDestination), vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_ShortMin);
-    vResult = _mm_min_ps(vResult, g_ShortMax);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Pack the ints into shorts
-    vInt = _mm_packs_epi32(vInt, vInt);
-    _mm_store_sd(reinterpret_cast<double*>(&pDestination->x),
-                 _mm_castsi128_pd(vInt));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUShortN4(XMUSHORTN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiplyAdd(N, g_UShortMax, g_XMOneHalf.v);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint16_t>(tmp.x);
-    pDestination->y = static_cast<uint16_t>(tmp.y);
-    pDestination->z = static_cast<uint16_t>(tmp.z);
-    pDestination->w = static_cast<uint16_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
-    vResult = vmulq_n_f32(vResult, 65535.0f);
-    vResult = vaddq_f32(vResult, g_XMOneHalf);
-    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
-    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    vResult = _mm_mul_ps(vResult, g_UShortMax);
-    vResult = _mm_add_ps(vResult, g_XMOneHalf);
-    // Convert to int
-    __m128i vInt = _mm_cvttps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
-    pDestination->w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUShort4(XMUSHORT4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UShortMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint16_t>(tmp.x);
-    pDestination->y = static_cast<uint16_t>(tmp.y);
-    pDestination->z = static_cast<uint16_t>(tmp.z);
-    pDestination->w = static_cast<uint16_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    vResult = vminq_f32(vResult, g_UShortMax);
-    uint16x4_t vInt = vmovn_u32(vcvtq_u32_f32(vResult));
-    vst1_u16(reinterpret_cast<uint16_t*>(pDestination), vInt);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_UShortMax);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // Since the SSE pack instruction clamps using signed rules,
-    // manually extract the values to store them to memory
-    pDestination->x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    pDestination->y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    pDestination->z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
-    pDestination->w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreXDecN4(XMXDECN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 Min = {{{-1.0f, -1.0f, -1.0f, 0.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Scale = {{{511.0f, 511.0f, 511.0f, 3.0f}}};
-
-    XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
-    N = XMVectorMultiply(N, Scale.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              (static_cast<int>(tmp.x) & 0x3FF));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f}}};
-    static const XMVECTORI32 ScaleMask = {
-        {{0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29}}};
-    float32x4_t vResult = vmaxq_f32(V, Min);
-    vResult = vminq_f32(vResult, vdupq_n_f32(1.0f));
-    vResult = vmulq_f32(vResult, Scale);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti, ScaleMask);
-    int32x4_t vResultw = vandq_s32(vResulti, g_XMMaskW);
-    vResulti = vaddq_s32(vResulti, vResultw);
-    // Do a horizontal or of all 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
-    vTemp = vorr_u32(vTemp, vhi);
-    vTemp = vpadd_u32(vTemp, vTemp);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{511.0f, 511.0f * 1024.0f, 511.0f * 1048576.0f, 3.0f * 536870912.0f}}};
-    static const XMVECTORI32 ScaleMask = {
-        {{0x3FF, 0x3FF << 10, 0x3FF << 20, 0x3 << 29}}};
-    XMVECTOR vResult = _mm_max_ps(V, Min);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, Scale);
-    // Convert to int (W is unsigned)
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, ScaleMask);
-    // To fix W, add itself to shift it up to <<30 instead of <<29
-    __m128i vResultw = _mm_and_si128(vResulti, g_XMMaskW);
-    vResulti = _mm_add_epi32(vResulti, vResultw);
-    // Do a horizontal or of all 4 entries
-    vResult =
-        XM_PERMUTE_PS(_mm_castsi128_ps(vResulti), _MM_SHUFFLE(0, 3, 2, 1));
-    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1));
-    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
-    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 3, 2, 1));
-    vResulti = _mm_or_si128(vResulti, _mm_castps_si128(vResult));
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreXDec4(XMXDEC4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 MinXDec4 = {{{-511.0f, -511.0f, -511.0f, 0.0f}}};
-    static const XMVECTORF32 MaxXDec4 = {{{511.0f, 511.0f, 511.0f, 3.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, MinXDec4, MaxXDec4);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleXDec4 = {
-        {{1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f,
-          1024.0f * 1024.0f * 1024.0f / 2.0f}}};
-    static const XMVECTORI32 MaskXDec4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    float32x4_t vResult = vmaxq_f32(V, MinXDec4);
-    vResult = vminq_f32(vResult, MaxXDec4);
-    vResult = vmulq_f32(vResult, ScaleXDec4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti, MaskXDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
-    uint32x2_t vTemp2 = vget_high_u32(vreinterpretq_u32_s32(vResulti));
-    vTemp = vorr_u32(vTemp, vTemp2);
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_u32(vTemp2, vTemp2);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleXDec4 = {
-        {{1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f,
-          1024.0f * 1024.0f * 1024.0f / 2.0f}}};
-    static const XMVECTORI32 MaskXDec4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, MinXDec4);
-    vResult = _mm_min_ps(vResult, MaxXDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleXDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskXDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a single bit left shift on y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUDecN4(XMUDECN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Scale = {{{1023.0f, 1023.0f, 1023.0f, 3.0f}}};
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, Scale.v);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDecN4 = {
-        {{1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f,
-          3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f}}};
-    static const XMVECTORI32 MaskUDecN4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult, vdupq_n_f32(1.f));
-    vResult = vmulq_f32(vResult, ScaleUDecN4);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_u32(vTemp2, vTemp2);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDecN4 = {
-        {{1023.0f, 1023.0f * 1024.0f * 0.5f, 1023.0f * 1024.0f * 1024.0f,
-          3.0f * 1024.0f * 1024.0f * 1024.0f * 0.5f}}};
-    static const XMVECTORI32 MaskUDecN4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleUDecN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUDecN4_XR(XMUDECN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 Scale = {{{510.0f, 510.0f, 510.0f, 3.0f}}};
-    static const XMVECTORF32 Bias = {{{384.0f, 384.0f, 384.0f, 0.0f}}};
-    static const XMVECTORF32 C = {{{1023.f, 1023.f, 1023.f, 3.f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorMultiplyAdd(V, Scale, Bias);
-    N = XMVectorClamp(N, g_XMZero, C);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Shift = {{{1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f,
-                                        1024.0f * 1024.0f * 1024.0f * 0.5f}}};
-    static const XMVECTORU32 MaskUDecN4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    float32x4_t vResult = vmlaq_f32(Bias, V, Scale);
-    vResult = vmaxq_f32(vResult, vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult, C);
-    vResult = vmulq_f32(vResult, Shift);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_u32(vTemp2, vTemp2);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 Shift = {{{1.0f, 1024.0f * 0.5f, 1024.0f * 1024.0f,
-                                        1024.0f * 1024.0f * 1024.0f * 0.5f}}};
-    static const XMVECTORU32 MaskUDecN4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    // Scale & bias
-    XMVECTOR vResult = XM_FMADD_PS(V, Scale, Bias);
-    // Clamp to bounds
-    vResult = _mm_max_ps(vResult, g_XMZero);
-    vResult = _mm_min_ps(vResult, C);
-    // Scale by shift values
-    vResult = _mm_mul_ps(vResult, Shift);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskUDecN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUDec4(XMUDEC4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 MaxUDec4 = {{{1023.0f, 1023.0f, 1023.0f, 3.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), MaxUDec4);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDec4 = {
-        {{1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f,
-          1024.0f * 1024.0f * 1024.0f / 2.0f}}};
-    static const XMVECTORI32 MaskUDec4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0.f));
-    vResult = vminq_f32(vResult, MaxUDec4);
-    vResult = vmulq_f32(vResult, ScaleUDec4);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, MaskUDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_u32(vTemp2, vTemp2);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUDec4 = {
-        {{1.0f, 1024.0f / 2.0f, 1024.0f * 1024.0f,
-          1024.0f * 1024.0f * 1024.0f / 2.0f}}};
-    static const XMVECTORI32 MaskUDec4 = {
-        {{0x3FF, 0x3FF << (10 - 1), 0x3FF << 20, 0x3 << (30 - 1)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, MaxUDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleUDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskUDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a left shift by one bit on y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreDecN4(XMDECN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    static const XMVECTORF32 Scale = {{{511.0f, 511.0f, 511.0f, 1.0f}}};
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, Scale.v);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleDecN4 = {
-        {{511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f,
-          1.0f * 1024.0f * 1024.0f * 1024.0f}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(-1.f));
-    vResult = vminq_f32(vResult, vdupq_n_f32(1.f));
-    vResult = vmulq_f32(vResult, ScaleDecN4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti, g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
-    vTemp = vorr_u32(vTemp, vhi);
-    vTemp = vpadd_u32(vTemp, vTemp);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleDecN4 = {
-        {{511.0f, 511.0f * 1024.0f, 511.0f * 1024.0f * 1024.0f,
-          1.0f * 1024.0f * 1024.0f * 1024.0f}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleDecN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreDec4(XMDEC4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 MinDec4 = {{{-511.0f, -511.0f, -511.0f, -1.0f}}};
-    static const XMVECTORF32 MaxDec4 = {{{511.0f, 511.0f, 511.0f, 1.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, MinDec4, MaxDec4);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint32_t>((static_cast<int>(tmp.w) << 30) |
-                              ((static_cast<int>(tmp.z) & 0x3FF) << 20) |
-                              ((static_cast<int>(tmp.y) & 0x3FF) << 10) |
-                              ((static_cast<int>(tmp.x) & 0x3FF)));
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 ScaleDec4 = {
-        {{1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f}}};
-    float32x4_t vResult = vmaxq_f32(V, MinDec4);
-    vResult = vminq_f32(vResult, MaxDec4);
-    vResult = vmulq_f32(vResult, ScaleDec4);
-    int32x4_t vResulti = vcvtq_s32_f32(vResult);
-    vResulti = vandq_s32(vResulti, g_XMMaskDec4);
-    // Do a horizontal or of all 4 entries
-    uint32x2_t vTemp = vget_low_u32(vreinterpretq_u32_s32(vResulti));
-    uint32x2_t vhi = vget_high_u32(vreinterpretq_u32_s32(vResulti));
-    vTemp = vorr_u32(vTemp, vhi);
-    vTemp = vpadd_u32(vTemp, vTemp);
-    vst1_lane_u32(&pDestination->v, vTemp, 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleDec4 = {
-        {{1.0f, 1024.0f, 1024.0f * 1024.0f, 1024.0f * 1024.0f * 1024.0f}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, MinDec4);
-    vResult = _mm_min_ps(vResult, MaxDec4);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleDec4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, g_XMMaskDec4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUByteN4(XMUBYTEN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorSaturate(V);
-    N = XMVectorMultiply(N, g_UByteMax);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint8_t>(tmp.x);
-    pDestination->y = static_cast<uint8_t>(tmp.y);
-    pDestination->z = static_cast<uint8_t>(tmp.z);
-    pDestination->w = static_cast<uint8_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 255.0f);
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUByteN4 = {
-        {{255.0f, 255.0f * 256.0f * 0.5f, 255.0f * 256.0f * 256.0f,
-          255.0f * 256.0f * 256.0f * 256.0f * 0.5f}}};
-    static const XMVECTORI32 MaskUByteN4 = {
-        {{0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleUByteN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskUByteN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a single bit left shift to fix y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUByte4(XMUBYTE4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), g_UByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<uint8_t>(tmp.x);
-    pDestination->y = static_cast<uint8_t>(tmp.y);
-    pDestination->z = static_cast<uint8_t>(tmp.z);
-    pDestination->w = static_cast<uint8_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(0));
-    R = vminq_f32(R, vdupq_n_f32(255.0f));
-    uint32x4_t vInt32 = vcvtq_u32_f32(R);
-    uint16x4_t vInt16 = vqmovn_u32(vInt32);
-    uint8x8_t vInt8 = vqmovn_u16(vcombine_u16(vInt16, vInt16));
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_u8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleUByte4 = {
-        {{1.0f, 256.0f * 0.5f, 256.0f * 256.0f,
-          256.0f * 256.0f * 256.0f * 0.5f}}};
-    static const XMVECTORI32 MaskUByte4 = {
-        {{0xFF, 0xFF << (8 - 1), 0xFF << 16, 0xFF << (24 - 1)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, g_UByteMax);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleUByte4);
-    // Convert to int by rounding
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskUByte4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // Perform a single bit left shift to fix y|w
-    vResulti2 = _mm_add_epi32(vResulti2, vResulti2);
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreByteN4(XMBYTEN4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
-    N = XMVectorMultiply(N, g_ByteMax);
-    N = XMVectorTruncate(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int8_t>(tmp.x);
-    pDestination->y = static_cast<int8_t>(tmp.y);
-    pDestination->z = static_cast<int8_t>(tmp.z);
-    pDestination->w = static_cast<int8_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-1.f));
-    R = vminq_f32(R, vdupq_n_f32(1.0f));
-    R = vmulq_n_f32(R, 127.0f);
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleByteN4 = {
-        {{127.0f, 127.0f * 256.0f, 127.0f * 256.0f * 256.0f,
-          127.0f * 256.0f * 256.0f * 256.0f}}};
-    static const XMVECTORI32 MaskByteN4 = {
-        {{0xFF, 0xFF << 8, 0xFF << 16, static_cast<int>(0xFF000000)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_XMNegativeOne);
-    vResult = _mm_min_ps(vResult, g_XMOne);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleByteN4);
-    // Convert to int
-    __m128i vResulti = _mm_cvttps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskByteN4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreByte4(XMBYTE4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, g_ByteMin, g_ByteMax);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->x = static_cast<int8_t>(tmp.x);
-    pDestination->y = static_cast<int8_t>(tmp.y);
-    pDestination->z = static_cast<int8_t>(tmp.z);
-    pDestination->w = static_cast<int8_t>(tmp.w);
-
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    float32x4_t R = vmaxq_f32(V, vdupq_n_f32(-127.f));
-    R = vminq_f32(R, vdupq_n_f32(127.f));
-    int32x4_t vInt32 = vcvtq_s32_f32(R);
-    int16x4_t vInt16 = vqmovn_s32(vInt32);
-    int8x8_t vInt8 = vqmovn_s16(vcombine_s16(vInt16, vInt16));
-    vst1_lane_u32(&pDestination->v, vreinterpret_u32_s8(vInt8), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    static const XMVECTORF32 ScaleByte4 = {
-        {{1.0f, 256.0f, 256.0f * 256.0f, 256.0f * 256.0f * 256.0f}}};
-    static const XMVECTORI32 MaskByte4 = {
-        {{0xFF, 0xFF << 8, 0xFF << 16, static_cast<int>(0xFF000000)}}};
-    // Clamp to bounds
-    XMVECTOR vResult = _mm_max_ps(V, g_ByteMin);
-    vResult = _mm_min_ps(vResult, g_ByteMax);
-    // Scale by multiplication
-    vResult = _mm_mul_ps(vResult, ScaleByte4);
-    // Convert to int by rounding
-    __m128i vResulti = _mm_cvtps_epi32(vResult);
-    // Mask off any fraction
-    vResulti = _mm_and_si128(vResulti, MaskByte4);
-    // Do a horizontal or of 4 entries
-    __m128i vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(3, 2, 3, 2));
-    // x = x|z, y = y|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    // Move Z to the x position
-    vResulti2 = _mm_shuffle_epi32(vResulti, _MM_SHUFFLE(1, 1, 1, 1));
-    // i = x|y|z|w
-    vResulti = _mm_or_si128(vResulti, vResulti2);
-    _mm_store_ss(reinterpret_cast<float*>(&pDestination->v),
-                 _mm_castsi128_ps(vResulti));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreUNibble4(XMUNIBBLE4* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 Max = {{{15.0f, 15.0f, 15.0f, 15.0f}}};
-#if defined(_XM_NO_INTRINSICS_)
-
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint16_t>(((static_cast<int>(tmp.w) & 0xF) << 12) |
-                              ((static_cast<int>(tmp.z) & 0xF) << 8) |
-                              ((static_cast<int>(tmp.y) & 0xF) << 4) |
-                              (static_cast<int>(tmp.x) & 0xF));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.0f, 16.f, 16.f * 16.f, 16.f * 16.f * 16.f}}};
-    static const XMVECTORU32 Mask = {{{0xF, 0xF << 4, 0xF << 8, 0xF << 12}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    vResult = vminq_f32(vResult, Max);
-    vResult = vmulq_f32(vResult, Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vhi = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vhi);
-    vTemp = vpadd_u32(vTemp, vTemp);
-    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, Max);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
-    auto w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
-    pDestination->v = static_cast<uint16_t>(
-        ((static_cast<int>(w) & 0xF) << 12) |
-        ((static_cast<int>(z) & 0xF) << 8) |
-        ((static_cast<int>(y) & 0xF) << 4) | ((static_cast<int>(x) & 0xF)));
-#endif
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline void XM_CALLCONV
-XMStoreU555(XMU555* pDestination, FXMVECTOR V) noexcept {
-    assert(pDestination);
-    static const XMVECTORF32 Max = {{{31.0f, 31.0f, 31.0f, 1.0f}}};
-
-#if defined(_XM_NO_INTRINSICS_)
-    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
-    N = XMVectorRound(N);
-
-    XMFLOAT4A tmp;
-    XMStoreFloat4A(&tmp, N);
-
-    pDestination->v =
-        static_cast<uint16_t>(((tmp.w > 0.f) ? 0x8000 : 0) |
-                              ((static_cast<int>(tmp.z) & 0x1F) << 10) |
-                              ((static_cast<int>(tmp.y) & 0x1F) << 5) |
-                              (static_cast<int>(tmp.x) & 0x1F));
-#elif defined(_XM_ARM_NEON_INTRINSICS_)
-    static const XMVECTORF32 Scale = {
-        {{1.0f, 32.f / 2.f, 32.f * 32.f, 32.f * 32.f * 32.f / 2.f}}};
-    static const XMVECTORU32 Mask = {
-        {{0x1F, 0x1F << (5 - 1), 0x1F << 10, 0x1 << (15 - 1)}}};
-    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
-    vResult = vminq_f32(vResult, Max);
-    vResult = vmulq_f32(vResult, Scale);
-    uint32x4_t vResulti = vcvtq_u32_f32(vResult);
-    vResulti = vandq_u32(vResulti, Mask);
-    // Do a horizontal or of 4 entries
-    uint32x2_t vTemp = vget_low_u32(vResulti);
-    uint32x2_t vTemp2 = vget_high_u32(vResulti);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    // Perform a single bit left shift on y|w
-    vTemp2 = vdup_lane_u32(vTemp, 1);
-    vTemp2 = vadd_u32(vTemp2, vTemp2);
-    vTemp = vorr_u32(vTemp, vTemp2);
-    vst1_lane_u16(&pDestination->v, vreinterpret_u16_u32(vTemp), 0);
-#elif defined(_XM_SSE_INTRINSICS_)
-    // Bounds check
-    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
-    vResult = _mm_min_ps(vResult, Max);
-    // Convert to int with rounding
-    __m128i vInt = _mm_cvtps_epi32(vResult);
-    // No SSE operations will write to 16-bit values, so we have to extract them
-    // manually
-    auto x = static_cast<uint16_t>(_mm_extract_epi16(vInt, 0));
-    auto y = static_cast<uint16_t>(_mm_extract_epi16(vInt, 2));
-    auto z = static_cast<uint16_t>(_mm_extract_epi16(vInt, 4));
-    auto w = static_cast<uint16_t>(_mm_extract_epi16(vInt, 6));
-    pDestination->v = static_cast<uint16_t>(
-        (static_cast<int>(w) ? 0x8000 : 0) |
-        ((static_cast<int>(z) & 0x1F) << 10) |
-        ((static_cast<int>(y) & 0x1F) << 5) | ((static_cast<int>(x) & 0x1F)));
-#endif
-}
-
-/****************************************************************************
- *
- * XMCOLOR operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMCOLOR::XMCOLOR(float _r, float _g, float _b, float _a) noexcept {
-    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMCOLOR::XMCOLOR(const float* pArray) noexcept {
-    XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMHALF2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMHALF2::XMHALF2(float _x, float _y) noexcept {
-    x = XMConvertFloatToHalf(_x);
-    y = XMConvertFloatToHalf(_y);
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMHALF2::XMHALF2(const float* pArray) noexcept {
-    assert(pArray != nullptr);
-    x = XMConvertFloatToHalf(pArray[0]);
-    y = XMConvertFloatToHalf(pArray[1]);
-}
-
-/****************************************************************************
- *
- * XMSHORTN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMSHORTN2::XMSHORTN2(float _x, float _y) noexcept {
-    XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMSHORTN2::XMSHORTN2(
-    const float* pArray) noexcept {
-    XMStoreShortN2(this,
-                   XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMSHORT2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMSHORT2::XMSHORT2(float _x, float _y) noexcept {
-    XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMSHORT2::XMSHORT2(const float* pArray) noexcept {
-    XMStoreShort2(this,
-                  XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORTN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUSHORTN2::XMUSHORTN2(float _x, float _y) noexcept {
-    XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUSHORTN2::XMUSHORTN2(
-    const float* pArray) noexcept {
-    XMStoreUShortN2(this,
-                    XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORT2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUSHORT2::XMUSHORT2(float _x, float _y) noexcept {
-    XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUSHORT2::XMUSHORT2(
-    const float* pArray) noexcept {
-    XMStoreUShort2(this,
-                   XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTEN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMBYTEN2::XMBYTEN2(float _x, float _y) noexcept {
-    XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMBYTEN2::XMBYTEN2(const float* pArray) noexcept {
-    XMStoreByteN2(this,
-                  XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTE2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMBYTE2::XMBYTE2(float _x, float _y) noexcept {
-    XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMBYTE2::XMBYTE2(const float* pArray) noexcept {
-    XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTEN2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUBYTEN2::XMUBYTEN2(float _x, float _y) noexcept {
-    XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUBYTEN2::XMUBYTEN2(
-    const float* pArray) noexcept {
-    XMStoreUByteN2(this,
-                   XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTE2 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUBYTE2::XMUBYTE2(float _x, float _y) noexcept {
-    XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUBYTE2::XMUBYTE2(const float* pArray) noexcept {
-    XMStoreUByte2(this,
-                  XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMU565 operators
- *
- ****************************************************************************/
-
-inline XMU565::XMU565(float _x, float _y, float _z) noexcept {
-    XMStoreU565(this, XMVectorSet(_x, _y, _z, 0.0f));
-}
-
-_Use_decl_annotations_ inline XMU565::XMU565(const float* pArray) noexcept {
-    XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMFLOAT3PK operators
- *
- ****************************************************************************/
-
-inline XMFLOAT3PK::XMFLOAT3PK(float _x, float _y, float _z) noexcept {
-    XMStoreFloat3PK(this, XMVectorSet(_x, _y, _z, 0.0f));
-}
-
-_Use_decl_annotations_ inline XMFLOAT3PK::XMFLOAT3PK(
-    const float* pArray) noexcept {
-    XMStoreFloat3PK(this,
-                    XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMFLOAT3SE operators
- *
- ****************************************************************************/
-
-inline XMFLOAT3SE::XMFLOAT3SE(float _x, float _y, float _z) noexcept {
-    XMStoreFloat3SE(this, XMVectorSet(_x, _y, _z, 0.0f));
-}
-
-_Use_decl_annotations_ inline XMFLOAT3SE::XMFLOAT3SE(
-    const float* pArray) noexcept {
-    XMStoreFloat3SE(this,
-                    XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMHALF4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMHALF4::XMHALF4(float _x, float _y, float _z, float _w) noexcept {
-    x = XMConvertFloatToHalf(_x);
-    y = XMConvertFloatToHalf(_y);
-    z = XMConvertFloatToHalf(_z);
-    w = XMConvertFloatToHalf(_w);
-}
-
-//------------------------------------------------------------------------------
-
-_Use_decl_annotations_ inline XMHALF4::XMHALF4(const float* pArray) noexcept {
-    XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
-}
-
-/****************************************************************************
- *
- * XMSHORTN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMSHORTN4::XMSHORTN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMSHORTN4::XMSHORTN4(
-    const float* pArray) noexcept {
-    XMStoreShortN4(this,
-                   XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMSHORT4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMSHORT4::XMSHORT4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMSHORT4::XMSHORT4(const float* pArray) noexcept {
-    XMStoreShort4(this,
-                  XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORTN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUSHORTN4::XMUSHORTN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUSHORTN4::XMUSHORTN4(
-    const float* pArray) noexcept {
-    XMStoreUShortN4(this,
-                    XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUSHORT4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUSHORT4::XMUSHORT4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUSHORT4::XMUSHORT4(
-    const float* pArray) noexcept {
-    XMStoreUShort4(this,
-                   XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMXDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMXDECN4::XMXDECN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMXDECN4::XMXDECN4(const float* pArray) noexcept {
-    XMStoreXDecN4(this,
-                  XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMXDEC4 operators
- *
- ****************************************************************************/
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4996)
-// C4996: ignore deprecation warning
-#endif
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-//------------------------------------------------------------------------------
-
-inline XMXDEC4::XMXDEC4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMXDEC4::XMXDEC4(const float* pArray) noexcept {
-    XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMDECN4::XMDECN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMDECN4::XMDECN4(const float* pArray) noexcept {
-    XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMDEC4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMDEC4::XMDEC4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMDEC4::XMDEC4(const float* pArray) noexcept {
-    XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-/****************************************************************************
- *
- * XMUDECN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUDECN4::XMUDECN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUDECN4::XMUDECN4(const float* pArray) noexcept {
-    XMStoreUDecN4(this,
-                  XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUDEC4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUDEC4::XMUDEC4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUDEC4::XMUDEC4(const float* pArray) noexcept {
-    XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTEN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMBYTEN4::XMBYTEN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMBYTEN4::XMBYTEN4(const float* pArray) noexcept {
-    XMStoreByteN4(this,
-                  XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMBYTE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMBYTE4::XMBYTE4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMBYTE4::XMBYTE4(const float* pArray) noexcept {
-    XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTEN4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUBYTEN4::XMUBYTEN4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUBYTEN4::XMUBYTEN4(
-    const float* pArray) noexcept {
-    XMStoreUByteN4(this,
-                   XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUBYTE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUBYTE4::XMUBYTE4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUBYTE4::XMUBYTE4(const float* pArray) noexcept {
-    XMStoreUByte4(this,
-                  XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMUNIBBLE4 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMUNIBBLE4::XMUNIBBLE4(float _x, float _y, float _z, float _w) noexcept {
-    XMStoreUNibble4(this, XMVectorSet(_x, _y, _z, _w));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMUNIBBLE4::XMUNIBBLE4(
-    const float* pArray) noexcept {
-    XMStoreUNibble4(this,
-                    XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
-}
-
-/****************************************************************************
- *
- * XMU555 operators
- *
- ****************************************************************************/
-
-//------------------------------------------------------------------------------
-
-inline XMU555::XMU555(float _x, float _y, float _z, bool _w) noexcept {
-    XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f)));
-}
-
-//------------------------------------------------------------------------------
-_Use_decl_annotations_ inline XMU555::XMU555(const float* pArray,
-                                             bool _w) noexcept {
-    XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
-    XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f)));
-}
diff --git a/targets/app/linux/Stubs/DirectXMath/sal.h b/targets/app/linux/Stubs/DirectXMath/sal.h
deleted file mode 100644
index 2f40e716e..000000000
--- a/targets/app/linux/Stubs/DirectXMath/sal.h
+++ /dev/null
@@ -1,4244 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/***
-*sal.h - markers for documenting the semantics of APIs
-*
-
-*
-*Purpose:
-*       sal.h provides a set of annotations to describe how a function uses its
-*       parameters - the assumptions it makes about them, and the guarantees it makes
-*       upon finishing.
-****/
-#pragma once
-
-/*==========================================================================
-
-   The comments in this file are intended to give basic understanding of
-   the usage of SAL, the Microsoft Source Code Annotation Language.
-   For more details, please see http://go.microsoft.com/fwlink/?LinkID=242134
-
-   The macros are defined in 3 layers, plus the structural set:
-
-   _In_/_Out_/_Ret_ Layer:
-   ----------------------
-   This layer provides the highest abstraction and its macros should be used
-   in most cases. These macros typically start with:
-      _In_     : input parameter to a function, unmodified by called function
-      _Out_    : output parameter, written to by called function, pointed-to
-                 location not expected to be initialized prior to call
-      _Outptr_ : like _Out_ when returned variable is a pointer type
-                 (so param is pointer-to-pointer type). Called function
-                 provides/allocated space.
-      _Outref_ : like _Outptr_, except param is reference-to-pointer type.
-      _Inout_  : inout parameter, read from and potentially modified by
-                 called function.
-      _Ret_    : for return values
-      _Field_  : class/struct field invariants
-   For common usage, this class of SAL provides the most concise annotations.
-   Note that _In_/_Out_/_Inout_/_Outptr_ annotations are designed to be used
-   with a parameter target. Using them with _At_ to specify non-parameter
-   targets may yield unexpected results.
-
-   This layer also includes a number of other properties that can be specified
-   to extend the ability of code analysis, most notably:
-      -- Designating parameters as format strings for printf/scanf/scanf_s
-      -- Requesting stricter type checking for C enum parameters
-
-   _Pre_/_Post_ Layer:
-   ------------------
-   The macros of this layer only should be used when there is no suitable macro
-   in the _In_/_Out_ layer. Its macros start with _Pre_ or _Post_.
-   This layer provides the most flexibility for annotations.
-
-   Implementation Abstraction Layer:
-   --------------------------------
-   Macros from this layer should never be used directly. The layer only exists
-   to hide the implementation of the annotation macros.
-
-   Structural Layer:
-   ----------------
-   These annotations, like _At_ and _When_, are used with annotations from
-   any of the other layers as modifiers, indicating exactly when and where
-   the annotations apply.
-
-
-   Common syntactic conventions:
-   ----------------------------
-
-   Usage:
-   -----
-   _In_, _Out_, _Inout_, _Pre_, _Post_, are for formal parameters.
-   _Ret_, _Deref_ret_ must be used for return values.
-
-   Nullness:
-   --------
-   If the parameter can be NULL as a precondition to the function, the
-   annotation contains _opt. If the macro does not contain '_opt' the
-   parameter cannot be NULL.
-
-   If an out/inout parameter returns a null pointer as a postcondition, this is
-   indicated by _Ret_maybenull_ or _result_maybenull_. If the macro is not
-   of this form, then the result will not be NULL as a postcondition.
-     _Outptr_ - output value is not NULL
-     _Outptr_result_maybenull_ - output value might be NULL
-
-   String Type:
-   -----------
-   _z: NullTerminated string
-   for _In_ parameters the buffer must have the specified stringtype before the call
-   for _Out_ parameters the buffer must have the specified stringtype after the call
-   for _Inout_ parameters both conditions apply
-
-   Extent Syntax:
-   -------------
-   Buffer sizes are expressed as element counts, unless the macro explicitly
-   contains _byte_ or _bytes_. Some annotations specify two buffer sizes, in
-   which case the second is used to indicate how much of the buffer is valid
-   as a postcondition. This table outlines the precondition buffer allocation
-   size, precondition number of valid elements, postcondition allocation size,
-   and postcondition number of valid elements for representative buffer size
-   annotations:
-                                     Pre    |  Pre    |  Post   |  Post
-                                     alloc  |  valid  |  alloc  |  valid
-      Annotation                     elems  |  elems  |  elems  |  elems
-      ----------                     ------------------------------------
-      _In_reads_(s)                    s    |   s     |   s     |   s
-      _Inout_updates_(s)               s    |   s     |   s     |   s
-      _Inout_updates_to_(s,c)          s    |   s     |   s     |   c
-      _Out_writes_(s)                  s    |   0     |   s     |   s
-      _Out_writes_to_(s,c)             s    |   0     |   s     |   c
-      _Outptr_result_buffer_(s)        ?    |   ?     |   s     |   s
-      _Outptr_result_buffer_to_(s,c)   ?    |   ?     |   s     |   c
-
-   For the _Outptr_ annotations, the buffer in question is at one level of
-   dereference. The called function is responsible for supplying the buffer.
-
-   Success and failure:
-   -------------------
-   The SAL concept of success allows functions to define expressions that can
-   be tested by the caller, which if it evaluates to non-zero, indicates the
-   function succeeded, which means that its postconditions are guaranteed to
-   hold.  Otherwise, if the expression evaluates to zero, the function is
-   considered to have failed, and the postconditions are not guaranteed.
-
-   The success criteria can be specified with the _Success_(expr) annotation:
-     _Success_(return != FALSE) BOOL
-     PathCanonicalizeA(_Out_writes_(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
-        pszBuf is only guaranteed to be NULL-terminated when TRUE is returned,
-        and FALSE indicates failure. In common practice, callers check for zero
-        vs. non-zero returns, so it is preferable to express the success
-        criteria in terms of zero/non-zero, not checked for exactly TRUE.
-
-   Functions can specify that some postconditions will still hold, even when
-   the function fails, using _On_failure_(anno-list), or postconditions that
-   hold regardless of success or failure using _Always_(anno-list).
-
-   The annotation _Return_type_success_(expr) may be used with a typedef to
-   give a default _Success_ criteria to all functions returning that type.
-   This is the case for common Windows API status types, including
-   HRESULT and NTSTATUS.  This may be overridden on a per-function basis by
-   specifying a _Success_ annotation locally.
-
-============================================================================*/
-
-#define __ATTR_SAL
-
-#ifndef _SAL_VERSION /*IFSTRIP=IGN*/
-#define _SAL_VERSION 20
-#endif
-
-#ifdef _PREFAST_  // [
-
-// choose attribute or __declspec implementation
-#ifndef _USE_DECLSPECS_FOR_SAL  // [
-#define _USE_DECLSPECS_FOR_SAL 1
-#endif  // ]
-
-#if _USE_DECLSPECS_FOR_SAL  // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#elif !defined(_USE_ATTRIBUTES_FOR_SAL)  // ][
-#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/     // [
-#define _USE_ATTRIBUTES_FOR_SAL 1
-#else  // ][
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif  // ]
-#endif  // ]
-
-#if !_USE_DECLSPECS_FOR_SAL           // [
-#if !_USE_ATTRIBUTES_FOR_SAL          // [
-#if _MSC_VER >= 1400 /*IFSTRIP=IGN*/  // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 1
-#else  // ][
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL 1
-#endif  // ]
-#endif  // ]
-#endif  // ]
-
-#else
-
-// Disable expansion of SAL macros in non-Prefast mode to
-// improve compiler throughput.
-#ifndef _USE_DECLSPECS_FOR_SAL  // [
-#define _USE_DECLSPECS_FOR_SAL 0
-#endif                           // ]
-#ifndef _USE_ATTRIBUTES_FOR_SAL  // [
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif  // ]
-
-#endif  // ]
-
-// safeguard for MIDL and RC builds
-#if _USE_DECLSPECS_FOR_SAL && (defined(MIDL_PASS) || defined(__midl) || defined(RC_INVOKED) || \
-                               !defined(_PREFAST_)) /*IFSTRIP=IGN*/  // [
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL 0
-#endif  // ]
-#if _USE_ATTRIBUTES_FOR_SAL && (!defined(_MSC_EXTENSIONS) || defined(MIDL_PASS) || \
-                                defined(__midl) || defined(RC_INVOKED)) /*IFSTRIP=IGN*/  // [
-#undef _USE_ATTRIBUTES_FOR_SAL
-#define _USE_ATTRIBUTES_FOR_SAL 0
-#endif  // ]
-
-#if _USE_DECLSPECS_FOR_SAL || _USE_ATTRIBUTES_FOR_SAL
-
-// Special enum type for Y/N/M
-enum __SAL_YesNo { _SAL_notpresent, _SAL_no, _SAL_maybe, _SAL_yes, _SAL_default };
-
-#endif
-
-#if defined(BUILD_WINDOWS) && !_USE_ATTRIBUTES_FOR_SAL /*IFSTRIP=IGN*/
-#define _SAL1_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL1_1_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1.1") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL1_2_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1.2") _GrouP_(annotes _SAL_nop_impl_)
-#define _SAL2_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "2") _GrouP_(annotes _SAL_nop_impl_)
-#else
-#define _SAL1_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1") _Group_(annotes _SAL_nop_impl_)
-#define _SAL1_1_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1.1") _Group_(annotes _SAL_nop_impl_)
-#define _SAL1_2_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "1.2") _Group_(annotes _SAL_nop_impl_)
-#define _SAL2_Source_(Name, args, annotes) \
-    _SA_annotes3(SAL_name, #Name, "", "2") _Group_(annotes _SAL_nop_impl_)
-#endif
-
-//============================================================================
-//   Structural SAL:
-//     These annotations modify the use of other annotations.  They may
-//     express the annotation target (i.e. what parameter/field the annotation
-//     applies to) or the condition under which the annotation is applicable.
-//============================================================================
-
-// _At_(target, annos) specifies that the annotations listed in 'annos' is to
-// be applied to 'target' rather than to the identifier which is the current
-// lexical target.
-#define _At_(target, annos) _At_impl_(target, annos _SAL_nop_impl_)
-
-// _At_buffer_(target, iter, bound, annos) is similar to _At_, except that
-// target names a buffer, and each annotation in annos is applied to each
-// element of target up to bound, with the variable named in iter usable
-// by the annotations to refer to relevant offsets within target.
-#define _At_buffer_(target, iter, bound, annos) \
-    _At_buffer_impl_(target, iter, bound, annos _SAL_nop_impl_)
-
-// _When_(expr, annos) specifies that the annotations listed in 'annos' only
-// apply when 'expr' evaluates to non-zero.
-#define _When_(expr, annos) _When_impl_(expr, annos _SAL_nop_impl_)
-#define _Group_(annos) _Group_impl_(annos _SAL_nop_impl_)
-#define _GrouP_(annos) _GrouP_impl_(annos _SAL_nop_impl_)
-
-// <expr> indicates whether normal post conditions apply to a function
-#define _Success_(expr) _SAL2_Source_(_Success_, (expr), _Success_impl_(expr))
-
-// <expr> indicates whether post conditions apply to a function returning
-// the type that this annotation is applied to
-#define _Return_type_success_(expr) \
-    _SAL2_Source_(_Return_type_success_, (expr), _Success_impl_(expr))
-
-// Establish postconditions that apply only if the function does not succeed
-#define _On_failure_(annos) _On_failure_impl_(annos _SAL_nop_impl_)
-
-// Establish postconditions that apply in both success and failure cases.
-// Only applicable with functions that have  _Success_ or _Return_type_succss_.
-#define _Always_(annos) _Always_impl_(annos _SAL_nop_impl_)
-
-// Usable on a function definition. Asserts that a function declaration is
-// in scope, and its annotations are to be used. There are no other annotations
-// allowed on the function definition.
-#define _Use_decl_annotations_ _Use_decl_anno_impl_
-
-// _Notref_ may precede a _Deref_ or "real" annotation, and removes one
-// level of dereference if the parameter is a C++ reference (&).  If the
-// net deref on a "real" annotation is negative, it is simply discarded.
-#define _Notref_ _Notref_impl_
-
-// Annotations for defensive programming styles.
-#define _Pre_defensive_ _SA_annotes0(SAL_pre_defensive)
-#define _Post_defensive_ _SA_annotes0(SAL_post_defensive)
-
-#define _In_defensive_(annotes) _Pre_defensive_ _Group_(annotes)
-#define _Out_defensive_(annotes) _Post_defensive_ _Group_(annotes)
-#define _Inout_defensive_(annotes) _Pre_defensive_ _Post_defensive_ _Group_(annotes)
-
-//============================================================================
-//   _In_\_Out_ Layer:
-//============================================================================
-
-// Reserved pointer parameters, must always be NULL.
-#define _Reserved_ _SAL2_Source_(_Reserved_, (), _Pre1_impl_(__null_impl))
-
-// _Const_ allows specification that any namable memory location is considered
-// readonly for a given call.
-#define _Const_ _SAL2_Source_(_Const_, (), _Pre1_impl_(__readaccess_impl_notref))
-
-// Input parameters --------------------------
-
-//   _In_ - Annotations for parameters where data is passed into the function, but not modified.
-//          _In_ by itself can be used with non-pointer types (although it is redundant).
-
-// e.g. void SetPoint( _In_ const POINT* pPT );
-#define _In_                                         \
-    _SAL2_Source_(_In_, (),                          \
-                  _Pre1_impl_(__notnull_impl_notref) \
-                      _Pre_valid_impl_ _Deref_pre1_impl_(__readaccess_impl_notref))
-#define _In_opt_                \
-    _SAL2_Source_(_In_opt_, (), \
-                  _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_ _Deref_pre_readonly_)
-
-// nullterminated 'in' parameters.
-// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
-#define _In_z_ _SAL2_Source_(_In_z_, (), _In_ _Pre1_impl_(__zterm_impl))
-#define _In_opt_z_ _SAL2_Source_(_In_opt_z_, (), _In_opt_ _Pre1_impl_(__zterm_impl))
-
-// 'input' buffers with given size
-
-#define _In_reads_(size) _SAL2_Source_(_In_reads_, (size), _Pre_count_(size) _Deref_pre_readonly_)
-#define _In_reads_opt_(size) \
-    _SAL2_Source_(_In_reads_opt_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_)
-#define _In_reads_bytes_(size) \
-    _SAL2_Source_(_In_reads_bytes_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_)
-#define _In_reads_bytes_opt_(size) \
-    _SAL2_Source_(_In_reads_bytes_opt_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
-#define _In_reads_z_(size) _SAL2_Source_(_In_reads_z_, (size), _In_reads_(size) _Pre_z_)
-#define _In_reads_opt_z_(size) \
-    _SAL2_Source_(_In_reads_opt_z_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_ _Pre_opt_z_)
-#define _In_reads_or_z_(size)                                            \
-    _SAL2_Source_(_In_reads_or_z_, (size),                               \
-                  _In_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) \
-                      _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
-#define _In_reads_or_z_opt_(size)                                            \
-    _SAL2_Source_(_In_reads_or_z_opt_, (size),                               \
-                  _In_opt_ _When_(_String_length_(_Curr_) < (size), _Pre_z_) \
-                      _When_(_String_length_(_Curr_) >= (size), _Pre1_impl_(__count_impl(size))))
-
-// 'input' buffers valid to the given end pointer
-
-#define _In_reads_to_ptr_(ptr) \
-    _SAL2_Source_(_In_reads_to_ptr_, (ptr), _Pre_ptrdiff_count_(ptr) _Deref_pre_readonly_)
-#define _In_reads_to_ptr_opt_(ptr) \
-    _SAL2_Source_(_In_reads_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_)
-#define _In_reads_to_ptr_z_(ptr) \
-    _SAL2_Source_(_In_reads_to_ptr_z_, (ptr), _In_reads_to_ptr_(ptr) _Pre_z_)
-#define _In_reads_to_ptr_opt_z_(ptr)              \
-    _SAL2_Source_(_In_reads_to_ptr_opt_z_, (ptr), \
-                  _Pre_opt_ptrdiff_count_(ptr) _Deref_pre_readonly_ _Pre_opt_z_)
-
-// Output parameters --------------------------
-
-//   _Out_ - Annotations for pointer or reference parameters where data passed back to the caller.
-//           These are mostly used where the pointer/reference is to a non-pointer type.
-//           _Outptr_/_Outref) (see below) are typically used to return pointers via parameters.
-
-// e.g. void GetPoint( _Out_ POINT* pPT );
-#define _Out_ _SAL2_Source_(_Out_, (), _Out_impl_)
-#define _Out_opt_ _SAL2_Source_(_Out_opt_, (), _Out_opt_impl_)
-
-#define _Out_writes_(size) _SAL2_Source_(_Out_writes_, (size), _Pre_cap_(size) _Post_valid_impl_)
-#define _Out_writes_opt_(size) \
-    _SAL2_Source_(_Out_writes_opt_, (size), _Pre_opt_cap_(size) _Post_valid_impl_)
-#define _Out_writes_bytes_(size) \
-    _SAL2_Source_(_Out_writes_bytes_, (size), _Pre_bytecap_(size) _Post_valid_impl_)
-#define _Out_writes_bytes_opt_(size) \
-    _SAL2_Source_(_Out_writes_bytes_opt_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_)
-#define _Out_writes_z_(size) \
-    _SAL2_Source_(_Out_writes_z_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_writes_opt_z_(size) \
-    _SAL2_Source_(_Out_writes_opt_z_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_)
-
-#define _Out_writes_to_(size, count)              \
-    _SAL2_Source_(_Out_writes_to_, (size, count), \
-                  _Pre_cap_(size) _Post_valid_impl_ _Post_count_(count))
-#define _Out_writes_to_opt_(size, count)              \
-    _SAL2_Source_(_Out_writes_to_opt_, (size, count), \
-                  _Pre_opt_cap_(size) _Post_valid_impl_ _Post_count_(count))
-#define _Out_writes_all_(size) \
-    _SAL2_Source_(_Out_writes_all_, (size), _Out_writes_to_(_Old_(size), _Old_(size)))
-#define _Out_writes_all_opt_(size) \
-    _SAL2_Source_(_Out_writes_all_opt_, (size), _Out_writes_to_opt_(_Old_(size), _Old_(size)))
-
-#define _Out_writes_bytes_to_(size, count)              \
-    _SAL2_Source_(_Out_writes_bytes_to_, (size, count), \
-                  _Pre_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_writes_bytes_to_opt_(size, count)              \
-    _SAL2_Source_(_Out_writes_bytes_to_opt_, (size, count), \
-                  _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_writes_bytes_all_(size) \
-    _SAL2_Source_(_Out_writes_bytes_all_, (size), _Out_writes_bytes_to_(_Old_(size), _Old_(size)))
-#define _Out_writes_bytes_all_opt_(size)              \
-    _SAL2_Source_(_Out_writes_bytes_all_opt_, (size), \
-                  _Out_writes_bytes_to_opt_(_Old_(size), _Old_(size)))
-
-#define _Out_writes_to_ptr_(ptr) \
-    _SAL2_Source_(_Out_writes_to_ptr_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_)
-#define _Out_writes_to_ptr_opt_(ptr) \
-    _SAL2_Source_(_Out_writes_to_ptr_opt_, (ptr), _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_)
-#define _Out_writes_to_ptr_z_(ptr) \
-    _SAL2_Source_(_Out_writes_to_ptr_z_, (ptr), _Pre_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_)
-#define _Out_writes_to_ptr_opt_z_(ptr)              \
-    _SAL2_Source_(_Out_writes_to_ptr_opt_z_, (ptr), \
-                  _Pre_opt_ptrdiff_cap_(ptr) _Post_valid_impl_ Post_z_)
-
-// Inout parameters ----------------------------
-
-//   _Inout_ - Annotations for pointer or reference parameters where data is passed in and
-//        potentially modified.
-//          void ModifyPoint( _Inout_ POINT* pPT );
-//          void ModifyPointByRef( _Inout_ POINT& pPT );
-
-#define _Inout_ _SAL2_Source_(_Inout_, (), _Prepost_valid_)
-#define _Inout_opt_ _SAL2_Source_(_Inout_opt_, (), _Prepost_opt_valid_)
-
-// For modifying string buffers
-//   void toupper( _Inout_z_ char* sz );
-#define _Inout_z_ _SAL2_Source_(_Inout_z_, (), _Prepost_z_)
-#define _Inout_opt_z_ _SAL2_Source_(_Inout_opt_z_, (), _Prepost_opt_z_)
-
-// For modifying buffers with explicit element size
-#define _Inout_updates_(size) \
-    _SAL2_Source_(_Inout_updates_, (size), _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_opt_(size)              \
-    _SAL2_Source_(_Inout_updates_opt_, (size), \
-                  _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_z_(size)                                                                \
-    _SAL2_Source_(_Inout_updates_z_, (size),                                                   \
-                  _Pre_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) \
-                      _Post1_impl_(__zterm_impl))
-#define _Inout_updates_opt_z_(size)                                                                \
-    _SAL2_Source_(_Inout_updates_opt_z_, (size),                                                   \
-                  _Pre_opt_cap_(size) _Pre_valid_impl_ _Post_valid_impl_ _Pre1_impl_(__zterm_impl) \
-                      _Post1_impl_(__zterm_impl))
-
-#define _Inout_updates_to_(size, count)              \
-    _SAL2_Source_(_Inout_updates_to_, (size, count), \
-                  _Out_writes_to_(size, count) _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
-#define _Inout_updates_to_opt_(size, count)              \
-    _SAL2_Source_(_Inout_updates_to_opt_, (size, count), \
-                  _Out_writes_to_opt_(size, count)       \
-                      _Pre_valid_impl_ _Pre1_impl_(__count_impl(count)))
-
-#define _Inout_updates_all_(size) \
-    _SAL2_Source_(_Inout_updates_all_, (size), _Inout_updates_to_(_Old_(size), _Old_(size)))
-#define _Inout_updates_all_opt_(size) \
-    _SAL2_Source_(_Inout_updates_all_opt_, (size), _Inout_updates_to_opt_(_Old_(size), _Old_(size)))
-
-// For modifying buffers with explicit byte size
-#define _Inout_updates_bytes_(size)              \
-    _SAL2_Source_(_Inout_updates_bytes_, (size), \
-                  _Pre_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_)
-#define _Inout_updates_bytes_opt_(size)              \
-    _SAL2_Source_(_Inout_updates_bytes_opt_, (size), \
-                  _Pre_opt_bytecap_(size) _Pre_valid_impl_ _Post_valid_impl_)
-
-#define _Inout_updates_bytes_to_(size, count)              \
-    _SAL2_Source_(_Inout_updates_bytes_to_, (size, count), \
-                  _Out_writes_bytes_to_(size, count)       \
-                      _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
-#define _Inout_updates_bytes_to_opt_(size, count)              \
-    _SAL2_Source_(_Inout_updates_bytes_to_opt_, (size, count), \
-                  _Out_writes_bytes_to_opt_(size, count)       \
-                      _Pre_valid_impl_ _Pre1_impl_(__bytecount_impl(count)))
-
-#define _Inout_updates_bytes_all_(size)              \
-    _SAL2_Source_(_Inout_updates_bytes_all_, (size), \
-                  _Inout_updates_bytes_to_(_Old_(size), _Old_(size)))
-#define _Inout_updates_bytes_all_opt_(size)              \
-    _SAL2_Source_(_Inout_updates_bytes_all_opt_, (size), \
-                  _Inout_updates_bytes_to_opt_(_Old_(size), _Old_(size)))
-
-// Pointer to pointer parameters -------------------------
-
-//   _Outptr_ - Annotations for output params returning pointers
-//      These describe parameters where the called function provides the buffer:
-//        HRESULT SHStrDupW(_In_ LPCWSTR psz, _Outptr_ LPWSTR *ppwsz);
-//      The caller passes the address of an LPWSTR variable as ppwsz, and SHStrDupW allocates
-//      and initializes memory and returns the pointer to the new LPWSTR in *ppwsz.
-//
-//    _Outptr_opt_ - describes parameters that are allowed to be NULL.
-//    _Outptr_*_result_maybenull_ - describes parameters where the called function might return NULL
-//    to the caller.
-//
-//    Example:
-//       void MyFunc(_Outptr_opt_ int **ppData1, _Outptr_result_maybenull_ int **ppData2);
-//    Callers:
-//       MyFunc(NULL, NULL);           // error: parameter 2, ppData2, should not be NULL
-//       MyFunc(&pData1, &pData2);     // ok: both non-NULL
-//       if (*pData1 == *pData2) ...   // error: pData2 might be NULL after call
-
-#define _Outptr_                \
-    _SAL2_Source_(_Outptr_, (), \
-                  _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1)))
-#define _Outptr_result_maybenull_                \
-    _SAL2_Source_(_Outptr_result_maybenull_, (), \
-                  _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
-#define _Outptr_opt_                \
-    _SAL2_Source_(_Outptr_opt_, (), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(1)))
-#define _Outptr_opt_result_maybenull_                \
-    _SAL2_Source_(_Outptr_opt_result_maybenull_, (), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(1)))
-
-// Annotations for _Outptr_ parameters returning pointers to null terminated strings.
-
-#define _Outptr_result_z_ _SAL2_Source_(_Outptr_result_z_, (), _Out_impl_ _Deref_post_z_)
-#define _Outptr_opt_result_z_ \
-    _SAL2_Source_(_Outptr_opt_result_z_, (), _Out_opt_impl_ _Deref_post_z_)
-#define _Outptr_result_maybenull_z_ \
-    _SAL2_Source_(_Outptr_result_maybenull_z_, (), _Out_impl_ _Deref_post_opt_z_)
-#define _Outptr_opt_result_maybenull_z_ \
-    _SAL2_Source_(_Outptr_opt_result_maybenull_z_, (), _Out_opt_impl_ _Deref_post_opt_z_)
-
-// Annotations for _Outptr_ parameters where the output pointer is set to NULL if the function
-// fails.
-
-#define _Outptr_result_nullonfailure_ \
-    _SAL2_Source_(_Outptr_result_nullonfailure_, (), _Outptr_ _On_failure_(_Deref_post_null_))
-#define _Outptr_opt_result_nullonfailure_                \
-    _SAL2_Source_(_Outptr_opt_result_nullonfailure_, (), \
-                  _Outptr_opt_ _On_failure_(_Deref_post_null_))
-
-// Annotations for _Outptr_ parameters which return a pointer to a ref-counted COM object,
-// following the COM convention of setting the output to NULL on failure.
-// The current implementation is identical to _Outptr_result_nullonfailure_.
-// For pointers to types that are not COM objects, _Outptr_result_nullonfailure_ is preferred.
-
-#define _COM_Outptr_ _SAL2_Source_(_COM_Outptr_, (), _Outptr_ _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_result_maybenull_                \
-    _SAL2_Source_(_COM_Outptr_result_maybenull_, (), \
-                  _Outptr_result_maybenull_ _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_opt_ \
-    _SAL2_Source_(_COM_Outptr_opt_, (), _Outptr_opt_ _On_failure_(_Deref_post_null_))
-#define _COM_Outptr_opt_result_maybenull_                \
-    _SAL2_Source_(_COM_Outptr_opt_result_maybenull_, (), \
-                  _Outptr_opt_result_maybenull_ _On_failure_(_Deref_post_null_))
-
-// Annotations for _Outptr_ parameters returning a pointer to buffer with a specified number of
-// elements/bytes
-
-#define _Outptr_result_buffer_(size)              \
-    _SAL2_Source_(_Outptr_result_buffer_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
-#define _Outptr_opt_result_buffer_(size)              \
-    _SAL2_Source_(_Outptr_opt_result_buffer_, (size), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __cap_impl(size)))
-#define _Outptr_result_buffer_to_(size, count)                                           \
-    _SAL2_Source_(_Outptr_result_buffer_to_, (size, count),                              \
-                  _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), \
-                                                __count_impl(count)))
-#define _Outptr_opt_result_buffer_to_(size, count)                                           \
-    _SAL2_Source_(_Outptr_opt_result_buffer_to_, (size, count),                              \
-                  _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __cap_impl(size), \
-                                                    __count_impl(count)))
-
-#define _Outptr_result_buffer_all_(size)              \
-    _SAL2_Source_(_Outptr_result_buffer_all_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
-#define _Outptr_opt_result_buffer_all_(size)              \
-    _SAL2_Source_(_Outptr_opt_result_buffer_all_, (size), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __count_impl(size)))
-
-#define _Outptr_result_buffer_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_result_buffer_maybenull_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
-#define _Outptr_opt_result_buffer_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_opt_result_buffer_maybenull_, (size), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __cap_impl(size)))
-#define _Outptr_result_buffer_to_maybenull_(size, count)                                   \
-    _SAL2_Source_(_Outptr_result_buffer_to_maybenull_, (size, count),                      \
-                  _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), \
-                                                __count_impl(count)))
-#define _Outptr_opt_result_buffer_to_maybenull_(size, count)                                   \
-    _SAL2_Source_(_Outptr_opt_result_buffer_to_maybenull_, (size, count),                      \
-                  _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __cap_impl(size), \
-                                                    __count_impl(count)))
-
-#define _Outptr_result_buffer_all_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_result_buffer_all_maybenull_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
-#define _Outptr_opt_result_buffer_all_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_opt_result_buffer_all_maybenull_, (size), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __count_impl(size)))
-
-#define _Outptr_result_bytebuffer_(size)              \
-    _SAL2_Source_(_Outptr_result_bytebuffer_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_opt_result_bytebuffer_(size)              \
-    _SAL2_Source_(_Outptr_opt_result_bytebuffer_, (size), \
-                  _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_result_bytebuffer_to_(size, count)                                           \
-    _SAL2_Source_(_Outptr_result_bytebuffer_to_, (size, count),                              \
-                  _Out_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), \
-                                                __bytecount_impl(count)))
-#define _Outptr_opt_result_bytebuffer_to_(size, count)                                           \
-    _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_, (size, count),                              \
-                  _Out_opt_impl_ _Deref_post3_impl_(__notnull_impl_notref, __bytecap_impl(size), \
-                                                    __bytecount_impl(count)))
-
-#define _Outptr_result_bytebuffer_all_(size)              \
-    _SAL2_Source_(_Outptr_result_bytebuffer_all_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
-#define _Outptr_opt_result_bytebuffer_all_(size)    \
-    _SAL2_Source_(                                  \
-        _Outptr_opt_result_bytebuffer_all_, (size), \
-        _Out_opt_impl_ _Deref_post2_impl_(__notnull_impl_notref, __bytecount_impl(size)))
-
-#define _Outptr_result_bytebuffer_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_result_bytebuffer_maybenull_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_opt_result_bytebuffer_maybenull_(size)    \
-    _SAL2_Source_(                                        \
-        _Outptr_opt_result_bytebuffer_maybenull_, (size), \
-        _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecap_impl(size)))
-#define _Outptr_result_bytebuffer_to_maybenull_(size, count)                                   \
-    _SAL2_Source_(_Outptr_result_bytebuffer_to_maybenull_, (size, count),                      \
-                  _Out_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), \
-                                                __bytecount_impl(count)))
-#define _Outptr_opt_result_bytebuffer_to_maybenull_(size, count)                                   \
-    _SAL2_Source_(_Outptr_opt_result_bytebuffer_to_maybenull_, (size, count),                      \
-                  _Out_opt_impl_ _Deref_post3_impl_(__maybenull_impl_notref, __bytecap_impl(size), \
-                                                    __bytecount_impl(count)))
-
-#define _Outptr_result_bytebuffer_all_maybenull_(size)              \
-    _SAL2_Source_(_Outptr_result_bytebuffer_all_maybenull_, (size), \
-                  _Out_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
-#define _Outptr_opt_result_bytebuffer_all_maybenull_(size)    \
-    _SAL2_Source_(                                            \
-        _Outptr_opt_result_bytebuffer_all_maybenull_, (size), \
-        _Out_opt_impl_ _Deref_post2_impl_(__maybenull_impl_notref, __bytecount_impl(size)))
-
-// Annotations for output reference to pointer parameters.
-
-#define _Outref_ _SAL2_Source_(_Outref_, (), _Out_impl_ _Post_notnull_)
-#define _Outref_result_maybenull_                                             \
-    _SAL2_Source_(_Outref_result_maybenull_, (),                              \
-                  _Pre2_impl_(__notnull_impl_notref, __cap_c_one_notref_impl) \
-                      _Post_maybenull_ _Post_valid_impl_)
-
-#define _Outref_result_buffer_(size) \
-    _SAL2_Source_(_Outref_result_buffer_, (size), _Outref_ _Post1_impl_(__cap_impl(size)))
-#define _Outref_result_bytebuffer_(size) \
-    _SAL2_Source_(_Outref_result_bytebuffer_, (size), _Outref_ _Post1_impl_(__bytecap_impl(size)))
-#define _Outref_result_buffer_to_(size, count)              \
-    _SAL2_Source_(_Outref_result_buffer_to_, (size, count), \
-                  _Outref_result_buffer_(size) _Post1_impl_(__count_impl(count)))
-#define _Outref_result_bytebuffer_to_(size, count)              \
-    _SAL2_Source_(_Outref_result_bytebuffer_to_, (size, count), \
-                  _Outref_result_bytebuffer_(size) _Post1_impl_(__bytecount_impl(count)))
-#define _Outref_result_buffer_all_(size) \
-    _SAL2_Source_(_Outref_result_buffer_all_, (size), _Outref_result_buffer_to_(size, _Old_(size)))
-#define _Outref_result_bytebuffer_all_(size)              \
-    _SAL2_Source_(_Outref_result_bytebuffer_all_, (size), \
-                  _Outref_result_bytebuffer_to_(size, _Old_(size)))
-
-#define _Outref_result_buffer_maybenull_(size)              \
-    _SAL2_Source_(_Outref_result_buffer_maybenull_, (size), \
-                  _Outref_result_maybenull_ _Post1_impl_(__cap_impl(size)))
-#define _Outref_result_bytebuffer_maybenull_(size)              \
-    _SAL2_Source_(_Outref_result_bytebuffer_maybenull_, (size), \
-                  _Outref_result_maybenull_ _Post1_impl_(__bytecap_impl(size)))
-#define _Outref_result_buffer_to_maybenull_(size, count)              \
-    _SAL2_Source_(_Outref_result_buffer_to_maybenull_, (size, count), \
-                  _Outref_result_buffer_maybenull_(size) _Post1_impl_(__count_impl(count)))
-#define _Outref_result_bytebuffer_to_maybenull_(size, count)              \
-    _SAL2_Source_(_Outref_result_bytebuffer_to_maybenull_, (size, count), \
-                  _Outref_result_bytebuffer_maybenull_(size)              \
-                      _Post1_impl_(__bytecount_impl(count)))
-#define _Outref_result_buffer_all_maybenull_(size)              \
-    _SAL2_Source_(_Outref_result_buffer_all_maybenull_, (size), \
-                  _Outref_result_buffer_to_maybenull_(size, _Old_(size)))
-#define _Outref_result_bytebuffer_all_maybenull_(size)              \
-    _SAL2_Source_(_Outref_result_bytebuffer_all_maybenull_, (size), \
-                  _Outref_result_bytebuffer_to_maybenull_(size, _Old_(size)))
-
-// Annotations for output reference to pointer parameters that guarantee
-// that the pointer is set to NULL on failure.
-#define _Outref_result_nullonfailure_ \
-    _SAL2_Source_(_Outref_result_nullonfailure_, (), _Outref_ _On_failure_(_Post_null_))
-
-// Generic annotations to set output value of a by-pointer or by-reference parameter to null/zero on
-// failure.
-#define _Result_nullonfailure_ \
-    _SAL2_Source_(_Result_nullonfailure_, (), _On_failure_(_Notref_impl_ _Deref_impl_ _Post_null_))
-#define _Result_zeroonfailure_                \
-    _SAL2_Source_(_Result_zeroonfailure_, (), \
-                  _On_failure_(_Notref_impl_ _Deref_impl_ _Out_range_(==, 0)))
-
-// return values -------------------------------
-
-//
-// _Ret_ annotations
-//
-// describing conditions that hold for return values after the call
-
-// e.g. _Ret_z_ CString::operator const WCHAR*() const throw();
-#define _Ret_z_ \
-    _SAL2_Source_(_Ret_z_, (), _Ret2_impl_(__notnull_impl, __zterm_impl) _Ret_valid_impl_)
-#define _Ret_maybenull_z_                \
-    _SAL2_Source_(_Ret_maybenull_z_, (), \
-                  _Ret2_impl_(__maybenull_impl, __zterm_impl) _Ret_valid_impl_)
-
-// used with allocated but not yet initialized objects
-#define _Ret_notnull_ _SAL2_Source_(_Ret_notnull_, (), _Ret1_impl_(__notnull_impl))
-#define _Ret_maybenull_ _SAL2_Source_(_Ret_maybenull_, (), _Ret1_impl_(__maybenull_impl))
-#define _Ret_null_ _SAL2_Source_(_Ret_null_, (), _Ret1_impl_(__null_impl))
-
-// used with allocated and initialized objects
-//    returns single valid object
-#define _Ret_valid_ \
-    _SAL2_Source_(_Ret_valid_, (), _Ret1_impl_(__notnull_impl_notref) _Ret_valid_impl_)
-
-//    returns pointer to initialized buffer of specified size
-#define _Ret_writes_(size)              \
-    _SAL2_Source_(_Ret_writes_, (size), \
-                  _Ret2_impl_(__notnull_impl, __count_impl(size)) _Ret_valid_impl_)
-#define _Ret_writes_z_(size)              \
-    _SAL2_Source_(_Ret_writes_z_, (size), \
-                  _Ret3_impl_(__notnull_impl, __count_impl(size), __zterm_impl) _Ret_valid_impl_)
-#define _Ret_writes_bytes_(size)              \
-    _SAL2_Source_(_Ret_writes_bytes_, (size), \
-                  _Ret2_impl_(__notnull_impl, __bytecount_impl(size)) _Ret_valid_impl_)
-#define _Ret_writes_maybenull_(size)              \
-    _SAL2_Source_(_Ret_writes_maybenull_, (size), \
-                  _Ret2_impl_(__maybenull_impl, __count_impl(size)) _Ret_valid_impl_)
-#define _Ret_writes_maybenull_z_(size)                                            \
-    _SAL2_Source_(_Ret_writes_maybenull_z_, (size),                               \
-                  _Ret3_impl_(__maybenull_impl, __count_impl(size), __zterm_impl) \
-                      _Ret_valid_impl_)
-#define _Ret_writes_bytes_maybenull_(size)              \
-    _SAL2_Source_(_Ret_writes_bytes_maybenull_, (size), \
-                  _Ret2_impl_(__maybenull_impl, __bytecount_impl(size)) _Ret_valid_impl_)
-
-//    returns pointer to partially initialized buffer, with total size 'size' and initialized size
-//    'count'
-#define _Ret_writes_to_(size, count)                                                 \
-    _SAL2_Source_(_Ret_writes_to_, (size, count),                                    \
-                  _Ret3_impl_(__notnull_impl, __cap_impl(size), __count_impl(count)) \
-                      _Ret_valid_impl_)
-#define _Ret_writes_bytes_to_(size, count)                                                   \
-    _SAL2_Source_(_Ret_writes_bytes_to_, (size, count),                                      \
-                  _Ret3_impl_(__notnull_impl, __bytecap_impl(size), __bytecount_impl(count)) \
-                      _Ret_valid_impl_)
-#define _Ret_writes_to_maybenull_(size, count)                                         \
-    _SAL2_Source_(_Ret_writes_to_maybenull_, (size, count),                            \
-                  _Ret3_impl_(__maybenull_impl, __cap_impl(size), __count_impl(count)) \
-                      _Ret_valid_impl_)
-#define _Ret_writes_bytes_to_maybenull_(size, count)                                           \
-    _SAL2_Source_(_Ret_writes_bytes_to_maybenull_, (size, count),                              \
-                  _Ret3_impl_(__maybenull_impl, __bytecap_impl(size), __bytecount_impl(count)) \
-                      _Ret_valid_impl_)
-
-// Annotations for strict type checking
-#define _Points_to_data_ _SAL2_Source_(_Points_to_data_, (), _Pre_ _Points_to_data_impl_)
-#define _Literal_ _SAL2_Source_(_Literal_, (), _Pre_ _Literal_impl_)
-#define _Notliteral_ _SAL2_Source_(_Notliteral_, (), _Pre_ _Notliteral_impl_)
-
-// Check the return value of a function e.g. _Check_return_ ErrorCode Foo();
-#define _Check_return_ _SAL2_Source_(_Check_return_, (), _Check_return_impl_)
-#define _Must_inspect_result_ \
-    _SAL2_Source_(_Must_inspect_result_, (), _Must_inspect_impl_ _Check_return_impl_)
-
-// e.g. MyPrintF( _Printf_format_string_ const WCHAR* wzFormat, ... );
-#define _Printf_format_string_ \
-    _SAL2_Source_(_Printf_format_string_, (), _Printf_format_string_impl_)
-#define _Scanf_format_string_ _SAL2_Source_(_Scanf_format_string_, (), _Scanf_format_string_impl_)
-#define _Scanf_s_format_string_ \
-    _SAL2_Source_(_Scanf_s_format_string_, (), _Scanf_s_format_string_impl_)
-
-#define _Format_string_impl_(kind, where) _SA_annotes2(SAL_IsFormatString2, kind, where)
-#define _Printf_format_string_params_(x) \
-    _SAL2_Source_(_Printf_format_string_params_, (x), _Format_string_impl_("printf", x))
-#define _Scanf_format_string_params_(x) \
-    _SAL2_Source_(_Scanf_format_string_params_, (x), _Format_string_impl_("scanf", x))
-#define _Scanf_s_format_string_params_(x) \
-    _SAL2_Source_(_Scanf_s_format_string_params_, (x), _Format_string_impl_("scanf_s", x))
-
-// annotations to express value of integral or pointer parameter
-#define _In_range_(lb, ub) _SAL2_Source_(_In_range_, (lb, ub), _In_range_impl_(lb, ub))
-#define _Out_range_(lb, ub) _SAL2_Source_(_Out_range_, (lb, ub), _Out_range_impl_(lb, ub))
-#define _Ret_range_(lb, ub) _SAL2_Source_(_Ret_range_, (lb, ub), _Ret_range_impl_(lb, ub))
-#define _Deref_in_range_(lb, ub) \
-    _SAL2_Source_(_Deref_in_range_, (lb, ub), _Deref_in_range_impl_(lb, ub))
-#define _Deref_out_range_(lb, ub) \
-    _SAL2_Source_(_Deref_out_range_, (lb, ub), _Deref_out_range_impl_(lb, ub))
-#define _Deref_ret_range_(lb, ub) \
-    _SAL2_Source_(_Deref_ret_range_, (lb, ub), _Deref_ret_range_impl_(lb, ub))
-#define _Pre_equal_to_(expr) _SAL2_Source_(_Pre_equal_to_, (expr), _In_range_(==, expr))
-#define _Post_equal_to_(expr) _SAL2_Source_(_Post_equal_to_, (expr), _Out_range_(==, expr))
-
-// annotation to express that a value (usually a field of a mutable class)
-// is not changed by a function call
-#define _Unchanged_(e) _SAL2_Source_(_Unchanged_, (e), _At_(e, _Post_equal_to_(_Old_(e)) _Const_))
-
-// Annotations to allow expressing generalized pre and post conditions.
-// 'cond' may be any valid SAL expression that is considered to be true as a precondition
-// or postcondition (respsectively).
-#define _Pre_satisfies_(cond) _SAL2_Source_(_Pre_satisfies_, (cond), _Pre_satisfies_impl_(cond))
-#define _Post_satisfies_(cond) _SAL2_Source_(_Post_satisfies_, (cond), _Post_satisfies_impl_(cond))
-
-// Annotations to express struct, class and field invariants
-#define _Struct_size_bytes_(size) _SAL2_Source_(_Struct_size_bytes_, (size), _Writable_bytes_(size))
-
-#define _Field_size_(size) _SAL2_Source_(_Field_size_, (size), _Notnull_ _Writable_elements_(size))
-#define _Field_size_opt_(size) \
-    _SAL2_Source_(_Field_size_opt_, (size), _Maybenull_ _Writable_elements_(size))
-#define _Field_size_part_(size, count)              \
-    _SAL2_Source_(_Field_size_part_, (size, count), \
-                  _Notnull_ _Writable_elements_(size) _Readable_elements_(count))
-#define _Field_size_part_opt_(size, count)              \
-    _SAL2_Source_(_Field_size_part_opt_, (size, count), \
-                  _Maybenull_ _Writable_elements_(size) _Readable_elements_(count))
-#define _Field_size_full_(size) \
-    _SAL2_Source_(_Field_size_full_, (size), _Field_size_part_(size, size))
-#define _Field_size_full_opt_(size) \
-    _SAL2_Source_(_Field_size_full_opt_, (size), _Field_size_part_opt_(size, size))
-
-#define _Field_size_bytes_(size) \
-    _SAL2_Source_(_Field_size_bytes_, (size), _Notnull_ _Writable_bytes_(size))
-#define _Field_size_bytes_opt_(size) \
-    _SAL2_Source_(_Field_size_bytes_opt_, (size), _Maybenull_ _Writable_bytes_(size))
-#define _Field_size_bytes_part_(size, count)              \
-    _SAL2_Source_(_Field_size_bytes_part_, (size, count), \
-                  _Notnull_ _Writable_bytes_(size) _Readable_bytes_(count))
-#define _Field_size_bytes_part_opt_(size, count)              \
-    _SAL2_Source_(_Field_size_bytes_part_opt_, (size, count), \
-                  _Maybenull_ _Writable_bytes_(size) _Readable_bytes_(count))
-#define _Field_size_bytes_full_(size) \
-    _SAL2_Source_(_Field_size_bytes_full_, (size), _Field_size_bytes_part_(size, size))
-#define _Field_size_bytes_full_opt_(size) \
-    _SAL2_Source_(_Field_size_bytes_full_opt_, (size), _Field_size_bytes_part_opt_(size, size))
-
-#define _Field_z_ _SAL2_Source_(_Field_z_, (), _Null_terminated_)
-
-#define _Field_range_(min, max) \
-    _SAL2_Source_(_Field_range_, (min, max), _Field_range_impl_(min, max))
-
-//============================================================================
-//   _Pre_\_Post_ Layer:
-//============================================================================
-
-//
-// Raw Pre/Post for declaring custom pre/post conditions
-//
-
-#define _Pre_ _Pre_impl_
-#define _Post_ _Post_impl_
-
-//
-// Validity property
-//
-
-#define _Valid_ _Valid_impl_
-#define _Notvalid_ _Notvalid_impl_
-#define _Maybevalid_ _Maybevalid_impl_
-
-//
-// Buffer size properties
-//
-
-// Expressing buffer sizes without specifying pre or post condition
-#define _Readable_bytes_(size) _SAL2_Source_(_Readable_bytes_, (size), _Readable_bytes_impl_(size))
-#define _Readable_elements_(size) \
-    _SAL2_Source_(_Readable_elements_, (size), _Readable_elements_impl_(size))
-#define _Writable_bytes_(size) _SAL2_Source_(_Writable_bytes_, (size), _Writable_bytes_impl_(size))
-#define _Writable_elements_(size) \
-    _SAL2_Source_(_Writable_elements_, (size), _Writable_elements_impl_(size))
-
-#define _Null_terminated_ _SAL2_Source_(_Null_terminated_, (), _Null_terminated_impl_)
-#define _NullNull_terminated_ _SAL2_Source_(_NullNull_terminated_, (), _NullNull_terminated_impl_)
-
-// Expressing buffer size as pre or post condition
-#define _Pre_readable_size_(size) \
-    _SAL2_Source_(_Pre_readable_size_, (size), _Pre1_impl_(__count_impl(size)) _Pre_valid_impl_)
-#define _Pre_writable_size_(size) \
-    _SAL2_Source_(_Pre_writable_size_, (size), _Pre1_impl_(__cap_impl(size)))
-#define _Pre_readable_byte_size_(size)              \
-    _SAL2_Source_(_Pre_readable_byte_size_, (size), \
-                  _Pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
-#define _Pre_writable_byte_size_(size) \
-    _SAL2_Source_(_Pre_writable_byte_size_, (size), _Pre1_impl_(__bytecap_impl(size)))
-
-#define _Post_readable_size_(size) \
-    _SAL2_Source_(_Post_readable_size_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_)
-#define _Post_writable_size_(size) \
-    _SAL2_Source_(_Post_writable_size_, (size), _Post1_impl_(__cap_impl(size)))
-#define _Post_readable_byte_size_(size)              \
-    _SAL2_Source_(_Post_readable_byte_size_, (size), \
-                  _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
-#define _Post_writable_byte_size_(size) \
-    _SAL2_Source_(_Post_writable_byte_size_, (size), _Post1_impl_(__bytecap_impl(size)))
-
-//
-// Pointer null-ness properties
-//
-#define _Null_ _Null_impl_
-#define _Notnull_ _Notnull_impl_
-#define _Maybenull_ _Maybenull_impl_
-
-//
-// _Pre_ annotations ---
-//
-// describing conditions that must be met before the call of the function
-
-// e.g. int strlen( _Pre_z_ const char* sz );
-// buffer is a zero terminated string
-#define _Pre_z_ _SAL2_Source_(_Pre_z_, (), _Pre1_impl_(__zterm_impl) _Pre_valid_impl_)
-
-// valid size unknown or indicated by type (e.g.:LPSTR)
-#define _Pre_valid_ \
-    _SAL2_Source_(_Pre_valid_, (), _Pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_)
-#define _Pre_opt_valid_ \
-    _SAL2_Source_(_Pre_opt_valid_, (), _Pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
-
-#define _Pre_invalid_ _SAL2_Source_(_Pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
-
-// Overrides recursive valid when some field is not yet initialized when using _Inout_
-#define _Pre_unknown_ _SAL2_Source_(_Pre_unknown_, (), _Pre1_impl_(__maybevalid_impl))
-
-// used with allocated but not yet initialized objects
-#define _Pre_notnull_ _SAL2_Source_(_Pre_notnull_, (), _Pre1_impl_(__notnull_impl_notref))
-#define _Pre_maybenull_ _SAL2_Source_(_Pre_maybenull_, (), _Pre1_impl_(__maybenull_impl_notref))
-#define _Pre_null_ _SAL2_Source_(_Pre_null_, (), _Pre1_impl_(__null_impl_notref))
-
-//
-// _Post_ annotations ---
-//
-// describing conditions that hold after the function call
-
-// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom );
-// buffer will be a zero-terminated string after the call
-#define _Post_z_ _SAL2_Source_(_Post_z_, (), _Post1_impl_(__zterm_impl) _Post_valid_impl_)
-
-// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj );
-#define _Post_valid_ _SAL2_Source_(_Post_valid_, (), _Post_valid_impl_)
-#define _Post_invalid_ _SAL2_Source_(_Post_invalid_, (), _Deref_post1_impl_(__notvalid_impl))
-
-// e.g. void free( _Post_ptr_invalid_ void* pv );
-#define _Post_ptr_invalid_ _SAL2_Source_(_Post_ptr_invalid_, (), _Post1_impl_(__notvalid_impl))
-
-// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv );
-#define _Post_notnull_ _SAL2_Source_(_Post_notnull_, (), _Post1_impl_(__notnull_impl))
-
-// e.g. HRESULT GetObject(_Outptr_ _On_failure_(_At_(*p, _Post_null_)) T **p);
-#define _Post_null_ _SAL2_Source_(_Post_null_, (), _Post1_impl_(__null_impl))
-
-#define _Post_maybenull_ _SAL2_Source_(_Post_maybenull_, (), _Post1_impl_(__maybenull_impl))
-
-#define _Prepost_z_ _SAL2_Source_(_Prepost_z_, (), _Pre_z_ _Post_z_)
-
-// #pragma region Input Buffer SAL 1 compatibility macros
-
-/*==========================================================================
-
-   This section contains definitions for macros defined for VS2010 and earlier.
-   Usage of these macros is still supported, but the SAL 2 macros defined above
-   are recommended instead.  This comment block is retained to assist in
-   understanding SAL that still uses the older syntax.
-
-   The macros are defined in 3 layers:
-
-   _In_\_Out_ Layer:
-   ----------------
-   This layer provides the highest abstraction and its macros should be used
-   in most cases. Its macros start with _In_, _Out_ or _Inout_. For the
-   typical case they provide the most concise annotations.
-
-   _Pre_\_Post_ Layer:
-   ------------------
-   The macros of this layer only should be used when there is no suitable macro
-   in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_,
-   _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most
-   flexibility for annotations.
-
-   Implementation Abstraction Layer:
-   --------------------------------
-   Macros from this layer should never be used directly. The layer only exists
-   to hide the implementation of the annotation macros.
-
-
-   Annotation Syntax:
-   |--------------|----------|----------------|-----------------------------|
-   |   Usage      | Nullness | ZeroTerminated |  Extent                     |
-   |--------------|----------|----------------|-----------------------------|
-   | _In_         | <>       | <>             | <>                          |
-   | _Out_        | opt_     | z_             | [byte]cap_[c_|x_]( size )   |
-   | _Inout_      |          |                | [byte]count_[c_|x_]( size ) |
-   | _Deref_out_  |          |                | ptrdiff_cap_( ptr )         |
-   |--------------|          |                | ptrdiff_count_( ptr )       |
-   | _Ret_        |          |                |                             |
-   | _Deref_ret_  |          |                |                             |
-   |--------------|          |                |                             |
-   | _Pre_        |          |                |                             |
-   | _Post_       |          |                |                             |
-   | _Deref_pre_  |          |                |                             |
-   | _Deref_post_ |          |                |                             |
-   |--------------|----------|----------------|-----------------------------|
-
-   Usage:
-   -----
-   _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for
-   formal parameters.
-   _Ret_, _Deref_ret_ must be used for return values.
-
-   Nullness:
-   --------
-   If the pointer can be NULL the annotation contains _opt. If the macro
-   does not contain '_opt' the pointer may not be NULL.
-
-   String Type:
-   -----------
-   _z: NullTerminated string
-   for _In_ parameters the buffer must have the specified stringtype before the call
-   for _Out_ parameters the buffer must have the specified stringtype after the call
-   for _Inout_ parameters both conditions apply
-
-   Extent Syntax:
-   |------|---------------|---------------|
-   | Unit | Writ\Readable | Argument Type |
-   |------|---------------|---------------|
-   |  <>  | cap_          | <>            |
-   | byte | count_        | c_            |
-   |      |               | x_            |
-   |------|---------------|---------------|
-
-   'cap' (capacity) describes the writable size of the buffer and is typically used
-   with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes
-   'count' describes the readable size of the buffer and is typically used with _In_.
-   The default unit is elements. Use 'bytecount' if the size is given in bytes.
-
-   Argument syntax for cap_, bytecap_, count_, bytecount_:
-   (<parameter>|return)[+n]  e.g. cch, return, cb+2
-
-   If the buffer size is a constant expression use the c_ postfix.
-   E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16)
-
-   If the buffer size is given by a limiting pointer use the ptrdiff_ versions
-   of the macros.
-
-   If the buffer size is neither a parameter nor a constant expression use the x_
-   postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string.
-   No analysis can be done for x_ annotations but they at least tell the tool that
-   the buffer has some sort of extent description. x_ annotations might be supported
-   by future compiler versions.
-
-============================================================================*/
-
-// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
-// valid buffer extent described by another parameter
-#define _In_count_(size) _SAL1_1_Source_(_In_count_, (size), _Pre_count_(size) _Deref_pre_readonly_)
-#define _In_opt_count_(size) \
-    _SAL1_1_Source_(_In_opt_count_, (size), _Pre_opt_count_(size) _Deref_pre_readonly_)
-#define _In_bytecount_(size) \
-    _SAL1_1_Source_(_In_bytecount_, (size), _Pre_bytecount_(size) _Deref_pre_readonly_)
-#define _In_opt_bytecount_(size) \
-    _SAL1_1_Source_(_In_opt_bytecount_, (size), _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
-
-// valid buffer extent described by a constant extression
-#define _In_count_c_(size) \
-    _SAL1_1_Source_(_In_count_c_, (size), _Pre_count_c_(size) _Deref_pre_readonly_)
-#define _In_opt_count_c_(size) \
-    _SAL1_1_Source_(_In_opt_count_c_, (size), _Pre_opt_count_c_(size) _Deref_pre_readonly_)
-#define _In_bytecount_c_(size) \
-    _SAL1_1_Source_(_In_bytecount_c_, (size), _Pre_bytecount_c_(size) _Deref_pre_readonly_)
-#define _In_opt_bytecount_c_(size) \
-    _SAL1_1_Source_(_In_opt_bytecount_c_, (size), _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
-
-// nullterminated  'input' buffers with given size
-
-// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch )
-// nullterminated valid buffer extent described by another parameter
-#define _In_z_count_(size) \
-    _SAL1_1_Source_(_In_z_count_, (size), _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_)
-#define _In_opt_z_count_(size)                \
-    _SAL1_1_Source_(_In_opt_z_count_, (size), \
-                    _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_)
-#define _In_z_bytecount_(size) \
-    _SAL1_1_Source_(_In_z_bytecount_, (size), _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_)
-#define _In_opt_z_bytecount_(size)                \
-    _SAL1_1_Source_(_In_opt_z_bytecount_, (size), \
-                    _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_)
-
-// nullterminated valid buffer extent described by a constant extression
-#define _In_z_count_c_(size) \
-    _SAL1_1_Source_(_In_z_count_c_, (size), _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_)
-#define _In_opt_z_count_c_(size)                \
-    _SAL1_1_Source_(_In_opt_z_count_c_, (size), \
-                    _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_)
-#define _In_z_bytecount_c_(size)                \
-    _SAL1_1_Source_(_In_z_bytecount_c_, (size), \
-                    _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_)
-#define _In_opt_z_bytecount_c_(size)                \
-    _SAL1_1_Source_(_In_opt_z_bytecount_c_, (size), \
-                    _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_)
-
-// buffer capacity is described by another pointer
-// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch <
-// pchMax ) pch++; }
-#define _In_ptrdiff_count_(size) \
-    _SAL1_1_Source_(_In_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size) _Deref_pre_readonly_)
-#define _In_opt_ptrdiff_count_(size)                \
-    _SAL1_1_Source_(_In_opt_ptrdiff_count_, (size), \
-                    _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_)
-
-// 'x' version for complex expressions that are not supported by the current compiler version
-// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows );
-#define _In_count_x_(size) \
-    _SAL1_1_Source_(_In_count_x_, (size), _Pre_count_x_(size) _Deref_pre_readonly_)
-#define _In_opt_count_x_(size) \
-    _SAL1_1_Source_(_In_opt_count_x_, (size), _Pre_opt_count_x_(size) _Deref_pre_readonly_)
-#define _In_bytecount_x_(size) \
-    _SAL1_1_Source_(_In_bytecount_x_, (size), _Pre_bytecount_x_(size) _Deref_pre_readonly_)
-#define _In_opt_bytecount_x_(size) \
-    _SAL1_1_Source_(_In_opt_bytecount_x_, (size), _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_)
-
-// 'out' with buffer size
-// e.g. void GetIndices( _Out_cap_(cIndices) int* rgIndices, size_t cIndices );
-// buffer capacity is described by another parameter
-#define _Out_cap_(size) _SAL1_1_Source_(_Out_cap_, (size), _Pre_cap_(size) _Post_valid_impl_)
-#define _Out_opt_cap_(size) \
-    _SAL1_1_Source_(_Out_opt_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_)
-#define _Out_bytecap_(size) \
-    _SAL1_1_Source_(_Out_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_)
-#define _Out_opt_bytecap_(size) \
-    _SAL1_1_Source_(_Out_opt_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_)
-
-// buffer capacity is described by a constant expression
-#define _Out_cap_c_(size) _SAL1_1_Source_(_Out_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_)
-#define _Out_opt_cap_c_(size) \
-    _SAL1_1_Source_(_Out_opt_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_)
-#define _Out_bytecap_c_(size) \
-    _SAL1_1_Source_(_Out_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_)
-#define _Out_opt_bytecap_c_(size) \
-    _SAL1_1_Source_(_Out_opt_bytecap_c_, (size), _Pre_opt_bytecap_c_(size) _Post_valid_impl_)
-
-// buffer capacity is described by another parameter multiplied by a constant expression
-#define _Out_cap_m_(mult, size) \
-    _SAL1_1_Source_(_Out_cap_m_, (mult, size), _Pre_cap_m_(mult, size) _Post_valid_impl_)
-#define _Out_opt_cap_m_(mult, size) \
-    _SAL1_1_Source_(_Out_opt_cap_m_, (mult, size), _Pre_opt_cap_m_(mult, size) _Post_valid_impl_)
-#define _Out_z_cap_m_(mult, size) \
-    _SAL1_1_Source_(_Out_z_cap_m_, (mult, size), _Pre_cap_m_(mult, size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_m_(mult, size)                \
-    _SAL1_1_Source_(_Out_opt_z_cap_m_, (mult, size), \
-                    _Pre_opt_cap_m_(mult, size) _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by another pointer
-// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax )
-// pch++; }
-#define _Out_ptrdiff_cap_(size) \
-    _SAL1_1_Source_(_Out_ptrdiff_cap_, (size), _Pre_ptrdiff_cap_(size) _Post_valid_impl_)
-#define _Out_opt_ptrdiff_cap_(size) \
-    _SAL1_1_Source_(_Out_opt_ptrdiff_cap_, (size), _Pre_opt_ptrdiff_cap_(size) _Post_valid_impl_)
-
-// buffer capacity is described by a complex expression
-#define _Out_cap_x_(size) _SAL1_1_Source_(_Out_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_)
-#define _Out_opt_cap_x_(size) \
-    _SAL1_1_Source_(_Out_opt_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_)
-#define _Out_bytecap_x_(size) \
-    _SAL1_1_Source_(_Out_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_)
-#define _Out_opt_bytecap_x_(size) \
-    _SAL1_1_Source_(_Out_opt_bytecap_x_, (size), _Pre_opt_bytecap_x_(size) _Post_valid_impl_)
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo );
-// buffer capacity is described by another parameter
-#define _Out_z_cap_(size) \
-    _SAL1_1_Source_(_Out_z_cap_, (size), _Pre_cap_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_(size) \
-    _SAL1_1_Source_(_Out_opt_z_cap_, (size), _Pre_opt_cap_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_(size) \
-    _SAL1_1_Source_(_Out_z_bytecap_, (size), _Pre_bytecap_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_(size) \
-    _SAL1_1_Source_(_Out_opt_z_bytecap_, (size), _Pre_opt_bytecap_(size) _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by a constant expression
-#define _Out_z_cap_c_(size) \
-    _SAL1_1_Source_(_Out_z_cap_c_, (size), _Pre_cap_c_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_c_(size) \
-    _SAL1_1_Source_(_Out_opt_z_cap_c_, (size), _Pre_opt_cap_c_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_c_(size) \
-    _SAL1_1_Source_(_Out_z_bytecap_c_, (size), _Pre_bytecap_c_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_c_(size)                \
-    _SAL1_1_Source_(_Out_opt_z_bytecap_c_, (size), \
-                    _Pre_opt_bytecap_c_(size) _Post_valid_impl_ _Post_z_)
-
-// buffer capacity is described by a complex expression
-#define _Out_z_cap_x_(size) \
-    _SAL1_1_Source_(_Out_z_cap_x_, (size), _Pre_cap_x_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_cap_x_(size) \
-    _SAL1_1_Source_(_Out_opt_z_cap_x_, (size), _Pre_opt_cap_x_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_z_bytecap_x_(size) \
-    _SAL1_1_Source_(_Out_z_bytecap_x_, (size), _Pre_bytecap_x_(size) _Post_valid_impl_ _Post_z_)
-#define _Out_opt_z_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Out_opt_z_bytecap_x_, (size), \
-                    _Pre_opt_bytecap_x_(size) _Post_valid_impl_ _Post_z_)
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom,
-// _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo );
-#define _Out_cap_post_count_(cap, count)                \
-    _SAL1_1_Source_(_Out_cap_post_count_, (cap, count), \
-                    _Pre_cap_(cap) _Post_valid_impl_ _Post_count_(count))
-#define _Out_opt_cap_post_count_(cap, count)                \
-    _SAL1_1_Source_(_Out_opt_cap_post_count_, (cap, count), \
-                    _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_count_(count))
-#define _Out_bytecap_post_bytecount_(cap, count)                \
-    _SAL1_1_Source_(_Out_bytecap_post_bytecount_, (cap, count), \
-                    _Pre_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count))
-#define _Out_opt_bytecap_post_bytecount_(cap, count)                \
-    _SAL1_1_Source_(_Out_opt_bytecap_post_bytecount_, (cap, count), \
-                    _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_bytecount_(count))
-
-// a zero terminated string is filled into a buffer of given capacity
-// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char*
-// szTo, size_t cchTo );
-#define _Out_z_cap_post_count_(cap, count)                \
-    _SAL1_1_Source_(_Out_z_cap_post_count_, (cap, count), \
-                    _Pre_cap_(cap) _Post_valid_impl_ _Post_z_count_(count))
-#define _Out_opt_z_cap_post_count_(cap, count)                \
-    _SAL1_1_Source_(_Out_opt_z_cap_post_count_, (cap, count), \
-                    _Pre_opt_cap_(cap) _Post_valid_impl_ _Post_z_count_(count))
-#define _Out_z_bytecap_post_bytecount_(cap, count)                \
-    _SAL1_1_Source_(_Out_z_bytecap_post_bytecount_, (cap, count), \
-                    _Pre_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count))
-#define _Out_opt_z_bytecap_post_bytecount_(cap, count)                \
-    _SAL1_1_Source_(_Out_opt_z_bytecap_post_bytecount_, (cap, count), \
-                    _Pre_opt_bytecap_(cap) _Post_valid_impl_ _Post_z_bytecount_(count))
-
-// only use with dereferenced arguments e.g. '*pcch'
-#define _Out_capcount_(capcount)                \
-    _SAL1_1_Source_(_Out_capcount_, (capcount), \
-                    _Pre_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount))
-#define _Out_opt_capcount_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_capcount_, (capcount), \
-                    _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_count_(capcount))
-#define _Out_bytecapcount_(capcount)                \
-    _SAL1_1_Source_(_Out_bytecapcount_, (capcount), \
-                    _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount))
-#define _Out_opt_bytecapcount_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_bytecapcount_, (capcount), \
-                    _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_bytecount_(capcount))
-
-#define _Out_capcount_x_(capcount)                \
-    _SAL1_1_Source_(_Out_capcount_x_, (capcount), \
-                    _Pre_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount))
-#define _Out_opt_capcount_x_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_capcount_x_, (capcount), \
-                    _Pre_opt_cap_x_(capcount) _Post_valid_impl_ _Post_count_x_(capcount))
-#define _Out_bytecapcount_x_(capcount)                \
-    _SAL1_1_Source_(_Out_bytecapcount_x_, (capcount), \
-                    _Pre_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount))
-#define _Out_opt_bytecapcount_x_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_bytecapcount_x_, (capcount), \
-                    _Pre_opt_bytecap_x_(capcount) _Post_valid_impl_ _Post_bytecount_x_(capcount))
-
-// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen );
-#define _Out_z_capcount_(capcount)                \
-    _SAL1_1_Source_(_Out_z_capcount_, (capcount), \
-                    _Pre_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount))
-#define _Out_opt_z_capcount_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_z_capcount_, (capcount), \
-                    _Pre_opt_cap_(capcount) _Post_valid_impl_ _Post_z_count_(capcount))
-#define _Out_z_bytecapcount_(capcount)                \
-    _SAL1_1_Source_(_Out_z_bytecapcount_, (capcount), \
-                    _Pre_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount))
-#define _Out_opt_z_bytecapcount_(capcount)                \
-    _SAL1_1_Source_(_Out_opt_z_bytecapcount_, (capcount), \
-                    _Pre_opt_bytecap_(capcount) _Post_valid_impl_ _Post_z_bytecount_(capcount))
-
-// 'inout' buffers with initialized elements before and after the call
-// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
-#define _Inout_count_(size) _SAL1_1_Source_(_Inout_count_, (size), _Prepost_count_(size))
-#define _Inout_opt_count_(size) \
-    _SAL1_1_Source_(_Inout_opt_count_, (size), _Prepost_opt_count_(size))
-#define _Inout_bytecount_(size) \
-    _SAL1_1_Source_(_Inout_bytecount_, (size), _Prepost_bytecount_(size))
-#define _Inout_opt_bytecount_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecount_, (size), _Prepost_opt_bytecount_(size))
-
-#define _Inout_count_c_(size) _SAL1_1_Source_(_Inout_count_c_, (size), _Prepost_count_c_(size))
-#define _Inout_opt_count_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_count_c_, (size), _Prepost_opt_count_c_(size))
-#define _Inout_bytecount_c_(size) \
-    _SAL1_1_Source_(_Inout_bytecount_c_, (size), _Prepost_bytecount_c_(size))
-#define _Inout_opt_bytecount_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecount_c_, (size), _Prepost_opt_bytecount_c_(size))
-
-// nullterminated 'inout' buffers with initialized elements before and after the call
-// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndices, size_t cIndices );
-#define _Inout_z_count_(size) \
-    _SAL1_1_Source_(_Inout_z_count_, (size), _Prepost_z_ _Prepost_count_(size))
-#define _Inout_opt_z_count_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_count_, (size), _Prepost_z_ _Prepost_opt_count_(size))
-#define _Inout_z_bytecount_(size) \
-    _SAL1_1_Source_(_Inout_z_bytecount_, (size), _Prepost_z_ _Prepost_bytecount_(size))
-#define _Inout_opt_z_bytecount_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_bytecount_, (size), _Prepost_z_ _Prepost_opt_bytecount_(size))
-
-#define _Inout_z_count_c_(size) \
-    _SAL1_1_Source_(_Inout_z_count_c_, (size), _Prepost_z_ _Prepost_count_c_(size))
-#define _Inout_opt_z_count_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_count_c_, (size), _Prepost_z_ _Prepost_opt_count_c_(size))
-#define _Inout_z_bytecount_c_(size) \
-    _SAL1_1_Source_(_Inout_z_bytecount_c_, (size), _Prepost_z_ _Prepost_bytecount_c_(size))
-#define _Inout_opt_z_bytecount_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_bytecount_c_, (size), _Prepost_z_ _Prepost_opt_bytecount_c_(size))
-
-#define _Inout_ptrdiff_count_(size) \
-    _SAL1_1_Source_(_Inout_ptrdiff_count_, (size), _Pre_ptrdiff_count_(size))
-#define _Inout_opt_ptrdiff_count_(size) \
-    _SAL1_1_Source_(_Inout_opt_ptrdiff_count_, (size), _Pre_opt_ptrdiff_count_(size))
-
-#define _Inout_count_x_(size) _SAL1_1_Source_(_Inout_count_x_, (size), _Prepost_count_x_(size))
-#define _Inout_opt_count_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_count_x_, (size), _Prepost_opt_count_x_(size))
-#define _Inout_bytecount_x_(size) \
-    _SAL1_1_Source_(_Inout_bytecount_x_, (size), _Prepost_bytecount_x_(size))
-#define _Inout_opt_bytecount_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecount_x_, (size), _Prepost_opt_bytecount_x_(size))
-
-// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo );
-#define _Inout_cap_(size) _SAL1_1_Source_(_Inout_cap_, (size), _Pre_valid_cap_(size) _Post_valid_)
-#define _Inout_opt_cap_(size) \
-    _SAL1_1_Source_(_Inout_opt_cap_, (size), _Pre_opt_valid_cap_(size) _Post_valid_)
-#define _Inout_bytecap_(size) \
-    _SAL1_1_Source_(_Inout_bytecap_, (size), _Pre_valid_bytecap_(size) _Post_valid_)
-#define _Inout_opt_bytecap_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecap_, (size), _Pre_opt_valid_bytecap_(size) _Post_valid_)
-
-#define _Inout_cap_c_(size) \
-    _SAL1_1_Source_(_Inout_cap_c_, (size), _Pre_valid_cap_c_(size) _Post_valid_)
-#define _Inout_opt_cap_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_cap_c_, (size), _Pre_opt_valid_cap_c_(size) _Post_valid_)
-#define _Inout_bytecap_c_(size) \
-    _SAL1_1_Source_(_Inout_bytecap_c_, (size), _Pre_valid_bytecap_c_(size) _Post_valid_)
-#define _Inout_opt_bytecap_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecap_c_, (size), _Pre_opt_valid_bytecap_c_(size) _Post_valid_)
-
-#define _Inout_cap_x_(size) \
-    _SAL1_1_Source_(_Inout_cap_x_, (size), _Pre_valid_cap_x_(size) _Post_valid_)
-#define _Inout_opt_cap_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_cap_x_, (size), _Pre_opt_valid_cap_x_(size) _Post_valid_)
-#define _Inout_bytecap_x_(size) \
-    _SAL1_1_Source_(_Inout_bytecap_x_, (size), _Pre_valid_bytecap_x_(size) _Post_valid_)
-#define _Inout_opt_bytecap_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_bytecap_x_, (size), _Pre_opt_valid_bytecap_x_(size) _Post_valid_)
-
-// inout string buffers with writable size
-// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo );
-#define _Inout_z_cap_(size) _SAL1_1_Source_(_Inout_z_cap_, (size), _Pre_z_cap_(size) _Post_z_)
-#define _Inout_opt_z_cap_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_cap_, (size), _Pre_opt_z_cap_(size) _Post_z_)
-#define _Inout_z_bytecap_(size) \
-    _SAL1_1_Source_(_Inout_z_bytecap_, (size), _Pre_z_bytecap_(size) _Post_z_)
-#define _Inout_opt_z_bytecap_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_bytecap_, (size), _Pre_opt_z_bytecap_(size) _Post_z_)
-
-#define _Inout_z_cap_c_(size) _SAL1_1_Source_(_Inout_z_cap_c_, (size), _Pre_z_cap_c_(size) _Post_z_)
-#define _Inout_opt_z_cap_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_cap_c_, (size), _Pre_opt_z_cap_c_(size) _Post_z_)
-#define _Inout_z_bytecap_c_(size) \
-    _SAL1_1_Source_(_Inout_z_bytecap_c_, (size), _Pre_z_bytecap_c_(size) _Post_z_)
-#define _Inout_opt_z_bytecap_c_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_bytecap_c_, (size), _Pre_opt_z_bytecap_c_(size) _Post_z_)
-
-#define _Inout_z_cap_x_(size) _SAL1_1_Source_(_Inout_z_cap_x_, (size), _Pre_z_cap_x_(size) _Post_z_)
-#define _Inout_opt_z_cap_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_cap_x_, (size), _Pre_opt_z_cap_x_(size) _Post_z_)
-#define _Inout_z_bytecap_x_(size) \
-    _SAL1_1_Source_(_Inout_z_bytecap_x_, (size), _Pre_z_bytecap_x_(size) _Post_z_)
-#define _Inout_opt_z_bytecap_x_(size) \
-    _SAL1_1_Source_(_Inout_opt_z_bytecap_x_, (size), _Pre_opt_z_bytecap_x_(size) _Post_z_)
-
-// returning pointers to valid objects
-#define _Ret_ _SAL1_1_Source_(_Ret_, (), _Ret_valid_)
-#define _Ret_opt_ _SAL1_1_Source_(_Ret_opt_, (), _Ret_opt_valid_)
-
-// annotations to express 'boundedness' of integral value parameter
-#define _In_bound_ _SAL1_1_Source_(_In_bound_, (), _In_bound_impl_)
-#define _Out_bound_ _SAL1_1_Source_(_Out_bound_, (), _Out_bound_impl_)
-#define _Ret_bound_ _SAL1_1_Source_(_Ret_bound_, (), _Ret_bound_impl_)
-#define _Deref_in_bound_ _SAL1_1_Source_(_Deref_in_bound_, (), _Deref_in_bound_impl_)
-#define _Deref_out_bound_ _SAL1_1_Source_(_Deref_out_bound_, (), _Deref_out_bound_impl_)
-#define _Deref_inout_bound_ \
-    _SAL1_1_Source_(_Deref_inout_bound_, (), _Deref_in_bound_ _Deref_out_bound_)
-#define _Deref_ret_bound_ _SAL1_1_Source_(_Deref_ret_bound_, (), _Deref_ret_bound_impl_)
-
-// e.g.  HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT );
-#define _Deref_out_ _SAL1_1_Source_(_Deref_out_, (), _Out_ _Deref_post_valid_)
-#define _Deref_out_opt_ _SAL1_1_Source_(_Deref_out_opt_, (), _Out_ _Deref_post_opt_valid_)
-#define _Deref_opt_out_ _SAL1_1_Source_(_Deref_opt_out_, (), _Out_opt_ _Deref_post_valid_)
-#define _Deref_opt_out_opt_ \
-    _SAL1_1_Source_(_Deref_opt_out_opt_, (), _Out_opt_ _Deref_post_opt_valid_)
-
-// e.g.  void CloneString( _In_z_ const WCHAR* wzFrom, _Deref_out_z_ WCHAR** pWzTo );
-#define _Deref_out_z_ _SAL1_1_Source_(_Deref_out_z_, (), _Out_ _Deref_post_z_)
-#define _Deref_out_opt_z_ _SAL1_1_Source_(_Deref_out_opt_z_, (), _Out_ _Deref_post_opt_z_)
-#define _Deref_opt_out_z_ _SAL1_1_Source_(_Deref_opt_out_z_, (), _Out_opt_ _Deref_post_z_)
-#define _Deref_opt_out_opt_z_ \
-    _SAL1_1_Source_(_Deref_opt_out_opt_z_, (), _Out_opt_ _Deref_post_opt_z_)
-
-//
-// _Deref_pre_ ---
-//
-// describing conditions for array elements of dereferenced pointer parameters that must be met
-// before the call
-
-// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const WCHAR* const rgpwch[] );
-#define _Deref_pre_z_                                                                        \
-    _SAL1_1_Source_(_Deref_pre_z_, (),                                                       \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__zterm_impl) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_opt_z_                                                                      \
-    _SAL1_1_Source_(_Deref_pre_opt_z_, (),                                                     \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__zterm_impl) \
-                        _Pre_valid_impl_)
-
-// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ WCHAR*
-// const rgpwch[] ); buffer capacity is described by another parameter
-#define _Deref_pre_cap_(size)                \
-    _SAL1_1_Source_(_Deref_pre_cap_, (size), \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)))
-#define _Deref_pre_opt_cap_(size)                              \
-    _SAL1_1_Source_(_Deref_pre_opt_cap_, (size),               \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_impl(size)))
-#define _Deref_pre_bytecap_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_bytecap_, (size),             \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_impl(size)))
-#define _Deref_pre_opt_bytecap_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecap_, (size),           \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_impl(size)))
-
-// buffer capacity is described by a constant expression
-#define _Deref_pre_cap_c_(size)                              \
-    _SAL1_1_Source_(_Deref_pre_cap_c_, (size),               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_c_impl(size)))
-#define _Deref_pre_opt_cap_c_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_opt_cap_c_, (size),             \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_c_impl(size)))
-#define _Deref_pre_bytecap_c_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_bytecap_c_, (size),           \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_c_impl(size)))
-#define _Deref_pre_opt_bytecap_c_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecap_c_, (size),         \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_c_impl(size)))
-
-// buffer capacity is described by a complex condition
-#define _Deref_pre_cap_x_(size)                              \
-    _SAL1_1_Source_(_Deref_pre_cap_x_, (size),               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_x_impl(size)))
-#define _Deref_pre_opt_cap_x_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_opt_cap_x_, (size),             \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_x_impl(size)))
-#define _Deref_pre_bytecap_x_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_bytecap_x_, (size),           \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_x_impl(size)))
-#define _Deref_pre_opt_bytecap_x_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecap_x_, (size),         \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_x_impl(size)))
-
-// convenience macros for nullterminated buffers with given capacity
-#define _Deref_pre_z_cap_(size)                              \
-    _SAL1_1_Source_(_Deref_pre_z_cap_, (size),               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_opt_z_cap_, (size),             \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_z_bytecap_, (size),           \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_, (size),         \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_z_cap_c_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_z_cap_c_, (size),             \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_c_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_opt_z_cap_c_, (size),           \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_c_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_z_bytecap_c_, (size),         \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_c_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_c_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_z_cap_x_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_z_cap_x_, (size),             \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_cap_x_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_opt_z_cap_x_, (size),           \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __cap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_z_bytecap_x_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_z_bytecap_x_, (size),         \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_z_bytecap_x_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_z_bytecap_x_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre2_impl_(__zterm_impl, __bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Deref_pre_valid_cap_(size)                                                              \
-    _SAL1_1_Source_(_Deref_pre_valid_cap_, (size),                                               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_(size)                                                            \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_cap_, (size),                                             \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) _Deref_pre1_impl_(__cap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_valid_bytecap_, (size),       \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_(size)                    \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_, (size),     \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_valid_cap_c_(size)                                                              \
-    _SAL1_1_Source_(_Deref_pre_valid_cap_c_, (size),                                               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_c_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_cap_c_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_c_(size)                    \
-    _SAL1_1_Source_(_Deref_pre_valid_bytecap_c_, (size),     \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_c_(size)                  \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_c_, (size),   \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Deref_pre_valid_cap_x_(size)                                                              \
-    _SAL1_1_Source_(_Deref_pre_valid_cap_x_, (size),                                               \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__cap_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_cap_x_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_cap_x_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__cap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_valid_bytecap_x_(size)                    \
-    _SAL1_1_Source_(_Deref_pre_valid_bytecap_x_, (size),     \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_bytecap_x_(size)                  \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_bytecap_x_, (size),   \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n );
-// valid buffer extent is described by another parameter
-#define _Deref_pre_count_(size)                                                                    \
-    _SAL1_1_Source_(_Deref_pre_count_, (size),                                                     \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Deref_pre1_impl_(__count_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Deref_pre_opt_count_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_opt_count_, (size),             \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__count_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_bytecount_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_bytecount_, (size),           \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecount_, (size),         \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_impl(size)) _Pre_valid_impl_)
-
-// valid buffer extent is described by a constant expression
-#define _Deref_pre_count_c_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_count_c_, (size),             \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_count_c_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_opt_count_c_, (size),           \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__count_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_bytecount_c_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_bytecount_c_, (size),         \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_c_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecount_c_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_c_impl(size)) _Pre_valid_impl_)
-
-// valid buffer extent is described by a complex expression
-#define _Deref_pre_count_x_(size)                            \
-    _SAL1_1_Source_(_Deref_pre_count_x_, (size),             \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_count_x_(size)                          \
-    _SAL1_1_Source_(_Deref_pre_opt_count_x_, (size),           \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__count_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_bytecount_x_(size)                        \
-    _SAL1_1_Source_(_Deref_pre_bytecount_x_, (size),         \
-                    _Deref_pre1_impl_(__notnull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-#define _Deref_pre_opt_bytecount_x_(size)                      \
-    _SAL1_1_Source_(_Deref_pre_opt_bytecount_x_, (size),       \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) \
-                        _Deref_pre1_impl_(__bytecount_x_impl(size)) _Pre_valid_impl_)
-
-// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems );
-#define _Deref_pre_valid_                  \
-    _SAL1_1_Source_(_Deref_pre_valid_, (), \
-                    _Deref_pre1_impl_(__notnull_impl_notref) _Pre_valid_impl_)
-#define _Deref_pre_opt_valid_                  \
-    _SAL1_1_Source_(_Deref_pre_opt_valid_, (), \
-                    _Deref_pre1_impl_(__maybenull_impl_notref) _Pre_valid_impl_)
-#define _Deref_pre_invalid_ \
-    _SAL1_1_Source_(_Deref_pre_invalid_, (), _Deref_pre1_impl_(__notvalid_impl))
-
-#define _Deref_pre_notnull_ \
-    _SAL1_1_Source_(_Deref_pre_notnull_, (), _Deref_pre1_impl_(__notnull_impl_notref))
-#define _Deref_pre_maybenull_ \
-    _SAL1_1_Source_(_Deref_pre_maybenull_, (), _Deref_pre1_impl_(__maybenull_impl_notref))
-#define _Deref_pre_null_ \
-    _SAL1_1_Source_(_Deref_pre_null_, (), _Deref_pre1_impl_(__null_impl_notref))
-
-// restrict access rights
-#define _Deref_pre_readonly_ \
-    _SAL1_1_Source_(_Deref_pre_readonly_, (), _Deref_pre1_impl_(__readaccess_impl_notref))
-#define _Deref_pre_writeonly_ \
-    _SAL1_1_Source_(_Deref_pre_writeonly_, (), _Deref_pre1_impl_(__writeaccess_impl_notref))
-
-//
-// _Deref_post_ ---
-//
-// describing conditions for array elements or dereferenced pointer parameters that hold after the
-// call
-
-// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ WCHAR** pWzOut );
-#define _Deref_post_z_                                                                         \
-    _SAL1_1_Source_(_Deref_post_z_, (),                                                        \
-                    _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__zterm_impl) \
-                        _Post_valid_impl_)
-#define _Deref_post_opt_z_                                                                       \
-    _SAL1_1_Source_(_Deref_post_opt_z_, (),                                                      \
-                    _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post1_impl_(__zterm_impl) \
-                        _Post_valid_impl_)
-
-// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv );
-// buffer capacity is described by another parameter
-#define _Deref_post_cap_(size)                                \
-    _SAL1_1_Source_(_Deref_post_cap_, (size),                 \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__cap_impl(size)))
-#define _Deref_post_opt_cap_(size)                              \
-    _SAL1_1_Source_(_Deref_post_opt_cap_, (size),               \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_impl(size)))
-#define _Deref_post_bytecap_(size)                            \
-    _SAL1_1_Source_(_Deref_post_bytecap_, (size),             \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_impl(size)))
-#define _Deref_post_opt_bytecap_(size)                          \
-    _SAL1_1_Source_(_Deref_post_opt_bytecap_, (size),           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_impl(size)))
-
-// buffer capacity is described by a constant expression
-#define _Deref_post_cap_c_(size)                              \
-    _SAL1_1_Source_(_Deref_post_cap_c_, (size),               \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__cap_c_impl(size)))
-#define _Deref_post_opt_cap_c_(size)                            \
-    _SAL1_1_Source_(_Deref_post_opt_cap_c_, (size),             \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_c_impl(size)))
-#define _Deref_post_bytecap_c_(size)                          \
-    _SAL1_1_Source_(_Deref_post_bytecap_c_, (size),           \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_c_impl(size)))
-#define _Deref_post_opt_bytecap_c_(size)                        \
-    _SAL1_1_Source_(_Deref_post_opt_bytecap_c_, (size),         \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_c_impl(size)))
-
-// buffer capacity is described by a complex expression
-#define _Deref_post_cap_x_(size)                              \
-    _SAL1_1_Source_(_Deref_post_cap_x_, (size),               \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__cap_x_impl(size)))
-#define _Deref_post_opt_cap_x_(size)                            \
-    _SAL1_1_Source_(_Deref_post_opt_cap_x_, (size),             \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_x_impl(size)))
-#define _Deref_post_bytecap_x_(size)                          \
-    _SAL1_1_Source_(_Deref_post_bytecap_x_, (size),           \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_x_impl(size)))
-#define _Deref_post_opt_bytecap_x_(size)                        \
-    _SAL1_1_Source_(_Deref_post_opt_bytecap_x_, (size),         \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_x_impl(size)))
-
-// convenience macros for nullterminated buffers with given capacity
-#define _Deref_post_z_cap_(size)                              \
-    _SAL1_1_Source_(_Deref_post_z_cap_, (size),               \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_(size)                            \
-    _SAL1_1_Source_(_Deref_post_opt_z_cap_, (size),             \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_impl(size)) _Post_valid_impl_)
-#define _Deref_post_z_bytecap_(size)                          \
-    _SAL1_1_Source_(_Deref_post_z_bytecap_, (size),           \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __bytecap_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_(size)                        \
-    _SAL1_1_Source_(_Deref_post_opt_z_bytecap_, (size),         \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __bytecap_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_z_cap_c_(size)                            \
-    _SAL1_1_Source_(_Deref_post_z_cap_c_, (size),             \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_c_(size)                          \
-    _SAL1_1_Source_(_Deref_post_opt_z_cap_c_, (size),           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_z_bytecap_c_(size)                                            \
-    _SAL1_1_Source_(_Deref_post_z_bytecap_c_, (size),                             \
-                    _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_( \
-                        __zterm_impl, __bytecap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_c_(size)                                          \
-    _SAL1_1_Source_(_Deref_post_opt_z_bytecap_c_, (size),                           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_( \
-                        __zterm_impl, __bytecap_c_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_z_cap_x_(size)                            \
-    _SAL1_1_Source_(_Deref_post_z_cap_x_, (size),             \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_cap_x_(size)                          \
-    _SAL1_1_Source_(_Deref_post_opt_z_cap_x_, (size),           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post2_impl_(__zterm_impl, __cap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_z_bytecap_x_(size)                                            \
-    _SAL1_1_Source_(_Deref_post_z_bytecap_x_, (size),                             \
-                    _Deref_post1_impl_(__notnull_impl_notref) _Deref_post2_impl_( \
-                        __zterm_impl, __bytecap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_z_bytecap_x_(size)                                          \
-    _SAL1_1_Source_(_Deref_post_opt_z_bytecap_x_, (size),                           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) _Deref_post2_impl_( \
-                        __zterm_impl, __bytecap_x_impl(size)) _Post_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Deref_post_valid_cap_(size)                                                               \
-    _SAL1_1_Source_(_Deref_post_valid_cap_, (size),                                                \
-                    _Deref_post1_impl_(__notnull_impl_notref) _Deref_post1_impl_(__cap_impl(size)) \
-                        _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_(size)                        \
-    _SAL1_1_Source_(_Deref_post_opt_valid_cap_, (size),         \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_impl(size)) _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_(size)                      \
-    _SAL1_1_Source_(_Deref_post_valid_bytecap_, (size),       \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_(size)                    \
-    _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_, (size),     \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_valid_cap_c_(size)                        \
-    _SAL1_1_Source_(_Deref_post_valid_cap_c_, (size),         \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_c_(size)                      \
-    _SAL1_1_Source_(_Deref_post_opt_valid_cap_c_, (size),       \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_c_(size)                    \
-    _SAL1_1_Source_(_Deref_post_valid_bytecap_c_, (size),     \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_c_(size)                  \
-    _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_c_, (size),   \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_c_impl(size)) _Post_valid_impl_)
-
-#define _Deref_post_valid_cap_x_(size)                        \
-    _SAL1_1_Source_(_Deref_post_valid_cap_x_, (size),         \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_cap_x_(size)                      \
-    _SAL1_1_Source_(_Deref_post_opt_valid_cap_x_, (size),       \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__cap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_valid_bytecap_x_(size)                    \
-    _SAL1_1_Source_(_Deref_post_valid_bytecap_x_, (size),     \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_valid_bytecap_x_(size)                  \
-    _SAL1_1_Source_(_Deref_post_opt_valid_bytecap_x_, (size),   \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecap_x_impl(size)) _Post_valid_impl_)
-
-// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void**
-// ppv ); valid buffer extent is described by another parameter
-#define _Deref_post_count_(size)                              \
-    _SAL1_1_Source_(_Deref_post_count_, (size),               \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_count_(size)                            \
-    _SAL1_1_Source_(_Deref_post_opt_count_, (size),             \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__count_impl(size)) _Post_valid_impl_)
-#define _Deref_post_bytecount_(size)                          \
-    _SAL1_1_Source_(_Deref_post_bytecount_, (size),           \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_(size)                        \
-    _SAL1_1_Source_(_Deref_post_opt_bytecount_, (size),         \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
-
-// buffer capacity is described by a constant expression
-#define _Deref_post_count_c_(size)                            \
-    _SAL1_1_Source_(_Deref_post_count_c_, (size),             \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_count_c_(size)                          \
-    _SAL1_1_Source_(_Deref_post_opt_count_c_, (size),           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__count_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_bytecount_c_(size)                        \
-    _SAL1_1_Source_(_Deref_post_bytecount_c_, (size),         \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_c_(size)                      \
-    _SAL1_1_Source_(_Deref_post_opt_bytecount_c_, (size),       \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-
-// buffer capacity is described by a complex expression
-#define _Deref_post_count_x_(size)                            \
-    _SAL1_1_Source_(_Deref_post_count_x_, (size),             \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_count_x_(size)                          \
-    _SAL1_1_Source_(_Deref_post_opt_count_x_, (size),           \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__count_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_bytecount_x_(size)                        \
-    _SAL1_1_Source_(_Deref_post_bytecount_x_, (size),         \
-                    _Deref_post1_impl_(__notnull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-#define _Deref_post_opt_bytecount_x_(size)                      \
-    _SAL1_1_Source_(_Deref_post_opt_bytecount_x_, (size),       \
-                    _Deref_post1_impl_(__maybenull_impl_notref) \
-                        _Deref_post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-
-// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems
-// );
-#define _Deref_post_valid_                  \
-    _SAL1_1_Source_(_Deref_post_valid_, (), \
-                    _Deref_post1_impl_(__notnull_impl_notref) _Post_valid_impl_)
-#define _Deref_post_opt_valid_                  \
-    _SAL1_1_Source_(_Deref_post_opt_valid_, (), \
-                    _Deref_post1_impl_(__maybenull_impl_notref) _Post_valid_impl_)
-
-#define _Deref_post_notnull_ \
-    _SAL1_1_Source_(_Deref_post_notnull_, (), _Deref_post1_impl_(__notnull_impl_notref))
-#define _Deref_post_maybenull_ \
-    _SAL1_1_Source_(_Deref_post_maybenull_, (), _Deref_post1_impl_(__maybenull_impl_notref))
-#define _Deref_post_null_ \
-    _SAL1_1_Source_(_Deref_post_null_, (), _Deref_post1_impl_(__null_impl_notref))
-
-//
-// _Deref_ret_ ---
-//
-
-#define _Deref_ret_z_                  \
-    _SAL1_1_Source_(_Deref_ret_z_, (), \
-                    _Deref_ret1_impl_(__notnull_impl_notref) _Deref_ret1_impl_(__zterm_impl))
-#define _Deref_ret_opt_z_                  \
-    _SAL1_1_Source_(_Deref_ret_opt_z_, (), \
-                    _Deref_ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__zterm_impl))
-
-//
-// special _Deref_ ---
-//
-#define _Deref2_pre_readonly_ \
-    _SAL1_1_Source_(_Deref2_pre_readonly_, (), _Deref2_pre1_impl_(__readaccess_impl_notref))
-
-//
-// _Ret_ ---
-//
-
-// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src );
-#define _Ret_opt_valid_ \
-    _SAL1_1_Source_(_Ret_opt_valid_, (), _Ret1_impl_(__maybenull_impl_notref) _Ret_valid_impl_)
-#define _Ret_opt_z_ \
-    _SAL1_1_Source_(_Ret_opt_z_, (), _Ret2_impl_(__maybenull_impl, __zterm_impl) _Ret_valid_impl_)
-
-// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb );
-// Buffer capacity is described by another parameter
-#define _Ret_cap_(size)                \
-    _SAL1_1_Source_(_Ret_cap_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_impl(size)))
-#define _Ret_opt_cap_(size)                \
-    _SAL1_1_Source_(_Ret_opt_cap_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_impl(size)))
-#define _Ret_bytecap_(size)                \
-    _SAL1_1_Source_(_Ret_bytecap_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
-#define _Ret_opt_bytecap_(size)                \
-    _SAL1_1_Source_(_Ret_opt_bytecap_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_impl(size)))
-
-// Buffer capacity is described by a constant expression
-#define _Ret_cap_c_(size)                \
-    _SAL1_1_Source_(_Ret_cap_c_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
-#define _Ret_opt_cap_c_(size)                \
-    _SAL1_1_Source_(_Ret_opt_cap_c_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_c_impl(size)))
-#define _Ret_bytecap_c_(size)                \
-    _SAL1_1_Source_(_Ret_bytecap_c_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
-#define _Ret_opt_bytecap_c_(size)                \
-    _SAL1_1_Source_(_Ret_opt_bytecap_c_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_c_impl(size)))
-
-// Buffer capacity is described by a complex condition
-#define _Ret_cap_x_(size)                \
-    _SAL1_1_Source_(_Ret_cap_x_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
-#define _Ret_opt_cap_x_(size)                \
-    _SAL1_1_Source_(_Ret_opt_cap_x_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__cap_x_impl(size)))
-#define _Ret_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Ret_bytecap_x_, (size), \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
-#define _Ret_opt_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Ret_opt_bytecap_x_, (size), \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecap_x_impl(size)))
-
-// return value is nullterminated and capacity is given by another parameter
-#define _Ret_z_cap_(size)                                                                          \
-    _SAL1_1_Source_(_Ret_z_cap_, (size),                                                           \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret2_impl_(__zterm_impl, __cap_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_z_cap_(size)                            \
-    _SAL1_1_Source_(_Ret_opt_z_cap_, (size),             \
-                    _Ret1_impl_(__maybenull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __cap_impl(size)) _Ret_valid_impl_)
-#define _Ret_z_bytecap_(size)                          \
-    _SAL1_1_Source_(_Ret_z_bytecap_, (size),           \
-                    _Ret1_impl_(__notnull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __bytecap_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_z_bytecap_(size)                        \
-    _SAL1_1_Source_(_Ret_opt_z_bytecap_, (size),         \
-                    _Ret1_impl_(__maybenull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __bytecap_impl(size)) _Ret_valid_impl_)
-
-// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb );
-// Valid Buffer extent is described by another parameter
-#define _Ret_count_(size)                                                              \
-    _SAL1_1_Source_(_Ret_count_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_count_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_count_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_bytecount_(size)                                                              \
-    _SAL1_1_Source_(_Ret_bytecount_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_bytecount_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_bytecount_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_impl(size)) \
-                        _Ret_valid_impl_)
-
-// Valid Buffer extent is described by a constant expression
-#define _Ret_count_c_(size)                                                              \
-    _SAL1_1_Source_(_Ret_count_c_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_c_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_count_c_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_count_c_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_c_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_bytecount_c_(size)                                                              \
-    _SAL1_1_Source_(_Ret_bytecount_c_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_bytecount_c_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_bytecount_c_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_c_impl(size)) \
-                        _Ret_valid_impl_)
-
-// Valid Buffer extent is described by a complex expression
-#define _Ret_count_x_(size)                                                              \
-    _SAL1_1_Source_(_Ret_count_x_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__count_x_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_count_x_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_count_x_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__count_x_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_bytecount_x_(size)                                                              \
-    _SAL1_1_Source_(_Ret_bytecount_x_, (size),                                               \
-                    _Ret1_impl_(__notnull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) \
-                        _Ret_valid_impl_)
-#define _Ret_opt_bytecount_x_(size)                                                            \
-    _SAL1_1_Source_(_Ret_opt_bytecount_x_, (size),                                             \
-                    _Ret1_impl_(__maybenull_impl_notref) _Ret1_impl_(__bytecount_x_impl(size)) \
-                        _Ret_valid_impl_)
-
-// return value is nullterminated and length is given by another parameter
-#define _Ret_z_count_(size)                            \
-    _SAL1_1_Source_(_Ret_z_count_, (size),             \
-                    _Ret1_impl_(__notnull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __count_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_z_count_(size)                          \
-    _SAL1_1_Source_(_Ret_opt_z_count_, (size),           \
-                    _Ret1_impl_(__maybenull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __count_impl(size)) _Ret_valid_impl_)
-#define _Ret_z_bytecount_(size)                        \
-    _SAL1_1_Source_(_Ret_z_bytecount_, (size),         \
-                    _Ret1_impl_(__notnull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __bytecount_impl(size)) _Ret_valid_impl_)
-#define _Ret_opt_z_bytecount_(size)                      \
-    _SAL1_1_Source_(_Ret_opt_z_bytecount_, (size),       \
-                    _Ret1_impl_(__maybenull_impl_notref) \
-                        _Ret2_impl_(__zterm_impl, __bytecount_impl(size)) _Ret_valid_impl_)
-
-// _Pre_ annotations ---
-#define _Pre_opt_z_                                                                \
-    _SAL1_1_Source_(_Pre_opt_z_, (),                                               \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__zterm_impl) \
-                        _Pre_valid_impl_)
-
-// restrict access rights
-#define _Pre_readonly_ _SAL1_1_Source_(_Pre_readonly_, (), _Pre1_impl_(__readaccess_impl_notref))
-#define _Pre_writeonly_ _SAL1_1_Source_(_Pre_writeonly_, (), _Pre1_impl_(__writeaccess_impl_notref))
-
-// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb );
-// buffer capacity described by another parameter
-#define _Pre_cap_(size)                \
-    _SAL1_1_Source_(_Pre_cap_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)))
-#define _Pre_opt_cap_(size)                \
-    _SAL1_1_Source_(_Pre_opt_cap_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)))
-#define _Pre_bytecap_(size)                \
-    _SAL1_1_Source_(_Pre_bytecap_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
-#define _Pre_opt_bytecap_(size)                \
-    _SAL1_1_Source_(_Pre_opt_bytecap_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)))
-
-// buffer capacity described by a constant expression
-#define _Pre_cap_c_(size)                \
-    _SAL1_1_Source_(_Pre_cap_c_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
-#define _Pre_opt_cap_c_(size)                \
-    _SAL1_1_Source_(_Pre_opt_cap_c_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)))
-#define _Pre_bytecap_c_(size)                \
-    _SAL1_1_Source_(_Pre_bytecap_c_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
-#define _Pre_opt_bytecap_c_(size)                \
-    _SAL1_1_Source_(_Pre_opt_bytecap_c_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)))
-#define _Pre_cap_c_one_                  \
-    _SAL1_1_Source_(_Pre_cap_c_one_, (), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
-#define _Pre_opt_cap_c_one_                  \
-    _SAL1_1_Source_(_Pre_opt_cap_c_one_, (), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl))
-
-// buffer capacity is described by another parameter multiplied by a constant expression
-#define _Pre_cap_m_(mult, size)                \
-    _SAL1_1_Source_(_Pre_cap_m_, (mult, size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__mult_impl(mult, size)))
-#define _Pre_opt_cap_m_(mult, size)                \
-    _SAL1_1_Source_(_Pre_opt_cap_m_, (mult, size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__mult_impl(mult, size)))
-
-// buffer capacity described by size of other buffer, only used by dangerous legacy APIs
-// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src);
-#define _Pre_cap_for_(param)                \
-    _SAL1_1_Source_(_Pre_cap_for_, (param), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
-#define _Pre_opt_cap_for_(param)                \
-    _SAL1_1_Source_(_Pre_opt_cap_for_, (param), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_for_impl(param)))
-
-// buffer capacity described by a complex condition
-#define _Pre_cap_x_(size)                \
-    _SAL1_1_Source_(_Pre_cap_x_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
-#define _Pre_opt_cap_x_(size)                \
-    _SAL1_1_Source_(_Pre_opt_cap_x_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)))
-#define _Pre_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Pre_bytecap_x_, (size), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
-#define _Pre_opt_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Pre_opt_bytecap_x_, (size), \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)))
-
-// buffer capacity described by the difference to another pointer parameter
-#define _Pre_ptrdiff_cap_(ptr)                \
-    _SAL1_1_Source_(_Pre_ptrdiff_cap_, (ptr), \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
-#define _Pre_opt_ptrdiff_cap_(ptr)                       \
-    _SAL1_1_Source_(_Pre_opt_ptrdiff_cap_, (ptr),        \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre1_impl_(__cap_x_impl(__ptrdiff(ptr))))
-
-// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t
-// cchTo );
-#define _Pre_z_cap_(size)                                                                          \
-    _SAL1_1_Source_(_Pre_z_cap_, (size),                                                           \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre2_impl_(__zterm_impl, __cap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_z_cap_(size)                            \
-    _SAL1_1_Source_(_Pre_opt_z_cap_, (size),             \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __cap_impl(size)) _Pre_valid_impl_)
-#define _Pre_z_bytecap_(size)                          \
-    _SAL1_1_Source_(_Pre_z_bytecap_, (size),           \
-                    _Pre1_impl_(__notnull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_(size)                        \
-    _SAL1_1_Source_(_Pre_opt_z_bytecap_, (size),         \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_impl(size)) _Pre_valid_impl_)
-
-#define _Pre_z_cap_c_(size)                            \
-    _SAL1_1_Source_(_Pre_z_cap_c_, (size),             \
-                    _Pre1_impl_(__notnull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __cap_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_cap_c_(size)                          \
-    _SAL1_1_Source_(_Pre_opt_z_cap_c_, (size),           \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __cap_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_z_bytecap_c_(size)                        \
-    _SAL1_1_Source_(_Pre_z_bytecap_c_, (size),         \
-                    _Pre1_impl_(__notnull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_c_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_c_(size)                      \
-    _SAL1_1_Source_(_Pre_opt_z_bytecap_c_, (size),       \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_c_impl(size)) _Pre_valid_impl_)
-
-#define _Pre_z_cap_x_(size)                            \
-    _SAL1_1_Source_(_Pre_z_cap_x_, (size),             \
-                    _Pre1_impl_(__notnull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __cap_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_cap_x_(size)                          \
-    _SAL1_1_Source_(_Pre_opt_z_cap_x_, (size),           \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __cap_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_z_bytecap_x_(size)                        \
-    _SAL1_1_Source_(_Pre_z_bytecap_x_, (size),         \
-                    _Pre1_impl_(__notnull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_x_impl(size)) _Pre_valid_impl_)
-#define _Pre_opt_z_bytecap_x_(size)                      \
-    _SAL1_1_Source_(_Pre_opt_z_bytecap_x_, (size),       \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre2_impl_(__zterm_impl, __bytecap_x_impl(size)) _Pre_valid_impl_)
-
-// known capacity and valid but unknown readable extent
-#define _Pre_valid_cap_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_cap_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_cap_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_valid_bytecap_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_bytecap_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_bytecap_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_impl(size)) \
-                        _Pre_valid_impl_)
-
-#define _Pre_valid_cap_c_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_cap_c_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_c_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_cap_c_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_valid_bytecap_c_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_bytecap_c_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_c_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_bytecap_c_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_c_impl(size)) \
-                        _Pre_valid_impl_)
-
-#define _Pre_valid_cap_x_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_cap_x_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_cap_x_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_cap_x_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_valid_bytecap_x_(size)                                                        \
-    _SAL1_1_Source_(_Pre_valid_bytecap_x_, (size),                                         \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_valid_bytecap_x_(size)                                                      \
-    _SAL1_1_Source_(_Pre_opt_valid_bytecap_x_, (size),                                       \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecap_x_impl(size)) \
-                        _Pre_valid_impl_)
-
-// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom,
-// _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); Valid buffer extent described by another parameter
-#define _Pre_count_(size)                                                              \
-    _SAL1_1_Source_(_Pre_count_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_count_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_count_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_bytecount_(size)                                                              \
-    _SAL1_1_Source_(_Pre_bytecount_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_bytecount_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_bytecount_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_impl(size)) \
-                        _Pre_valid_impl_)
-
-// Valid buffer extent described by a constant expression
-#define _Pre_count_c_(size)                                                              \
-    _SAL1_1_Source_(_Pre_count_c_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_count_c_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_count_c_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_bytecount_c_(size)                                                              \
-    _SAL1_1_Source_(_Pre_bytecount_c_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_bytecount_c_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_bytecount_c_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_c_impl(size)) \
-                        _Pre_valid_impl_)
-
-// Valid buffer extent described by a complex expression
-#define _Pre_count_x_(size)                                                              \
-    _SAL1_1_Source_(_Pre_count_x_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_count_x_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_count_x_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__count_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_bytecount_x_(size)                                                              \
-    _SAL1_1_Source_(_Pre_bytecount_x_, (size),                                               \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_bytecount_x_(size)                                                            \
-    _SAL1_1_Source_(_Pre_opt_bytecount_x_, (size),                                             \
-                    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__bytecount_x_impl(size)) \
-                        _Pre_valid_impl_)
-
-// Valid buffer extent described by the difference to another pointer parameter
-#define _Pre_ptrdiff_count_(ptr)                                                                   \
-    _SAL1_1_Source_(_Pre_ptrdiff_count_, (ptr),                                                    \
-                    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) \
-                        _Pre_valid_impl_)
-#define _Pre_opt_ptrdiff_count_(ptr)                     \
-    _SAL1_1_Source_(_Pre_opt_ptrdiff_count_, (ptr),      \
-                    _Pre1_impl_(__maybenull_impl_notref) \
-                        _Pre1_impl_(__count_x_impl(__ptrdiff(ptr))) _Pre_valid_impl_)
-
-// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_
-// size_t _Count) buffer maybe zero-terminated after the call
-#define _Post_maybez_ _SAL1_1_Source_(_Post_maybez_, (), _Post1_impl_(__maybezterm_impl))
-
-// e.g. size_t HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return)
-// LPCVOID lpMem );
-#define _Post_cap_(size) _SAL1_1_Source_(_Post_cap_, (size), _Post1_impl_(__cap_impl(size)))
-#define _Post_bytecap_(size) \
-    _SAL1_1_Source_(_Post_bytecap_, (size), _Post1_impl_(__bytecap_impl(size)))
-
-// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz );
-#define _Post_count_(size) \
-    _SAL1_1_Source_(_Post_count_, (size), _Post1_impl_(__count_impl(size)) _Post_valid_impl_)
-#define _Post_bytecount_(size)                \
-    _SAL1_1_Source_(_Post_bytecount_, (size), \
-                    _Post1_impl_(__bytecount_impl(size)) _Post_valid_impl_)
-#define _Post_count_c_(size) \
-    _SAL1_1_Source_(_Post_count_c_, (size), _Post1_impl_(__count_c_impl(size)) _Post_valid_impl_)
-#define _Post_bytecount_c_(size)                \
-    _SAL1_1_Source_(_Post_bytecount_c_, (size), \
-                    _Post1_impl_(__bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Post_count_x_(size) \
-    _SAL1_1_Source_(_Post_count_x_, (size), _Post1_impl_(__count_x_impl(size)) _Post_valid_impl_)
-#define _Post_bytecount_x_(size)                \
-    _SAL1_1_Source_(_Post_bytecount_x_, (size), \
-                    _Post1_impl_(__bytecount_x_impl(size)) _Post_valid_impl_)
-
-// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char*
-// szFrom, size_t cchFrom );
-#define _Post_z_count_(size)                \
-    _SAL1_1_Source_(_Post_z_count_, (size), \
-                    _Post2_impl_(__zterm_impl, __count_impl(size)) _Post_valid_impl_)
-#define _Post_z_bytecount_(size)                \
-    _SAL1_1_Source_(_Post_z_bytecount_, (size), \
-                    _Post2_impl_(__zterm_impl, __bytecount_impl(size)) _Post_valid_impl_)
-#define _Post_z_count_c_(size)                \
-    _SAL1_1_Source_(_Post_z_count_c_, (size), \
-                    _Post2_impl_(__zterm_impl, __count_c_impl(size)) _Post_valid_impl_)
-#define _Post_z_bytecount_c_(size)                \
-    _SAL1_1_Source_(_Post_z_bytecount_c_, (size), \
-                    _Post2_impl_(__zterm_impl, __bytecount_c_impl(size)) _Post_valid_impl_)
-#define _Post_z_count_x_(size)                \
-    _SAL1_1_Source_(_Post_z_count_x_, (size), \
-                    _Post2_impl_(__zterm_impl, __count_x_impl(size)) _Post_valid_impl_)
-#define _Post_z_bytecount_x_(size)                \
-    _SAL1_1_Source_(_Post_z_bytecount_x_, (size), \
-                    _Post2_impl_(__zterm_impl, __bytecount_x_impl(size)) _Post_valid_impl_)
-
-//
-// _Prepost_ ---
-//
-// describing conditions that hold before and after the function call
-
-#define _Prepost_opt_z_ _SAL1_1_Source_(_Prepost_opt_z_, (), _Pre_opt_z_ _Post_z_)
-
-#define _Prepost_count_(size) \
-    _SAL1_1_Source_(_Prepost_count_, (size), _Pre_count_(size) _Post_count_(size))
-#define _Prepost_opt_count_(size) \
-    _SAL1_1_Source_(_Prepost_opt_count_, (size), _Pre_opt_count_(size) _Post_count_(size))
-#define _Prepost_bytecount_(size) \
-    _SAL1_1_Source_(_Prepost_bytecount_, (size), _Pre_bytecount_(size) _Post_bytecount_(size))
-#define _Prepost_opt_bytecount_(size)                \
-    _SAL1_1_Source_(_Prepost_opt_bytecount_, (size), \
-                    _Pre_opt_bytecount_(size) _Post_bytecount_(size))
-#define _Prepost_count_c_(size) \
-    _SAL1_1_Source_(_Prepost_count_c_, (size), _Pre_count_c_(size) _Post_count_c_(size))
-#define _Prepost_opt_count_c_(size) \
-    _SAL1_1_Source_(_Prepost_opt_count_c_, (size), _Pre_opt_count_c_(size) _Post_count_c_(size))
-#define _Prepost_bytecount_c_(size) \
-    _SAL1_1_Source_(_Prepost_bytecount_c_, (size), _Pre_bytecount_c_(size) _Post_bytecount_c_(size))
-#define _Prepost_opt_bytecount_c_(size)                \
-    _SAL1_1_Source_(_Prepost_opt_bytecount_c_, (size), \
-                    _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size))
-#define _Prepost_count_x_(size) \
-    _SAL1_1_Source_(_Prepost_count_x_, (size), _Pre_count_x_(size) _Post_count_x_(size))
-#define _Prepost_opt_count_x_(size) \
-    _SAL1_1_Source_(_Prepost_opt_count_x_, (size), _Pre_opt_count_x_(size) _Post_count_x_(size))
-#define _Prepost_bytecount_x_(size) \
-    _SAL1_1_Source_(_Prepost_bytecount_x_, (size), _Pre_bytecount_x_(size) _Post_bytecount_x_(size))
-#define _Prepost_opt_bytecount_x_(size)                \
-    _SAL1_1_Source_(_Prepost_opt_bytecount_x_, (size), \
-                    _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size))
-
-#define _Prepost_valid_ _SAL1_1_Source_(_Prepost_valid_, (), _Pre_valid_ _Post_valid_)
-#define _Prepost_opt_valid_ _SAL1_1_Source_(_Prepost_opt_valid_, (), _Pre_opt_valid_ _Post_valid_)
-
-//
-// _Deref_<both> ---
-//
-// short version for _Deref_pre_<ann> _Deref_post_<ann>
-// describing conditions for array elements or dereferenced pointer parameters that hold before and
-// after the call
-
-#define _Deref_prepost_z_ _SAL1_1_Source_(_Deref_prepost_z_, (), _Deref_pre_z_ _Deref_post_z_)
-#define _Deref_prepost_opt_z_ \
-    _SAL1_1_Source_(_Deref_prepost_opt_z_, (), _Deref_pre_opt_z_ _Deref_post_opt_z_)
-
-#define _Deref_prepost_cap_(size) \
-    _SAL1_1_Source_(_Deref_prepost_cap_, (size), _Deref_pre_cap_(size) _Deref_post_cap_(size))
-#define _Deref_prepost_opt_cap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_cap_, (size), \
-                    _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size))
-#define _Deref_prepost_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_bytecap_, (size), \
-                    _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size))
-#define _Deref_prepost_opt_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_bytecap_, (size), \
-                    _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size))
-
-#define _Deref_prepost_cap_x_(size) \
-    _SAL1_1_Source_(_Deref_prepost_cap_x_, (size), _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size))
-#define _Deref_prepost_opt_cap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_cap_x_, (size), \
-                    _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size))
-#define _Deref_prepost_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_bytecap_x_, (size), \
-                    _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size))
-#define _Deref_prepost_opt_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_bytecap_x_, (size), \
-                    _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size))
-
-#define _Deref_prepost_z_cap_(size) \
-    _SAL1_1_Source_(_Deref_prepost_z_cap_, (size), _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size))
-#define _Deref_prepost_opt_z_cap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_z_cap_, (size), \
-                    _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size))
-#define _Deref_prepost_z_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_z_bytecap_, (size), \
-                    _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size))
-#define _Deref_prepost_opt_z_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_z_bytecap_, (size), \
-                    _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size))
-
-#define _Deref_prepost_valid_cap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_valid_cap_, (size), \
-                    _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size))
-#define _Deref_prepost_opt_valid_cap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_, (size), \
-                    _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size))
-#define _Deref_prepost_valid_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_valid_bytecap_, (size), \
-                    _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size))
-#define _Deref_prepost_opt_valid_bytecap_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_, (size), \
-                    _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size))
-
-#define _Deref_prepost_valid_cap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_valid_cap_x_, (size), \
-                    _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size))
-#define _Deref_prepost_opt_valid_cap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_valid_cap_x_, (size), \
-                    _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size))
-#define _Deref_prepost_valid_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_valid_bytecap_x_, (size), \
-                    _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size))
-#define _Deref_prepost_opt_valid_bytecap_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_valid_bytecap_x_, (size), \
-                    _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size))
-
-#define _Deref_prepost_count_(size) \
-    _SAL1_1_Source_(_Deref_prepost_count_, (size), _Deref_pre_count_(size) _Deref_post_count_(size))
-#define _Deref_prepost_opt_count_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_count_, (size), \
-                    _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size))
-#define _Deref_prepost_bytecount_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_bytecount_, (size), \
-                    _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size))
-#define _Deref_prepost_opt_bytecount_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_bytecount_, (size), \
-                    _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size))
-
-#define _Deref_prepost_count_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_count_x_, (size), \
-                    _Deref_pre_count_x_(size) _Deref_post_count_x_(size))
-#define _Deref_prepost_opt_count_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_count_x_, (size), \
-                    _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size))
-#define _Deref_prepost_bytecount_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_bytecount_x_, (size), \
-                    _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size))
-#define _Deref_prepost_opt_bytecount_x_(size)                \
-    _SAL1_1_Source_(_Deref_prepost_opt_bytecount_x_, (size), \
-                    _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size))
-
-#define _Deref_prepost_valid_ \
-    _SAL1_1_Source_(_Deref_prepost_valid_, (), _Deref_pre_valid_ _Deref_post_valid_)
-#define _Deref_prepost_opt_valid_ \
-    _SAL1_1_Source_(_Deref_prepost_opt_valid_, (), _Deref_pre_opt_valid_ _Deref_post_opt_valid_)
-
-//
-// _Deref_<miscellaneous>
-//
-// used with references to arrays
-
-#define _Deref_out_z_cap_c_(size) \
-    _SAL1_1_Source_(_Deref_out_z_cap_c_, (size), _Deref_pre_cap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_cap_c_(size) \
-    _SAL1_1_Source_(_Deref_inout_z_cap_c_, (size), _Deref_pre_z_cap_c_(size) _Deref_post_z_)
-#define _Deref_out_z_bytecap_c_(size) \
-    _SAL1_1_Source_(_Deref_out_z_bytecap_c_, (size), _Deref_pre_bytecap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_bytecap_c_(size) \
-    _SAL1_1_Source_(_Deref_inout_z_bytecap_c_, (size), _Deref_pre_z_bytecap_c_(size) _Deref_post_z_)
-#define _Deref_inout_z_ _SAL1_1_Source_(_Deref_inout_z_, (), _Deref_prepost_z_)
-
-// #pragma endregion Input Buffer SAL 1 compatibility macros
-
-//============================================================================
-//   Implementation Layer:
-//============================================================================
-
-// Naming conventions:
-// A symbol the begins with _SA_ is for the machinery of creating any
-// annotations; many of those come from sourceannotations.h in the case
-// of attributes.
-
-// A symbol that ends with _impl is the very lowest level macro.  It is
-// not required to be a legal standalone annotation, and in the case
-// of attribute annotations, usually is not.  (In the case of some declspec
-// annotations, it might be, but it should not be assumed so.)  Those
-// symols will be used in the _PreN..., _PostN... and _RetN... annotations
-// to build up more complete annotations.
-
-// A symbol ending in _impl_ is reserved to the implementation as well,
-// but it does form a complete annotation; usually they are used to build
-// up even higher level annotations.
-
-#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL  // [
-// Sharable "_impl" macros: these can be shared between the various annotation
-// forms but are part of the implementation of the macros.  These are collected
-// here to assure that only necessary differences in the annotations
-// exist.
-
-#define _Always_impl_(annos) _Group_(annos _SAL_nop_impl_) _On_failure_impl_(annos _SAL_nop_impl_)
-#define _Bound_impl_ _SA_annotes0(SAL_bound)
-#define _Field_range_impl_(min, max) _Range_impl_(min, max)
-#define _Literal_impl_ _SA_annotes1(SAL_constant, __yes)
-#define _Maybenull_impl_ _SA_annotes1(SAL_null, __maybe)
-#define _Maybevalid_impl_ _SA_annotes1(SAL_valid, __maybe)
-#define _Must_inspect_impl_ _Post_impl_ _SA_annotes0(SAL_mustInspect)
-#define _Notliteral_impl_ _SA_annotes1(SAL_constant, __no)
-#define _Notnull_impl_ _SA_annotes1(SAL_null, __no)
-#define _Notvalid_impl_ _SA_annotes1(SAL_valid, __no)
-#define _NullNull_terminated_impl_                  \
-    _Group_(_SA_annotes1(SAL_nullTerminated, __yes) \
-                _SA_annotes1(SAL_readableTo, inexpressibleCount("NullNull terminated string")))
-#define _Null_impl_ _SA_annotes1(SAL_null, __yes)
-#define _Null_terminated_impl_ _SA_annotes1(SAL_nullTerminated, __yes)
-#define _Out_impl_ \
-    _Pre1_impl_(__notnull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
-#define _Out_opt_impl_ \
-    _Pre1_impl_(__maybenull_impl_notref) _Pre1_impl_(__cap_c_one_notref_impl) _Post_valid_impl_
-#define _Points_to_data_impl_ _At_(*_Curr_, _SA_annotes1(SAL_mayBePointer, __no))
-#define _Post_satisfies_impl_(cond) _Post_impl_ _Satisfies_impl_(cond)
-#define _Post_valid_impl_ _Post1_impl_(__valid_impl)
-#define _Pre_satisfies_impl_(cond) _Pre_impl_ _Satisfies_impl_(cond)
-#define _Pre_valid_impl_ _Pre1_impl_(__valid_impl)
-#define _Range_impl_(min, max) _SA_annotes2(SAL_range, min, max)
-#define _Readable_bytes_impl_(size) _SA_annotes1(SAL_readableTo, byteCount(size))
-#define _Readable_elements_impl_(size) _SA_annotes1(SAL_readableTo, elementCount(size))
-#define _Ret_valid_impl_ _Ret1_impl_(__valid_impl)
-#define _Satisfies_impl_(cond) _SA_annotes1(SAL_satisfies, cond)
-#define _Valid_impl_ _SA_annotes1(SAL_valid, __yes)
-#define _Writable_bytes_impl_(size) _SA_annotes1(SAL_writableTo, byteCount(size))
-#define _Writable_elements_impl_(size) _SA_annotes1(SAL_writableTo, elementCount(size))
-
-#define _In_range_impl_(min, max) _Pre_impl_ _Range_impl_(min, max)
-#define _Out_range_impl_(min, max) _Post_impl_ _Range_impl_(min, max)
-#define _Ret_range_impl_(min, max) _Post_impl_ _Range_impl_(min, max)
-#define _Deref_in_range_impl_(min, max) _Deref_pre_impl_ _Range_impl_(min, max)
-#define _Deref_out_range_impl_(min, max) _Deref_post_impl_ _Range_impl_(min, max)
-#define _Deref_ret_range_impl_(min, max) _Deref_post_impl_ _Range_impl_(min, max)
-
-#define _Deref_pre_impl_ _Pre_impl_ _Notref_impl_ _Deref_impl_
-#define _Deref_post_impl_ _Post_impl_ _Notref_impl_ _Deref_impl_
-
-// The following are for the implementation machinery, and are not
-// suitable for annotating general code.
-// We're tying to phase this out, someday.  The parser quotes the param.
-#define __AuToQuOtE _SA_annotes0(SAL_AuToQuOtE)
-
-// Normally the parser does some simple type checking of annotation params,
-// defer that check to the plugin.
-#define __deferTypecheck _SA_annotes0(SAL_deferTypecheck)
-
-#define _SA_SPECSTRIZE(x) #x
-#define _SAL_nop_impl_ /* nothing */
-#define __nop_impl(x) x
-#endif
-
-#if _USE_ATTRIBUTES_FOR_SAL  // [
-
-// Using attributes for sal
-
-#include "codeanalysis\sourceannotations.h"
-
-#define _SA_annotes0(n) [SAL_annotes(Name = #n)]
-#define _SA_annotes1(n, pp1) [SAL_annotes(Name = #n, p1 = _SA_SPECSTRIZE(pp1))]
-#define _SA_annotes2(n, pp1, pp2) \
-    [SAL_annotes(Name = #n, p1 = _SA_SPECSTRIZE(pp1), p2 = _SA_SPECSTRIZE(pp2))]
-#define _SA_annotes3(n, pp1, pp2, pp3)                                          \
-    [SAL_annotes(Name = #n, p1 = _SA_SPECSTRIZE(pp1), p2 = _SA_SPECSTRIZE(pp2), \
-                 p3 = _SA_SPECSTRIZE(pp3))]
-
-#define _Pre_impl_ [SAL_pre]
-#define _Post_impl_ [SAL_post]
-#define _Deref_impl_ [SAL_deref]
-#define _Notref_impl_ [SAL_notref]
-
-// Declare a function to be an annotation or primop (respectively).
-// Done this way so that they don't appear in the regular compiler's
-// namespace.
-#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun;
-#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun;
-#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun;
-
-// Benign declspec needed here for WindowsPREfast
-#define __In_impl_                                                            \
-    [SA_Pre(Valid = SA_Yes)][SA_Pre(Deref = 1, Notref = 1, Access = SA_Read)] \
-        __declspec("SAL_pre SAL_valid")
-
-#elif _USE_DECLSPECS_FOR_SAL  // ][
-
-// Using declspecs for sal
-
-#define _SA_annotes0(n) __declspec(#n)
-#define _SA_annotes1(n, pp1) __declspec(#n "(" _SA_SPECSTRIZE(pp1) ")")
-#define _SA_annotes2(n, pp1, pp2) __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) ")")
-#define _SA_annotes3(n, pp1, pp2, pp3) \
-    __declspec(#n "(" _SA_SPECSTRIZE(pp1) "," _SA_SPECSTRIZE(pp2) "," _SA_SPECSTRIZE(pp3) ")")
-
-#define _Pre_impl_ _SA_annotes0(SAL_pre)
-#define _Post_impl_ _SA_annotes0(SAL_post)
-#define _Deref_impl_ _SA_annotes0(SAL_deref)
-#define _Notref_impl_ _SA_annotes0(SAL_notref)
-
-// Declare a function to be an annotation or primop (respectively).
-// Done this way so that they don't appear in the regular compiler's
-// namespace.
-#define __ANNOTATION(fun) _SA_annotes0(SAL_annotation) void __SA_##fun
-
-#define __PRIMOP(type, fun) _SA_annotes0(SAL_primop) type __SA_##fun
-
-#define __QUALIFIER(fun) _SA_annotes0(SAL_qualifier) void __SA_##fun;
-
-#define __In_impl_                     \
-    _Pre_impl_ _SA_annotes0(SAL_valid) \
-    _Pre_impl_ _Deref_impl_ _Notref_impl_ _SA_annotes0(SAL_readonly)
-
-#else  // ][
-
-// Using "nothing" for sal
-
-#define _SA_annotes0(n)
-#define _SA_annotes1(n, pp1)
-#define _SA_annotes2(n, pp1, pp2)
-#define _SA_annotes3(n, pp1, pp2, pp3)
-
-#define __ANNOTATION(fun)
-#define __PRIMOP(type, fun)
-#define __QUALIFIER(type, fun)
-
-#endif  // ]
-
-#if _USE_ATTRIBUTES_FOR_SAL || _USE_DECLSPECS_FOR_SAL  // [
-
-// Declare annotations that need to be declared.
-__ANNOTATION(SAL_useHeader(void));
-__ANNOTATION(SAL_bound(void));
-__ANNOTATION(SAL_allocator(void));  //??? resolve with PFD
-__ANNOTATION(SAL_file_parser(__AuToQuOtE __In_impl_ char *, __In_impl_ char *));
-__ANNOTATION(SAL_source_code_content(__In_impl_ char *));
-__ANNOTATION(SAL_analysisHint(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_untrusted_data_source(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_untrusted_data_source_this(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_validated(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_validated_this(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_encoded(void));
-__ANNOTATION(SAL_adt(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_add_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_remove_adt_property(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_transfer_adt_property_from(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_post_type(__AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_volatile(void));
-__ANNOTATION(SAL_nonvolatile(void));
-__ANNOTATION(SAL_entrypoint(__AuToQuOtE __In_impl_ char *, __AuToQuOtE __In_impl_ char *));
-__ANNOTATION(SAL_blocksOn(__In_impl_ void *));
-__ANNOTATION(SAL_mustInspect(void));
-
-// Only appears in model files, but needs to be declared.
-__ANNOTATION(SAL_TypeName(__AuToQuOtE __In_impl_ char *));
-
-// To be declared well-known soon.
-__ANNOTATION(SAL_interlocked(void);)
-
-#pragma warning(suppress : 28227 28241)
-__ANNOTATION(SAL_name(__In_impl_ char *, __In_impl_ char *, __In_impl_ char *);)
-
-__PRIMOP(char *, _Macro_value_(__In_impl_ char *));
-__PRIMOP(int, _Macro_defined_(__In_impl_ char *));
-__PRIMOP(char *, _Strstr_(__In_impl_ char *, __In_impl_ char *));
-
-#endif  // ]
-
-#if _USE_ATTRIBUTES_FOR_SAL  // [
-
-#define _Check_return_impl_ [SA_Post(MustCheck = SA_Yes)]
-
-#define _Success_impl_(expr) [SA_Success(Condition = #expr)]
-#define _On_failure_impl_(annos) \
-    [SAL_context(p1 = "SAL_failed")] _Group_(_Post_impl_ _Group_(annos _SAL_nop_impl_))
-
-#define _Printf_format_string_impl_ [SA_FormatString(Style = "printf")]
-#define _Scanf_format_string_impl_ [SA_FormatString(Style = "scanf")]
-#define _Scanf_s_format_string_impl_ [SA_FormatString(Style = "scanf_s")]
-
-#define _In_bound_impl_ [SA_PreBound(Deref = 0)]
-#define _Out_bound_impl_ [SA_PostBound(Deref = 0)]
-#define _Ret_bound_impl_ [SA_PostBound(Deref = 0)]
-#define _Deref_in_bound_impl_ [SA_PreBound(Deref = 1)]
-#define _Deref_out_bound_impl_ [SA_PostBound(Deref = 1)]
-#define _Deref_ret_bound_impl_ [SA_PostBound(Deref = 1)]
-
-#define __valid_impl Valid = SA_Yes
-#define __maybevalid_impl Valid = SA_Maybe
-#define __notvalid_impl Valid = SA_No
-
-#define __null_impl Null = SA_Yes
-#define __maybenull_impl Null = SA_Maybe
-#define __notnull_impl Null = SA_No
-
-#define __null_impl_notref Null = SA_Yes, Notref = 1
-#define __maybenull_impl_notref Null = SA_Maybe, Notref = 1
-#define __notnull_impl_notref Null = SA_No, Notref = 1
-
-#define __zterm_impl NullTerminated = SA_Yes
-#define __maybezterm_impl NullTerminated = SA_Maybe
-#define __maybzterm_impl NullTerminated = SA_Maybe
-#define __notzterm_impl NullTerminated = SA_No
-
-#define __readaccess_impl Access = SA_Read
-#define __writeaccess_impl Access = SA_Write
-#define __allaccess_impl Access = SA_ReadWrite
-
-#define __readaccess_impl_notref Access = SA_Read, Notref = 1
-#define __writeaccess_impl_notref Access = SA_Write, Notref = 1
-#define __allaccess_impl_notref Access = SA_ReadWrite, Notref = 1
-
-#if _MSC_VER >= 1610 /*IFSTRIP=IGN*/  // [
-
-// For SAL2, we need to expect general expressions.
-
-#define __cap_impl(size) WritableElements = "\n" #size
-#define __bytecap_impl(size) WritableBytes = "\n" #size
-#define __bytecount_impl(size) ValidBytes = "\n" #size
-#define __count_impl(size) ValidElements = "\n" #size
-
-#else  // ][
-
-#define __cap_impl(size) WritableElements = #size
-#define __bytecap_impl(size) WritableBytes = #size
-#define __bytecount_impl(size) ValidBytes = #size
-#define __count_impl(size) ValidElements = #size
-
-#endif  // ]
-
-#define __cap_c_impl(size) WritableElementsConst = size
-#define __cap_c_one_notref_impl WritableElementsConst = 1, Notref = 1
-#define __cap_for_impl(param) WritableElementsLength = #param
-#define __cap_x_impl(size) WritableElements = "\n@" #size
-
-#define __bytecap_c_impl(size) WritableBytesConst = size
-#define __bytecap_x_impl(size) WritableBytes = "\n@" #size
-
-#define __mult_impl(mult, size) __cap_impl((mult) * (size))
-
-#define __count_c_impl(size) ValidElementsConst = size
-#define __count_x_impl(size) ValidElements = "\n@" #size
-
-#define __bytecount_c_impl(size) ValidBytesConst = size
-#define __bytecount_x_impl(size) ValidBytes = "\n@" #size
-
-#define _At_impl_(target, annos) [SAL_at(p1 = #target)] _Group_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos) \
-    [SAL_at_buffer(p1 = #target, p2 = #iter, p3 = #bound)] _Group_(annos)
-#define _When_impl_(expr, annos) [SAL_when(p1 = #expr)] _Group_(annos)
-
-#define _Group_impl_(annos) [SAL_begin] annos[SAL_end]
-#define _GrouP_impl_(annos) [SAL_BEGIN] annos[SAL_END]
-
-#define _Use_decl_anno_impl_ _SA_annotes0(SAL_useHeader)  // this is a special case!
-
-#define _Pre1_impl_(p1) [SA_Pre(p1)]
-#define _Pre2_impl_(p1, p2) [SA_Pre(p1, p2)]
-#define _Pre3_impl_(p1, p2, p3) [SA_Pre(p1, p2, p3)]
-
-#define _Post1_impl_(p1) [SA_Post(p1)]
-#define _Post2_impl_(p1, p2) [SA_Post(p1, p2)]
-#define _Post3_impl_(p1, p2, p3) [SA_Post(p1, p2, p3)]
-
-#define _Ret1_impl_(p1) [SA_Post(p1)]
-#define _Ret2_impl_(p1, p2) [SA_Post(p1, p2)]
-#define _Ret3_impl_(p1, p2, p3) [SA_Post(p1, p2, p3)]
-
-#define _Deref_pre1_impl_(p1) [SA_Pre(Deref = 1, p1)]
-#define _Deref_pre2_impl_(p1, p2) [SA_Pre(Deref = 1, p1, p2)]
-#define _Deref_pre3_impl_(p1, p2, p3) [SA_Pre(Deref = 1, p1, p2, p3)]
-
-#define _Deref_post1_impl_(p1) [SA_Post(Deref = 1, p1)]
-#define _Deref_post2_impl_(p1, p2) [SA_Post(Deref = 1, p1, p2)]
-#define _Deref_post3_impl_(p1, p2, p3) [SA_Post(Deref = 1, p1, p2, p3)]
-
-#define _Deref_ret1_impl_(p1) [SA_Post(Deref = 1, p1)]
-#define _Deref_ret2_impl_(p1, p2) [SA_Post(Deref = 1, p1, p2)]
-#define _Deref_ret3_impl_(p1, p2, p3) [SA_Post(Deref = 1, p1, p2, p3)]
-
-#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref = 2, Notref = 1, p1)]
-#define _Deref2_post1_impl_(p1) [SA_Post(Deref = 2, Notref = 1, p1)]
-#define _Deref2_ret1_impl_(p1) [SA_Post(Deref = 2, Notref = 1, p1)]
-
-// Obsolete -- may be needed for transition to attributes.
-#define __inner_typefix(ctype) [SAL_typefix(p1 = _SA_SPECSTRIZE(ctype))]
-#define __inner_exceptthat [SAL_except]
-
-#elif _USE_DECLSPECS_FOR_SAL  // ][
-
-#define _Check_return_impl_ __post _SA_annotes0(SAL_checkReturn)
-
-#define _Success_impl_(expr) _SA_annotes1(SAL_success, expr)
-#define _On_failure_impl_(annos) \
-    _SA_annotes1(SAL_context, SAL_failed) _Group_(_Post_impl_ _Group_(_SAL_nop_impl_ annos))
-
-#define _Printf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "printf")
-#define _Scanf_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf")
-#define _Scanf_s_format_string_impl_ _SA_annotes1(SAL_IsFormatString, "scanf_s")
-
-#define _In_bound_impl_ _Pre_impl_ _Bound_impl_
-#define _Out_bound_impl_ _Post_impl_ _Bound_impl_
-#define _Ret_bound_impl_ _Post_impl_ _Bound_impl_
-#define _Deref_in_bound_impl_ _Deref_pre_impl_ _Bound_impl_
-#define _Deref_out_bound_impl_ _Deref_post_impl_ _Bound_impl_
-#define _Deref_ret_bound_impl_ _Deref_post_impl_ _Bound_impl_
-
-#define __null_impl _SA_annotes0(SAL_null)            // _SA_annotes1(SAL_null, __yes)
-#define __notnull_impl _SA_annotes0(SAL_notnull)      // _SA_annotes1(SAL_null, __no)
-#define __maybenull_impl _SA_annotes0(SAL_maybenull)  // _SA_annotes1(SAL_null, __maybe)
-
-#define __valid_impl _SA_annotes0(SAL_valid)            // _SA_annotes1(SAL_valid, __yes)
-#define __notvalid_impl _SA_annotes0(SAL_notvalid)      // _SA_annotes1(SAL_valid, __no)
-#define __maybevalid_impl _SA_annotes0(SAL_maybevalid)  // _SA_annotes1(SAL_valid, __maybe)
-
-#define __null_impl_notref _Notref_ _Null_impl_
-#define __maybenull_impl_notref _Notref_ _Maybenull_impl_
-#define __notnull_impl_notref _Notref_ _Notnull_impl_
-
-#define __zterm_impl _SA_annotes1(SAL_nullTerminated, __yes)
-#define __maybezterm_impl _SA_annotes1(SAL_nullTerminated, __maybe)
-#define __maybzterm_impl _SA_annotes1(SAL_nullTerminated, __maybe)
-#define __notzterm_impl _SA_annotes1(SAL_nullTerminated, __no)
-
-#define __readaccess_impl _SA_annotes1(SAL_access, 0x1)
-#define __writeaccess_impl _SA_annotes1(SAL_access, 0x2)
-#define __allaccess_impl _SA_annotes1(SAL_access, 0x3)
-
-#define __readaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x1)
-#define __writeaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x2)
-#define __allaccess_impl_notref _Notref_ _SA_annotes1(SAL_access, 0x3)
-
-#define __cap_impl(size) _SA_annotes1(SAL_writableTo, elementCount(size))
-#define __cap_c_impl(size) _SA_annotes1(SAL_writableTo, elementCount(size))
-#define __cap_c_one_notref_impl _Notref_ _SA_annotes1(SAL_writableTo, elementCount(1))
-#define __cap_for_impl(param) _SA_annotes1(SAL_writableTo, inexpressibleCount(sizeof(param)))
-#define __cap_x_impl(size) _SA_annotes1(SAL_writableTo, inexpressibleCount(#size))
-
-#define __bytecap_impl(size) _SA_annotes1(SAL_writableTo, byteCount(size))
-#define __bytecap_c_impl(size) _SA_annotes1(SAL_writableTo, byteCount(size))
-#define __bytecap_x_impl(size) _SA_annotes1(SAL_writableTo, inexpressibleCount(#size))
-
-#define __mult_impl(mult, size) _SA_annotes1(SAL_writableTo, (mult) * (size))
-
-#define __count_impl(size) _SA_annotes1(SAL_readableTo, elementCount(size))
-#define __count_c_impl(size) _SA_annotes1(SAL_readableTo, elementCount(size))
-#define __count_x_impl(size) _SA_annotes1(SAL_readableTo, inexpressibleCount(#size))
-
-#define __bytecount_impl(size) _SA_annotes1(SAL_readableTo, byteCount(size))
-#define __bytecount_c_impl(size) _SA_annotes1(SAL_readableTo, byteCount(size))
-#define __bytecount_x_impl(size) _SA_annotes1(SAL_readableTo, inexpressibleCount(#size))
-
-#define _At_impl_(target, annos) _SA_annotes0(SAL_at(target)) _Group_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos) \
-    _SA_annotes3(SAL_at_buffer, target, iter, bound) _Group_(annos)
-#define _Group_impl_(annos) _SA_annotes0(SAL_begin) annos _SA_annotes0(SAL_end)
-#define _GrouP_impl_(annos) _SA_annotes0(SAL_BEGIN) annos _SA_annotes0(SAL_END)
-#define _When_impl_(expr, annos) _SA_annotes0(SAL_when(expr)) _Group_(annos)
-
-#define _Use_decl_anno_impl_ __declspec("SAL_useHeader()")  // this is a special case!
-
-#define _Pre1_impl_(p1) _Pre_impl_ p1
-#define _Pre2_impl_(p1, p2) _Pre_impl_ p1 _Pre_impl_ p2
-#define _Pre3_impl_(p1, p2, p3) _Pre_impl_ p1 _Pre_impl_ p2 _Pre_impl_ p3
-
-#define _Post1_impl_(p1) _Post_impl_ p1
-#define _Post2_impl_(p1, p2) _Post_impl_ p1 _Post_impl_ p2
-#define _Post3_impl_(p1, p2, p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
-
-#define _Ret1_impl_(p1) _Post_impl_ p1
-#define _Ret2_impl_(p1, p2) _Post_impl_ p1 _Post_impl_ p2
-#define _Ret3_impl_(p1, p2, p3) _Post_impl_ p1 _Post_impl_ p2 _Post_impl_ p3
-
-#define _Deref_pre1_impl_(p1) _Deref_pre_impl_ p1
-#define _Deref_pre2_impl_(p1, p2) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2
-#define _Deref_pre3_impl_(p1, p2, p3) _Deref_pre_impl_ p1 _Deref_pre_impl_ p2 _Deref_pre_impl_ p3
-
-#define _Deref_post1_impl_(p1) _Deref_post_impl_ p1
-#define _Deref_post2_impl_(p1, p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2
-#define _Deref_post3_impl_(p1, p2, p3) \
-    _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
-
-#define _Deref_ret1_impl_(p1) _Deref_post_impl_ p1
-#define _Deref_ret2_impl_(p1, p2) _Deref_post_impl_ p1 _Deref_post_impl_ p2
-#define _Deref_ret3_impl_(p1, p2, p3) _Deref_post_impl_ p1 _Deref_post_impl_ p2 _Deref_post_impl_ p3
-
-#define _Deref2_pre1_impl_(p1) _Deref_pre_impl_ _Notref_impl_ _Deref_impl_ p1
-#define _Deref2_post1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
-#define _Deref2_ret1_impl_(p1) _Deref_post_impl_ _Notref_impl_ _Deref_impl_ p1
-
-#define __inner_typefix(ctype) _SA_annotes1(SAL_typefix, ctype)
-#define __inner_exceptthat _SA_annotes0(SAL_except)
-
-#elif defined(_MSC_EXTENSIONS) && !defined(MIDL_PASS) && !defined(__midl) && \
-    !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 /*IFSTRIP=IGN*/  // ][
-
-// minimum attribute expansion for foreground build
-
-#pragma push_macro("SA")
-#pragma push_macro("REPEATABLE")
-
-#ifdef __cplusplus  // [
-#define SA(id) id
-#define REPEATABLE [repeatable]
-#else  // !__cplusplus // ][
-#define SA(id) SA_##id
-#define REPEATABLE
-#endif              // !__cplusplus // ]
-
-REPEATABLE
-[source_annotation_attribute(SA(Parameter))] struct __P_impl {
-#ifdef __cplusplus  // [
-    __P_impl();
-#endif              // ]
-    int __d_;
-};
-typedef struct __P_impl __P_impl;
-
-REPEATABLE
-[source_annotation_attribute(SA(ReturnValue))] struct __R_impl {
-#ifdef __cplusplus  // [
-    __R_impl();
-#endif              // ]
-    int __d_;
-};
-typedef struct __R_impl __R_impl;
-
-[source_annotation_attribute(SA(Method))] struct __M_ {
-#ifdef __cplusplus  // [
-    __M_();
-#endif              // ]
-    int __d_;
-};
-typedef struct __M_ __M_;
-
-[source_annotation_attribute(SA(All))] struct __A_ {
-#ifdef __cplusplus  // [
-    __A_();
-#endif              // ]
-    int __d_;
-};
-typedef struct __A_ __A_;
-
-[source_annotation_attribute(SA(Field))] struct __F_ {
-#ifdef __cplusplus  // [
-    __F_();
-#endif              // ]
-    int __d_;
-};
-typedef struct __F_ __F_;
-
-#pragma pop_macro("REPEATABLE")
-#pragma pop_macro("SA")
-
-#define _SAL_nop_impl_
-
-#define _At_impl_(target, annos) [__A_(__d_ = 0)]
-#define _At_buffer_impl_(target, iter, bound, annos) [__A_(__d_ = 0)]
-#define _When_impl_(expr, annos) annos
-#define _Group_impl_(annos) annos
-#define _GrouP_impl_(annos) annos
-#define _Use_decl_anno_impl_ [__M_(__d_ = 0)]
-
-#define _Points_to_data_impl_ [__P_impl(__d_ = 0)]
-#define _Literal_impl_ [__P_impl(__d_ = 0)]
-#define _Notliteral_impl_ [__P_impl(__d_ = 0)]
-
-#define _Pre_valid_impl_ [__P_impl(__d_ = 0)]
-#define _Post_valid_impl_ [__P_impl(__d_ = 0)]
-#define _Ret_valid_impl_ [__R_impl(__d_ = 0)]
-
-#define _Check_return_impl_ [__R_impl(__d_ = 0)]
-#define _Must_inspect_impl_ [__R_impl(__d_ = 0)]
-
-#define _Success_impl_(expr) [__M_(__d_ = 0)]
-#define _On_failure_impl_(expr) [__M_(__d_ = 0)]
-#define _Always_impl_(expr) [__M_(__d_ = 0)]
-
-#define _Printf_format_string_impl_ [__P_impl(__d_ = 0)]
-#define _Scanf_format_string_impl_ [__P_impl(__d_ = 0)]
-#define _Scanf_s_format_string_impl_ [__P_impl(__d_ = 0)]
-
-#define _Raises_SEH_exception_impl_ [__M_(__d_ = 0)]
-#define _Maybe_raises_SEH_exception_impl_ [__M_(__d_ = 0)]
-
-#define _In_bound_impl_ [__P_impl(__d_ = 0)]
-#define _Out_bound_impl_ [__P_impl(__d_ = 0)]
-#define _Ret_bound_impl_ [__R_impl(__d_ = 0)]
-#define _Deref_in_bound_impl_ [__P_impl(__d_ = 0)]
-#define _Deref_out_bound_impl_ [__P_impl(__d_ = 0)]
-#define _Deref_ret_bound_impl_ [__R_impl(__d_ = 0)]
-
-#define _Range_impl_(min, max) [__P_impl(__d_ = 0)]
-#define _In_range_impl_(min, max) [__P_impl(__d_ = 0)]
-#define _Out_range_impl_(min, max) [__P_impl(__d_ = 0)]
-#define _Ret_range_impl_(min, max) [__R_impl(__d_ = 0)]
-#define _Deref_in_range_impl_(min, max) [__P_impl(__d_ = 0)]
-#define _Deref_out_range_impl_(min, max) [__P_impl(__d_ = 0)]
-#define _Deref_ret_range_impl_(min, max) [__R_impl(__d_ = 0)]
-
-#define _Field_range_impl_(min, max) [__F_(__d_ = 0)]
-
-#define _Pre_satisfies_impl_(cond) [__A_(__d_ = 0)]
-#define _Post_satisfies_impl_(cond) [__A_(__d_ = 0)]
-#define _Satisfies_impl_(cond) [__A_(__d_ = 0)]
-
-#define _Null_impl_ [__A_(__d_ = 0)]
-#define _Notnull_impl_ [__A_(__d_ = 0)]
-#define _Maybenull_impl_ [__A_(__d_ = 0)]
-
-#define _Valid_impl_ [__A_(__d_ = 0)]
-#define _Notvalid_impl_ [__A_(__d_ = 0)]
-#define _Maybevalid_impl_ [__A_(__d_ = 0)]
-
-#define _Readable_bytes_impl_(size) [__A_(__d_ = 0)]
-#define _Readable_elements_impl_(size) [__A_(__d_ = 0)]
-#define _Writable_bytes_impl_(size) [__A_(__d_ = 0)]
-#define _Writable_elements_impl_(size) [__A_(__d_ = 0)]
-
-#define _Null_terminated_impl_ [__A_(__d_ = 0)]
-#define _NullNull_terminated_impl_ [__A_(__d_ = 0)]
-
-#define _Pre_impl_ [__P_impl(__d_ = 0)]
-#define _Pre1_impl_(p1) [__P_impl(__d_ = 0)]
-#define _Pre2_impl_(p1, p2) [__P_impl(__d_ = 0)]
-#define _Pre3_impl_(p1, p2, p3) [__P_impl(__d_ = 0)]
-
-#define _Post_impl_ [__P_impl(__d_ = 0)]
-#define _Post1_impl_(p1) [__P_impl(__d_ = 0)]
-#define _Post2_impl_(p1, p2) [__P_impl(__d_ = 0)]
-#define _Post3_impl_(p1, p2, p3) [__P_impl(__d_ = 0)]
-
-#define _Ret1_impl_(p1) [__R_impl(__d_ = 0)]
-#define _Ret2_impl_(p1, p2) [__R_impl(__d_ = 0)]
-#define _Ret3_impl_(p1, p2, p3) [__R_impl(__d_ = 0)]
-
-#define _Deref_pre1_impl_(p1) [__P_impl(__d_ = 0)]
-#define _Deref_pre2_impl_(p1, p2) [__P_impl(__d_ = 0)]
-#define _Deref_pre3_impl_(p1, p2, p3) [__P_impl(__d_ = 0)]
-
-#define _Deref_post1_impl_(p1) [__P_impl(__d_ = 0)]
-#define _Deref_post2_impl_(p1, p2) [__P_impl(__d_ = 0)]
-#define _Deref_post3_impl_(p1, p2, p3) [__P_impl(__d_ = 0)]
-
-#define _Deref_ret1_impl_(p1) [__R_impl(__d_ = 0)]
-#define _Deref_ret2_impl_(p1, p2) [__R_impl(__d_ = 0)]
-#define _Deref_ret3_impl_(p1, p2, p3) [__R_impl(__d_ = 0)]
-
-#define _Deref2_pre1_impl_(p1)   //[__P_impl(__d_=0)]
-#define _Deref2_post1_impl_(p1)  //[__P_impl(__d_=0)]
-#define _Deref2_ret1_impl_(p1)   //[__P_impl(__d_=0)]
-
-#else  // ][
-
-#define _SAL_nop_impl_ X
-
-#define _At_impl_(target, annos)
-#define _When_impl_(expr, annos)
-#define _Group_impl_(annos)
-#define _GrouP_impl_(annos)
-#define _At_buffer_impl_(target, iter, bound, annos)
-#define _Use_decl_anno_impl_
-#define _Points_to_data_impl_
-#define _Literal_impl_
-#define _Notliteral_impl_
-#define _Notref_impl_
-
-#define _Pre_valid_impl_
-#define _Post_valid_impl_
-#define _Ret_valid_impl_
-
-#define _Check_return_impl_
-#define _Must_inspect_impl_
-
-#define _Success_impl_(expr)
-#define _On_failure_impl_(annos)
-#define _Always_impl_(annos)
-
-#define _Printf_format_string_impl_
-#define _Scanf_format_string_impl_
-#define _Scanf_s_format_string_impl_
-
-#define _In_bound_impl_
-#define _Out_bound_impl_
-#define _Ret_bound_impl_
-#define _Deref_in_bound_impl_
-#define _Deref_out_bound_impl_
-#define _Deref_ret_bound_impl_
-
-#define _Range_impl_(min, max)
-#define _In_range_impl_(min, max)
-#define _Out_range_impl_(min, max)
-#define _Ret_range_impl_(min, max)
-#define _Deref_in_range_impl_(min, max)
-#define _Deref_out_range_impl_(min, max)
-#define _Deref_ret_range_impl_(min, max)
-
-#define _Satisfies_impl_(expr)
-#define _Pre_satisfies_impl_(expr)
-#define _Post_satisfies_impl_(expr)
-
-#define _Null_impl_
-#define _Notnull_impl_
-#define _Maybenull_impl_
-
-#define _Valid_impl_
-#define _Notvalid_impl_
-#define _Maybevalid_impl_
-
-#define _Field_range_impl_(min, max)
-
-#define _Pre_impl_
-#define _Pre1_impl_(p1)
-#define _Pre2_impl_(p1, p2)
-#define _Pre3_impl_(p1, p2, p3)
-
-#define _Post_impl_
-#define _Post1_impl_(p1)
-#define _Post2_impl_(p1, p2)
-#define _Post3_impl_(p1, p2, p3)
-
-#define _Ret1_impl_(p1)
-#define _Ret2_impl_(p1, p2)
-#define _Ret3_impl_(p1, p2, p3)
-
-#define _Deref_pre1_impl_(p1)
-#define _Deref_pre2_impl_(p1, p2)
-#define _Deref_pre3_impl_(p1, p2, p3)
-
-#define _Deref_post1_impl_(p1)
-#define _Deref_post2_impl_(p1, p2)
-#define _Deref_post3_impl_(p1, p2, p3)
-
-#define _Deref_ret1_impl_(p1)
-#define _Deref_ret2_impl_(p1, p2)
-#define _Deref_ret3_impl_(p1, p2, p3)
-
-#define _Deref2_pre1_impl_(p1)
-#define _Deref2_post1_impl_(p1)
-#define _Deref2_ret1_impl_(p1)
-
-#define _Readable_bytes_impl_(size)
-#define _Readable_elements_impl_(size)
-#define _Writable_bytes_impl_(size)
-#define _Writable_elements_impl_(size)
-
-#define _Null_terminated_impl_
-#define _NullNull_terminated_impl_
-
-// Obsolete -- may be needed for transition to attributes.
-#define __inner_typefix(ctype)
-#define __inner_exceptthat
-
-#endif  // ]
-
-// This section contains the deprecated annotations
-
-/*
- -------------------------------------------------------------------------------
- Introduction
-
- sal.h provides a set of annotations to describe how a function uses its
- parameters - the assumptions it makes about them, and the guarantees it makes
- upon finishing.
-
- Annotations may be placed before either a function parameter's type or its return
- type, and describe the function's behavior regarding the parameter or return value.
- There are two classes of annotations: buffer annotations and advanced annotations.
- Buffer annotations describe how functions use their pointer parameters, and
- advanced annotations either describe complex/unusual buffer behavior, or provide
- additional information about a parameter that is not otherwise expressible.
-
- -------------------------------------------------------------------------------
- Buffer Annotations
-
- The most important annotations in sal.h provide a consistent way to annotate
- buffer parameters or return values for a function. Each of these annotations describes
- a single buffer (which could be a string, a fixed-length or variable-length array,
- or just a pointer) that the function interacts with: where it is, how large it is,
- how much is initialized, and what the function does with it.
-
- The appropriate macro for a given buffer can be constructed using the table below.
- Just pick the appropriate values from each category, and combine them together
- with a leading underscore. Some combinations of values do not make sense as buffer
- annotations. Only meaningful annotations can be added to your code; for a list of
- these, see the buffer annotation definitions section.
-
- Only a single buffer annotation should be used for each parameter.
-
- |------------|------------|---------|--------|----------|----------|---------------|
- |   Level    |   Usage    |  Size   | Output | NullTerm | Optional |  Parameters   |
- |------------|------------|---------|--------|----------|----------|---------------|
- | <>         | <>         | <>      | <>     | _z       | <>       | <>            |
- | _deref     | _in        | _ecount | _full  | _nz      | _opt     | (size)        |
- | _deref_opt | _out       | _bcount | _part  |          |          | (size,length) |
- |            | _inout     |         |        |          |          |               |
- |            |            |         |        |          |          |               |
- |------------|------------|---------|--------|----------|----------|---------------|
-
- Level: Describes the buffer pointer's level of indirection from the parameter or
-          return value 'p'.
-
- <>         : p is the buffer pointer.
- _deref     : *p is the buffer pointer. p must not be NULL.
- _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of
-                the annotation is ignored.
-
- Usage: Describes how the function uses the buffer.
-
- <>     : The buffer is not accessed. If used on the return value or with _deref, the
-            function will provide the buffer, and it will be uninitialized at exit.
-            Otherwise, the caller must provide the buffer. This should only be used
-            for alloc and free functions.
- _in    : The function will only read from the buffer. The caller must provide the
-            buffer and initialize it. Cannot be used with _deref.
- _out   : The function will only write to the buffer. If used on the return value or
-            with _deref, the function will provide the buffer and initialize it.
-            Otherwise, the caller must provide the buffer, and the function will
-            initialize it.
- _inout : The function may freely read from and write to the buffer. The caller must
-            provide the buffer and initialize it. If used with _deref, the buffer may
-            be reallocated by the function.
-
- Size: Describes the total size of the buffer. This may be less than the space actually
-         allocated for the buffer, in which case it describes the accessible amount.
-
- <>      : No buffer size is given. If the type specifies the buffer size (such as
-             with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one
-             element long. Must be used with _in, _out, or _inout.
- _ecount : The buffer size is an explicit element count.
- _bcount : The buffer size is an explicit byte count.
-
- Output: Describes how much of the buffer will be initialized by the function. For
-           _inout buffers, this also describes how much is initialized at entry. Omit this
-           category for _in buffers; they must be fully initialized by the caller.
-
- <>    : The type specifies how much is initialized. For instance, a function initializing
-           an LPWSTR must NULL-terminate the string.
- _full : The function initializes the entire buffer.
- _part : The function initializes part of the buffer, and explicitly indicates how much.
-
- NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer.
- _z    : A '\0' indicated the end of the buffer
- _nz     : The buffer may not be null terminated and a '\0' does not indicate the end of the
-          buffer.
- Optional: Describes if the buffer itself is optional.
-
- <>   : The pointer to the buffer must not be NULL.
- _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced.
-
- Parameters: Gives explicit counts for the size and length of the buffer.
-
- <>            : There is no explicit count. Use when neither _ecount nor _bcount is used.
- (size)        : Only the buffer's total size is given. Use with _ecount or _bcount but not _part.
- (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part
-                   and _bcount_part.
-
- -------------------------------------------------------------------------------
- Buffer Annotation Examples
-
- LWSTDAPI_(BOOL) StrToIntExA(
-     __in LPCSTR pszString,
-     DWORD dwFlags,
-     __out int *piRet                     -- A pointer whose dereference will be filled in.
- );
-
- void MyPaintingFunction(
-     __in HWND hwndControl,               -- An initialized read-only parameter.
-     __in_opt HDC hdcOptional,            -- An initialized read-only parameter that might be NULL.
-     __inout IPropertyStore *ppsStore     -- An initialized parameter that may be freely used
-                                          --   and modified.
- );
-
- LWSTDAPI_(BOOL) PathCompactPathExA(
-     __out_ecount(cchMax) LPSTR pszOut,   -- A string buffer with cch elements that will
-                                          --   be NULL terminated on exit.
-     __in LPCSTR pszSrc,
-     UINT cchMax,
-     DWORD dwFlags
- );
-
- HRESULT SHLocalAllocBytes(
-     size_t cb,
-     __deref_bcount(cb) T **ppv           -- A pointer whose dereference will be set to an
-                                          --   uninitialized buffer with cb bytes.
- );
-
- __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at
-     entry and exit, and may be written to by this function.
-
- __out_ecount_part(count, *countOut) : A buffer with count elements that will be
-     partially initialized by this function. The function indicates how much it
-     initialized by setting *countOut.
-
- -------------------------------------------------------------------------------
- Advanced Annotations
-
- Advanced annotations describe behavior that is not expressible with the regular
- buffer macros. These may be used either to annotate buffer parameters that involve
- complex or conditional behavior, or to enrich existing annotations with additional
- information.
-
- __success(expr) f :
-     <expr> indicates whether function f succeeded or not. If <expr> is true at exit,
-     all the function's guarantees (as given by other annotations) must hold. If <expr>
-     is false at exit, the caller should not expect any of the function's guarantees
-     to hold. If not used, the function must always satisfy its guarantees. Added
-     automatically to functions that indicate success in standard ways, such as by
-     returning an HRESULT.
-
- __nullterminated p :
-     Pointer p is a buffer that may be read or written up to and including the first
-     NULL character or pointer. May be used on typedefs, which marks valid (properly
-     initialized) instances of that type as being NULL-terminated.
-
- __nullnullterminated p :
-     Pointer p is a buffer that may be read or written up to and including the first
-     sequence of two NULL characters or pointers. May be used on typedefs, which marks
-     valid instances of that type as being double-NULL terminated.
-
- __reserved v :
-     Value v must be 0/NULL, reserved for future use.
-
- __checkReturn v :
-     Return value v must not be ignored by callers of this function.
-
- __typefix(ctype) v :
-     Value v should be treated as an instance of ctype, rather than its declared type.
-
- __override f :
-     Specify C#-style 'override' behaviour for overriding virtual methods.
-
- __callback f :
-     Function f can be used as a function pointer.
-
- __format_string p :
-     Pointer p is a string that contains % markers in the style of printf.
-
- __blocksOn(resource) f :
-     Function f blocks on the resource 'resource'.
-
- FALLTHROUGH :
-     Annotates switch statement labels where fall-through is desired, to distinguish
-     from forgotten break statements.
-
- -------------------------------------------------------------------------------
- Advanced Annotation Examples
-
- __success(return != FALSE) LWSTDAPI_(BOOL)
- PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) :
-    pszBuf is only guaranteed to be NULL-terminated when TRUE is returned.
-
- typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings.
-
- __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be
-     a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs.
-
- -------------------------------------------------------------------------------
-*/
-
-#define __specstrings
-
-#ifdef __cplusplus  // [
-#ifndef __nothrow   // [
-#define __nothrow NOTHROW_DECL
-#endif  // ]
-extern "C" {
-#else              // ][
-#ifndef __nothrow  // [
-#define __nothrow
-#endif                           // ]
-#endif /* #ifdef __cplusplus */  // ]
-
-/*
- -------------------------------------------------------------------------------
- Helper Macro Definitions
-
- These express behavior common to many of the high-level annotations.
- DO NOT USE THESE IN YOUR CODE.
- -------------------------------------------------------------------------------
-*/
-
-/*
-    The helper annotations are only understood by the compiler version used by
-    various defect detection tools. When the regular compiler is running, they
-    are defined into nothing, and do not affect the compiled code.
-*/
-
-#if !defined(__midl) && defined(_PREFAST_)  // [
-
-/*
- In the primitive "SAL_*" annotations "SAL" stands for Standard
- Annotation Language.  These "SAL_*" annotations are the
- primitives the compiler understands and high-level MACROs
- will decompose into these primivates.
-*/
-
-#define _SA_SPECSTRIZE(x) #x
-
-/*
- __notnull p
- __maybenull p
-
- Annotates a pointer p. States that pointer p is never null or maybe null.
-*/
-
-#define __notnull _Notnull_impl_
-#define __maybenull _Maybenull_impl_
-
-/*
- __readonly l
- __notreadonly l
- __maybereadonly l
-
- Annotates a location l. States that location l is not modified after
- this point.  If the annotation is placed on the precondition state of
- a function, the restriction only applies until the postcondition state
- of the function.  __maybereadonly states that the annotated location
- may be modified, whereas __notreadonly states that a location must be
- modified.
-*/
-
-#define __readonly _Pre1_impl_(__readaccess_impl)
-#define __notreadonly _Pre1_impl_(__allaccess_impl)
-#define __maybereadonly _Pre1_impl_(__readaccess_impl)
-
-/*
- __valid v
- __notvalid v
- __maybevalid v
-
- Annotates any value v. States that the value satisfies all properties of
- valid values of its type. For example, for a string buffer, valid means
- that the buffer pointer is either NULL or points to a NULL-terminated string.
-*/
-
-#define __valid _Valid_impl_
-#define __notvalid _Notvalid_impl_
-#define __maybevalid _Maybevalid_impl_
-
-/*
- __readableTo(extent) p
-
- Annotates a buffer pointer p.  If the buffer can be read, extent describes
- how much of the buffer is readable. For a reader of the buffer, this is
- an explicit permission to read up to that amount, rather than a restriction to
- read only up to it.
-*/
-
-#define __readableTo(extent) _SA_annotes1(SAL_readableTo, extent)
-
-/*
-
- __elem_readableTo(size)
-
- Annotates a buffer pointer p as being readable to size elements.
-*/
-
-#define __elem_readableTo(size) _SA_annotes1(SAL_readableTo, elementCount(size))
-
-/*
- __byte_readableTo(size)
-
- Annotates a buffer pointer p as being readable to size bytes.
-*/
-#define __byte_readableTo(size) _SA_annotes1(SAL_readableTo, byteCount(size))
-
-/*
- __writableTo(extent) p
-
- Annotates a buffer pointer p. If the buffer can be modified, extent
- describes how much of the buffer is writable (usually the allocation
- size). For a writer of the buffer, this is an explicit permission to
- write up to that amount, rather than a restriction to write only up to it.
-*/
-#define __writableTo(size) _SA_annotes1(SAL_writableTo, size)
-
-/*
- __elem_writableTo(size)
-
- Annotates a buffer pointer p as being writable to size elements.
-*/
-#define __elem_writableTo(size) _SA_annotes1(SAL_writableTo, elementCount(size))
-
-/*
- __byte_writableTo(size)
-
- Annotates a buffer pointer p as being writable to size bytes.
-*/
-#define __byte_writableTo(size) _SA_annotes1(SAL_writableTo, byteCount(size))
-
-/*
- __deref p
-
- Annotates a pointer p. The next annotation applies one dereference down
- in the type. If readableTo(p, size) then the next annotation applies to
- all elements *(p+i) for which i satisfies the size. If p is a pointer
- to a struct, the next annotation applies to all fields of the struct.
-*/
-#define __deref _Deref_impl_
-
-/*
- __pre __next_annotation
-
- The next annotation applies in the precondition state
-*/
-#define __pre _Pre_impl_
-
-/*
- __post __next_annotation
-
- The next annotation applies in the postcondition state
-*/
-#define __post _Post_impl_
-
-/*
- __precond(<expr>)
-
- When <expr> is true, the next annotation applies in the precondition state
- (currently not enabled)
-*/
-#define __precond(expr) __pre
-
-/*
- __postcond(<expr>)
-
- When <expr> is true, the next annotation applies in the postcondition state
- (currently not enabled)
-*/
-#define __postcond(expr) __post
-
-/*
- __exceptthat
-
- Given a set of annotations Q containing __exceptthat maybeP, the effect of
- the except clause is to erase any P or notP annotations (explicit or
- implied) within Q at the same level of dereferencing that the except
- clause appears, and to replace it with maybeP.
-
-  Example 1: __valid __pre_except_maybenull on a pointer p means that the
-             pointer may be null, and is otherwise valid, thus overriding
-             the implicit notnull annotation implied by __valid on
-             pointers.
-
-  Example 2: __valid __deref __pre_except_maybenull on an int **p means
-             that p is not null (implied by valid), but the elements
-             pointed to by p could be null, and are otherwise valid.
-*/
-#define __exceptthat __inner_exceptthat
-
-/*
- _refparam
-
- Added to all out parameter macros to indicate that they are all reference
- parameters.
-*/
-#define __refparam _Notref_ __deref __notreadonly
-
-/*
- __inner_*
-
- Helper macros that directly correspond to certain high-level annotations.
-
-*/
-
-/*
- Macros to classify the entrypoints and indicate their category.
-
- Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM.
-
-*/
-#define __inner_control_entrypoint(category) _SA_annotes2(SAL_entrypoint, controlEntry, category)
-
-/*
- Pre-defined data entry point categories include: Registry, File, Network.
-*/
-#define __inner_data_entrypoint(category) _SA_annotes2(SAL_entrypoint, dataEntry, category)
-
-#define __inner_override _SA_annotes0(__override)
-#define __inner_callback _SA_annotes0(__callback)
-#define __inner_blocksOn(resource) _SA_annotes1(SAL_blocksOn, resource)
-
-#define __post_except_maybenull __post __inner_exceptthat _Maybenull_impl_
-#define __pre_except_maybenull __pre __inner_exceptthat _Maybenull_impl_
-
-#define __post_deref_except_maybenull __post __deref __inner_exceptthat _Maybenull_impl_
-#define __pre_deref_except_maybenull __pre __deref __inner_exceptthat _Maybenull_impl_
-
-#define __inexpressible_readableTo(size) _Readable_elements_impl_(_Inexpressible_(size))
-#define __inexpressible_writableTo(size) _Writable_elements_impl_(_Inexpressible_(size))
-
-#else  // ][
-#define __notnull
-#define __deref
-#define __maybenull
-#define __readonly
-#define __notreadonly
-#define __maybereadonly
-#define __valid
-#define __notvalid
-#define __maybevalid
-#define __readableTo(extent)
-#define __elem_readableTo(size)
-#define __byte_readableTo(size)
-#define __writableTo(size)
-#define __elem_writableTo(size)
-#define __byte_writableTo(size)
-#define __pre
-#define __post
-#define __precond(expr)
-#define __postcond(expr)
-#define __exceptthat
-#define __inner_override
-#define __inner_callback
-#define __inner_blocksOn(resource)
-#define __refparam
-#define __inner_control_entrypoint(category)
-#define __inner_data_entrypoint(category)
-
-#define __post_except_maybenull
-#define __pre_except_maybenull
-#define __post_deref_except_maybenull
-#define __pre_deref_except_maybenull
-
-#define __inexpressible_readableTo(size)
-#define __inexpressible_writableTo(size)
-
-#endif /* #if !defined(__midl) && defined(_PREFAST_) */  // ]
-
-/*
--------------------------------------------------------------------------------
-Buffer Annotation Definitions
-
-Any of these may be used to directly annotate functions, but only one should
-be used for each parameter. To determine which annotation to use for a given
-buffer, use the table in the buffer annotations section.
--------------------------------------------------------------------------------
-*/
-
-#define __ecount(size) _SAL1_Source_(__ecount, (size), __notnull __elem_writableTo(size))
-#define __bcount(size) _SAL1_Source_(__bcount, (size), __notnull __byte_writableTo(size))
-#define __in_ecount(size) _SAL1_Source_(__in_ecount, (size), _In_reads_(size))
-#define __in_bcount(size) _SAL1_Source_(__in_bcount, (size), _In_reads_bytes_(size))
-#define __in_z _SAL1_Source_(__in_z, (), _In_z_)
-#define __in_ecount_z(size) _SAL1_Source_(__in_ecount_z, (size), _In_reads_z_(size))
-#define __in_bcount_z(size) \
-    _SAL1_Source_(__in_bcount_z, (size), __in_bcount(size) __pre __nullterminated)
-#define __in_nz _SAL1_Source_(__in_nz, (), __in)
-#define __in_ecount_nz(size) _SAL1_Source_(__in_ecount_nz, (size), __in_ecount(size))
-#define __in_bcount_nz(size) _SAL1_Source_(__in_bcount_nz, (size), __in_bcount(size))
-#define __out_ecount(size) _SAL1_Source_(__out_ecount, (size), _Out_writes_(size))
-#define __out_bcount(size) _SAL1_Source_(__out_bcount, (size), _Out_writes_bytes_(size))
-#define __out_ecount_part(size, length) \
-    _SAL1_Source_(__out_ecount_part, (size, length), _Out_writes_to_(size, length))
-#define __out_bcount_part(size, length) \
-    _SAL1_Source_(__out_bcount_part, (size, length), _Out_writes_bytes_to_(size, length))
-#define __out_ecount_full(size) _SAL1_Source_(__out_ecount_full, (size), _Out_writes_all_(size))
-#define __out_bcount_full(size) \
-    _SAL1_Source_(__out_bcount_full, (size), _Out_writes_bytes_all_(size))
-#define __out_z _SAL1_Source_(__out_z, (), __post __valid __refparam __post __nullterminated)
-#define __out_z_opt                \
-    _SAL1_Source_(__out_z_opt, (), \
-                  __post __valid __refparam __post __nullterminated __pre_except_maybenull)
-#define __out_ecount_z(size)              \
-    _SAL1_Source_(__out_ecount_z, (size), \
-                  __ecount(size) __post __valid __refparam __post __nullterminated)
-#define __out_bcount_z(size)              \
-    _SAL1_Source_(__out_bcount_z, (size), \
-                  __bcount(size) __post __valid __refparam __post __nullterminated)
-#define __out_ecount_part_z(size, length)              \
-    _SAL1_Source_(__out_ecount_part_z, (size, length), \
-                  __out_ecount_part(size, length) __post __nullterminated)
-#define __out_bcount_part_z(size, length)              \
-    _SAL1_Source_(__out_bcount_part_z, (size, length), \
-                  __out_bcount_part(size, length) __post __nullterminated)
-#define __out_ecount_full_z(size) \
-    _SAL1_Source_(__out_ecount_full_z, (size), __out_ecount_full(size) __post __nullterminated)
-#define __out_bcount_full_z(size) \
-    _SAL1_Source_(__out_bcount_full_z, (size), __out_bcount_full(size) __post __nullterminated)
-#define __out_nz _SAL1_Source_(__out_nz, (), __post __valid __refparam)
-#define __out_nz_opt \
-    _SAL1_Source_(__out_nz_opt, (), __post __valid __refparam __post_except_maybenull_)
-#define __out_ecount_nz(size) \
-    _SAL1_Source_(__out_ecount_nz, (size), __ecount(size) __post __valid __refparam)
-#define __out_bcount_nz(size) \
-    _SAL1_Source_(__out_bcount_nz, (size), __bcount(size) __post __valid __refparam)
-#define __inout _SAL1_Source_(__inout, (), _Inout_)
-#define __inout_ecount(size) _SAL1_Source_(__inout_ecount, (size), _Inout_updates_(size))
-#define __inout_bcount(size) _SAL1_Source_(__inout_bcount, (size), _Inout_updates_bytes_(size))
-#define __inout_ecount_part(size, length) \
-    _SAL1_Source_(__inout_ecount_part, (size, length), _Inout_updates_to_(size, length))
-#define __inout_bcount_part(size, length) \
-    _SAL1_Source_(__inout_bcount_part, (size, length), _Inout_updates_bytes_to_(size, length))
-#define __inout_ecount_full(size) \
-    _SAL1_Source_(__inout_ecount_full, (size), _Inout_updates_all_(size))
-#define __inout_bcount_full(size) \
-    _SAL1_Source_(__inout_bcount_full, (size), _Inout_updates_bytes_all_(size))
-#define __inout_z _SAL1_Source_(__inout_z, (), _Inout_z_)
-#define __inout_ecount_z(size) _SAL1_Source_(__inout_ecount_z, (size), _Inout_updates_z_(size))
-#define __inout_bcount_z(size)              \
-    _SAL1_Source_(__inout_bcount_z, (size), \
-                  __inout_bcount(size) __pre __nullterminated __post __nullterminated)
-#define __inout_nz _SAL1_Source_(__inout_nz, (), __inout)
-#define __inout_ecount_nz(size) _SAL1_Source_(__inout_ecount_nz, (size), __inout_ecount(size))
-#define __inout_bcount_nz(size) _SAL1_Source_(__inout_bcount_nz, (size), __inout_bcount(size))
-#define __ecount_opt(size) \
-    _SAL1_Source_(__ecount_opt, (size), __ecount(size) __pre_except_maybenull)
-#define __bcount_opt(size) \
-    _SAL1_Source_(__bcount_opt, (size), __bcount(size) __pre_except_maybenull)
-#define __in_opt _SAL1_Source_(__in_opt, (), _In_opt_)
-#define __in_ecount_opt(size) _SAL1_Source_(__in_ecount_opt, (size), _In_reads_opt_(size))
-#define __in_bcount_opt(size) _SAL1_Source_(__in_bcount_opt, (size), _In_reads_bytes_opt_(size))
-#define __in_z_opt _SAL1_Source_(__in_z_opt, (), _In_opt_z_)
-#define __in_ecount_z_opt(size) \
-    _SAL1_Source_(__in_ecount_z_opt, (size), __in_ecount_opt(size) __pre __nullterminated)
-#define __in_bcount_z_opt(size) \
-    _SAL1_Source_(__in_bcount_z_opt, (size), __in_bcount_opt(size) __pre __nullterminated)
-#define __in_nz_opt _SAL1_Source_(__in_nz_opt, (), __in_opt)
-#define __in_ecount_nz_opt(size) _SAL1_Source_(__in_ecount_nz_opt, (size), __in_ecount_opt(size))
-#define __in_bcount_nz_opt(size) _SAL1_Source_(__in_bcount_nz_opt, (size), __in_bcount_opt(size))
-#define __out_opt _SAL1_Source_(__out_opt, (), _Out_opt_)
-#define __out_ecount_opt(size) _SAL1_Source_(__out_ecount_opt, (size), _Out_writes_opt_(size))
-#define __out_bcount_opt(size) _SAL1_Source_(__out_bcount_opt, (size), _Out_writes_bytes_opt_(size))
-#define __out_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__out_ecount_part_opt, (size, length), \
-                  __out_ecount_part(size, length) __pre_except_maybenull)
-#define __out_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__out_bcount_part_opt, (size, length), \
-                  __out_bcount_part(size, length) __pre_except_maybenull)
-#define __out_ecount_full_opt(size) \
-    _SAL1_Source_(__out_ecount_full_opt, (size), __out_ecount_full(size) __pre_except_maybenull)
-#define __out_bcount_full_opt(size) \
-    _SAL1_Source_(__out_bcount_full_opt, (size), __out_bcount_full(size) __pre_except_maybenull)
-#define __out_ecount_z_opt(size) \
-    _SAL1_Source_(__out_ecount_z_opt, (size), __out_ecount_opt(size) __post __nullterminated)
-#define __out_bcount_z_opt(size) \
-    _SAL1_Source_(__out_bcount_z_opt, (size), __out_bcount_opt(size) __post __nullterminated)
-#define __out_ecount_part_z_opt(size, length)              \
-    _SAL1_Source_(__out_ecount_part_z_opt, (size, length), \
-                  __out_ecount_part_opt(size, length) __post __nullterminated)
-#define __out_bcount_part_z_opt(size, length)              \
-    _SAL1_Source_(__out_bcount_part_z_opt, (size, length), \
-                  __out_bcount_part_opt(size, length) __post __nullterminated)
-#define __out_ecount_full_z_opt(size)              \
-    _SAL1_Source_(__out_ecount_full_z_opt, (size), \
-                  __out_ecount_full_opt(size) __post __nullterminated)
-#define __out_bcount_full_z_opt(size)              \
-    _SAL1_Source_(__out_bcount_full_z_opt, (size), \
-                  __out_bcount_full_opt(size) __post __nullterminated)
-#define __out_ecount_nz_opt(size) \
-    _SAL1_Source_(__out_ecount_nz_opt, (size), __out_ecount_opt(size) __post __nullterminated)
-#define __out_bcount_nz_opt(size) \
-    _SAL1_Source_(__out_bcount_nz_opt, (size), __out_bcount_opt(size) __post __nullterminated)
-#define __inout_opt _SAL1_Source_(__inout_opt, (), _Inout_opt_)
-#define __inout_ecount_opt(size) \
-    _SAL1_Source_(__inout_ecount_opt, (size), __inout_ecount(size) __pre_except_maybenull)
-#define __inout_bcount_opt(size) \
-    _SAL1_Source_(__inout_bcount_opt, (size), __inout_bcount(size) __pre_except_maybenull)
-#define __inout_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__inout_ecount_part_opt, (size, length), \
-                  __inout_ecount_part(size, length) __pre_except_maybenull)
-#define __inout_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__inout_bcount_part_opt, (size, length), \
-                  __inout_bcount_part(size, length) __pre_except_maybenull)
-#define __inout_ecount_full_opt(size) \
-    _SAL1_Source_(__inout_ecount_full_opt, (size), __inout_ecount_full(size) __pre_except_maybenull)
-#define __inout_bcount_full_opt(size) \
-    _SAL1_Source_(__inout_bcount_full_opt, (size), __inout_bcount_full(size) __pre_except_maybenull)
-#define __inout_z_opt \
-    _SAL1_Source_(__inout_z_opt, (), __inout_opt __pre __nullterminated __post __nullterminated)
-#define __inout_ecount_z_opt(size)              \
-    _SAL1_Source_(__inout_ecount_z_opt, (size), \
-                  __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
-#define __inout_ecount_z_opt(size)              \
-    _SAL1_Source_(__inout_ecount_z_opt, (size), \
-                  __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated)
-#define __inout_bcount_z_opt(size) \
-    _SAL1_Source_(__inout_bcount_z_opt, (size), __inout_bcount_opt(size))
-#define __inout_nz_opt _SAL1_Source_(__inout_nz_opt, (), __inout_opt)
-#define __inout_ecount_nz_opt(size) \
-    _SAL1_Source_(__inout_ecount_nz_opt, (size), __inout_ecount_opt(size))
-#define __inout_bcount_nz_opt(size) \
-    _SAL1_Source_(__inout_bcount_nz_opt, (size), __inout_bcount_opt(size))
-#define __deref_ecount(size)                                      \
-    _SAL1_Source_(                                                \
-        __deref_ecount, (size),                                   \
-        _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) \
-            __post _Notref_ __deref _Notref_ __notnull __post __deref __elem_writableTo(size))
-#define __deref_bcount(size)                                      \
-    _SAL1_Source_(                                                \
-        __deref_bcount, (size),                                   \
-        _Notref_ __ecount(1) __post _Notref_ __elem_readableTo(1) \
-            __post _Notref_ __deref _Notref_ __notnull __post __deref __byte_writableTo(size))
-#define __deref_out _SAL1_Source_(__deref_out, (), _Outptr_)
-#define __deref_out_ecount(size) \
-    _SAL1_Source_(__deref_out_ecount, (size), _Outptr_result_buffer_(size))
-#define __deref_out_bcount(size) \
-    _SAL1_Source_(__deref_out_bcount, (size), _Outptr_result_bytebuffer_(size))
-#define __deref_out_ecount_part(size, length) \
-    _SAL1_Source_(__deref_out_ecount_part, (size, length), _Outptr_result_buffer_to_(size, length))
-#define __deref_out_bcount_part(size, length)              \
-    _SAL1_Source_(__deref_out_bcount_part, (size, length), \
-                  _Outptr_result_bytebuffer_to_(size, length))
-#define __deref_out_ecount_full(size) \
-    _SAL1_Source_(__deref_out_ecount_full, (size), __deref_out_ecount_part(size, size))
-#define __deref_out_bcount_full(size) \
-    _SAL1_Source_(__deref_out_bcount_full, (size), __deref_out_bcount_part(size, size))
-#define __deref_out_z _SAL1_Source_(__deref_out_z, (), _Outptr_result_z_)
-#define __deref_out_ecount_z(size)              \
-    _SAL1_Source_(__deref_out_ecount_z, (size), \
-                  __deref_out_ecount(size) __post __deref __nullterminated)
-#define __deref_out_bcount_z(size)              \
-    _SAL1_Source_(__deref_out_bcount_z, (size), \
-                  __deref_out_bcount(size) __post __deref __nullterminated)
-#define __deref_out_nz _SAL1_Source_(__deref_out_nz, (), __deref_out)
-#define __deref_out_ecount_nz(size) \
-    _SAL1_Source_(__deref_out_ecount_nz, (size), __deref_out_ecount(size))
-#define __deref_out_bcount_nz(size) \
-    _SAL1_Source_(__deref_out_bcount_nz, (size), __deref_out_ecount(size))
-#define __deref_inout                                              \
-    _SAL1_Source_(__deref_inout, (),                               \
-                  _Notref_ __notnull _Notref_ __elem_readableTo(1) \
-                      __pre __deref __valid __post _Notref_ __deref __valid __refparam)
-#define __deref_inout_z      \
-    _SAL1_Source_(           \
-        __deref_inout_z, (), \
-        __deref_inout __pre __deref __nullterminated __post _Notref_ __deref __nullterminated)
-#define __deref_inout_ecount(size)                                    \
-    _SAL1_Source_(__deref_inout_ecount, (size),                       \
-                  __deref_inout __pre __deref __elem_writableTo(size) \
-                      __post _Notref_ __deref __elem_writableTo(size))
-#define __deref_inout_bcount(size)                                    \
-    _SAL1_Source_(__deref_inout_bcount, (size),                       \
-                  __deref_inout __pre __deref __byte_writableTo(size) \
-                      __post _Notref_ __deref __byte_writableTo(size))
-#define __deref_inout_ecount_part(size, length)                                      \
-    _SAL1_Source_(__deref_inout_ecount_part, (size, length),                         \
-                  __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) \
-                      __post __deref __elem_readableTo(length))
-#define __deref_inout_bcount_part(size, length)                                      \
-    _SAL1_Source_(__deref_inout_bcount_part, (size, length),                         \
-                  __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) \
-                      __post __deref __byte_readableTo(length))
-#define __deref_inout_ecount_full(size) \
-    _SAL1_Source_(__deref_inout_ecount_full, (size), __deref_inout_ecount_part(size, size))
-#define __deref_inout_bcount_full(size) \
-    _SAL1_Source_(__deref_inout_bcount_full, (size), __deref_inout_bcount_part(size, size))
-#define __deref_inout_ecount_z(size)              \
-    _SAL1_Source_(__deref_inout_ecount_z, (size), \
-                  __deref_inout_ecount(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_bcount_z(size)              \
-    _SAL1_Source_(__deref_inout_bcount_z, (size), \
-                  __deref_inout_bcount(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_nz _SAL1_Source_(__deref_inout_nz, (), __deref_inout)
-#define __deref_inout_ecount_nz(size) \
-    _SAL1_Source_(__deref_inout_ecount_nz, (size), __deref_inout_ecount(size))
-#define __deref_inout_bcount_nz(size) \
-    _SAL1_Source_(__deref_inout_bcount_nz, (size), __deref_inout_ecount(size))
-#define __deref_ecount_opt(size) \
-    _SAL1_Source_(__deref_ecount_opt, (size), __deref_ecount(size) __post_deref_except_maybenull)
-#define __deref_bcount_opt(size) \
-    _SAL1_Source_(__deref_bcount_opt, (size), __deref_bcount(size) __post_deref_except_maybenull)
-#define __deref_out_opt \
-    _SAL1_Source_(__deref_out_opt, (), __deref_out __post_deref_except_maybenull)
-#define __deref_out_ecount_opt(size)              \
-    _SAL1_Source_(__deref_out_ecount_opt, (size), \
-                  __deref_out_ecount(size) __post_deref_except_maybenull)
-#define __deref_out_bcount_opt(size)              \
-    _SAL1_Source_(__deref_out_bcount_opt, (size), \
-                  __deref_out_bcount(size) __post_deref_except_maybenull)
-#define __deref_out_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_out_ecount_part_opt, (size, length), \
-                  __deref_out_ecount_part(size, length) __post_deref_except_maybenull)
-#define __deref_out_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_out_bcount_part_opt, (size, length), \
-                  __deref_out_bcount_part(size, length) __post_deref_except_maybenull)
-#define __deref_out_ecount_full_opt(size)              \
-    _SAL1_Source_(__deref_out_ecount_full_opt, (size), \
-                  __deref_out_ecount_full(size) __post_deref_except_maybenull)
-#define __deref_out_bcount_full_opt(size)              \
-    _SAL1_Source_(__deref_out_bcount_full_opt, (size), \
-                  __deref_out_bcount_full(size) __post_deref_except_maybenull)
-#define __deref_out_z_opt _SAL1_Source_(__deref_out_z_opt, (), _Outptr_result_maybenull_z_)
-#define __deref_out_ecount_z_opt(size)              \
-    _SAL1_Source_(__deref_out_ecount_z_opt, (size), \
-                  __deref_out_ecount_opt(size) __post __deref __nullterminated)
-#define __deref_out_bcount_z_opt(size)              \
-    _SAL1_Source_(__deref_out_bcount_z_opt, (size), \
-                  __deref_out_bcount_opt(size) __post __deref __nullterminated)
-#define __deref_out_nz_opt _SAL1_Source_(__deref_out_nz_opt, (), __deref_out_opt)
-#define __deref_out_ecount_nz_opt(size) \
-    _SAL1_Source_(__deref_out_ecount_nz_opt, (size), __deref_out_ecount_opt(size))
-#define __deref_out_bcount_nz_opt(size) \
-    _SAL1_Source_(__deref_out_bcount_nz_opt, (size), __deref_out_bcount_opt(size))
-#define __deref_inout_opt                \
-    _SAL1_Source_(__deref_inout_opt, (), \
-                  __deref_inout __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_opt(size)              \
-    _SAL1_Source_(__deref_inout_ecount_opt, (size), \
-                  __deref_inout_ecount(size)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_opt(size)              \
-    _SAL1_Source_(__deref_inout_bcount_opt, (size), \
-                  __deref_inout_bcount(size)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_inout_ecount_part_opt, (size, length), \
-                  __deref_inout_ecount_part(size, length)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_inout_bcount_part_opt, (size, length), \
-                  __deref_inout_bcount_part(size, length)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_ecount_full_opt(size)              \
-    _SAL1_Source_(__deref_inout_ecount_full_opt, (size), \
-                  __deref_inout_ecount_full(size)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_bcount_full_opt(size)              \
-    _SAL1_Source_(__deref_inout_bcount_full_opt, (size), \
-                  __deref_inout_bcount_full(size)        \
-                      __pre_deref_except_maybenull __post_deref_except_maybenull)
-#define __deref_inout_z_opt      \
-    _SAL1_Source_(               \
-        __deref_inout_z_opt, (), \
-        __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_ecount_z_opt(size)              \
-    _SAL1_Source_(__deref_inout_ecount_z_opt, (size), \
-                  __deref_inout_ecount_opt(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_bcount_z_opt(size)              \
-    _SAL1_Source_(__deref_inout_bcount_z_opt, (size), \
-                  __deref_inout_bcount_opt(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_inout_nz_opt _SAL1_Source_(__deref_inout_nz_opt, (), __deref_inout_opt)
-#define __deref_inout_ecount_nz_opt(size) \
-    _SAL1_Source_(__deref_inout_ecount_nz_opt, (size), __deref_inout_ecount_opt(size))
-#define __deref_inout_bcount_nz_opt(size) \
-    _SAL1_Source_(__deref_inout_bcount_nz_opt, (size), __deref_inout_bcount_opt(size))
-#define __deref_opt_ecount(size) \
-    _SAL1_Source_(__deref_opt_ecount, (size), __deref_ecount(size) __pre_except_maybenull)
-#define __deref_opt_bcount(size) \
-    _SAL1_Source_(__deref_opt_bcount, (size), __deref_bcount(size) __pre_except_maybenull)
-#define __deref_opt_out _SAL1_Source_(__deref_opt_out, (), _Outptr_opt_)
-#define __deref_opt_out_z _SAL1_Source_(__deref_opt_out_z, (), _Outptr_opt_result_z_)
-#define __deref_opt_out_ecount(size) \
-    _SAL1_Source_(__deref_opt_out_ecount, (size), __deref_out_ecount(size) __pre_except_maybenull)
-#define __deref_opt_out_bcount(size) \
-    _SAL1_Source_(__deref_opt_out_bcount, (size), __deref_out_bcount(size) __pre_except_maybenull)
-#define __deref_opt_out_ecount_part(size, length)              \
-    _SAL1_Source_(__deref_opt_out_ecount_part, (size, length), \
-                  __deref_out_ecount_part(size, length) __pre_except_maybenull)
-#define __deref_opt_out_bcount_part(size, length)              \
-    _SAL1_Source_(__deref_opt_out_bcount_part, (size, length), \
-                  __deref_out_bcount_part(size, length) __pre_except_maybenull)
-#define __deref_opt_out_ecount_full(size)              \
-    _SAL1_Source_(__deref_opt_out_ecount_full, (size), \
-                  __deref_out_ecount_full(size) __pre_except_maybenull)
-#define __deref_opt_out_bcount_full(size)              \
-    _SAL1_Source_(__deref_opt_out_bcount_full, (size), \
-                  __deref_out_bcount_full(size) __pre_except_maybenull)
-#define __deref_opt_inout _SAL1_Source_(__deref_opt_inout, (), _Inout_opt_)
-#define __deref_opt_inout_ecount(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount, (size), \
-                  __deref_inout_ecount(size) __pre_except_maybenull)
-#define __deref_opt_inout_bcount(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount, (size), \
-                  __deref_inout_bcount(size) __pre_except_maybenull)
-#define __deref_opt_inout_ecount_part(size, length)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_part, (size, length), \
-                  __deref_inout_ecount_part(size, length) __pre_except_maybenull)
-#define __deref_opt_inout_bcount_part(size, length)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_part, (size, length), \
-                  __deref_inout_bcount_part(size, length) __pre_except_maybenull)
-#define __deref_opt_inout_ecount_full(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_full, (size), \
-                  __deref_inout_ecount_full(size) __pre_except_maybenull)
-#define __deref_opt_inout_bcount_full(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_full, (size), \
-                  __deref_inout_bcount_full(size) __pre_except_maybenull)
-#define __deref_opt_inout_z      \
-    _SAL1_Source_(               \
-        __deref_opt_inout_z, (), \
-        __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_ecount_z(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_z, (size), \
-                  __deref_opt_inout_ecount(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_bcount_z(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_z, (size), \
-                  __deref_opt_inout_bcount(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_nz _SAL1_Source_(__deref_opt_inout_nz, (), __deref_opt_inout)
-#define __deref_opt_inout_ecount_nz(size) \
-    _SAL1_Source_(__deref_opt_inout_ecount_nz, (size), __deref_opt_inout_ecount(size))
-#define __deref_opt_inout_bcount_nz(size) \
-    _SAL1_Source_(__deref_opt_inout_bcount_nz, (size), __deref_opt_inout_bcount(size))
-#define __deref_opt_ecount_opt(size) \
-    _SAL1_Source_(__deref_opt_ecount_opt, (size), __deref_ecount_opt(size) __pre_except_maybenull)
-#define __deref_opt_bcount_opt(size) \
-    _SAL1_Source_(__deref_opt_bcount_opt, (size), __deref_bcount_opt(size) __pre_except_maybenull)
-#define __deref_opt_out_opt _SAL1_Source_(__deref_opt_out_opt, (), _Outptr_opt_result_maybenull_)
-#define __deref_opt_out_ecount_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_ecount_opt, (size), \
-                  __deref_out_ecount_opt(size) __pre_except_maybenull)
-#define __deref_opt_out_bcount_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_bcount_opt, (size), \
-                  __deref_out_bcount_opt(size) __pre_except_maybenull)
-#define __deref_opt_out_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_opt_out_ecount_part_opt, (size, length), \
-                  __deref_out_ecount_part_opt(size, length) __pre_except_maybenull)
-#define __deref_opt_out_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_opt_out_bcount_part_opt, (size, length), \
-                  __deref_out_bcount_part_opt(size, length) __pre_except_maybenull)
-#define __deref_opt_out_ecount_full_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_ecount_full_opt, (size), \
-                  __deref_out_ecount_full_opt(size) __pre_except_maybenull)
-#define __deref_opt_out_bcount_full_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_bcount_full_opt, (size), \
-                  __deref_out_bcount_full_opt(size) __pre_except_maybenull)
-#define __deref_opt_out_z_opt                                                                 \
-    _SAL1_Source_(                                                                            \
-        __deref_opt_out_z_opt, (),                                                            \
-        __post __deref __valid __refparam __pre_except_maybenull __pre_deref_except_maybenull \
-            __post_deref_except_maybenull __post __deref __nullterminated)
-#define __deref_opt_out_ecount_z_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_ecount_z_opt, (size), \
-                  __deref_opt_out_ecount_opt(size) __post __deref __nullterminated)
-#define __deref_opt_out_bcount_z_opt(size)              \
-    _SAL1_Source_(__deref_opt_out_bcount_z_opt, (size), \
-                  __deref_opt_out_bcount_opt(size) __post __deref __nullterminated)
-#define __deref_opt_out_nz_opt _SAL1_Source_(__deref_opt_out_nz_opt, (), __deref_opt_out_opt)
-#define __deref_opt_out_ecount_nz_opt(size) \
-    _SAL1_Source_(__deref_opt_out_ecount_nz_opt, (size), __deref_opt_out_ecount_opt(size))
-#define __deref_opt_out_bcount_nz_opt(size) \
-    _SAL1_Source_(__deref_opt_out_bcount_nz_opt, (size), __deref_opt_out_bcount_opt(size))
-#define __deref_opt_inout_opt \
-    _SAL1_Source_(__deref_opt_inout_opt, (), __deref_inout_opt __pre_except_maybenull)
-#define __deref_opt_inout_ecount_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_opt, (size), \
-                  __deref_inout_ecount_opt(size) __pre_except_maybenull)
-#define __deref_opt_inout_bcount_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_opt, (size), \
-                  __deref_inout_bcount_opt(size) __pre_except_maybenull)
-#define __deref_opt_inout_ecount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_part_opt, (size, length), \
-                  __deref_inout_ecount_part_opt(size, length) __pre_except_maybenull)
-#define __deref_opt_inout_bcount_part_opt(size, length)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_part_opt, (size, length), \
-                  __deref_inout_bcount_part_opt(size, length) __pre_except_maybenull)
-#define __deref_opt_inout_ecount_full_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_full_opt, (size), \
-                  __deref_inout_ecount_full_opt(size) __pre_except_maybenull)
-#define __deref_opt_inout_bcount_full_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_full_opt, (size), \
-                  __deref_inout_bcount_full_opt(size) __pre_except_maybenull)
-#define __deref_opt_inout_z_opt      \
-    _SAL1_Source_(                   \
-        __deref_opt_inout_z_opt, (), \
-        __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_ecount_z_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_ecount_z_opt, (size), \
-                  __deref_opt_inout_ecount_opt(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_bcount_z_opt(size)              \
-    _SAL1_Source_(__deref_opt_inout_bcount_z_opt, (size), \
-                  __deref_opt_inout_bcount_opt(size)      \
-                      __pre __deref __nullterminated __post __deref __nullterminated)
-#define __deref_opt_inout_nz_opt _SAL1_Source_(__deref_opt_inout_nz_opt, (), __deref_opt_inout_opt)
-#define __deref_opt_inout_ecount_nz_opt(size) \
-    _SAL1_Source_(__deref_opt_inout_ecount_nz_opt, (size), __deref_opt_inout_ecount_opt(size))
-#define __deref_opt_inout_bcount_nz_opt(size) \
-    _SAL1_Source_(__deref_opt_inout_bcount_nz_opt, (size), __deref_opt_inout_bcount_opt(size))
-
-/*
--------------------------------------------------------------------------------
-Advanced Annotation Definitions
-
-Any of these may be used to directly annotate functions, and may be used in
-combination with each other or with regular buffer macros. For an explanation
-of each annotation, see the advanced annotations section.
--------------------------------------------------------------------------------
-*/
-
-#define __success(expr) _Success_(expr)
-#define __nullterminated _Null_terminated_
-#define __nullnullterminated
-#define __clr_reserved _SAL1_Source_(__reserved, (), _Reserved_)
-#define __checkReturn _SAL1_Source_(__checkReturn, (), _Check_return_)
-#define __typefix(ctype) _SAL1_Source_(__typefix, (ctype), __inner_typefix(ctype))
-#define __override __inner_override
-#define __callback __inner_callback
-#define __format_string _Printf_format_string_
-#define __blocksOn(resource) __inner_blocksOn(resource)
-#define __control_entrypoint(category) __inner_control_entrypoint(category)
-#define __data_entrypoint(category) __inner_data_entrypoint(category)
-#define __useHeader _Use_decl_anno_impl_
-#define __on_failure(annotes) _On_failure_impl_(annotes _SAL_nop_impl_)
-
-#ifndef __has_cpp_attribute
-#define __has_cpp_attribute(x) (0)
-#endif
-
-#ifndef __fallthrough  // [
-#if __has_cpp_attribute(fallthrough)
-#define __fallthrough [[fallthrough]]
-#else
-#define __fallthrough
-#endif
-#endif  // ]
-
-#ifndef __analysis_assume  // [
-#ifdef _PREFAST_           // [
-#define __analysis_assume(expr) __assume(expr)
-#else  // ][
-#define __analysis_assume(expr)
-#endif  // ]
-#endif  // ]
-
-#ifndef _Analysis_assume_  // [
-#ifdef _PREFAST_           // [
-#define _Analysis_assume_(expr) __assume(expr)
-#else  // ][
-#define _Analysis_assume_(expr)
-#endif  // ]
-#endif  // ]
-
-#define _Analysis_noreturn_ _SAL2_Source_(_Analysis_noreturn_, (), _SA_annotes0(SAL_terminates))
-
-#ifdef _PREFAST_  // [
-__inline __nothrow void __AnalysisAssumeNullterminated(_Post_ __nullterminated void *p);
-
-#define _Analysis_assume_nullterminated_(x) __AnalysisAssumeNullterminated(x)
-#else  // ][
-#define _Analysis_assume_nullterminated_(x)
-#endif  // ]
-
-//
-// Set the analysis mode (global flags to analysis).
-// They take effect at the point of declaration; use at global scope
-// as a declaration.
-//
-
-// Synthesize a unique symbol.
-#define ___MKID(x, y) x##y
-#define __MKID(x, y) ___MKID(x, y)
-#define __GENSYM(x) __MKID(x, __COUNTER__)
-
-__ANNOTATION(SAL_analysisMode(__AuToQuOtE __In_impl_ char *mode);)
-
-#define _Analysis_mode_impl_(mode) _SA_annotes1(SAL_analysisMode, #mode)
-
-#define _Analysis_mode_(mode) \
-    typedef _Analysis_mode_impl_(mode) int __GENSYM(__prefast_analysis_mode_flag);
-
-// The following are predefined:
-//  _Analysis_operator_new_throw_   (operator new throws)
-//  _Analysis_operator_new_null_        (operator new returns null)
-//  _Analysis_operator_new_never_fails_ (operator new never fails)
-//
-
-// Function class annotations.
-__ANNOTATION(SAL_functionClassNew(__In_impl_ char *);)
-__PRIMOP(int, _In_function_class_(__In_impl_ char *);)
-#define _In_function_class_(x) _In_function_class_(#x)
-
-#define _Function_class_(x) _SA_annotes1(SAL_functionClassNew, #x)
-
-/*
- * interlocked operand used in interlocked instructions
- */
-// #define _Interlocked_operand_ _Pre_ _SA_annotes0(SAL_interlocked)
-
-#define _Enum_is_bitflag_ _SA_annotes0(SAL_enumIsBitflag)
-#define _Strict_type_match_ _SA_annotes0(SAL_strictType2)
-
-#define _Maybe_raises_SEH_exception_ _Pre_ _SA_annotes1(SAL_inTry, __yes)
-#define _Raises_SEH_exception_ _Group_(_Maybe_raises_SEH_exception_ _Analysis_noreturn_)
-
-// Clean up macros that collide with libstdc++ internals
-#undef __valid
-#undef __notvalid
-#undef __maybevalid
-
-#ifdef __cplusplus  // [
-}
-#endif  // ]
\ No newline at end of file
diff --git a/targets/app/linux/Stubs/d3d11_stubs.h b/targets/app/linux/Stubs/d3d11_stubs.h
index 225255a4e..a2f38881d 100644
--- a/targets/app/linux/Stubs/d3d11_stubs.h
+++ b/targets/app/linux/Stubs/d3d11_stubs.h
@@ -3,11 +3,8 @@
 
 #pragma once
 
-#include "app/linux/Stubs/DirectXMath/DirectXMath.h"
 #include "winapi_stubs.h"
 
-using namespace DirectX;
-
 typedef struct _RECT {
     LONG left;
     LONG top;
diff --git a/targets/minecraft/client/Camera.cpp b/targets/minecraft/client/Camera.cpp
index ca9f7edb6..ded658065 100644
--- a/targets/minecraft/client/Camera.cpp
+++ b/targets/minecraft/client/Camera.cpp
@@ -1,13 +1,13 @@
 #include "Camera.h"
 
 #include <GL/gl.h>
+#include <glm/glm.hpp>
 #include <math.h>
 #include <string.h>
 
 #include <numbers>
 
 #include "MemoryTracker.h"
-#include "app/linux/Stubs/DirectXMath/DirectXMath.h"
 #include "app/include/stubs.h"
 #include "java/FloatBuffer.h"
 #include "minecraft/world/entity/LivingEntity.h"
@@ -54,18 +54,16 @@ zPlayerOffs = position->get(2);
     // this is just working out how to get a (0,0,0) point in clip space to pass
     // into the inverted combined model/view/projection matrix, so we just need
     // to get this matrix and get its translation as an equivalent.
-    DirectX::XMMATRIX _modelview, _proj, _final, _invert;
-    DirectX::XMVECTOR _det;
-    DirectX::XMFLOAT4 trans;
+    glm::mat4 _modelview, _proj, _final, _invert;
+    glm::vec4 trans;
 
     memcpy(&_modelview, modelview->_getDataPointer(), 64);
     memcpy(&_proj, projection->_getDataPointer(), 64);
 
-    _final = XMMatrixMultiply(_modelview, _proj);
-    _det = XMMatrixDeterminant(_final);
-    _invert = XMMatrixInverse(&_det, _final);
+    _final = _proj * _modelview;  // GLM is column-major; reverse multiply order
+    _invert = glm::inverse(_final);
 
-    XMStoreFloat4(&trans, _invert.r[3]);
+    trans = _invert[3];  // column 3 = translation column in column-major
 
     xPlayerOffs = trans.x / trans.w;
     yPlayerOffs = trans.y / trans.w;
diff --git a/targets/minecraft/meson.build b/targets/minecraft/meson.build
index 2f24ef871..4623b1e05 100644
--- a/targets/minecraft/meson.build
+++ b/targets/minecraft/meson.build
@@ -52,6 +52,7 @@ lib_minecraft = static_library('minecraft',
     input_dep,
     profile_dep,
     storage_dep,
+    glm_dep,
     nbt_dep,
     java_dep,
     assets_localisation_dep,
diff --git a/targets/platform/meson.build b/targets/platform/meson.build
index 4e216ab0a..49aed3276 100644
--- a/targets/platform/meson.build
+++ b/targets/platform/meson.build
@@ -7,7 +7,6 @@ platform_dep = declare_dependency(
 # SDL2-based platform implementations (formerly 4J.* modules)
 _sdl2 = dependency('sdl2')
 _threads = dependency('threads')
-_glm = dependency('glm')
 _defs = []
 
 if get_option('renderer') == 'gles'
@@ -28,7 +27,7 @@ sdl2_sources = files(
 lib_platform_sdl2 = static_library('platform_sdl2',
   sdl2_sources,
   include_directories: [platform_inc, include_directories('sdl2')],
-  dependencies: [_sdl2, _gl, _threads, _glm, stb_dep],
+  dependencies: [_sdl2, _gl, _threads, glm_dep, stb_dep],
   cpp_args: _defs + global_cpp_args + global_cpp_defs,
 )
 
@@ -37,7 +36,7 @@ lib_platform_sdl2 = static_library('platform_sdl2',
 render_dep = declare_dependency(
   link_with: lib_platform_sdl2,
   include_directories: [platform_inc, include_directories('sdl2')],
-  dependencies: [_sdl2, _gl, _threads, _glm],
+  dependencies: [_sdl2, _gl, _threads, glm_dep],
 )
 input_dep = render_dep
 profile_dep = render_dep