ARM 32-bit support for Windows (will probably work on Linux/Android t…

…oo) (#332)
jrouwe · Nov 12, 2022 · 69fcdfe · 69fcdfe
1 parent 84b751b
commit 69fcdfe
Show file tree

Hide file tree

Showing 20 changed files with 69 additions and 35 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -141,6 +141,24 @@ jobs:
     - name: Build
       run: msbuild Build\VS2022_CL_ARM\JoltPhysics.sln /property:Configuration=${{matrix.build_type}}
 
+  msvc_cl_arm_32_bit:
+    runs-on: windows-latest
+    name: Visual Studio CL ARM 32-bit
+    strategy:
+        fail-fast: false
+        matrix:
+            build_type: [Debug, Release]
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+    - name: Configure CMake
+      run: cmake -B ${{github.workspace}}/Build/VS2022_CL_ARM_32_BIT -G "Visual Studio 17 2022" -A ARM Build
+    - name: Build
+      run: msbuild Build\VS2022_CL_ARM_32_BIT\JoltPhysics.sln /property:Configuration=${{matrix.build_type}}
+
   macos:
     runs-on: macos-latest
     name: macOS

diff --git a/Build/CMakeLists.txt b/Build/CMakeLists.txt
@@ -61,6 +61,11 @@ if (("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUA
 	# Set compiler flag for disabling RTTI
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /GR-")
 
+	if ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM")
+		# On ARM the exception handling flag is missing which causes warnings
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
+	endif()
+
 	# Set compiler flags for various configurations
 	set(CMAKE_CXX_FLAGS_DEBUG "/GS /Od /Ob0 /RTC1")
 	set(CMAKE_CXX_FLAGS_RELEASE "/GS- /Gy /O2 /Oi /Ot")
@@ -69,8 +74,8 @@ if (("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" OR "${CMAKE_SYSTEM_NAME}" STREQUA
 	set(CMAKE_CXX_FLAGS_RELEASEUBSAN "-fsanitize=undefined,implicit-conversion,float-divide-by-zero,local-bounds -fno-sanitize-recover=all")
 	set(CMAKE_CXX_FLAGS_RELEASECOVERAGE "-fprofile-instr-generate -fcoverage-mapping")
 
-	if (NOT ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM64"))
-		# On ARM64, whole program optimization triggers an internal compiler error during code gen, so we don't turn it on
+	if (NOT ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM64") AND NOT ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM"))
+		# On ARM, whole program optimization triggers an internal compiler error during code gen, so we don't turn it on
 		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
 		set(CMAKE_CXX_FLAGS_DISTRIBUTION "${CMAKE_CXX_FLAGS_DISTRIBUTION} /GL")
 	endif()
@@ -276,7 +281,7 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
 	   endif()
 endif()
 
-if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows" AND NOT ("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "ARM")) # ARM 32-bit is missing dinput8.lib
 	# Windows only targets
 	if (TARGET_SAMPLES OR TARGET_VIEWER)
 		include(${PHYSICS_REPO_ROOT}/TestFramework/TestFramework.cmake)

diff --git a/Build/cmake_vs2022_cl_arm_32bit.bat b/Build/cmake_vs2022_cl_arm_32bit.bat
@@ -0,0 +1,3 @@
+@echo off
+cmake -S . -B VS2022_CL_ARM_32BIT -G "Visual Studio 17 2022" -A ARM %*
+echo Open VS2022_CL_ARM_32BIT\JoltPhysics.sln to build the project.
diff --git a/Jolt/Core/Core.h b/Jolt/Core/Core.h
@@ -56,6 +56,7 @@
 		#define JPH_CPU_ADDRESS_BITS 32
 	#endif
 	#define JPH_USE_SSE
+	#define JPH_VECTOR_ALIGNMENT 16
 
 	// Detect enabled instruction sets
 	#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && !defined(JPH_USE_AVX512)
@@ -95,15 +96,22 @@
 			#error Undefined compiler
 		#endif
 	#endif
-#elif defined(__aarch64__) || defined(_M_ARM64)
-	// ARM64 CPU architecture
-	#define JPH_CPU_ARM64
-	#define JPH_USE_NEON
-	#define JPH_CPU_ADDRESS_BITS 64
+#elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+	// ARM CPU architecture
+	#define JPH_CPU_ARM
+	#if defined(__aarch64__) || defined(_M_ARM64)
+		#define JPH_CPU_ADDRESS_BITS 64
+		#define JPH_USE_NEON
+		#define JPH_VECTOR_ALIGNMENT 16
+	#else
+		#define JPH_CPU_ADDRESS_BITS 32
+		#define JPH_VECTOR_ALIGNMENT 8 // 32-bit ARM does not support aligning on the stack on 16 byte boundaries
+	#endif
 #elif defined(JPH_PLATFORM_WASM)
 	// WebAssembly CPU architecture
 	#define JPH_CPU_WASM
 	#define JPH_CPU_ADDRESS_BITS 32
+	#define JPH_VECTOR_ALIGNMENT 16
 	#define JPH_DISABLE_CUSTOM_ALLOCATOR
 #else
 	#error Unsupported CPU architecture
@@ -197,7 +205,7 @@
 #elif defined(JPH_PLATFORM_LINUX) || defined(JPH_PLATFORM_ANDROID) || defined(JPH_PLATFORM_MACOS) || defined(JPH_PLATFORM_IOS)
 	#if defined(JPH_CPU_X86)
 		#define JPH_BREAKPOINT		__asm volatile ("int $0x3")
-	#elif defined(JPH_CPU_ARM64)
+	#elif defined(JPH_CPU_ARM)
 		#define JPH_BREAKPOINT		__builtin_trap()
 	#endif
 #elif defined(JPH_PLATFORM_WASM)

diff --git a/Jolt/Core/FPControlWord.h b/Jolt/Core/FPControlWord.h
@@ -30,7 +30,7 @@ class FPControlWord : public NonCopyable
 	uint		mPrevState;	
 };
 
-#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
 
 /// Helper class that needs to be put on the stack to update the state of the floating point control word.
 /// This state is kept per thread.
@@ -59,7 +59,7 @@ class FPControlWord : public NonCopyable
 	unsigned int mPrevState;
 };
 
-#elif defined(JPH_USE_NEON)
+#elif defined(JPH_CPU_ARM)
 
 /// Helper class that needs to be put on the stack to update the state of the floating point control word.
 /// This state is kept per thread.

diff --git a/Jolt/Core/FPException.h b/Jolt/Core/FPException.h
@@ -20,7 +20,7 @@ class FPExceptionDisableInvalid : public FPControlWord<_MM_MASK_INVALID, _MM_MAS
 /// Disable division by zero floating point exceptions
 class FPExceptionDisableDivByZero : public FPControlWord<_MM_MASK_DIV_ZERO, _MM_MASK_DIV_ZERO> { };
 
-#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
 
 /// Enable floating point divide by zero exception and exceptions on invalid numbers
 class FPExceptionsEnable : public FPControlWord<0, _EM_INVALID | _EM_ZERODIVIDE> { };
@@ -31,7 +31,7 @@ class FPExceptionDisableInvalid : public FPControlWord<_EM_INVALID, _EM_INVALID>
 /// Disable division by zero floating point exceptions
 class FPExceptionDisableDivByZero : public FPControlWord<_EM_ZERODIVIDE, _EM_ZERODIVIDE> { };
 
-#elif defined(JPH_USE_NEON)
+#elif defined(JPH_CPU_ARM)
 
 /// Invalid operation exception bit
 static constexpr uint64 FP_IOE = 1 << 8;

diff --git a/Jolt/Core/FPFlushDenormals.h b/Jolt/Core/FPFlushDenormals.h
@@ -13,11 +13,11 @@ JPH_NAMESPACE_BEGIN
 /// This can make floating point operations much faster when working with very small numbers
 class FPFlushDenormals : public FPControlWord<_MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_MASK> { };
 
-#elif defined(JPH_USE_NEON) && defined(JPH_COMPILER_MSVC)
+#elif defined(JPH_CPU_ARM) && defined(JPH_COMPILER_MSVC)
 
 class FPFlushDenormals : public FPControlWord<_DN_FLUSH, _MCW_DN> { };
 
-#elif defined(JPH_USE_NEON)
+#elif defined(JPH_CPU_ARM)
 
 /// Flush denormals to zero bit
 static constexpr uint64 FP_FZ = 1 << 24;

diff --git a/Jolt/Core/TickCounter.cpp b/Jolt/Core/TickCounter.cpp
@@ -24,7 +24,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM))
 
 uint64 GetProcessorTickCount()
 {
@@ -33,10 +33,10 @@ uint64 GetProcessorTickCount()
 	return uint64(count.QuadPart);
 }
 
-#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM64)
+#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM)
 
 static const uint64 sProcessorTicksPerSecond = []() {
-#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM))
 	LARGE_INTEGER frequency { };
 	QueryPerformanceFrequency(&frequency);
 	return uint64(frequency.QuadPart);
@@ -71,7 +71,7 @@ static const uint64 sProcessorTicksPerSecond = []() {
 
 		#if defined(JPH_CPU_X86)
 			const char *cpu_str = "cpu MHz";
-		#elif defined(JPH_CPU_ARM64)
+		#elif defined(JPH_CPU_ARM)
 			const char *cpu_str = "BogoMIPS";
 		#else
 			#error Unsupported CPU architecture

diff --git a/Jolt/Core/TickCounter.h b/Jolt/Core/TickCounter.h
@@ -12,7 +12,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM64))
+#if defined(JPH_PLATFORM_WINDOWS_UWP) || (defined(JPH_PLATFORM_WINDOWS) && defined(JPH_CPU_ARM))
 
 /// Functionality to get the processors cycle counter
 uint64 GetProcessorTickCount(); // Not inline to avoid having to include Windows.h
@@ -26,7 +26,7 @@ JPH_INLINE uint64 GetProcessorTickCount()
 	return JPH_PLATFORM_BLUE_GET_TICKS();
 #elif defined(JPH_CPU_X86)
 	return __rdtsc();
-#elif defined(JPH_CPU_ARM64)
+#elif defined(JPH_CPU_ARM)
 	uint64 val;
 	asm volatile("mrs %0, cntvct_el0" : "=r" (val));
 	return val;
@@ -37,7 +37,7 @@ JPH_INLINE uint64 GetProcessorTickCount()
 #endif
 }
 
-#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM64)
+#endif // JPH_PLATFORM_WINDOWS_UWP || (JPH_PLATFORM_WINDOWS && JPH_CPU_ARM)
 
 /// Get the amount of ticks per second, note that this number will never be fully accurate as the amound of ticks per second may vary with CPU load, so this number is only to be used to give an indication of time for profiling purposes
 uint64 GetProcessorTicksPerSecond();

diff --git a/Jolt/Math/Mat44.h b/Jolt/Math/Mat44.h
@@ -8,7 +8,7 @@
 JPH_NAMESPACE_BEGIN
 
 /// Holds a 4x4 matrix of floats, but supports also operations on the 3x3 upper left part of the matrix.
-class [[nodiscard]] alignas(16) Mat44
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Mat44
 {
 public:
 	JPH_OVERRIDE_NEW_DELETE

diff --git a/Jolt/Math/Math.h b/Jolt/Math/Math.h
@@ -107,7 +107,7 @@ inline uint CountTrailingZeros(uint32 inValue)
 			return 32;
 		return __builtin_ctz(inValue);
 	#endif
-#elif defined(JPH_CPU_ARM64)
+#elif defined(JPH_CPU_ARM)
 	#if defined(JPH_COMPILER_MSVC)
 		if (inValue == 0)
 			return 32;
@@ -139,7 +139,7 @@ inline uint CountLeadingZeros(uint32 inValue)
 			return 32;
 		return __builtin_clz(inValue);
 	#endif
-#elif defined(JPH_CPU_ARM64)
+#elif defined(JPH_CPU_ARM)
 	#if defined(JPH_COMPILER_MSVC)
 		return _CountLeadingZeros(inValue);
 	#else

diff --git a/Jolt/Math/Quat.h b/Jolt/Math/Quat.h
@@ -28,7 +28,7 @@ JPH_NAMESPACE_BEGIN
 /// it easy to extract the rotation axis of the quaternion:
 ///
 /// q = [cos(angle / 2), sin(angle / 2) * rotation_axis]
-class [[nodiscard]] alignas(16) Quat
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Quat
 {
 public:
 	JPH_OVERRIDE_NEW_DELETE

diff --git a/Jolt/Math/UVec4.h b/Jolt/Math/UVec4.h
@@ -7,7 +7,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-class [[nodiscard]] alignas(16) UVec4
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) UVec4
 {
 public:
 	JPH_OVERRIDE_NEW_DELETE

diff --git a/Jolt/Math/Vec3.h b/Jolt/Math/Vec3.h
@@ -11,7 +11,7 @@ JPH_NAMESPACE_BEGIN
 
 /// 3 component vector (stored as 4 vectors). 
 /// Note that we keep the 4th component the same as the 3rd component to avoid divisions by zero when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED defined
-class [[nodiscard]] alignas(16) Vec3
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec3
 {
 public:
 	JPH_OVERRIDE_NEW_DELETE

diff --git a/Jolt/Math/Vec4.h b/Jolt/Math/Vec4.h
@@ -9,7 +9,7 @@
 
 JPH_NAMESPACE_BEGIN
 
-class [[nodiscard]] alignas(16) Vec4
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec4
 {
 public:
 	JPH_OVERRIDE_NEW_DELETE

diff --git a/Jolt/Physics/Body/Body.h b/Jolt/Physics/Body/Body.h
@@ -317,7 +317,7 @@ class Body : public NonCopyable
 };
 
 static_assert(sizeof(Body) == 128, "Body should be 128 bytes");
-static_assert(alignof(Body) == 16, "Body should align to 16 bytes");
+static_assert(alignof(Body) == JPH_VECTOR_ALIGNMENT, "Body should properly align");
 
 JPH_NAMESPACE_END
 

diff --git a/Jolt/Physics/Collision/BroadPhase/QuadTree.cpp b/Jolt/Physics/Collision/BroadPhase/QuadTree.cpp
@@ -503,7 +503,7 @@ QuadTree::NodeID QuadTree::BuildTree(const BodyVector &inBodies, TrackingVector
 
 	// Calculate centers of all bodies that are to be inserted
 	Vec3 *centers = new Vec3 [inNumber];
-	JPH_ASSERT(IsAligned(centers, 16));
+	JPH_ASSERT(IsAligned(centers, JPH_VECTOR_ALIGNMENT));
 	Vec3 *c = centers;
 	for (const NodeID *n = ioNodeIDs, *n_end = ioNodeIDs + inNumber; n < n_end; ++n, ++c)
 		*c = GetNodeOrBodyBounds(inBodies, *n).GetCenter();

diff --git a/Jolt/Physics/Collision/Shape/ConvexHullShape.h b/Jolt/Physics/Collision/Shape/ConvexHullShape.h
@@ -156,7 +156,7 @@ class ConvexHullShape final : public ConvexShape
 	};
 
 	static_assert(sizeof(Point) == 32, "Unexpected size");
-	static_assert(alignof(Point) == 16, "Unexpected alignment");
+	static_assert(alignof(Point) == JPH_VECTOR_ALIGNMENT, "Unexpected alignment");
 
 	Vec3					mCenterOfMass;				///< Center of mass of this convex hull
 	Mat44					mInertia;					///< Inertia matrix assuming density is 1 (needs to be multiplied by density)

diff --git a/Jolt/Physics/Collision/TransformedShape.h b/Jolt/Physics/Collision/TransformedShape.h
@@ -171,6 +171,6 @@ class TransformedShape
 };
 
 static_assert(sizeof(TransformedShape) == 64, "Not properly packed");
-static_assert(alignof(TransformedShape) == 16, "Not properly aligned");
+static_assert(alignof(TransformedShape) == JPH_VECTOR_ALIGNMENT, "Not properly aligned");
 
 JPH_NAMESPACE_END
diff --git a/README.md b/README.md
@@ -81,8 +81,8 @@ For more information see the [Architecture and API documentation](https://jrouwe
 
 ## Supported Platforms
 
-* Windows (VS2019, VS2022) x64/x86/ARM64 (Desktop/UWP)
-* Linux (tested on Ubuntu 20.04) x64/ARM64
+* Windows (VS2019, VS2022) x86/x64/ARM/ARM64 (Desktop/UWP)
+* Linux (tested on Ubuntu 22.04) x64/ARM64
 * Android (tested on Android 10) x64/ARM64
 * Platform Blue (a popular game console) x64
 * macOS (tested on Monterey) x64/ARM64