From 58176a503d2bda8e5d68f7adb731049d9919fb8d Mon Sep 17 00:00:00 2001 From: Dave Cramer Date: Sun, 13 Jul 2025 06:33:17 -0400 Subject: [PATCH] Enable the Microsoft Windows ARM64/MSVC platform Add support for the ARM64 architecture on Windows 11 using MSVC compiler addressing build issues and implementing proper memory synchronization semantics for this platform. * Implement spin_delay() with __isb(_ARM64_BARRIER_SY) intrinsic to emit the "ISB SY" instruction which matches the GCC/Clang approach to spinloop delay and emperical evidence that it out-scales the YIELD instruction in practice. * Unconditionally choose to use the MSVC supplied intrinsic for CRC32 on ARM64. * Implement the S_UNLOCK() macro using the InterlockedExchange() intrinsic. Author: Greg Burd Author: Dave Cramer Discussion: https://postgr.es/m/3c576ad7-d2da-4137-b791-5821da7cc370%40app.fastmail.com --- doc/src/sgml/installation.sgml | 2 +- meson.build | 9 +++-- src/include/storage/s_lock.h | 60 +++++++++++++++++++++++++++------- src/port/pg_crc32c_armv8.c | 6 ++++ src/tools/msvc_gendef.pl | 8 ++--- 5 files changed, 67 insertions(+), 18 deletions(-) diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index fe8d73e1f8c0..3f8d512a906f 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -3967,7 +3967,7 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib" Special Considerations for 64-Bit Windows - PostgreSQL will only build for the x64 architecture on 64-bit Windows. + PostgreSQL will only build for the x64 and ARM64 architectures on 64-bit Windows. Mixing 32- and 64-bit versions in the same build tree is not supported. diff --git a/meson.build b/meson.build index 1256094fa571..6c463d14749d 100644 --- a/meson.build +++ b/meson.build @@ -2527,7 +2527,12 @@ int main(void) } ''' - if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc', + # Check first for a MSVC/ARM64 combo because the test prog above won't + # compile (as it doesn't '#ifdef _MSC_VER #include '), which + # is okay as we know for a fact that this platform combo supports the + # intrinsic for ARM64 CRC the test performs, so use that unconditionally. + if (host_cpu == 'aarch64' and cc.get_id() == 'msvc') or \ + cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc', args: test_c_args) # Use ARM CRC Extension unconditionally cdata.set('USE_ARMV8_CRC32C', 1) @@ -2546,7 +2551,7 @@ int main(void) cdata.set('USE_ARMV8_CRC32C', false) cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true - endif +endif elif host_cpu == 'loongarch64' diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index 7f8f566bd407..50262cca887a 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -594,7 +594,8 @@ tas(volatile slock_t *lock) #if !defined(HAS_TEST_AND_SET) /* We didn't trigger above, let's try here */ -#ifdef _MSC_VER +/* When compiling for Microsoft Windows using MSVC */ +#if defined(_MSC_VER) typedef LONG slock_t; #define HAS_TEST_AND_SET @@ -602,34 +603,71 @@ typedef LONG slock_t; #define SPIN_DELAY() spin_delay() -/* If using Visual C++ on Win64, inline assembly is unavailable. - * Use a _mm_pause intrinsic instead of rep nop. +/* + * _InterlockedExchange() generates a full memory barrier (or release + * semantics that ensures all prior memory operations are visible to + * other cores before the lock is released. + */ +#define S_UNLOCK(lock) (InterlockedExchange(lock, 0)) + +#if defined(_WIN64) /* Microsoft Windows x64 */ + +#if defined(_M_ARM64) /* aarch64 */ +/* + * While there is support for a __yield() intrinsic for MSVC/ARM64[1], there + * is a wealth of real-world testing across databases and languages as well + * as a blog post by ARM[2] suggest that ISB is the most scalable and power + * friendly instruction to use for spinlock delay loops. So we use the only + * supported intrinsic/flag combination availble for this platform combo[3]. + * This matches what we do above when compiling with either GCC or Clang. + * + * [1] https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics + * [2] https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm + * [3] https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/intrinsics/arm64-intrinsics.md + */ +static __forceinline void +spin_delay(void) +{ + __isb(_ARM64_BARRIER_SY); +} + +#elif defined(_M_X64) /* x86-64 */ + +/* + * Use _mm_pause() intrinsic for x86-64. This emits the PAUSE instruction, + * which improves performance in spin-wait loops by preventing pipeline flush + * on Hyper-Threading systems. */ -#if defined(_WIN64) static __forceinline void spin_delay(void) { _mm_pause(); } -#else + +#endif /* defined(_M_ARM64|_M_X64) */ + +#else /* !defined(_WIN64) */ + +#ifdef _M_IX86 /* x86-specific */ + +/* Use no-op for MSVC 32bit x86 */ static __forceinline void spin_delay(void) { /* See comment for gcc code. Same code, MASM syntax */ __asm rep nop; } -#endif #include #pragma intrinsic(_ReadWriteBarrier) -#define S_UNLOCK(lock) \ +#define S_UNLOCK(lock) \ do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0) -#endif - - -#endif /* !defined(HAS_TEST_AND_SET) */ +#endif /* defined(_M_IX86) */ +#endif /* defined(_WIN64) */ +#endif /* defined(_MSC_VER) */ +#endif /* !defined(HAS_TEST_AND_SET) */ /* Blow up if we didn't have any way to do spinlocks */ diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 5ba070bb99d8..29a91dca62f8 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -14,7 +14,13 @@ */ #include "c.h" +#ifdef _MSC_VER + /* MSVC ARM64 intrinsics */ +#include +#else + /* GCC/Clang: Use ACLE intrinsics from arm_acle.h */ #include +#endif #include "port/pg_crc32c.h" diff --git a/src/tools/msvc_gendef.pl b/src/tools/msvc_gendef.pl index 868aad51b09c..c92c94c4775f 100644 --- a/src/tools/msvc_gendef.pl +++ b/src/tools/msvc_gendef.pl @@ -118,9 +118,9 @@ sub writedef { my $isdata = $def->{$f} eq 'data'; - # Strip the leading underscore for win32, but not x64 + # Strip the leading underscore for win32, but not x64 and aarch64 $f =~ s/^_// - unless ($arch eq "x86_64"); + unless ($arch eq "x86_64" || $arch eq "aarch64"); # Emit just the name if it's a function symbol, or emit the name # decorated with the DATA option for variables. @@ -141,7 +141,7 @@ sub writedef sub usage { die("Usage: msvc_gendef.pl --arch --deffile --tempdir files-or-directories\n" - . " arch: x86 | x86_64\n" + . " arch: x86 | x86_64 | aarch64\n" . " deffile: path of the generated file\n" . " tempdir: directory for temporary files\n" . " files or directories: object files or directory containing object files\n" @@ -158,7 +158,7 @@ sub usage 'tempdir:s' => \$tempdir,) or usage(); usage("arch: $arch") - unless ($arch eq 'x86' || $arch eq 'x86_64'); + unless ($arch eq 'x86' || $arch eq 'x86_64' || $arch eq 'aarch64'); my @files;