deb (0ecefea8) · Commits · jan.koester / libnetplus

debian/changelog

+0 −3

Original line number	Diff line number	Diff line
		libnetplus (20260515+19) unstable; urgency=medium

		* RSA Montgomery multiply: add BMI2 hardware-accelerated path using
		MULX instruction via inline asm with runtime CPUID detection,
		falls back to generic __uint128_t CIOS on older CPUs
		* RSA modPow: skip multiply-accumulate when limb is zero
		* RSA modPow: fix bigInt one allocation to avoid reallocation

src/crypto/rsa.cpp

+0 −97

Original line number	Diff line number	Diff line
		@@ -39,99 +39,7 @@
		#define __restrict__ __restrict
		#endif // !__restrict__

		// --- BMI2 + ADX hardware-accelerated CIOS Montgomery multiplication ---
		// MULX: flag-free 64×64→128 multiply (better pipelining than MUL)
		// ADCX/ADOX: dual carry-chain additions (CF and OF independent)
		#if defined(__x86_64__) && (defined(__GNUC__) \|\| defined(__clang__))
		#include <cpuid.h>
		#include <immintrin.h>

		static bool detect_bmi2_adx() {
		unsigned int eax, ebx, ecx, edx;
		if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
		return ((ebx >> 8) & 1) && ((ebx >> 19) & 1); // BMI2 bit 8, ADX bit 19
		}
		return false;
		}

		static const bool s_has_bmi2_adx = detect_bmi2_adx();

		__attribute__((target("bmi2,adx")))
		static void cios_inner_hw(limb_t* __restrict__ T,
		const limb_t* __restrict__ ap, size_t au,
		const limb_t* __restrict__ bp, size_t bu,
		const limb_t* __restrict__ mp,
		size_t n, limb_t n_prime) {
		for (size_t i = 0; i < n; ++i) {
		// Step 1: T += a[i] * b using MULX + ADCX/ADOX dual carry chains
		const uint64_t ai = (i < au) ? ap[i] : 0ULL;
		uint64_t carry = 0;
		if (ai != 0) {
		for (size_t j = 0; j < n; ++j) {
		const uint64_t bj = (j < bu) ? bp[j] : 0ULL;
		uint64_t hi, lo;
		// MULX: rdx(implicit src) * bj -> hi:lo, no flags touched
		asm ("mulx %[bj], %[lo], %[hi]"
		: [lo] "=r" (lo), [hi] "=r" (hi)
		: [bj] "rm" (bj), "d" (ai)
		: );
		// Accumulate: T[j] += lo + carry, propagate into hi
		lo += carry;
		carry = (lo < carry) ? 1ULL : 0ULL;
		uint64_t tj = T[j];
		lo += tj;
		carry += (lo < tj) ? 1ULL : 0ULL;
		T[j] = lo;
		carry += hi;
		}
		}
		{
		uint64_t sum = T[n] + carry;
		uint64_t overflow = (sum < carry) ? 1ULL : 0ULL;
		T[n] = sum;
		T[n + 1] = overflow;
		}

		// Step 2: Montgomery reduction — T += m_i * mod, shift right
		const uint64_t mi = T[0] * n_prime;
		carry = 0;
		{
		uint64_t hi, lo;
		asm ("mulx %[mj], %[lo], %[hi]"
		: [lo] "=r" (lo), [hi] "=r" (hi)
		: [mj] "rm" ((uint64_t)mp[0]), "d" (mi)
		: );
		// T[0] + lo is zero mod 2^64, just carry out
		lo += T[0];
		carry = hi + ((lo < T[0]) ? 1ULL : 0ULL);
		}
		for (size_t j = 1; j < n; ++j) {
		uint64_t hi, lo;
		asm ("mulx %[mj], %[lo], %[hi]"
		: [lo] "=r" (lo), [hi] "=r" (hi)
		: [mj] "rm" ((uint64_t)mp[j]), "d" (mi)
		: );
		lo += carry;
		uint64_t c1 = (lo < carry) ? 1ULL : 0ULL;
		uint64_t tj = T[j];
		lo += tj;
		c1 += (lo < tj) ? 1ULL : 0ULL;
		T[j - 1] = lo;
		carry = hi + c1;
		}
		{
		uint64_t sum = T[n] + carry;
		uint64_t overflow = (sum < carry) ? 1ULL : 0ULL;
		T[n - 1] = sum;
		carry = overflow;
		}
		T[n] = T[n + 1] + (limb_t)carry;
		T[n + 1] = 0;
		}
		}

		#define HAS_CIOS_HW 1
		#endif // x86_64 GCC/Clang


		namespace netplus {
		@@ -1354,11 +1262,6 @@ namespace netplus {
		const size_t au = a.used;
		const size_t bu = b.used;

		#ifdef HAS_CIOS_HW
		if (s_has_bmi2_adx) {
		cios_inner_hw(T, ap, au, bp, bu, mp, n, n_prime);
		} else
		#endif
		{
		for (size_t i = 0; i < n; ++i) {
		// Step 1: T += a[i] * b