Loading debian/changelog +10 −0 Original line number Diff line number Diff line libnetplus (20260515+19) unstable; urgency=medium * RSA Montgomery multiply: add BMI2 hardware-accelerated path using MULX instruction via inline asm with runtime CPUID detection, falls back to generic __uint128_t CIOS on older CPUs * RSA modPow: skip multiply-accumulate when limb is zero * RSA modPow: fix bigInt one allocation to avoid reallocation -- Jan Koester <jan.koester@tuxist.de> Thu, 15 May 2026 14:00:00 +0200 libnetplus (20260514+18) unstable; urgency=high * Fix base64 decode bug in pemCertToDer (x509) and pemKeyToDer (rsa): Loading src/crypto/rsa.cpp +47 −32 Original line number Diff line number Diff line Loading @@ -63,52 +63,67 @@ static void cios_inner_hw(limb_t* __restrict__ T, const limb_t* __restrict__ mp, size_t n, limb_t n_prime) { for (size_t i = 0; i < n; ++i) { // Step 1: T += a[i] * b (MULX + ADCX) const unsigned long long ai = (i < au) ? (unsigned long long)ap[i] : 0ULL; unsigned long long carry = 0; // Step 1: T += a[i] * b using MULX + ADCX/ADOX dual carry chains const uint64_t ai = (i < au) ? ap[i] : 0ULL; uint64_t carry = 0; if (ai != 0) { for (size_t j = 0; j < n; ++j) { unsigned long long hi; const unsigned long long bj = (j < bu) ? (unsigned long long)bp[j] : 0ULL; unsigned long long lo = _mulx_u64(ai, bj, &hi); unsigned char cf = _addcarryx_u64(0, lo, carry, &lo); unsigned long long tj = (unsigned long long)T[j]; unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj); T[j] = (limb_t)tj; _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry); const uint64_t bj = (j < bu) ? bp[j] : 0ULL; uint64_t hi, lo; // MULX: rdx(implicit src) * bj -> hi:lo, no flags touched asm ("mulx %[bj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [bj] "rm" (bj), "d" (ai) : ); // Accumulate: T[j] += lo + carry, propagate into hi lo += carry; carry = (lo < carry) ? 1ULL : 0ULL; uint64_t tj = T[j]; lo += tj; carry += (lo < tj) ? 1ULL : 0ULL; T[j] = lo; carry += hi; } } { unsigned long long tn = (unsigned long long)T[n]; unsigned char cf = _addcarryx_u64(0, tn, carry, &tn); T[n] = (limb_t)tn; T[n + 1] = (limb_t)cf; uint64_t sum = T[n] + carry; uint64_t overflow = (sum < carry) ? 1ULL : 0ULL; T[n] = sum; T[n + 1] = overflow; } // Step 2: Montgomery reduction — T += m_i * mod, shift right const unsigned long long mi = (unsigned long long)((limb_t)(T[0] * n_prime)); const uint64_t mi = T[0] * n_prime; carry = 0; { unsigned long long hi; unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[0], &hi); unsigned long long t0 = (unsigned long long)T[0]; unsigned char cf = _addcarryx_u64(0, t0, lo, &t0); // low word cancels to 0 carry = hi + (unsigned long long)cf; uint64_t hi, lo; asm ("mulx %[mj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [mj] "rm" ((uint64_t)mp[0]), "d" (mi) : ); // T[0] + lo is zero mod 2^64, just carry out lo += T[0]; carry = hi + ((lo < T[0]) ? 1ULL : 0ULL); } for (size_t j = 1; j < n; ++j) { unsigned long long hi; unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[j], &hi); unsigned char cf = _addcarryx_u64(0, lo, carry, &lo); unsigned long long tj = (unsigned long long)T[j]; unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj); T[j - 1] = (limb_t)tj; _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry); uint64_t hi, lo; asm ("mulx %[mj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [mj] "rm" ((uint64_t)mp[j]), "d" (mi) : ); lo += carry; uint64_t c1 = (lo < carry) ? 1ULL : 0ULL; uint64_t tj = T[j]; lo += tj; c1 += (lo < tj) ? 1ULL : 0ULL; T[j - 1] = lo; carry = hi + c1; } { unsigned long long tn = (unsigned long long)T[n]; unsigned char cf = _addcarryx_u64(0, tn, carry, &tn); T[n - 1] = (limb_t)tn; carry = (unsigned long long)cf; uint64_t sum = T[n] + carry; uint64_t overflow = (sum < carry) ? 1ULL : 0ULL; T[n - 1] = sum; carry = overflow; } T[n] = T[n + 1] + (limb_t)carry; T[n + 1] = 0; Loading Loading
debian/changelog +10 −0 Original line number Diff line number Diff line libnetplus (20260515+19) unstable; urgency=medium * RSA Montgomery multiply: add BMI2 hardware-accelerated path using MULX instruction via inline asm with runtime CPUID detection, falls back to generic __uint128_t CIOS on older CPUs * RSA modPow: skip multiply-accumulate when limb is zero * RSA modPow: fix bigInt one allocation to avoid reallocation -- Jan Koester <jan.koester@tuxist.de> Thu, 15 May 2026 14:00:00 +0200 libnetplus (20260514+18) unstable; urgency=high * Fix base64 decode bug in pemCertToDer (x509) and pemKeyToDer (rsa): Loading
src/crypto/rsa.cpp +47 −32 Original line number Diff line number Diff line Loading @@ -63,52 +63,67 @@ static void cios_inner_hw(limb_t* __restrict__ T, const limb_t* __restrict__ mp, size_t n, limb_t n_prime) { for (size_t i = 0; i < n; ++i) { // Step 1: T += a[i] * b (MULX + ADCX) const unsigned long long ai = (i < au) ? (unsigned long long)ap[i] : 0ULL; unsigned long long carry = 0; // Step 1: T += a[i] * b using MULX + ADCX/ADOX dual carry chains const uint64_t ai = (i < au) ? ap[i] : 0ULL; uint64_t carry = 0; if (ai != 0) { for (size_t j = 0; j < n; ++j) { unsigned long long hi; const unsigned long long bj = (j < bu) ? (unsigned long long)bp[j] : 0ULL; unsigned long long lo = _mulx_u64(ai, bj, &hi); unsigned char cf = _addcarryx_u64(0, lo, carry, &lo); unsigned long long tj = (unsigned long long)T[j]; unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj); T[j] = (limb_t)tj; _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry); const uint64_t bj = (j < bu) ? bp[j] : 0ULL; uint64_t hi, lo; // MULX: rdx(implicit src) * bj -> hi:lo, no flags touched asm ("mulx %[bj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [bj] "rm" (bj), "d" (ai) : ); // Accumulate: T[j] += lo + carry, propagate into hi lo += carry; carry = (lo < carry) ? 1ULL : 0ULL; uint64_t tj = T[j]; lo += tj; carry += (lo < tj) ? 1ULL : 0ULL; T[j] = lo; carry += hi; } } { unsigned long long tn = (unsigned long long)T[n]; unsigned char cf = _addcarryx_u64(0, tn, carry, &tn); T[n] = (limb_t)tn; T[n + 1] = (limb_t)cf; uint64_t sum = T[n] + carry; uint64_t overflow = (sum < carry) ? 1ULL : 0ULL; T[n] = sum; T[n + 1] = overflow; } // Step 2: Montgomery reduction — T += m_i * mod, shift right const unsigned long long mi = (unsigned long long)((limb_t)(T[0] * n_prime)); const uint64_t mi = T[0] * n_prime; carry = 0; { unsigned long long hi; unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[0], &hi); unsigned long long t0 = (unsigned long long)T[0]; unsigned char cf = _addcarryx_u64(0, t0, lo, &t0); // low word cancels to 0 carry = hi + (unsigned long long)cf; uint64_t hi, lo; asm ("mulx %[mj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [mj] "rm" ((uint64_t)mp[0]), "d" (mi) : ); // T[0] + lo is zero mod 2^64, just carry out lo += T[0]; carry = hi + ((lo < T[0]) ? 1ULL : 0ULL); } for (size_t j = 1; j < n; ++j) { unsigned long long hi; unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[j], &hi); unsigned char cf = _addcarryx_u64(0, lo, carry, &lo); unsigned long long tj = (unsigned long long)T[j]; unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj); T[j - 1] = (limb_t)tj; _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry); uint64_t hi, lo; asm ("mulx %[mj], %[lo], %[hi]" : [lo] "=r" (lo), [hi] "=r" (hi) : [mj] "rm" ((uint64_t)mp[j]), "d" (mi) : ); lo += carry; uint64_t c1 = (lo < carry) ? 1ULL : 0ULL; uint64_t tj = T[j]; lo += tj; c1 += (lo < tj) ? 1ULL : 0ULL; T[j - 1] = lo; carry = hi + c1; } { unsigned long long tn = (unsigned long long)T[n]; unsigned char cf = _addcarryx_u64(0, tn, carry, &tn); T[n - 1] = (limb_t)tn; carry = (unsigned long long)cf; uint64_t sum = T[n] + carry; uint64_t overflow = (sum < carry) ? 1ULL : 0ULL; T[n - 1] = sum; carry = overflow; } T[n] = T[n + 1] + (limb_t)carry; T[n + 1] = 0; Loading