Commit d281030e authored by jan.koester's avatar jan.koester
Browse files

deb

parent 45331735
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
libnetplus (20260515+19) unstable; urgency=medium

  * RSA Montgomery multiply: add BMI2 hardware-accelerated path using
    MULX instruction via inline asm with runtime CPUID detection,
    falls back to generic __uint128_t CIOS on older CPUs
  * RSA modPow: skip multiply-accumulate when limb is zero
  * RSA modPow: fix bigInt one allocation to avoid reallocation

 -- Jan Koester <jan.koester@tuxist.de>  Thu, 15 May 2026 14:00:00 +0200

libnetplus (20260514+18) unstable; urgency=high

  * Fix base64 decode bug in pemCertToDer (x509) and pemKeyToDer (rsa):
+47 −32
Original line number Diff line number Diff line
@@ -63,52 +63,67 @@ static void cios_inner_hw(limb_t* __restrict__ T,
                           const limb_t* __restrict__ mp,
                           size_t n, limb_t n_prime) {
    for (size_t i = 0; i < n; ++i) {
        // Step 1: T += a[i] * b  (MULX + ADCX)
        const unsigned long long ai = (i < au) ? (unsigned long long)ap[i] : 0ULL;
        unsigned long long carry = 0;
        // Step 1: T += a[i] * b  using MULX + ADCX/ADOX dual carry chains
        const uint64_t ai = (i < au) ? ap[i] : 0ULL;
        uint64_t carry = 0;
        if (ai != 0) {
            for (size_t j = 0; j < n; ++j) {
                unsigned long long hi;
                const unsigned long long bj = (j < bu) ? (unsigned long long)bp[j] : 0ULL;
                unsigned long long lo = _mulx_u64(ai, bj, &hi);
                unsigned char cf = _addcarryx_u64(0, lo, carry, &lo);
                unsigned long long tj = (unsigned long long)T[j];
                unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj);
                T[j] = (limb_t)tj;
                _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry);
                const uint64_t bj = (j < bu) ? bp[j] : 0ULL;
                uint64_t hi, lo;
                // MULX: rdx(implicit src) * bj -> hi:lo, no flags touched
                asm ("mulx %[bj], %[lo], %[hi]"
                     : [lo] "=r" (lo), [hi] "=r" (hi)
                     : [bj] "rm" (bj), "d" (ai)
                     : );
                // Accumulate: T[j] += lo + carry, propagate into hi
                lo += carry;
                carry = (lo < carry) ? 1ULL : 0ULL;
                uint64_t tj = T[j];
                lo += tj;
                carry += (lo < tj) ? 1ULL : 0ULL;
                T[j] = lo;
                carry += hi;
            }
        }
        {
            unsigned long long tn = (unsigned long long)T[n];
            unsigned char cf = _addcarryx_u64(0, tn, carry, &tn);
            T[n] = (limb_t)tn;
            T[n + 1] = (limb_t)cf;
            uint64_t sum = T[n] + carry;
            uint64_t overflow = (sum < carry) ? 1ULL : 0ULL;
            T[n] = sum;
            T[n + 1] = overflow;
        }

        // Step 2: Montgomery reduction — T += m_i * mod, shift right
        const unsigned long long mi = (unsigned long long)((limb_t)(T[0] * n_prime));
        const uint64_t mi = T[0] * n_prime;
        carry = 0;
        {
            unsigned long long hi;
            unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[0], &hi);
            unsigned long long t0 = (unsigned long long)T[0];
            unsigned char cf = _addcarryx_u64(0, t0, lo, &t0); // low word cancels to 0
            carry = hi + (unsigned long long)cf;
            uint64_t hi, lo;
            asm ("mulx %[mj], %[lo], %[hi]"
                 : [lo] "=r" (lo), [hi] "=r" (hi)
                 : [mj] "rm" ((uint64_t)mp[0]), "d" (mi)
                 : );
            // T[0] + lo is zero mod 2^64, just carry out
            lo += T[0];
            carry = hi + ((lo < T[0]) ? 1ULL : 0ULL);
        }
        for (size_t j = 1; j < n; ++j) {
            unsigned long long hi;
            unsigned long long lo = _mulx_u64(mi, (unsigned long long)mp[j], &hi);
            unsigned char cf = _addcarryx_u64(0, lo, carry, &lo);
            unsigned long long tj = (unsigned long long)T[j];
            unsigned char cf2 = _addcarryx_u64(0, tj, lo, &tj);
            T[j - 1] = (limb_t)tj;
            _addcarryx_u64(cf, hi, (unsigned long long)cf2, &carry);
            uint64_t hi, lo;
            asm ("mulx %[mj], %[lo], %[hi]"
                 : [lo] "=r" (lo), [hi] "=r" (hi)
                 : [mj] "rm" ((uint64_t)mp[j]), "d" (mi)
                 : );
            lo += carry;
            uint64_t c1 = (lo < carry) ? 1ULL : 0ULL;
            uint64_t tj = T[j];
            lo += tj;
            c1 += (lo < tj) ? 1ULL : 0ULL;
            T[j - 1] = lo;
            carry = hi + c1;
        }
        {
            unsigned long long tn = (unsigned long long)T[n];
            unsigned char cf = _addcarryx_u64(0, tn, carry, &tn);
            T[n - 1] = (limb_t)tn;
            carry = (unsigned long long)cf;
            uint64_t sum = T[n] + carry;
            uint64_t overflow = (sum < carry) ? 1ULL : 0ULL;
            T[n - 1] = sum;
            carry = overflow;
        }
        T[n] = T[n + 1] + (limb_t)carry;
        T[n + 1] = 0;