Commit ed7405b9 authored by jan.koester's avatar jan.koester
Browse files

urgent sha fix

parent 66862952
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
libnetplus (20260425+4) unstable; urgency=medium

  * SHA-256 SHA-NI: fix message schedule bugs — double sha256msg1
    expansion, wrong W+K round pairing, and incorrect final register.
    All 12 NIST FIPS 180-4 test vectors now pass.

 -- Jan Koester <jan.koester@tuxist.de>  Sat, 25 Apr 2026 19:00:00 +0200

libnetplus (20260425+3) unstable; urgency=medium

  * SHA-256: add SHA-NI hardware-accelerated path with runtime CPU
+41 −45
Original line number Diff line number Diff line
@@ -50,64 +50,60 @@ static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input)
        __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK);

        __m128i MSGTMP;
        // Rounds 0-3
        MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \
            m0 = _mm_sha256msg1_epu32(m0, m1); \
            m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \
            m0 = _mm_sha256msg2_epu32(m0, m3); \
            MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \
        // State update macro — 4 SHA-256 rounds using msg + K[ki]
        #define SHANI_ROUNDS(msg, ki) \
            MSGTMP = _mm_add_epi32(msg, _mm_loadu_si128((const __m128i*)&K256[ki])); \
            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \
            MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \
            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        // Full round: state update + complete `next` W expansion + start partial `pa`
        // `cur` = current fully-computed W, used for round keys
        // `next` = message to complete (already has sha256msg1 applied)
        // `pa`   = prev for alignr AND target for sha256msg1 (same register)
        #define SHANI_FULL_ROUND(ki, cur, next, pa) \
            SHANI_ROUNDS(cur, ki); \
            next = _mm_add_epi32(next, _mm_alignr_epi8(cur, pa, 4)); \
            next = _mm_sha256msg2_epu32(next, cur); \
            pa = _mm_sha256msg1_epu32(pa, cur);

        // Rounds 0-3
        SHANI_ROUNDS(MSG0, 0);
        // Rounds 4-7
        MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        SHANI_ROUNDS(MSG1, 4);
        MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

        // Rounds 8-11
        MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        SHANI_ROUNDS(MSG2, 8);
        MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

        // Rounds 12-15
        MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        SHANI_ROUNDS(MSG3, 12);
        MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4));
        MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
        MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

        // Rounds 16-59 (unrolled in groups of 4)
        SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2);
        SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3);
        SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2);
        SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3);
        SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2);

        // Rounds 60-63
        MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        #undef SHA256_SHANI_ROUND
        // Rounds 16-55: repeating pattern — use cur, complete next, start pa
        SHANI_FULL_ROUND(16, MSG0, MSG1, MSG3);
        SHANI_FULL_ROUND(20, MSG1, MSG2, MSG0);
        SHANI_FULL_ROUND(24, MSG2, MSG3, MSG1);
        SHANI_FULL_ROUND(28, MSG3, MSG0, MSG2);
        SHANI_FULL_ROUND(32, MSG0, MSG1, MSG3);
        SHANI_FULL_ROUND(36, MSG1, MSG2, MSG0);
        SHANI_FULL_ROUND(40, MSG2, MSG3, MSG1);
        SHANI_FULL_ROUND(44, MSG3, MSG0, MSG2);
        SHANI_FULL_ROUND(48, MSG0, MSG1, MSG3);
        SHANI_FULL_ROUND(52, MSG1, MSG2, MSG0);

        // Rounds 56-59: last expansion (complete MSG3, no new partial needed)
        SHANI_ROUNDS(MSG2, 56);
        MSG3 = _mm_add_epi32(MSG3, _mm_alignr_epi8(MSG2, MSG1, 4));
        MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

        // Rounds 60-63: final rounds, no expansion
        SHANI_ROUNDS(MSG3, 60);

        #undef SHANI_ROUNDS
        #undef SHANI_FULL_ROUND

        STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
        STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);