Loading debian/changelog +8 −0 Original line number Diff line number Diff line libnetplus (20260425+4) unstable; urgency=medium * SHA-256 SHA-NI: fix message schedule bugs — double sha256msg1 expansion, wrong W+K round pairing, and incorrect final register. All 12 NIST FIPS 180-4 test vectors now pass. -- Jan Koester <jan.koester@tuxist.de> Sat, 25 Apr 2026 19:00:00 +0200 libnetplus (20260425+3) unstable; urgency=medium * SHA-256: add SHA-NI hardware-accelerated path with runtime CPU Loading src/crypto/sha.cpp +41 −45 Original line number Diff line number Diff line Loading @@ -50,64 +50,60 @@ static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK); __m128i MSGTMP; // Rounds 0-3 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \ m0 = _mm_sha256msg1_epu32(m0, m1); \ m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \ m0 = _mm_sha256msg2_epu32(m0, m3); \ MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \ // State update macro — 4 SHA-256 rounds using msg + K[ki] #define SHANI_ROUNDS(msg, ki) \ MSGTMP = _mm_add_epi32(msg, _mm_loadu_si128((const __m128i*)&K256[ki])); \ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \ MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); // Full round: state update + complete `next` W expansion + start partial `pa` // `cur` = current fully-computed W, used for round keys // `next` = message to complete (already has sha256msg1 applied) // `pa` = prev for alignr AND target for sha256msg1 (same register) #define SHANI_FULL_ROUND(ki, cur, next, pa) \ SHANI_ROUNDS(cur, ki); \ next = _mm_add_epi32(next, _mm_alignr_epi8(cur, pa, 4)); \ next = _mm_sha256msg2_epu32(next, cur); \ pa = _mm_sha256msg1_epu32(pa, cur); // Rounds 0-3 SHANI_ROUNDS(MSG0, 0); // Rounds 4-7 MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG1, 4); MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); // Rounds 8-11 MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG2, 8); MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); // Rounds 12-15 MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG3, 12); MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4)); MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); // Rounds 16-59 (unrolled in groups of 4) SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2); // Rounds 60-63 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #undef SHA256_SHANI_ROUND // Rounds 16-55: repeating pattern — use cur, complete next, start pa SHANI_FULL_ROUND(16, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(20, MSG1, MSG2, MSG0); SHANI_FULL_ROUND(24, MSG2, MSG3, MSG1); SHANI_FULL_ROUND(28, MSG3, MSG0, MSG2); SHANI_FULL_ROUND(32, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(36, MSG1, MSG2, MSG0); SHANI_FULL_ROUND(40, MSG2, MSG3, MSG1); SHANI_FULL_ROUND(44, MSG3, MSG0, MSG2); SHANI_FULL_ROUND(48, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(52, MSG1, MSG2, MSG0); // Rounds 56-59: last expansion (complete MSG3, no new partial needed) SHANI_ROUNDS(MSG2, 56); MSG3 = _mm_add_epi32(MSG3, _mm_alignr_epi8(MSG2, MSG1, 4)); MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); // Rounds 60-63: final rounds, no expansion SHANI_ROUNDS(MSG3, 60); #undef SHANI_ROUNDS #undef SHANI_FULL_ROUND STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); Loading Loading
debian/changelog +8 −0 Original line number Diff line number Diff line libnetplus (20260425+4) unstable; urgency=medium * SHA-256 SHA-NI: fix message schedule bugs — double sha256msg1 expansion, wrong W+K round pairing, and incorrect final register. All 12 NIST FIPS 180-4 test vectors now pass. -- Jan Koester <jan.koester@tuxist.de> Sat, 25 Apr 2026 19:00:00 +0200 libnetplus (20260425+3) unstable; urgency=medium * SHA-256: add SHA-NI hardware-accelerated path with runtime CPU Loading
src/crypto/sha.cpp +41 −45 Original line number Diff line number Diff line Loading @@ -50,64 +50,60 @@ static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK); __m128i MSGTMP; // Rounds 0-3 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \ m0 = _mm_sha256msg1_epu32(m0, m1); \ m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \ m0 = _mm_sha256msg2_epu32(m0, m3); \ MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \ // State update macro — 4 SHA-256 rounds using msg + K[ki] #define SHANI_ROUNDS(msg, ki) \ MSGTMP = _mm_add_epi32(msg, _mm_loadu_si128((const __m128i*)&K256[ki])); \ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \ MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); // Full round: state update + complete `next` W expansion + start partial `pa` // `cur` = current fully-computed W, used for round keys // `next` = message to complete (already has sha256msg1 applied) // `pa` = prev for alignr AND target for sha256msg1 (same register) #define SHANI_FULL_ROUND(ki, cur, next, pa) \ SHANI_ROUNDS(cur, ki); \ next = _mm_add_epi32(next, _mm_alignr_epi8(cur, pa, 4)); \ next = _mm_sha256msg2_epu32(next, cur); \ pa = _mm_sha256msg1_epu32(pa, cur); // Rounds 0-3 SHANI_ROUNDS(MSG0, 0); // Rounds 4-7 MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG1, 4); MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); // Rounds 8-11 MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG2, 8); MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); // Rounds 12-15 MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); SHANI_ROUNDS(MSG3, 12); MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4)); MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); // Rounds 16-59 (unrolled in groups of 4) SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2); // Rounds 60-63 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #undef SHA256_SHANI_ROUND // Rounds 16-55: repeating pattern — use cur, complete next, start pa SHANI_FULL_ROUND(16, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(20, MSG1, MSG2, MSG0); SHANI_FULL_ROUND(24, MSG2, MSG3, MSG1); SHANI_FULL_ROUND(28, MSG3, MSG0, MSG2); SHANI_FULL_ROUND(32, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(36, MSG1, MSG2, MSG0); SHANI_FULL_ROUND(40, MSG2, MSG3, MSG1); SHANI_FULL_ROUND(44, MSG3, MSG0, MSG2); SHANI_FULL_ROUND(48, MSG0, MSG1, MSG3); SHANI_FULL_ROUND(52, MSG1, MSG2, MSG0); // Rounds 56-59: last expansion (complete MSG3, no new partial needed) SHANI_ROUNDS(MSG2, 56); MSG3 = _mm_add_epi32(MSG3, _mm_alignr_epi8(MSG2, MSG1, 4)); MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); // Rounds 60-63: final rounds, no expansion SHANI_ROUNDS(MSG3, 60); #undef SHANI_ROUNDS #undef SHANI_FULL_ROUND STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); Loading