libxcks-doc/blake3_8cpp_source.html

 /*

  * libxcks

  * Copyright (C) 2022 Julien Couot

  *

  * This program is free software: you can redistribute it and/or modify it

  * under the terms of the GNU Lesser General Public License as published by

  * the Free Software Foundation, either version 3 of the License, or (at your

  * option) any later version.

  *

  * This program is distributed in the hope that it will be useful, but

  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public

  * License for more details.

  *

  * You should have received a copy of the GNU Lesser General Public License

  * along with this program.  If not, see <https://www.gnu.org/licenses/>.

  */


 //---------------------------------------------------------------------------

 #include <cstring>

 #if defined(__GNUC__)

   #include <cassert>

 #endif


 #include "blake3.hpp"

 //---------------------------------------------------------------------------


 using namespace std;

 //---------------------------------------------------------------------------


 namespace libxcks

 {

 /* #if defined(__x86_64__) || defined(_M_X64)

 #define IS_X86

 #define IS_X86_64

 #endif


 #if defined(__i386__) || defined(_M_IX86)

 #define IS_X86

 #define IS_X86_32

 #endif


 #if defined(IS_X86)

 #define MAX_SIMD_DEGREE 16

 #elif defined(BLAKE3_USE_NEON)

 #define MAX_SIMD_DEGREE 4

 #else

 #define MAX_SIMD_DEGREE 1

 #endif */

 #define MAX_SIMD_DEGREE 1


 // There are some places where we want a static size that's equal to the

 // MAX_SIMD_DEGREE, but also at least 2.

 #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)


 inline BLAKE3::Output::Output(const uint32_t input_cv[8],

                               const uint8_t block[BLAKE3_BLOCK_LEN],

                               uint8_t block_len, uint64_t counter, uint8_t flags)

 {

   memcpy(this->input_cv, input_cv, 32);

   memcpy(this->block, block, BLAKE3_BLOCK_LEN);

   this->block_len = block_len;

   this->counter = counter;

   this->flags = flags;

 }

 //---------------------------------------------------------------------------


 // Chaining values within a given chunk (specifically the compress_in_place

 // interface) are represented as words. This avoids unnecessary bytes<->words

 // conversion overhead in the portable implementation. However, the hash_many

 // interface handles both user input and parent node blocks, so it accepts

 // bytes. For that reason, chaining values in the CV stack are represented as

 // bytes.

 inline void BLAKE3::Output::chainingValue(uint8_t cv[32]) const

 {

   uint32_t cv_words[8];

   memcpy(cv_words, input_cv, 32);

   compressInPlace(cv_words, block, block_len, counter, flags);

   storeCvWords(cv, cv_words);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::Output::rootBytes(uint64_t seek, uint8_t *out,

                                       size_t out_len)

 {

   uint64_t output_block_counter = seek / 64;

   size_t offset_within_block = seek % 64;

   uint8_t wide_buf[64];

   while (out_len > 0)

   {

     compressXof(input_cv, block, block_len, output_block_counter,

                 flags | static_cast<uint8_t>(Flags::ROOT), wide_buf);

     size_t available_bytes = 64 - offset_within_block;

     size_t memcpy_len;

     if (out_len > available_bytes)

       memcpy_len = available_bytes;

     else

       memcpy_len = out_len;

     memcpy(out, wide_buf + offset_within_block, memcpy_len);

     out += memcpy_len;

     out_len -= memcpy_len;

     output_block_counter += 1;

     offset_within_block = 0;

   }

 }

 //---------------------------------------------------------------------------


 inline BLAKE3::Output BLAKE3::Output::parentOutput(const uint8_t block[BLAKE3_BLOCK_LEN],

                                                    const uint32_t key[8],

                                                    uint8_t flags)

 {

   return Output(key, block, BLAKE3_BLOCK_LEN, 0, flags |

                                                  static_cast<uint8_t>(Flags::PARENT));

 }

 //---------------------------------------------------------------------------


 inline size_t BLAKE3::ChunkState::fillBuf(const uint8_t* input,

                                           size_t input_len)

 {

   size_t take = BLAKE3_BLOCK_LEN - (static_cast<size_t>(buf_len));

   if (take > input_len)

     take = input_len;

   uint8_t *dest = buf + (static_cast<size_t>(buf_len));

   memcpy(dest, input, take);

   buf_len += static_cast<uint8_t>(take);

   return take;

 }

 //---------------------------------------------------------------------------


 inline uint8_t BLAKE3::ChunkState::maybeStartFlag() const

 {

   if (blocks_compressed == 0)

     return static_cast<uint8_t>(Flags::CHUNK_START);

   else

     return 0;

 }

 //---------------------------------------------------------------------------


 inline BLAKE3::Output BLAKE3::ChunkState::stateOutput() const

 {

   uint8_t block_flags = flags | maybeStartFlag() |

                        static_cast<uint8_t>(Flags::CHUNK_END);

   return Output(cv, buf, buf_len, chunk_counter, block_flags);

 }

 //---------------------------------------------------------------------------


 /*

  * Resets the chunk to initial value.

  */

 inline void BLAKE3::ChunkState::init(const uint32_t key[8], const uint8_t flags)

 {

   memcpy(cv, key, BLAKE3_KEY_LEN);

   chunk_counter = 0;

   memset(buf, 0, BLAKE3_BLOCK_LEN);

   buf_len = 0;

   blocks_compressed = 0;

   this->flags = flags;

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::ChunkState::reset(const uint32_t key[8],

                                       const uint64_t chunk_counter)

 {

   memcpy(cv, key, BLAKE3_KEY_LEN);

   this->chunk_counter = chunk_counter;

   blocks_compressed = 0;

   memset(buf, 0, BLAKE3_BLOCK_LEN);

   buf_len = 0;

 }

 //---------------------------------------------------------------------------


 inline size_t BLAKE3::ChunkState::len() const

 {

   return (BLAKE3_BLOCK_LEN * static_cast<size_t>(blocks_compressed)) +

           (static_cast<size_t>(buf_len));

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::ChunkState::update(const uint8_t* input, size_t input_len)

 {

   if (buf_len > 0)

   {

     size_t take = fillBuf(input, input_len);

     input += take;

     input_len -= take;

     if (input_len > 0)

     {

       compressInPlace(cv, buf, BLAKE3_BLOCK_LEN, chunk_counter,

                       flags | maybeStartFlag());

       blocks_compressed += 1;

       buf_len = 0;

       memset(buf, 0, BLAKE3_BLOCK_LEN);

     }

   }


   while (input_len > BLAKE3_BLOCK_LEN)

   {

     compressInPlace(cv, input, BLAKE3_BLOCK_LEN, chunk_counter,

                     flags | maybeStartFlag());

     blocks_compressed += 1;

     input += BLAKE3_BLOCK_LEN;

     input_len -= BLAKE3_BLOCK_LEN;

   }


   size_t take = fillBuf(input, input_len);

   input += take;

   input_len -= take;

 }

 //---------------------------------------------------------------------------


 inline unsigned int BLAKE3::popcnt(uint64_t x)

 {

   #if defined(__GNUC__) || defined(__clang__)

   return __builtin_popcountll(x);

   #else

   unsigned int count = 0;

   while (x != 0)

   {

     count += 1;

     x &= x - 1;

   }

   return count;

   #endif

 }

 //---------------------------------------------------------------------------


 inline unsigned int BLAKE3::highestOne(uint64_t x)

 {

   #if defined(__GNUC__) || defined(__clang__)

   return 63 ^ __builtin_clzll(x);

   #elif defined(_MSC_VER) && defined(IS_X86_64)

   unsigned long index;

   _BitScanReverse64(&index, x);

   return index;

   #elif defined(_MSC_VER) && defined(IS_X86_32)

   if(x >> 32)

   {

     unsigned long index;

     _BitScanReverse(&index, x >> 32);

     return 32 + index;

   }

   else

   {

     unsigned long index;

     _BitScanReverse(&index, x);

     return index;

   }

   #else

   unsigned int c = 0;

   if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }

   if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }

   if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }

   if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }

   if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }

   if(x & 0x0000000000000002ULL) {           c +=  1; }

   return c;

   #endif

 }

 //---------------------------------------------------------------------------


 inline uint64_t BLAKE3::roundDownToPowerOf2(uint64_t x)

 {

   return UINT64_C(1) << highestOne(x | 1);

 }

 //---------------------------------------------------------------------------


 inline uint32_t BLAKE3::counterLow(uint64_t counter)

 {

   return static_cast<uint32_t>(counter);

 }

 //---------------------------------------------------------------------------


 inline uint32_t BLAKE3::counterHigh(uint64_t counter)

 {

   return static_cast<uint32_t>(counter >> 32);

 }

 //---------------------------------------------------------------------------


 inline uint32_t BLAKE3::load32(const void *src)

 {

   const uint8_t *p = static_cast<const uint8_t *>(src);

   return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |

          ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);

 }

 //---------------------------------------------------------------------------


 inline uint32_t BLAKE3::rotr32(const uint32_t w, const uint32_t c)

 {

   return (w >> c) | (w << (32 - c));

 }

 //---------------------------------------------------------------------------


 // Given some input larger than one chunk, return the number of bytes that

 // should go in the left subtree. This is the largest power-of-2 number of

 // chunks that leaves at least 1 byte for the right subtree.

 inline size_t BLAKE3::left_len(size_t content_len)

 {

   // Subtract 1 to reserve at least one byte for the right side. content_len

   // should always be greater than BLAKE3_CHUNK_LEN.

   size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;

   return roundDownToPowerOf2(full_chunks) * BLAKE3_CHUNK_LEN;

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::store32(void *dst, uint32_t w)

 {

   uint8_t *p = static_cast<uint8_t *>(dst);

   p[0] = static_cast<uint8_t>(w >> 0);

   p[1] = static_cast<uint8_t>(w >> 8);

   p[2] = static_cast<uint8_t>(w >> 16);

   p[3] = static_cast<uint8_t>(w >> 24);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::storeCvWords(uint8_t bytes_out[32], uint32_t cv_words[8])

 {

   store32(&bytes_out[0 * 4], cv_words[0]);

   store32(&bytes_out[1 * 4], cv_words[1]);

   store32(&bytes_out[2 * 4], cv_words[2]);

   store32(&bytes_out[3 * 4], cv_words[3]);

   store32(&bytes_out[4 * 4], cv_words[4]);

   store32(&bytes_out[5 * 4], cv_words[5]);

   store32(&bytes_out[6 * 4], cv_words[6]);

   store32(&bytes_out[7 * 4], cv_words[7]);

 }

 //---------------------------------------------------------------------------


 inline  void BLAKE3::g(uint32_t *state, const size_t a,

                        const size_t b, const size_t c,

                        const size_t d, const uint32_t x,

                        const uint32_t y)

 {

   state[a] = state[a] + state[b] + x;

   state[d] = rotr32(state[d] ^ state[a], 16);

   state[c] = state[c] + state[d];

   state[b] = rotr32(state[b] ^ state[c], 12);

   state[a] = state[a] + state[b] + y;

   state[d] = rotr32(state[d] ^ state[a], 8);

   state[c] = state[c] + state[d];

   state[b] = rotr32(state[b] ^ state[c], 7);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::roundFn(uint32_t state[16], const uint32_t *msg,

                             size_t round)

 {

   // Select the message schedule based on the round.

   const uint8_t *schedule = MSG_SCHEDULE[round];


   // Mix the columns.

   g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);

   g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);

   g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);

   g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);


   // Mix the rows.

   g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);

   g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);

   g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);

   g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::compressPre(uint32_t state[16],

                                 const uint32_t cv[8],

                                 const uint8_t block[BLAKE3_BLOCK_LEN],

                                 uint8_t block_len,

                                 uint64_t counter, uint8_t flags)

 {

   uint32_t block_words[16];

   block_words[0] = load32(block + 4 * 0);

   block_words[1] = load32(block + 4 * 1);

   block_words[2] = load32(block + 4 * 2);

   block_words[3] = load32(block + 4 * 3);

   block_words[4] = load32(block + 4 * 4);

   block_words[5] = load32(block + 4 * 5);

   block_words[6] = load32(block + 4 * 6);

   block_words[7] = load32(block + 4 * 7);

   block_words[8] = load32(block + 4 * 8);

   block_words[9] = load32(block + 4 * 9);

   block_words[10] = load32(block + 4 * 10);

   block_words[11] = load32(block + 4 * 11);

   block_words[12] = load32(block + 4 * 12);

   block_words[13] = load32(block + 4 * 13);

   block_words[14] = load32(block + 4 * 14);

   block_words[15] = load32(block + 4 * 15);


   state[0] = cv[0];

   state[1] = cv[1];

   state[2] = cv[2];

   state[3] = cv[3];

   state[4] = cv[4];

   state[5] = cv[5];

   state[6] = cv[6];

   state[7] = cv[7];

   state[8] = IV[0];

   state[9] = IV[1];

   state[10] = IV[2];

   state[11] = IV[3];

   state[12] = counterLow(counter);

   state[13] = counterHigh(counter);

   state[14] = static_cast<uint32_t>(block_len);

   state[15] = static_cast<uint32_t>(flags);


   roundFn(state, &block_words[0], 0);

   roundFn(state, &block_words[0], 1);

   roundFn(state, &block_words[0], 2);

   roundFn(state, &block_words[0], 3);

   roundFn(state, &block_words[0], 4);

   roundFn(state, &block_words[0], 5);

   roundFn(state, &block_words[0], 6);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::compressInPlace(uint32_t cv[8],

                                     const uint8_t block[BLAKE3_BLOCK_LEN],

                                     uint8_t block_len,

                                     uint64_t counter,

                                     uint8_t flags)

 {

   uint32_t state[16];

   compressPre(state, cv, block, block_len, counter, flags);

   cv[0] = state[0] ^ state[8];

   cv[1] = state[1] ^ state[9];

   cv[2] = state[2] ^ state[10];

   cv[3] = state[3] ^ state[11];

   cv[4] = state[4] ^ state[12];

   cv[5] = state[5] ^ state[13];

   cv[6] = state[6] ^ state[14];

   cv[7] = state[7] ^ state[15];

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::compressXof(const uint32_t cv[8],

                                 const uint8_t block[BLAKE3_BLOCK_LEN],

                                 uint8_t block_len, uint64_t counter,

                                 uint8_t flags, uint8_t out[64])

 {

   uint32_t state[16];

   compressPre(state, cv, block, block_len, counter, flags);


   store32(&out[0 * 4], state[0] ^ state[8]);

   store32(&out[1 * 4], state[1] ^ state[9]);

   store32(&out[2 * 4], state[2] ^ state[10]);

   store32(&out[3 * 4], state[3] ^ state[11]);

   store32(&out[4 * 4], state[4] ^ state[12]);

   store32(&out[5 * 4], state[5] ^ state[13]);

   store32(&out[6 * 4], state[6] ^ state[14]);

   store32(&out[7 * 4], state[7] ^ state[15]);

   store32(&out[8 * 4], state[8] ^ cv[0]);

   store32(&out[9 * 4], state[9] ^ cv[1]);

   store32(&out[10 * 4], state[10] ^ cv[2]);

   store32(&out[11 * 4], state[11] ^ cv[3]);

   store32(&out[12 * 4], state[12] ^ cv[4]);

   store32(&out[13 * 4], state[13] ^ cv[5]);

   store32(&out[14 * 4], state[14] ^ cv[6]);

   store32(&out[15 * 4], state[15] ^ cv[7]);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::hashOne(const uint8_t *input, size_t blocks,

                             const uint32_t key[8], uint64_t counter,

                             uint8_t flags, uint8_t flags_start,

                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])

 {

   uint32_t cv[8];

   memcpy(cv, key, BLAKE3_KEY_LEN);

   uint8_t block_flags = flags | flags_start;

   while (blocks > 0)

   {

     if (blocks == 1)

       block_flags |= flags_end;

     compressInPlace(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);

     input = &input[BLAKE3_BLOCK_LEN];

     blocks -= 1;

     block_flags = flags;

   }

   storeCvWords(out, cv);

 }

 //---------------------------------------------------------------------------


 inline void BLAKE3::hashMany(const uint8_t *const *inputs, size_t num_inputs,

                              size_t blocks, const uint32_t key[8],

                              uint64_t counter, bool increment_counter,

                              uint8_t flags, uint8_t flags_start,

                              uint8_t flags_end, uint8_t *out)

 {

   while (num_inputs > 0)

   {

     hashOne(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);

     if (increment_counter)

       counter += 1;

     inputs += 1;

     num_inputs -= 1;

     out = &out[BLAKE3_OUT_LEN];

   }

 }

 //---------------------------------------------------------------------------


 // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time

 // on a single thread. Write out the parent chaining values and return the

 // number of parents hashed. (If there's an odd input chaining value left over,

 // return it as an additional output.) These parents are never the root and

 // never empty; those cases use a different codepath.

 inline size_t BLAKE3::compressParentsParallel(const uint8_t *child_chaining_values,

                                               size_t num_chaining_values,

                                               const uint32_t key[8],

                                               uint8_t flags,

                                               uint8_t *out)

 {

   const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];

   size_t parents_array_len = 0;

   while (num_chaining_values - (2 * parents_array_len) >= 2)

   {

     parents_array[parents_array_len] =

         &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];

     parents_array_len += 1;

   }


   hashMany(parents_array, parents_array_len, 1, key,

            0, // Parents always use counter 0.

            false, flags | static_cast<uint8_t>(Flags::PARENT),

            0, // Parents have no start flags.

            0, // Parents have no end flags.

            out);


   // If there's an odd child left over, it becomes an output.

   if (num_chaining_values > 2 * parents_array_len)

   {

     memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],

            &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],

            BLAKE3_OUT_LEN);

     return parents_array_len + 1;

   }

   else

     return parents_array_len;

 }

 //---------------------------------------------------------------------------


 inline size_t BLAKE3::compressChunksParallel(const uint8_t *input,

                                              size_t input_len,

                                              const uint32_t key[8],

                                              uint64_t chunk_counter,

                                              uint8_t flags,

                                              uint8_t *out)

 {

   const uint8_t *chunks_array[MAX_SIMD_DEGREE];

   size_t input_position = 0;

   size_t chunks_array_len = 0;

   while (input_len - input_position >= BLAKE3_CHUNK_LEN)

   {

     chunks_array[chunks_array_len] = &input[input_position];

     input_position += BLAKE3_CHUNK_LEN;

     chunks_array_len += 1;

   }


   hashMany(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,

            key, chunk_counter, true, flags, static_cast<uint8_t>(Flags::CHUNK_START),

            static_cast<uint8_t>(Flags::CHUNK_END), out);


   // Hash the remaining partial chunk, if there is one. Note that the empty

   // chunk (meaning the empty message) is a different codepath.

   if (input_len > input_position)

   {

     uint64_t counter = chunk_counter + static_cast<uint64_t>(chunks_array_len);

     ChunkState chunk_state;

     chunk_state.init(key, flags);

     chunk_state.setChunkCounter(counter);

     chunk_state.update(&input[input_position], input_len - input_position);

     Output output = chunk_state.stateOutput();

     output.chainingValue(&out[chunks_array_len * BLAKE3_OUT_LEN]);

     return chunks_array_len + 1;

   }

   else

     return chunks_array_len;

 }

 //---------------------------------------------------------------------------


 // The wide helper function returns (writes out) an array of chaining values

 // and returns the length of that array. The number of chaining values returned

 // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,

 // if the input is shorter than that many chunks. The reason for maintaining a

 // wide array of chaining values going back up the tree, is to allow the

 // implementation to hash as many parents in parallel as possible.

 //

 // As a special case when the SIMD degree is 1, this function will still return

 // at least 2 outputs. This guarantees that this function doesn't perform the

 // root compression. (If it did, it would use the wrong flags, and also we

 // wouldn't be able to implement exendable ouput.) Note that this function is

 // not used when the whole input is only 1 chunk long; that's a different

 // codepath.

 //

 // Why not just have the caller split the input on the first update(), instead

 // of implementing this special rule? Because we don't want to limit SIMD or

 // multi-threading parallelism for that update().

 size_t BLAKE3::compressSubtreeWide(const uint8_t *input, size_t input_len,

                                    const uint32_t key[8],

                                    uint64_t chunk_counter,

                                    uint8_t flags, uint8_t *out)

 {

   // Note that the single chunk case does *not* bump the SIMD degree up to 2

   // when it is 1. If this implementation adds multi-threading in the future,

   // this gives us the option of multi-threading even the 2-chunk case, which

   // can help performance on smaller platforms.

   if (input_len <= simdDegree() * BLAKE3_CHUNK_LEN)

     return compressChunksParallel(input, input_len, key, chunk_counter, flags,

                                   out);


   // With more than simd_degree chunks, we need to recurse. Start by dividing

   // the input into left and right subtrees. (Note that this is only optimal

   // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree

   // of 3 or something, we'll need a more complicated strategy.)

   size_t left_input_len = left_len(input_len);

   size_t right_input_len = input_len - left_input_len;

   const uint8_t *right_input = &input[left_input_len];

   uint64_t right_chunk_counter =

       chunk_counter + static_cast<uint64_t>(left_input_len / BLAKE3_CHUNK_LEN);


   // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to

   // account for the special case of returning 2 outputs when the SIMD degree

   // is 1.

   uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];

   size_t degree = simdDegree();

   if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1)

   {

     // The special case: We always use a degree of at least two, to make

     // sure there are two outputs. Except, as noted above, at the chunk

     // level, where we allow degree=1. (Note that the 1-chunk-input case is

     // a different codepath.)

     degree = 2;

   }

   uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];


   // Recurse! If this implementation adds multi-threading support in the

   // future, this is where it will go.

   size_t left_n = compressSubtreeWide(input, left_input_len, key, chunk_counter,

                                       flags, cv_array);

   size_t right_n = compressSubtreeWide(right_input, right_input_len, key,

                                        right_chunk_counter, flags, right_cvs);


   // The special case again. If simd_degree=1, then we'll have left_n=1 and

   // right_n=1. Rather than compressing them into a single output, return

   // them directly, to make sure we always have at least two outputs.

   if (left_n == 1)

   {

     memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);

     return 2;

   }


   // Otherwise, do one layer of parent node compression.

   size_t num_chaining_values = left_n + right_n;

   return compressParentsParallel(cv_array, num_chaining_values, key, flags, out);

 }

 //---------------------------------------------------------------------------


 // Hash a subtree with compressSubtreeWide(), and then condense the resulting

 // list of chaining values down to a single parent node. Don't compress that

 // last parent node, however. Instead, return its message bytes (the

 // concatenated chaining values of its children). This is necessary when the

 // first call to update() supplies a complete subtree, because the topmost

 // parent node of that subtree could end up being the root. It's also necessary

 // for extended output in the general case.

 //

 // As with compressSubtreeWide(), this function is not used on inputs of 1

 // chunk or less. That's a different codepath.

 inline void BLAKE3::compressSubtreeToParentNode(const uint8_t *input,

                                                 size_t input_len,

                                                 const uint32_t key[8],

                                                 uint64_t chunk_counter,

                                                 uint8_t flags,

                                                 uint8_t out[2 * BLAKE3_OUT_LEN])

 {

   uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];

   size_t num_cvs = compressSubtreeWide(input, input_len, key,

                                                 chunk_counter, flags, cv_array);

   #if defined(__GNUC__)

   assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);

   #endif


   // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,

   // compress_subtree_wide() returns more than 2 chaining values. Condense

   // them into 2 by forming parent nodes repeatedly.

   uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];

   #if defined(__GNUC__)

   // The second half of this loop condition is always true, and we just

   // asserted it above. But GCC can't tell that it's always true, and if NDEBUG

   // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious

   // warnings here. GCC 8.5 is particular sensitive, so if you're changing this

   // code, test it against that version.

   while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {

   #else

   while (num_cvs > 2) {

   #endif

     num_cvs =

         compressParentsParallel(cv_array, num_cvs, key, flags, out_array);

     memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);

   }

   memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);

 }

 //---------------------------------------------------------------------------


 // As described in pushCv() below, we do "lazy merging", delaying

 // merges until right before the next CV is about to be added. This is

 // different from the reference implementation. Another difference is that we

 // aren't always merging 1 chunk at a time. Instead, each CV might represent

 // any power-of-two number of chunks, as long as the smaller-above-larger stack

 // order is maintained. Instead of the "count the trailing 0-bits" algorithm

 // described in the spec, we use a "count the total number of 1-bits" variant

 // that doesn't require us to retain the subtree size of the CV on top of the

 // stack. The principle is the same: each CV that should remain in the stack is

 // represented by a 1-bit in the total number of chunks (or bytes) so far.

 inline void BLAKE3::mergeCvStack(uint64_t total_len)

 {

   size_t post_merge_stack_len = static_cast<size_t>(popcnt(total_len));

   while (cv_stack_len > post_merge_stack_len)

   {

     uint8_t *parent_node = &cv_stack[(cv_stack_len - 2) * BLAKE3_OUT_LEN];

     Output output = Output::parentOutput(parent_node, key, chunk.getFlags());

     output.chainingValue(parent_node);

     cv_stack_len -= 1;

   }

 }

 //---------------------------------------------------------------------------


 // In reference_impl.rs, we merge the new CV with existing CVs from the stack

 // before pushing it. We can do that because we know more input is coming, so

 // we know none of the merges are root.

 //

 // This setting is different. We want to feed as much input as possible to

 // compress_subtree_wide(), without setting aside anything for the chunk_state.

 // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once

 // as a single subtree, if at all possible.

 //

 // This leads to two problems:

 // 1) This 64 KiB input might be the only call that ever gets made to update.

 //    In this case, the root node of the 64 KiB subtree would be the root node

 //    of the whole tree, and it would need to be ROOT finalized. We can't

 //    compress it until we know.

 // 2) This 64 KiB input might complete a larger tree, whose root node is

 //    similarly going to be the the root of the whole tree. For example, maybe

 //    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the

 //    node at the root of the 256 KiB subtree until we know how to finalize it.

 //

 // The second problem is solved with "lazy merging". That is, when we're about

 // to add a CV to the stack, we don't merge it with anything first, as the

 // reference impl does. Instead we do merges using the *previous* CV that was

 // added, which is sitting on top of the stack, and we put the new CV

 // (unmerged) on top of the stack afterwards. This guarantees that we never

 // merge the root node until finalize().

 //

 // Solving the first problem requires an additional tool,

 // compress_subtree_to_parent_node(). That function always returns the top

 // *two* chaining values of the subtree it's compressing. We then do lazy

 // merging with each of them separately, so that the second CV will always

 // remain unmerged. (That also helps us support extendable output when we're

 // hashing an input all-at-once.)

 inline void BLAKE3::pushCv(uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)

 {

   mergeCvStack(chunk_counter);

   memcpy(&cv_stack[cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);

   cv_stack_len += 1;

 }

 //---------------------------------------------------------------------------


 void BLAKE3::finalizeSeek(uint64_t seek, uint8_t *out, size_t out_len)

 {

   // Explicitly checking for zero avoids causing UB by passing a null pointer

   // to memcpy. This comes up in practice with things like:

   //   std::vector<uuint8_t> v;

   //   blake3_hasher_finalize(&hasher, v.data(), v.size());

   if (out_len == 0)

     return;


   // If the subtree stack is empty, then the current chunk is the root.

   if (cv_stack_len == 0)

   {

     Output output = chunk.stateOutput();

     output.rootBytes(seek, out, out_len);

     return;

   }

   // If there are any bytes in the chunk state, finalize that chunk and do a

   // roll-up merge between that chunk hash and every subtree in the stack. In

   // this case, the extra merge loop at the end of blake3_hasher_update

   // guarantees that none of the subtrees in the stack need to be merged with

   // each other first. Otherwise, if there are no bytes in the chunk state,

   // then the top of the stack is a chunk hash, and we start the merge from

   // that.

   Output output;

   size_t cvs_remaining;

   if (chunk.len() > 0)

   {

     cvs_remaining = cv_stack_len;

     output = chunk.stateOutput();

   }

   else

   {

     // There are always at least 2 CVs in the stack in this case.

     cvs_remaining = cv_stack_len - 2;

     output = Output::parentOutput(&cv_stack[cvs_remaining * 32], key,

                                   chunk.getFlags());

   }

   while (cvs_remaining > 0)

   {

     cvs_remaining -= 1;

     uint8_t parent_block[BLAKE3_BLOCK_LEN];

     memcpy(parent_block, &cv_stack[cvs_remaining * 32], 32);

     output.chainingValue(&parent_block[32]);

     output = Output::parentOutput(parent_block, key, chunk.getFlags());

   }

   output.rootBytes(seek, out, out_len);

 }

 //---------------------------------------------------------------------------


 /*

  * Default constructor.

  */

 BLAKE3::BLAKE3()

 {

   reset();

 }

 //---------------------------------------------------------------------------


 /*

  * Resets the BLAKE3 hash to initial value.

  */

 void BLAKE3::reset()

 {

   memcpy(key, IV, BLAKE3_KEY_LEN);

   chunk.init(key, 0);

   cv_stack_len = 0;

 }

 //---------------------------------------------------------------------------


 /*

  * Updates the BLAKE3 hash with specified array of bytes.

  */

 void BLAKE3::update(const uint8_t* buf, size_t len)

 {

   // Explicitly checking for zero avoids causing UB by passing a null pointer

   // to memcpy. This comes up in practice with things like:

   //   std::vector<uuint8_t> v;

   //   blake3_hasher_update(&hasher, v.data(), v.size());

   if (len == 0)

     return;


   // If we have some partial chunk bytes in the internal chunk_state, we need

   // to finish that chunk first.

   if (chunk.len() > 0) {

     size_t take = BLAKE3_CHUNK_LEN - chunk.len();

     if (take > len)

       take = len;

     chunk.update(buf, take);

     buf += take;

     len -= take;

     // If we've filled the current chunk and there's more coming, finalize this

     // chunk and proceed. In this case we know it's not the root.

     if (len > 0)

     {

       Output output = chunk.stateOutput();

       uint8_t chunk_cv[32];

       output.chainingValue(chunk_cv);

       pushCv(chunk_cv, chunk.getChunkCounter());

       chunk.reset(key, chunk.getChunkCounter() + 1);

     }

     else

       return;

   }


   // Now the chunk_state is clear, and we have more input. If there's more than

   // a single chunk (so, definitely not the root chunk), hash the largest whole

   // subtree we can, with the full benefits of SIMD (and maybe in the future,

   // multi-threading) parallelism. Two restrictions:

   // - The subtree has to be a power-of-2 number of chunks. Only subtrees along

   //   the right edge can be incomplete, and we don't know where the right edge

   //   is going to be until we get to finalize().

   // - The subtree must evenly divide the total number of chunks up until this

   //   point (if total is not 0). If the current incomplete subtree is only

   //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have

   //   to complete the current subtree first.

   // Because we might need to break up the input to form powers of 2, or to

   // evenly divide what we already have, this part runs in a loop.

   while (len > BLAKE3_CHUNK_LEN)

   {

     size_t subtree_len = roundDownToPowerOf2(len);

     uint64_t count_so_far = chunk.getChunkCounter() * BLAKE3_CHUNK_LEN;

     // Shrink the subtree_len until it evenly divides the count so far. We know

     // that subtree_len itself is a power of 2, so we can use a bitmasking

     // trick instead of an actual remainder operation. (Note that if the caller

     // consistently passes power-of-2 inputs of the same size, as is hopefully

     // typical, this loop condition will always fail, and subtree_len will

     // always be the full length of the input.)

     //

     // An aside: We don't have to shrink subtree_len quite this much. For

     // example, if count_so_far is 1, we could pass 2 chunks to

     // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still

     // get the right answer in the end, and we might get to use 2-way SIMD

     // parallelism. The problem with this optimization, is that it gets us

     // stuck always hashing 2 chunks. The total number of chunks will remain

     // odd, and we'll never graduate to higher degrees of parallelism. See

     // https://github.com/BLAKE3-team/BLAKE3/issues/69.

     while (((static_cast<uint64_t>(subtree_len - 1)) & count_so_far) != 0)

       subtree_len /= 2;


     // The shrunken subtree_len might now be 1 chunk long. If so, hash that one

     // chunk by itself. Otherwise, compress the subtree into a pair of CVs.

     uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;

     if (subtree_len <= BLAKE3_CHUNK_LEN)

     {

       ChunkState chunk_state;

       chunk_state.init(key, chunk.getFlags());

       chunk_state.setChunkCounter(chunk.getChunkCounter());

       chunk_state.update(buf, subtree_len);

       Output output = chunk_state.stateOutput();

       uint8_t cv[BLAKE3_OUT_LEN];

       output.chainingValue(cv);

       pushCv(cv, chunk_state.getChunkCounter());

     }

     else

     {

       // This is the high-performance happy path, though getting here depends

       // on the caller giving us a long enough input.

       uint8_t cv_pair[2 * BLAKE3_OUT_LEN];

       compressSubtreeToParentNode(buf, subtree_len, key,

                                   chunk.getChunkCounter(),

                                   chunk.getFlags(), cv_pair);

       pushCv(cv_pair, chunk.getChunkCounter());

       pushCv(&cv_pair[BLAKE3_OUT_LEN],

                      chunk.getChunkCounter() + (subtree_chunks / 2));

     }

     chunk.setChunkCounter(chunk.getChunkCounter() + subtree_chunks);

     buf += subtree_len;

     len -= subtree_len;

   }


   // If there's any remaining input less than a full chunk, add it to the chunk

   // state. In that case, also do a final merge loop to make sure the subtree

   // stack doesn't contain any unmerged pairs. The remaining input means we

   // know these merges are non-root. This merge loop isn't strictly necessary

   // here, because hasher_push_chunk_cv already does its own merge loop, but it

   // simplifies blake3_hasher_finalize below.

   if (len > 0)

   {

     chunk.update(buf, len);

     mergeCvStack(chunk.getChunkCounter());

   }

 }

 //---------------------------------------------------------------------------


 /*

  * Returns the BLAKE3 hash value in the first 32 bytes of the given address.

  */

 uint8_t* BLAKE3::getValue(uint8_t* buffer) const

 {

   BLAKE3 blake3(*this);

   blake3.finalizeSeek(0, static_cast<uint8_t*>(buffer), getSize());


   return buffer;

 }

 //---------------------------------------------------------------------------

 }  // namespace libxcks

 //---------------------------------------------------------------------------


blake3.hpp
Compute BLAKE3 hash.

libxcks::ChecksumAlgoId::BLAKE3
@ BLAKE3
BLAKE3 cryptographic hash function (in its default output size of 256 bits).