58 #define MAX_SIMD_DEGREE 1
63 #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
66 inline BLAKE3::Output::Output(
const uint32_t input_cv[8],
67 const uint8_t block[BLAKE3_BLOCK_LEN],
68 uint8_t block_len, uint64_t counter, uint8_t flags)
70 memcpy(this->input_cv, input_cv, 32);
71 memcpy(this->block, block, BLAKE3_BLOCK_LEN);
72 this->block_len = block_len;
73 this->counter = counter;
85 inline void BLAKE3::Output::chainingValue(uint8_t cv[32])
const
88 memcpy(cv_words, input_cv, 32);
89 compressInPlace(cv_words, block, block_len, counter, flags);
90 storeCvWords(cv, cv_words);
95 inline void BLAKE3::Output::rootBytes(uint64_t seek, uint8_t *out,
98 uint64_t output_block_counter = seek / 64;
99 size_t offset_within_block = seek % 64;
100 uint8_t wide_buf[64];
103 compressXof(input_cv, block, block_len, output_block_counter,
104 flags |
static_cast<uint8_t
>(Flags::ROOT), wide_buf);
105 size_t available_bytes = 64 - offset_within_block;
107 if (out_len > available_bytes)
108 memcpy_len = available_bytes;
110 memcpy_len = out_len;
111 memcpy(out, wide_buf + offset_within_block, memcpy_len);
113 out_len -= memcpy_len;
114 output_block_counter += 1;
115 offset_within_block = 0;
121 inline BLAKE3::Output BLAKE3::Output::parentOutput(
const uint8_t block[BLAKE3_BLOCK_LEN],
122 const uint32_t key[8],
125 return Output(key, block, BLAKE3_BLOCK_LEN, 0, flags |
126 static_cast<uint8_t
>(Flags::PARENT));
131 inline size_t BLAKE3::ChunkState::fillBuf(
const uint8_t* input,
134 size_t take = BLAKE3_BLOCK_LEN - (
static_cast<size_t>(buf_len));
135 if (take > input_len)
137 uint8_t *dest = buf + (
static_cast<size_t>(buf_len));
138 memcpy(dest, input, take);
139 buf_len +=
static_cast<uint8_t
>(take);
145 inline uint8_t BLAKE3::ChunkState::maybeStartFlag()
const
147 if (blocks_compressed == 0)
148 return static_cast<uint8_t
>(Flags::CHUNK_START);
155 inline BLAKE3::Output BLAKE3::ChunkState::stateOutput()
const
157 uint8_t block_flags = flags | maybeStartFlag() |
158 static_cast<uint8_t
>(Flags::CHUNK_END);
159 return Output(cv, buf, buf_len, chunk_counter, block_flags);
167 inline void BLAKE3::ChunkState::init(
const uint32_t key[8],
const uint8_t flags)
169 memcpy(cv, key, BLAKE3_KEY_LEN);
171 memset(buf, 0, BLAKE3_BLOCK_LEN);
173 blocks_compressed = 0;
179 inline void BLAKE3::ChunkState::reset(
const uint32_t key[8],
180 const uint64_t chunk_counter)
182 memcpy(cv, key, BLAKE3_KEY_LEN);
183 this->chunk_counter = chunk_counter;
184 blocks_compressed = 0;
185 memset(buf, 0, BLAKE3_BLOCK_LEN);
191 inline size_t BLAKE3::ChunkState::len()
const
193 return (BLAKE3_BLOCK_LEN *
static_cast<size_t>(blocks_compressed)) +
194 (
static_cast<size_t>(buf_len));
199 inline void BLAKE3::ChunkState::update(
const uint8_t* input,
size_t input_len)
203 size_t take = fillBuf(input, input_len);
208 compressInPlace(cv, buf, BLAKE3_BLOCK_LEN, chunk_counter,
209 flags | maybeStartFlag());
210 blocks_compressed += 1;
212 memset(buf, 0, BLAKE3_BLOCK_LEN);
216 while (input_len > BLAKE3_BLOCK_LEN)
218 compressInPlace(cv, input, BLAKE3_BLOCK_LEN, chunk_counter,
219 flags | maybeStartFlag());
220 blocks_compressed += 1;
221 input += BLAKE3_BLOCK_LEN;
222 input_len -= BLAKE3_BLOCK_LEN;
225 size_t take = fillBuf(input, input_len);
232 inline unsigned int BLAKE3::popcnt(uint64_t x)
234 #if defined(__GNUC__) || defined(__clang__)
235 return __builtin_popcountll(x);
237 unsigned int count = 0;
249 inline unsigned int BLAKE3::highestOne(uint64_t x)
251 #if defined(__GNUC__) || defined(__clang__)
252 return 63 ^ __builtin_clzll(x);
253 #elif defined(_MSC_VER) && defined(IS_X86_64)
255 _BitScanReverse64(&index, x);
257 #elif defined(_MSC_VER) && defined(IS_X86_32)
261 _BitScanReverse(&index, x >> 32);
267 _BitScanReverse(&index, x);
272 if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
273 if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
274 if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
275 if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
276 if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
277 if(x & 0x0000000000000002ULL) { c += 1; }
284 inline uint64_t BLAKE3::roundDownToPowerOf2(uint64_t x)
286 return UINT64_C(1) << highestOne(x | 1);
291 inline uint32_t BLAKE3::counterLow(uint64_t counter)
293 return static_cast<uint32_t
>(counter);
297 inline uint32_t BLAKE3::counterHigh(uint64_t counter)
299 return static_cast<uint32_t
>(counter >> 32);
304 inline uint32_t BLAKE3::load32(
const void *src)
306 const uint8_t *p =
static_cast<const uint8_t *
>(src);
307 return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
308 ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
313 inline uint32_t BLAKE3::rotr32(
const uint32_t w,
const uint32_t c)
315 return (w >> c) | (w << (32 - c));
323 inline size_t BLAKE3::left_len(
size_t content_len)
327 size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
328 return roundDownToPowerOf2(full_chunks) * BLAKE3_CHUNK_LEN;
333 inline void BLAKE3::store32(
void *dst, uint32_t w)
335 uint8_t *p =
static_cast<uint8_t *
>(dst);
336 p[0] =
static_cast<uint8_t
>(w >> 0);
337 p[1] =
static_cast<uint8_t
>(w >> 8);
338 p[2] =
static_cast<uint8_t
>(w >> 16);
339 p[3] =
static_cast<uint8_t
>(w >> 24);
344 inline void BLAKE3::storeCvWords(uint8_t bytes_out[32], uint32_t cv_words[8])
346 store32(&bytes_out[0 * 4], cv_words[0]);
347 store32(&bytes_out[1 * 4], cv_words[1]);
348 store32(&bytes_out[2 * 4], cv_words[2]);
349 store32(&bytes_out[3 * 4], cv_words[3]);
350 store32(&bytes_out[4 * 4], cv_words[4]);
351 store32(&bytes_out[5 * 4], cv_words[5]);
352 store32(&bytes_out[6 * 4], cv_words[6]);
353 store32(&bytes_out[7 * 4], cv_words[7]);
358 inline void BLAKE3::g(uint32_t *state,
const size_t a,
359 const size_t b,
const size_t c,
360 const size_t d,
const uint32_t x,
363 state[a] = state[a] + state[b] + x;
364 state[d] = rotr32(state[d] ^ state[a], 16);
365 state[c] = state[c] + state[d];
366 state[b] = rotr32(state[b] ^ state[c], 12);
367 state[a] = state[a] + state[b] + y;
368 state[d] = rotr32(state[d] ^ state[a], 8);
369 state[c] = state[c] + state[d];
370 state[b] = rotr32(state[b] ^ state[c], 7);
375 inline void BLAKE3::roundFn(uint32_t state[16],
const uint32_t *msg,
379 const uint8_t *schedule = MSG_SCHEDULE[round];
382 g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
383 g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
384 g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
385 g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
388 g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
389 g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
390 g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
391 g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
396 inline void BLAKE3::compressPre(uint32_t state[16],
397 const uint32_t cv[8],
398 const uint8_t block[BLAKE3_BLOCK_LEN],
400 uint64_t counter, uint8_t flags)
402 uint32_t block_words[16];
403 block_words[0] = load32(block + 4 * 0);
404 block_words[1] = load32(block + 4 * 1);
405 block_words[2] = load32(block + 4 * 2);
406 block_words[3] = load32(block + 4 * 3);
407 block_words[4] = load32(block + 4 * 4);
408 block_words[5] = load32(block + 4 * 5);
409 block_words[6] = load32(block + 4 * 6);
410 block_words[7] = load32(block + 4 * 7);
411 block_words[8] = load32(block + 4 * 8);
412 block_words[9] = load32(block + 4 * 9);
413 block_words[10] = load32(block + 4 * 10);
414 block_words[11] = load32(block + 4 * 11);
415 block_words[12] = load32(block + 4 * 12);
416 block_words[13] = load32(block + 4 * 13);
417 block_words[14] = load32(block + 4 * 14);
418 block_words[15] = load32(block + 4 * 15);
432 state[12] = counterLow(counter);
433 state[13] = counterHigh(counter);
434 state[14] =
static_cast<uint32_t
>(block_len);
435 state[15] =
static_cast<uint32_t
>(flags);
437 roundFn(state, &block_words[0], 0);
438 roundFn(state, &block_words[0], 1);
439 roundFn(state, &block_words[0], 2);
440 roundFn(state, &block_words[0], 3);
441 roundFn(state, &block_words[0], 4);
442 roundFn(state, &block_words[0], 5);
443 roundFn(state, &block_words[0], 6);
448 inline void BLAKE3::compressInPlace(uint32_t cv[8],
449 const uint8_t block[BLAKE3_BLOCK_LEN],
455 compressPre(state, cv, block, block_len, counter, flags);
456 cv[0] = state[0] ^ state[8];
457 cv[1] = state[1] ^ state[9];
458 cv[2] = state[2] ^ state[10];
459 cv[3] = state[3] ^ state[11];
460 cv[4] = state[4] ^ state[12];
461 cv[5] = state[5] ^ state[13];
462 cv[6] = state[6] ^ state[14];
463 cv[7] = state[7] ^ state[15];
468 inline void BLAKE3::compressXof(
const uint32_t cv[8],
469 const uint8_t block[BLAKE3_BLOCK_LEN],
470 uint8_t block_len, uint64_t counter,
471 uint8_t flags, uint8_t out[64])
474 compressPre(state, cv, block, block_len, counter, flags);
476 store32(&out[0 * 4], state[0] ^ state[8]);
477 store32(&out[1 * 4], state[1] ^ state[9]);
478 store32(&out[2 * 4], state[2] ^ state[10]);
479 store32(&out[3 * 4], state[3] ^ state[11]);
480 store32(&out[4 * 4], state[4] ^ state[12]);
481 store32(&out[5 * 4], state[5] ^ state[13]);
482 store32(&out[6 * 4], state[6] ^ state[14]);
483 store32(&out[7 * 4], state[7] ^ state[15]);
484 store32(&out[8 * 4], state[8] ^ cv[0]);
485 store32(&out[9 * 4], state[9] ^ cv[1]);
486 store32(&out[10 * 4], state[10] ^ cv[2]);
487 store32(&out[11 * 4], state[11] ^ cv[3]);
488 store32(&out[12 * 4], state[12] ^ cv[4]);
489 store32(&out[13 * 4], state[13] ^ cv[5]);
490 store32(&out[14 * 4], state[14] ^ cv[6]);
491 store32(&out[15 * 4], state[15] ^ cv[7]);
496 inline void BLAKE3::hashOne(
const uint8_t *input,
size_t blocks,
497 const uint32_t key[8], uint64_t counter,
498 uint8_t flags, uint8_t flags_start,
499 uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
502 memcpy(cv, key, BLAKE3_KEY_LEN);
503 uint8_t block_flags = flags | flags_start;
507 block_flags |= flags_end;
508 compressInPlace(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
509 input = &input[BLAKE3_BLOCK_LEN];
513 storeCvWords(out, cv);
518 inline void BLAKE3::hashMany(
const uint8_t *
const *inputs,
size_t num_inputs,
519 size_t blocks,
const uint32_t key[8],
520 uint64_t counter,
bool increment_counter,
521 uint8_t flags, uint8_t flags_start,
522 uint8_t flags_end, uint8_t *out)
524 while (num_inputs > 0)
526 hashOne(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
527 if (increment_counter)
531 out = &out[BLAKE3_OUT_LEN];
542 inline size_t BLAKE3::compressParentsParallel(
const uint8_t *child_chaining_values,
543 size_t num_chaining_values,
544 const uint32_t key[8],
548 const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
549 size_t parents_array_len = 0;
550 while (num_chaining_values - (2 * parents_array_len) >= 2)
552 parents_array[parents_array_len] =
553 &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
554 parents_array_len += 1;
557 hashMany(parents_array, parents_array_len, 1, key,
559 false, flags |
static_cast<uint8_t
>(Flags::PARENT),
565 if (num_chaining_values > 2 * parents_array_len)
567 memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
568 &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
570 return parents_array_len + 1;
573 return parents_array_len;
578 inline size_t BLAKE3::compressChunksParallel(
const uint8_t *input,
580 const uint32_t key[8],
581 uint64_t chunk_counter,
585 const uint8_t *chunks_array[MAX_SIMD_DEGREE];
586 size_t input_position = 0;
587 size_t chunks_array_len = 0;
588 while (input_len - input_position >= BLAKE3_CHUNK_LEN)
590 chunks_array[chunks_array_len] = &input[input_position];
591 input_position += BLAKE3_CHUNK_LEN;
592 chunks_array_len += 1;
595 hashMany(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
596 key, chunk_counter,
true, flags,
static_cast<uint8_t
>(Flags::CHUNK_START),
597 static_cast<uint8_t
>(Flags::CHUNK_END), out);
601 if (input_len > input_position)
603 uint64_t counter = chunk_counter +
static_cast<uint64_t
>(chunks_array_len);
604 ChunkState chunk_state;
605 chunk_state.init(key, flags);
606 chunk_state.setChunkCounter(counter);
607 chunk_state.update(&input[input_position], input_len - input_position);
608 Output output = chunk_state.stateOutput();
609 output.chainingValue(&out[chunks_array_len * BLAKE3_OUT_LEN]);
610 return chunks_array_len + 1;
613 return chunks_array_len;
635 size_t BLAKE3::compressSubtreeWide(
const uint8_t *input,
size_t input_len,
636 const uint32_t key[8],
637 uint64_t chunk_counter,
638 uint8_t flags, uint8_t *out)
644 if (input_len <= simdDegree() * BLAKE3_CHUNK_LEN)
645 return compressChunksParallel(input, input_len, key, chunk_counter, flags,
653 size_t left_input_len = left_len(input_len);
654 size_t right_input_len = input_len - left_input_len;
655 const uint8_t *right_input = &input[left_input_len];
656 uint64_t right_chunk_counter =
657 chunk_counter +
static_cast<uint64_t
>(left_input_len / BLAKE3_CHUNK_LEN);
662 uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
663 size_t degree = simdDegree();
664 if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1)
672 uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
676 size_t left_n = compressSubtreeWide(input, left_input_len, key, chunk_counter,
678 size_t right_n = compressSubtreeWide(right_input, right_input_len, key,
679 right_chunk_counter, flags, right_cvs);
686 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
691 size_t num_chaining_values = left_n + right_n;
692 return compressParentsParallel(cv_array, num_chaining_values, key, flags, out);
707 inline void BLAKE3::compressSubtreeToParentNode(
const uint8_t *input,
709 const uint32_t key[8],
710 uint64_t chunk_counter,
712 uint8_t out[2 * BLAKE3_OUT_LEN])
714 uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
715 size_t num_cvs = compressSubtreeWide(input, input_len, key,
716 chunk_counter, flags, cv_array);
717 #if defined(__GNUC__)
718 assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
724 uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
725 #if defined(__GNUC__)
731 while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
733 while (num_cvs > 2) {
736 compressParentsParallel(cv_array, num_cvs, key, flags, out_array);
737 memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
739 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
754 inline void BLAKE3::mergeCvStack(uint64_t total_len)
756 size_t post_merge_stack_len =
static_cast<size_t>(popcnt(total_len));
757 while (cv_stack_len > post_merge_stack_len)
759 uint8_t *parent_node = &cv_stack[(cv_stack_len - 2) * BLAKE3_OUT_LEN];
760 Output output = Output::parentOutput(parent_node, key, chunk.getFlags());
761 output.chainingValue(parent_node);
800 inline void BLAKE3::pushCv(uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
802 mergeCvStack(chunk_counter);
803 memcpy(&cv_stack[cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
809 void BLAKE3::finalizeSeek(uint64_t seek, uint8_t *out,
size_t out_len)
819 if (cv_stack_len == 0)
821 Output output = chunk.stateOutput();
822 output.rootBytes(seek, out, out_len);
833 size_t cvs_remaining;
836 cvs_remaining = cv_stack_len;
837 output = chunk.stateOutput();
842 cvs_remaining = cv_stack_len - 2;
843 output = Output::parentOutput(&cv_stack[cvs_remaining * 32], key,
846 while (cvs_remaining > 0)
849 uint8_t parent_block[BLAKE3_BLOCK_LEN];
850 memcpy(parent_block, &cv_stack[cvs_remaining * 32], 32);
851 output.chainingValue(&parent_block[32]);
852 output = Output::parentOutput(parent_block, key, chunk.getFlags());
854 output.rootBytes(seek, out, out_len);
874 memcpy(key, IV, BLAKE3_KEY_LEN);
884 void BLAKE3::update(
const uint8_t* buf,
size_t len)
895 if (chunk.len() > 0) {
896 size_t take = BLAKE3_CHUNK_LEN - chunk.len();
899 chunk.update(buf, take);
906 Output output = chunk.stateOutput();
907 uint8_t chunk_cv[32];
908 output.chainingValue(chunk_cv);
909 pushCv(chunk_cv, chunk.getChunkCounter());
910 chunk.reset(key, chunk.getChunkCounter() + 1);
929 while (len > BLAKE3_CHUNK_LEN)
931 size_t subtree_len = roundDownToPowerOf2(len);
932 uint64_t count_so_far = chunk.getChunkCounter() * BLAKE3_CHUNK_LEN;
948 while (((
static_cast<uint64_t
>(subtree_len - 1)) & count_so_far) != 0)
953 uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
954 if (subtree_len <= BLAKE3_CHUNK_LEN)
956 ChunkState chunk_state;
957 chunk_state.init(key, chunk.getFlags());
958 chunk_state.setChunkCounter(chunk.getChunkCounter());
959 chunk_state.update(buf, subtree_len);
960 Output output = chunk_state.stateOutput();
961 uint8_t cv[BLAKE3_OUT_LEN];
962 output.chainingValue(cv);
963 pushCv(cv, chunk_state.getChunkCounter());
969 uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
970 compressSubtreeToParentNode(buf, subtree_len, key,
971 chunk.getChunkCounter(),
972 chunk.getFlags(), cv_pair);
973 pushCv(cv_pair, chunk.getChunkCounter());
974 pushCv(&cv_pair[BLAKE3_OUT_LEN],
975 chunk.getChunkCounter() + (subtree_chunks / 2));
977 chunk.setChunkCounter(chunk.getChunkCounter() + subtree_chunks);
990 chunk.update(buf, len);
991 mergeCvStack(chunk.getChunkCounter());
1000 uint8_t* BLAKE3::getValue(uint8_t* buffer)
const
1003 blake3.finalizeSeek(0,
static_cast<uint8_t*
>(buffer), getSize());
@ BLAKE3
BLAKE3 cryptographic hash function (in its default output size of 256 bits).