libxcks  0.1.0.1
blake3.cpp
Go to the documentation of this file.
1 /*
2  * libxcks
3  * Copyright (C) 2022 Julien Couot
4  *
5  * This program is free software: you can redistribute it and/or modify it
6  * under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or (at your
8  * option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13  * License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with this program. If not, see <https://www.gnu.org/licenses/>.
17  */
18 
24 //---------------------------------------------------------------------------
25 #include <cstring>
26 #if defined(__GNUC__)
27  #include <cassert>
28 #endif
29 
30 #include "blake3.hpp"
31 //---------------------------------------------------------------------------
32 
34 using namespace std;
35 //---------------------------------------------------------------------------
36 
37 
38 namespace libxcks
39 {
41 /* #if defined(__x86_64__) || defined(_M_X64)
42 #define IS_X86
43 #define IS_X86_64
44 #endif
45 
46 #if defined(__i386__) || defined(_M_IX86)
47 #define IS_X86
48 #define IS_X86_32
49 #endif
50 
51 #if defined(IS_X86)
52 #define MAX_SIMD_DEGREE 16
53 #elif defined(BLAKE3_USE_NEON)
54 #define MAX_SIMD_DEGREE 4
55 #else
56 #define MAX_SIMD_DEGREE 1
57 #endif */
58 #define MAX_SIMD_DEGREE 1
59 
60 
61 // There are some places where we want a static size that's equal to the
62 // MAX_SIMD_DEGREE, but also at least 2.
63 #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
64 
65 
66 inline BLAKE3::Output::Output(const uint32_t input_cv[8],
67  const uint8_t block[BLAKE3_BLOCK_LEN],
68  uint8_t block_len, uint64_t counter, uint8_t flags)
69 {
70  memcpy(this->input_cv, input_cv, 32);
71  memcpy(this->block, block, BLAKE3_BLOCK_LEN);
72  this->block_len = block_len;
73  this->counter = counter;
74  this->flags = flags;
75 }
76 //---------------------------------------------------------------------------
77 
78 
79 // Chaining values within a given chunk (specifically the compress_in_place
80 // interface) are represented as words. This avoids unnecessary bytes<->words
81 // conversion overhead in the portable implementation. However, the hash_many
82 // interface handles both user input and parent node blocks, so it accepts
83 // bytes. For that reason, chaining values in the CV stack are represented as
84 // bytes.
85 inline void BLAKE3::Output::chainingValue(uint8_t cv[32]) const
86 {
87  uint32_t cv_words[8];
88  memcpy(cv_words, input_cv, 32);
89  compressInPlace(cv_words, block, block_len, counter, flags);
90  storeCvWords(cv, cv_words);
91 }
92 //---------------------------------------------------------------------------
93 
94 
95 inline void BLAKE3::Output::rootBytes(uint64_t seek, uint8_t *out,
96  size_t out_len)
97 {
98  uint64_t output_block_counter = seek / 64;
99  size_t offset_within_block = seek % 64;
100  uint8_t wide_buf[64];
101  while (out_len > 0)
102  {
103  compressXof(input_cv, block, block_len, output_block_counter,
104  flags | static_cast<uint8_t>(Flags::ROOT), wide_buf);
105  size_t available_bytes = 64 - offset_within_block;
106  size_t memcpy_len;
107  if (out_len > available_bytes)
108  memcpy_len = available_bytes;
109  else
110  memcpy_len = out_len;
111  memcpy(out, wide_buf + offset_within_block, memcpy_len);
112  out += memcpy_len;
113  out_len -= memcpy_len;
114  output_block_counter += 1;
115  offset_within_block = 0;
116  }
117 }
118 //---------------------------------------------------------------------------
119 
120 
121 inline BLAKE3::Output BLAKE3::Output::parentOutput(const uint8_t block[BLAKE3_BLOCK_LEN],
122  const uint32_t key[8],
123  uint8_t flags)
124 {
125  return Output(key, block, BLAKE3_BLOCK_LEN, 0, flags |
126  static_cast<uint8_t>(Flags::PARENT));
127 }
128 //---------------------------------------------------------------------------
129 
130 
131 inline size_t BLAKE3::ChunkState::fillBuf(const uint8_t* input,
132  size_t input_len)
133 {
134  size_t take = BLAKE3_BLOCK_LEN - (static_cast<size_t>(buf_len));
135  if (take > input_len)
136  take = input_len;
137  uint8_t *dest = buf + (static_cast<size_t>(buf_len));
138  memcpy(dest, input, take);
139  buf_len += static_cast<uint8_t>(take);
140  return take;
141 }
142 //---------------------------------------------------------------------------
143 
144 
145 inline uint8_t BLAKE3::ChunkState::maybeStartFlag() const
146 {
147  if (blocks_compressed == 0)
148  return static_cast<uint8_t>(Flags::CHUNK_START);
149  else
150  return 0;
151 }
152 //---------------------------------------------------------------------------
153 
154 
155 inline BLAKE3::Output BLAKE3::ChunkState::stateOutput() const
156 {
157  uint8_t block_flags = flags | maybeStartFlag() |
158  static_cast<uint8_t>(Flags::CHUNK_END);
159  return Output(cv, buf, buf_len, chunk_counter, block_flags);
160 }
161 //---------------------------------------------------------------------------
162 
163 
164 /*
165  * Resets the chunk to initial value.
166  */
167 inline void BLAKE3::ChunkState::init(const uint32_t key[8], const uint8_t flags)
168 {
169  memcpy(cv, key, BLAKE3_KEY_LEN);
170  chunk_counter = 0;
171  memset(buf, 0, BLAKE3_BLOCK_LEN);
172  buf_len = 0;
173  blocks_compressed = 0;
174  this->flags = flags;
175 }
176 //---------------------------------------------------------------------------
177 
178 
179 inline void BLAKE3::ChunkState::reset(const uint32_t key[8],
180  const uint64_t chunk_counter)
181 {
182  memcpy(cv, key, BLAKE3_KEY_LEN);
183  this->chunk_counter = chunk_counter;
184  blocks_compressed = 0;
185  memset(buf, 0, BLAKE3_BLOCK_LEN);
186  buf_len = 0;
187 }
188 //---------------------------------------------------------------------------
189 
190 
191 inline size_t BLAKE3::ChunkState::len() const
192 {
193  return (BLAKE3_BLOCK_LEN * static_cast<size_t>(blocks_compressed)) +
194  (static_cast<size_t>(buf_len));
195 }
196 //---------------------------------------------------------------------------
197 
198 
199 inline void BLAKE3::ChunkState::update(const uint8_t* input, size_t input_len)
200 {
201  if (buf_len > 0)
202  {
203  size_t take = fillBuf(input, input_len);
204  input += take;
205  input_len -= take;
206  if (input_len > 0)
207  {
208  compressInPlace(cv, buf, BLAKE3_BLOCK_LEN, chunk_counter,
209  flags | maybeStartFlag());
210  blocks_compressed += 1;
211  buf_len = 0;
212  memset(buf, 0, BLAKE3_BLOCK_LEN);
213  }
214  }
215 
216  while (input_len > BLAKE3_BLOCK_LEN)
217  {
218  compressInPlace(cv, input, BLAKE3_BLOCK_LEN, chunk_counter,
219  flags | maybeStartFlag());
220  blocks_compressed += 1;
221  input += BLAKE3_BLOCK_LEN;
222  input_len -= BLAKE3_BLOCK_LEN;
223  }
224 
225  size_t take = fillBuf(input, input_len);
226  input += take;
227  input_len -= take;
228 }
229 //---------------------------------------------------------------------------
230 
231 
232 inline unsigned int BLAKE3::popcnt(uint64_t x)
233 {
234  #if defined(__GNUC__) || defined(__clang__)
235  return __builtin_popcountll(x);
236  #else
237  unsigned int count = 0;
238  while (x != 0)
239  {
240  count += 1;
241  x &= x - 1;
242  }
243  return count;
244  #endif
245 }
246 //---------------------------------------------------------------------------
247 
248 
249 inline unsigned int BLAKE3::highestOne(uint64_t x)
250 {
251  #if defined(__GNUC__) || defined(__clang__)
252  return 63 ^ __builtin_clzll(x);
253  #elif defined(_MSC_VER) && defined(IS_X86_64)
254  unsigned long index;
255  _BitScanReverse64(&index, x);
256  return index;
257  #elif defined(_MSC_VER) && defined(IS_X86_32)
258  if(x >> 32)
259  {
260  unsigned long index;
261  _BitScanReverse(&index, x >> 32);
262  return 32 + index;
263  }
264  else
265  {
266  unsigned long index;
267  _BitScanReverse(&index, x);
268  return index;
269  }
270  #else
271  unsigned int c = 0;
272  if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
273  if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
274  if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
275  if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
276  if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
277  if(x & 0x0000000000000002ULL) { c += 1; }
278  return c;
279  #endif
280 }
281 //---------------------------------------------------------------------------
282 
283 
284 inline uint64_t BLAKE3::roundDownToPowerOf2(uint64_t x)
285 {
286  return UINT64_C(1) << highestOne(x | 1);
287 }
288 //---------------------------------------------------------------------------
289 
290 
291 inline uint32_t BLAKE3::counterLow(uint64_t counter)
292 {
293  return static_cast<uint32_t>(counter);
294 }
295 //---------------------------------------------------------------------------
296 
297 inline uint32_t BLAKE3::counterHigh(uint64_t counter)
298 {
299  return static_cast<uint32_t>(counter >> 32);
300 }
301 //---------------------------------------------------------------------------
302 
303 
304 inline uint32_t BLAKE3::load32(const void *src)
305 {
306  const uint8_t *p = static_cast<const uint8_t *>(src);
307  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
308  ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
309 }
310 //---------------------------------------------------------------------------
311 
312 
313 inline uint32_t BLAKE3::rotr32(const uint32_t w, const uint32_t c)
314 {
315  return (w >> c) | (w << (32 - c));
316 }
317 //---------------------------------------------------------------------------
318 
319 
320 // Given some input larger than one chunk, return the number of bytes that
321 // should go in the left subtree. This is the largest power-of-2 number of
322 // chunks that leaves at least 1 byte for the right subtree.
323 inline size_t BLAKE3::left_len(size_t content_len)
324 {
325  // Subtract 1 to reserve at least one byte for the right side. content_len
326  // should always be greater than BLAKE3_CHUNK_LEN.
327  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
328  return roundDownToPowerOf2(full_chunks) * BLAKE3_CHUNK_LEN;
329 }
330 //---------------------------------------------------------------------------
331 
332 
333 inline void BLAKE3::store32(void *dst, uint32_t w)
334 {
335  uint8_t *p = static_cast<uint8_t *>(dst);
336  p[0] = static_cast<uint8_t>(w >> 0);
337  p[1] = static_cast<uint8_t>(w >> 8);
338  p[2] = static_cast<uint8_t>(w >> 16);
339  p[3] = static_cast<uint8_t>(w >> 24);
340 }
341 //---------------------------------------------------------------------------
342 
343 
344 inline void BLAKE3::storeCvWords(uint8_t bytes_out[32], uint32_t cv_words[8])
345 {
346  store32(&bytes_out[0 * 4], cv_words[0]);
347  store32(&bytes_out[1 * 4], cv_words[1]);
348  store32(&bytes_out[2 * 4], cv_words[2]);
349  store32(&bytes_out[3 * 4], cv_words[3]);
350  store32(&bytes_out[4 * 4], cv_words[4]);
351  store32(&bytes_out[5 * 4], cv_words[5]);
352  store32(&bytes_out[6 * 4], cv_words[6]);
353  store32(&bytes_out[7 * 4], cv_words[7]);
354 }
355 //---------------------------------------------------------------------------
356 
357 
358 inline void BLAKE3::g(uint32_t *state, const size_t a,
359  const size_t b, const size_t c,
360  const size_t d, const uint32_t x,
361  const uint32_t y)
362 {
363  state[a] = state[a] + state[b] + x;
364  state[d] = rotr32(state[d] ^ state[a], 16);
365  state[c] = state[c] + state[d];
366  state[b] = rotr32(state[b] ^ state[c], 12);
367  state[a] = state[a] + state[b] + y;
368  state[d] = rotr32(state[d] ^ state[a], 8);
369  state[c] = state[c] + state[d];
370  state[b] = rotr32(state[b] ^ state[c], 7);
371 }
372 //---------------------------------------------------------------------------
373 
374 
375 inline void BLAKE3::roundFn(uint32_t state[16], const uint32_t *msg,
376  size_t round)
377 {
378  // Select the message schedule based on the round.
379  const uint8_t *schedule = MSG_SCHEDULE[round];
380 
381  // Mix the columns.
382  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
383  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
384  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
385  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
386 
387  // Mix the rows.
388  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
389  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
390  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
391  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
392 }
393 //---------------------------------------------------------------------------
394 
395 
396 inline void BLAKE3::compressPre(uint32_t state[16],
397  const uint32_t cv[8],
398  const uint8_t block[BLAKE3_BLOCK_LEN],
399  uint8_t block_len,
400  uint64_t counter, uint8_t flags)
401 {
402  uint32_t block_words[16];
403  block_words[0] = load32(block + 4 * 0);
404  block_words[1] = load32(block + 4 * 1);
405  block_words[2] = load32(block + 4 * 2);
406  block_words[3] = load32(block + 4 * 3);
407  block_words[4] = load32(block + 4 * 4);
408  block_words[5] = load32(block + 4 * 5);
409  block_words[6] = load32(block + 4 * 6);
410  block_words[7] = load32(block + 4 * 7);
411  block_words[8] = load32(block + 4 * 8);
412  block_words[9] = load32(block + 4 * 9);
413  block_words[10] = load32(block + 4 * 10);
414  block_words[11] = load32(block + 4 * 11);
415  block_words[12] = load32(block + 4 * 12);
416  block_words[13] = load32(block + 4 * 13);
417  block_words[14] = load32(block + 4 * 14);
418  block_words[15] = load32(block + 4 * 15);
419 
420  state[0] = cv[0];
421  state[1] = cv[1];
422  state[2] = cv[2];
423  state[3] = cv[3];
424  state[4] = cv[4];
425  state[5] = cv[5];
426  state[6] = cv[6];
427  state[7] = cv[7];
428  state[8] = IV[0];
429  state[9] = IV[1];
430  state[10] = IV[2];
431  state[11] = IV[3];
432  state[12] = counterLow(counter);
433  state[13] = counterHigh(counter);
434  state[14] = static_cast<uint32_t>(block_len);
435  state[15] = static_cast<uint32_t>(flags);
436 
437  roundFn(state, &block_words[0], 0);
438  roundFn(state, &block_words[0], 1);
439  roundFn(state, &block_words[0], 2);
440  roundFn(state, &block_words[0], 3);
441  roundFn(state, &block_words[0], 4);
442  roundFn(state, &block_words[0], 5);
443  roundFn(state, &block_words[0], 6);
444 }
445 //---------------------------------------------------------------------------
446 
447 
448 inline void BLAKE3::compressInPlace(uint32_t cv[8],
449  const uint8_t block[BLAKE3_BLOCK_LEN],
450  uint8_t block_len,
451  uint64_t counter,
452  uint8_t flags)
453 {
454  uint32_t state[16];
455  compressPre(state, cv, block, block_len, counter, flags);
456  cv[0] = state[0] ^ state[8];
457  cv[1] = state[1] ^ state[9];
458  cv[2] = state[2] ^ state[10];
459  cv[3] = state[3] ^ state[11];
460  cv[4] = state[4] ^ state[12];
461  cv[5] = state[5] ^ state[13];
462  cv[6] = state[6] ^ state[14];
463  cv[7] = state[7] ^ state[15];
464 }
465 //---------------------------------------------------------------------------
466 
467 
468 inline void BLAKE3::compressXof(const uint32_t cv[8],
469  const uint8_t block[BLAKE3_BLOCK_LEN],
470  uint8_t block_len, uint64_t counter,
471  uint8_t flags, uint8_t out[64])
472 {
473  uint32_t state[16];
474  compressPre(state, cv, block, block_len, counter, flags);
475 
476  store32(&out[0 * 4], state[0] ^ state[8]);
477  store32(&out[1 * 4], state[1] ^ state[9]);
478  store32(&out[2 * 4], state[2] ^ state[10]);
479  store32(&out[3 * 4], state[3] ^ state[11]);
480  store32(&out[4 * 4], state[4] ^ state[12]);
481  store32(&out[5 * 4], state[5] ^ state[13]);
482  store32(&out[6 * 4], state[6] ^ state[14]);
483  store32(&out[7 * 4], state[7] ^ state[15]);
484  store32(&out[8 * 4], state[8] ^ cv[0]);
485  store32(&out[9 * 4], state[9] ^ cv[1]);
486  store32(&out[10 * 4], state[10] ^ cv[2]);
487  store32(&out[11 * 4], state[11] ^ cv[3]);
488  store32(&out[12 * 4], state[12] ^ cv[4]);
489  store32(&out[13 * 4], state[13] ^ cv[5]);
490  store32(&out[14 * 4], state[14] ^ cv[6]);
491  store32(&out[15 * 4], state[15] ^ cv[7]);
492 }
493 //---------------------------------------------------------------------------
494 
495 
496 inline void BLAKE3::hashOne(const uint8_t *input, size_t blocks,
497  const uint32_t key[8], uint64_t counter,
498  uint8_t flags, uint8_t flags_start,
499  uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
500 {
501  uint32_t cv[8];
502  memcpy(cv, key, BLAKE3_KEY_LEN);
503  uint8_t block_flags = flags | flags_start;
504  while (blocks > 0)
505  {
506  if (blocks == 1)
507  block_flags |= flags_end;
508  compressInPlace(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
509  input = &input[BLAKE3_BLOCK_LEN];
510  blocks -= 1;
511  block_flags = flags;
512  }
513  storeCvWords(out, cv);
514 }
515 //---------------------------------------------------------------------------
516 
517 
518 inline void BLAKE3::hashMany(const uint8_t *const *inputs, size_t num_inputs,
519  size_t blocks, const uint32_t key[8],
520  uint64_t counter, bool increment_counter,
521  uint8_t flags, uint8_t flags_start,
522  uint8_t flags_end, uint8_t *out)
523 {
524  while (num_inputs > 0)
525  {
526  hashOne(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
527  if (increment_counter)
528  counter += 1;
529  inputs += 1;
530  num_inputs -= 1;
531  out = &out[BLAKE3_OUT_LEN];
532  }
533 }
534 //---------------------------------------------------------------------------
535 
536 
537 // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
538 // on a single thread. Write out the parent chaining values and return the
539 // number of parents hashed. (If there's an odd input chaining value left over,
540 // return it as an additional output.) These parents are never the root and
541 // never empty; those cases use a different codepath.
542 inline size_t BLAKE3::compressParentsParallel(const uint8_t *child_chaining_values,
543  size_t num_chaining_values,
544  const uint32_t key[8],
545  uint8_t flags,
546  uint8_t *out)
547 {
548  const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
549  size_t parents_array_len = 0;
550  while (num_chaining_values - (2 * parents_array_len) >= 2)
551  {
552  parents_array[parents_array_len] =
553  &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
554  parents_array_len += 1;
555  }
556 
557  hashMany(parents_array, parents_array_len, 1, key,
558  0, // Parents always use counter 0.
559  false, flags | static_cast<uint8_t>(Flags::PARENT),
560  0, // Parents have no start flags.
561  0, // Parents have no end flags.
562  out);
563 
564  // If there's an odd child left over, it becomes an output.
565  if (num_chaining_values > 2 * parents_array_len)
566  {
567  memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
568  &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
569  BLAKE3_OUT_LEN);
570  return parents_array_len + 1;
571  }
572  else
573  return parents_array_len;
574 }
575 //---------------------------------------------------------------------------
576 
577 
578 inline size_t BLAKE3::compressChunksParallel(const uint8_t *input,
579  size_t input_len,
580  const uint32_t key[8],
581  uint64_t chunk_counter,
582  uint8_t flags,
583  uint8_t *out)
584 {
585  const uint8_t *chunks_array[MAX_SIMD_DEGREE];
586  size_t input_position = 0;
587  size_t chunks_array_len = 0;
588  while (input_len - input_position >= BLAKE3_CHUNK_LEN)
589  {
590  chunks_array[chunks_array_len] = &input[input_position];
591  input_position += BLAKE3_CHUNK_LEN;
592  chunks_array_len += 1;
593  }
594 
595  hashMany(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN,
596  key, chunk_counter, true, flags, static_cast<uint8_t>(Flags::CHUNK_START),
597  static_cast<uint8_t>(Flags::CHUNK_END), out);
598 
599  // Hash the remaining partial chunk, if there is one. Note that the empty
600  // chunk (meaning the empty message) is a different codepath.
601  if (input_len > input_position)
602  {
603  uint64_t counter = chunk_counter + static_cast<uint64_t>(chunks_array_len);
604  ChunkState chunk_state;
605  chunk_state.init(key, flags);
606  chunk_state.setChunkCounter(counter);
607  chunk_state.update(&input[input_position], input_len - input_position);
608  Output output = chunk_state.stateOutput();
609  output.chainingValue(&out[chunks_array_len * BLAKE3_OUT_LEN]);
610  return chunks_array_len + 1;
611  }
612  else
613  return chunks_array_len;
614 }
615 //---------------------------------------------------------------------------
616 
617 
618 // The wide helper function returns (writes out) an array of chaining values
619 // and returns the length of that array. The number of chaining values returned
620 // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
621 // if the input is shorter than that many chunks. The reason for maintaining a
622 // wide array of chaining values going back up the tree, is to allow the
623 // implementation to hash as many parents in parallel as possible.
624 //
625 // As a special case when the SIMD degree is 1, this function will still return
626 // at least 2 outputs. This guarantees that this function doesn't perform the
627 // root compression. (If it did, it would use the wrong flags, and also we
628 // wouldn't be able to implement exendable ouput.) Note that this function is
629 // not used when the whole input is only 1 chunk long; that's a different
630 // codepath.
631 //
632 // Why not just have the caller split the input on the first update(), instead
633 // of implementing this special rule? Because we don't want to limit SIMD or
634 // multi-threading parallelism for that update().
635 size_t BLAKE3::compressSubtreeWide(const uint8_t *input, size_t input_len,
636  const uint32_t key[8],
637  uint64_t chunk_counter,
638  uint8_t flags, uint8_t *out)
639 {
640  // Note that the single chunk case does *not* bump the SIMD degree up to 2
641  // when it is 1. If this implementation adds multi-threading in the future,
642  // this gives us the option of multi-threading even the 2-chunk case, which
643  // can help performance on smaller platforms.
644  if (input_len <= simdDegree() * BLAKE3_CHUNK_LEN)
645  return compressChunksParallel(input, input_len, key, chunk_counter, flags,
646  out);
647 
648 
649  // With more than simd_degree chunks, we need to recurse. Start by dividing
650  // the input into left and right subtrees. (Note that this is only optimal
651  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
652  // of 3 or something, we'll need a more complicated strategy.)
653  size_t left_input_len = left_len(input_len);
654  size_t right_input_len = input_len - left_input_len;
655  const uint8_t *right_input = &input[left_input_len];
656  uint64_t right_chunk_counter =
657  chunk_counter + static_cast<uint64_t>(left_input_len / BLAKE3_CHUNK_LEN);
658 
659  // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
660  // account for the special case of returning 2 outputs when the SIMD degree
661  // is 1.
662  uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
663  size_t degree = simdDegree();
664  if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1)
665  {
666  // The special case: We always use a degree of at least two, to make
667  // sure there are two outputs. Except, as noted above, at the chunk
668  // level, where we allow degree=1. (Note that the 1-chunk-input case is
669  // a different codepath.)
670  degree = 2;
671  }
672  uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
673 
674  // Recurse! If this implementation adds multi-threading support in the
675  // future, this is where it will go.
676  size_t left_n = compressSubtreeWide(input, left_input_len, key, chunk_counter,
677  flags, cv_array);
678  size_t right_n = compressSubtreeWide(right_input, right_input_len, key,
679  right_chunk_counter, flags, right_cvs);
680 
681  // The special case again. If simd_degree=1, then we'll have left_n=1 and
682  // right_n=1. Rather than compressing them into a single output, return
683  // them directly, to make sure we always have at least two outputs.
684  if (left_n == 1)
685  {
686  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
687  return 2;
688  }
689 
690  // Otherwise, do one layer of parent node compression.
691  size_t num_chaining_values = left_n + right_n;
692  return compressParentsParallel(cv_array, num_chaining_values, key, flags, out);
693 }
694 //---------------------------------------------------------------------------
695 
696 
697 // Hash a subtree with compressSubtreeWide(), and then condense the resulting
698 // list of chaining values down to a single parent node. Don't compress that
699 // last parent node, however. Instead, return its message bytes (the
700 // concatenated chaining values of its children). This is necessary when the
701 // first call to update() supplies a complete subtree, because the topmost
702 // parent node of that subtree could end up being the root. It's also necessary
703 // for extended output in the general case.
704 //
705 // As with compressSubtreeWide(), this function is not used on inputs of 1
706 // chunk or less. That's a different codepath.
707 inline void BLAKE3::compressSubtreeToParentNode(const uint8_t *input,
708  size_t input_len,
709  const uint32_t key[8],
710  uint64_t chunk_counter,
711  uint8_t flags,
712  uint8_t out[2 * BLAKE3_OUT_LEN])
713 {
714  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
715  size_t num_cvs = compressSubtreeWide(input, input_len, key,
716  chunk_counter, flags, cv_array);
717  #if defined(__GNUC__)
718  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
719  #endif
720 
721  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
722  // compress_subtree_wide() returns more than 2 chaining values. Condense
723  // them into 2 by forming parent nodes repeatedly.
724  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
725  #if defined(__GNUC__)
726  // The second half of this loop condition is always true, and we just
727  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
728  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
729  // warnings here. GCC 8.5 is particular sensitive, so if you're changing this
730  // code, test it against that version.
731  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
732  #else
733  while (num_cvs > 2) {
734  #endif
735  num_cvs =
736  compressParentsParallel(cv_array, num_cvs, key, flags, out_array);
737  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
738  }
739  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
740 }
741 //---------------------------------------------------------------------------
742 
743 
744 // As described in pushCv() below, we do "lazy merging", delaying
745 // merges until right before the next CV is about to be added. This is
746 // different from the reference implementation. Another difference is that we
747 // aren't always merging 1 chunk at a time. Instead, each CV might represent
748 // any power-of-two number of chunks, as long as the smaller-above-larger stack
749 // order is maintained. Instead of the "count the trailing 0-bits" algorithm
750 // described in the spec, we use a "count the total number of 1-bits" variant
751 // that doesn't require us to retain the subtree size of the CV on top of the
752 // stack. The principle is the same: each CV that should remain in the stack is
753 // represented by a 1-bit in the total number of chunks (or bytes) so far.
754 inline void BLAKE3::mergeCvStack(uint64_t total_len)
755 {
756  size_t post_merge_stack_len = static_cast<size_t>(popcnt(total_len));
757  while (cv_stack_len > post_merge_stack_len)
758  {
759  uint8_t *parent_node = &cv_stack[(cv_stack_len - 2) * BLAKE3_OUT_LEN];
760  Output output = Output::parentOutput(parent_node, key, chunk.getFlags());
761  output.chainingValue(parent_node);
762  cv_stack_len -= 1;
763  }
764 }
765 //---------------------------------------------------------------------------
766 
767 
768 // In reference_impl.rs, we merge the new CV with existing CVs from the stack
769 // before pushing it. We can do that because we know more input is coming, so
770 // we know none of the merges are root.
771 //
772 // This setting is different. We want to feed as much input as possible to
773 // compress_subtree_wide(), without setting aside anything for the chunk_state.
774 // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
775 // as a single subtree, if at all possible.
776 //
777 // This leads to two problems:
778 // 1) This 64 KiB input might be the only call that ever gets made to update.
779 // In this case, the root node of the 64 KiB subtree would be the root node
780 // of the whole tree, and it would need to be ROOT finalized. We can't
781 // compress it until we know.
782 // 2) This 64 KiB input might complete a larger tree, whose root node is
783 // similarly going to be the the root of the whole tree. For example, maybe
784 // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
785 // node at the root of the 256 KiB subtree until we know how to finalize it.
786 //
787 // The second problem is solved with "lazy merging". That is, when we're about
788 // to add a CV to the stack, we don't merge it with anything first, as the
789 // reference impl does. Instead we do merges using the *previous* CV that was
790 // added, which is sitting on top of the stack, and we put the new CV
791 // (unmerged) on top of the stack afterwards. This guarantees that we never
792 // merge the root node until finalize().
793 //
794 // Solving the first problem requires an additional tool,
795 // compress_subtree_to_parent_node(). That function always returns the top
796 // *two* chaining values of the subtree it's compressing. We then do lazy
797 // merging with each of them separately, so that the second CV will always
798 // remain unmerged. (That also helps us support extendable output when we're
799 // hashing an input all-at-once.)
800 inline void BLAKE3::pushCv(uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
801 {
802  mergeCvStack(chunk_counter);
803  memcpy(&cv_stack[cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
804  cv_stack_len += 1;
805 }
806 //---------------------------------------------------------------------------
807 
808 
809 void BLAKE3::finalizeSeek(uint64_t seek, uint8_t *out, size_t out_len)
810 {
811  // Explicitly checking for zero avoids causing UB by passing a null pointer
812  // to memcpy. This comes up in practice with things like:
813  // std::vector<uuint8_t> v;
814  // blake3_hasher_finalize(&hasher, v.data(), v.size());
815  if (out_len == 0)
816  return;
817 
818  // If the subtree stack is empty, then the current chunk is the root.
819  if (cv_stack_len == 0)
820  {
821  Output output = chunk.stateOutput();
822  output.rootBytes(seek, out, out_len);
823  return;
824  }
825  // If there are any bytes in the chunk state, finalize that chunk and do a
826  // roll-up merge between that chunk hash and every subtree in the stack. In
827  // this case, the extra merge loop at the end of blake3_hasher_update
828  // guarantees that none of the subtrees in the stack need to be merged with
829  // each other first. Otherwise, if there are no bytes in the chunk state,
830  // then the top of the stack is a chunk hash, and we start the merge from
831  // that.
832  Output output;
833  size_t cvs_remaining;
834  if (chunk.len() > 0)
835  {
836  cvs_remaining = cv_stack_len;
837  output = chunk.stateOutput();
838  }
839  else
840  {
841  // There are always at least 2 CVs in the stack in this case.
842  cvs_remaining = cv_stack_len - 2;
843  output = Output::parentOutput(&cv_stack[cvs_remaining * 32], key,
844  chunk.getFlags());
845  }
846  while (cvs_remaining > 0)
847  {
848  cvs_remaining -= 1;
849  uint8_t parent_block[BLAKE3_BLOCK_LEN];
850  memcpy(parent_block, &cv_stack[cvs_remaining * 32], 32);
851  output.chainingValue(&parent_block[32]);
852  output = Output::parentOutput(parent_block, key, chunk.getFlags());
853  }
854  output.rootBytes(seek, out, out_len);
855 }
856 //---------------------------------------------------------------------------
857 
858 
859 /*
860  * Default constructor.
861  */
862 BLAKE3::BLAKE3()
863 {
864  reset();
865 }
866 //---------------------------------------------------------------------------
867 
868 
869 /*
870  * Resets the BLAKE3 hash to initial value.
871  */
872 void BLAKE3::reset()
873 {
874  memcpy(key, IV, BLAKE3_KEY_LEN);
875  chunk.init(key, 0);
876  cv_stack_len = 0;
877 }
878 //---------------------------------------------------------------------------
879 
880 
881 /*
882  * Updates the BLAKE3 hash with specified array of bytes.
883  */
884 void BLAKE3::update(const uint8_t* buf, size_t len)
885 {
886  // Explicitly checking for zero avoids causing UB by passing a null pointer
887  // to memcpy. This comes up in practice with things like:
888  // std::vector<uuint8_t> v;
889  // blake3_hasher_update(&hasher, v.data(), v.size());
890  if (len == 0)
891  return;
892 
893  // If we have some partial chunk bytes in the internal chunk_state, we need
894  // to finish that chunk first.
895  if (chunk.len() > 0) {
896  size_t take = BLAKE3_CHUNK_LEN - chunk.len();
897  if (take > len)
898  take = len;
899  chunk.update(buf, take);
900  buf += take;
901  len -= take;
902  // If we've filled the current chunk and there's more coming, finalize this
903  // chunk and proceed. In this case we know it's not the root.
904  if (len > 0)
905  {
906  Output output = chunk.stateOutput();
907  uint8_t chunk_cv[32];
908  output.chainingValue(chunk_cv);
909  pushCv(chunk_cv, chunk.getChunkCounter());
910  chunk.reset(key, chunk.getChunkCounter() + 1);
911  }
912  else
913  return;
914  }
915 
916  // Now the chunk_state is clear, and we have more input. If there's more than
917  // a single chunk (so, definitely not the root chunk), hash the largest whole
918  // subtree we can, with the full benefits of SIMD (and maybe in the future,
919  // multi-threading) parallelism. Two restrictions:
920  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
921  // the right edge can be incomplete, and we don't know where the right edge
922  // is going to be until we get to finalize().
923  // - The subtree must evenly divide the total number of chunks up until this
924  // point (if total is not 0). If the current incomplete subtree is only
925  // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
926  // to complete the current subtree first.
927  // Because we might need to break up the input to form powers of 2, or to
928  // evenly divide what we already have, this part runs in a loop.
929  while (len > BLAKE3_CHUNK_LEN)
930  {
931  size_t subtree_len = roundDownToPowerOf2(len);
932  uint64_t count_so_far = chunk.getChunkCounter() * BLAKE3_CHUNK_LEN;
933  // Shrink the subtree_len until it evenly divides the count so far. We know
934  // that subtree_len itself is a power of 2, so we can use a bitmasking
935  // trick instead of an actual remainder operation. (Note that if the caller
936  // consistently passes power-of-2 inputs of the same size, as is hopefully
937  // typical, this loop condition will always fail, and subtree_len will
938  // always be the full length of the input.)
939  //
940  // An aside: We don't have to shrink subtree_len quite this much. For
941  // example, if count_so_far is 1, we could pass 2 chunks to
942  // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
943  // get the right answer in the end, and we might get to use 2-way SIMD
944  // parallelism. The problem with this optimization, is that it gets us
945  // stuck always hashing 2 chunks. The total number of chunks will remain
946  // odd, and we'll never graduate to higher degrees of parallelism. See
947  // https://github.com/BLAKE3-team/BLAKE3/issues/69.
948  while (((static_cast<uint64_t>(subtree_len - 1)) & count_so_far) != 0)
949  subtree_len /= 2;
950 
951  // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
952  // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
953  uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
954  if (subtree_len <= BLAKE3_CHUNK_LEN)
955  {
956  ChunkState chunk_state;
957  chunk_state.init(key, chunk.getFlags());
958  chunk_state.setChunkCounter(chunk.getChunkCounter());
959  chunk_state.update(buf, subtree_len);
960  Output output = chunk_state.stateOutput();
961  uint8_t cv[BLAKE3_OUT_LEN];
962  output.chainingValue(cv);
963  pushCv(cv, chunk_state.getChunkCounter());
964  }
965  else
966  {
967  // This is the high-performance happy path, though getting here depends
968  // on the caller giving us a long enough input.
969  uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
970  compressSubtreeToParentNode(buf, subtree_len, key,
971  chunk.getChunkCounter(),
972  chunk.getFlags(), cv_pair);
973  pushCv(cv_pair, chunk.getChunkCounter());
974  pushCv(&cv_pair[BLAKE3_OUT_LEN],
975  chunk.getChunkCounter() + (subtree_chunks / 2));
976  }
977  chunk.setChunkCounter(chunk.getChunkCounter() + subtree_chunks);
978  buf += subtree_len;
979  len -= subtree_len;
980  }
981 
982  // If there's any remaining input less than a full chunk, add it to the chunk
983  // state. In that case, also do a final merge loop to make sure the subtree
984  // stack doesn't contain any unmerged pairs. The remaining input means we
985  // know these merges are non-root. This merge loop isn't strictly necessary
986  // here, because hasher_push_chunk_cv already does its own merge loop, but it
987  // simplifies blake3_hasher_finalize below.
988  if (len > 0)
989  {
990  chunk.update(buf, len);
991  mergeCvStack(chunk.getChunkCounter());
992  }
993 }
994 //---------------------------------------------------------------------------
995 
996 
997 /*
998  * Returns the BLAKE3 hash value in the first 32 bytes of the given address.
999  */
1000 uint8_t* BLAKE3::getValue(uint8_t* buffer) const
1001 {
1002  BLAKE3 blake3(*this);
1003  blake3.finalizeSeek(0, static_cast<uint8_t*>(buffer), getSize());
1004 
1005  return buffer;
1006 }
1007 //---------------------------------------------------------------------------
1008 } // namespace libxcks
1009 //---------------------------------------------------------------------------
1010 
Compute BLAKE3 hash.
@ BLAKE3
BLAKE3 cryptographic hash function (in its default output size of 256 bits).