diff --git a/ConflictSet.cpp b/ConflictSet.cpp index 217da0e..8e9ee12 100644 --- a/ConflictSet.cpp +++ b/ConflictSet.cpp @@ -176,7 +176,7 @@ int BitSet::firstSetGeq(int i) const { enum class Type : int8_t { Node0, - Node4, + Node3, Node16, Node48, Node256, @@ -215,19 +215,19 @@ struct Node0 : Node { uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; -struct Node4 : Node { - // Sorted - uint8_t index[16]; // 16 so that we can use the same simd index search - // implementation as Node16 - Child children[4]; - Node4() { this->type = Type::Node4; } +struct Node3 : Node { + constexpr static auto kMaxNodes = 3; + uint8_t index[kMaxNodes]; + Child children[kMaxNodes]; + Node3() { this->type = Type::Node3; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; struct Node16 : Node { // Sorted - uint8_t index[16]; - Child children[16]; + constexpr static auto kMaxNodes = 16; + uint8_t index[kMaxNodes]; + Child children[kMaxNodes]; Node16() { this->type = Type::Node16; } uint8_t *partialKey() { return (uint8_t *)(this + 1); } }; @@ -262,22 +262,33 @@ std::string getSearchPathPrintable(Node *n); // Bound memory usage following the analysis in the ART paper -constexpr int kBytesPerKey = 176; -constexpr int kMinSavingsPerNode = 120; -constexpr int kMinChildrenNode4 = 2; -constexpr int kMinChildrenNode16 = 5; +// Each node with an entry present gets a budget of kBytesPerKey. Node0 always +// has an entry present. +constexpr int kBytesPerKey = 144; +// Induction hypothesis is that each node's surplus is >= kMinNodeSurplus +constexpr int kMinNodeSurplus = 104; +constexpr int kMinChildrenNode3 = 2; +constexpr int kMinChildrenNode16 = 4; constexpr int kMinChildrenNode48 = 17; constexpr int kMinChildrenNode256 = 49; -static_assert(kMinChildrenNode256 * kMinSavingsPerNode - sizeof(Node256) >= - kMinSavingsPerNode); -static_assert(kMinChildrenNode48 * kMinSavingsPerNode - sizeof(Node48) >= - kMinSavingsPerNode); -static_assert(kMinChildrenNode16 * kMinSavingsPerNode - sizeof(Node16) >= - kMinSavingsPerNode); -static_assert(kMinChildrenNode4 * kMinSavingsPerNode - sizeof(Node4) >= - kMinSavingsPerNode); -static_assert(kBytesPerKey - sizeof(Node0) >= kMinSavingsPerNode); +constexpr int kNode256Surplus = + kMinChildrenNode256 * kMinNodeSurplus - sizeof(Node256); +static_assert(kNode256Surplus >= kMinNodeSurplus); + +constexpr int kNode48Surplus = + kMinChildrenNode48 * kMinNodeSurplus - sizeof(Node48); +static_assert(kNode48Surplus >= kMinNodeSurplus); + +constexpr int kNode16Surplus = + kMinChildrenNode16 * kMinNodeSurplus - sizeof(Node16); +static_assert(kNode16Surplus >= kMinNodeSurplus); + +constexpr int kNode3Surplus = + kMinChildrenNode3 * kMinNodeSurplus - sizeof(Node3); +static_assert(kNode3Surplus >= kMinNodeSurplus); + +static_assert(kBytesPerKey - sizeof(Node0) >= kMinNodeSurplus); // setOldestVersion will additionally try to maintain this property: // `max(children, 1) * length >= capacity` @@ -356,8 +367,8 @@ uint8_t *Node::partialKey() { switch (type) { case Type::Node0: return ((Node0 *)this)->partialKey(); - case Type::Node4: - return ((Node4 *)this)->partialKey(); + case Type::Node3: + return ((Node3 *)this)->partialKey(); case Type::Node16: return ((Node16 *)this)->partialKey(); case Type::Node48: @@ -370,13 +381,14 @@ uint8_t *Node::partialKey() { struct NodeAllocators { BoundedFreeListAllocator node0; - BoundedFreeListAllocator node4; + BoundedFreeListAllocator node3; BoundedFreeListAllocator node16; BoundedFreeListAllocator node48; BoundedFreeListAllocator node256; }; -int getNodeIndex(Node16 *self, uint8_t index) { +template int getNodeIndex(NodeT *self, uint8_t index) { + static_assert(std::is_same_v || std::is_same_v); #ifdef HAS_AVX // Based on https://www.the-paper-trail.org/post/art-paper-notes/ @@ -388,7 +400,7 @@ int getNodeIndex(Node16 *self, uint8_t index) { // keys aren't valid, we'll mask the results to only consider the valid ones // below. __m128i indices; - memcpy(&indices, self->index, sizeof(self->index)); + memcpy(&indices, self->index, NodeT::kMaxNodes); __m128i results = _mm_cmpeq_epi8(key_vec, indices); // Build a mask to select only the first node->num_children values from the @@ -411,10 +423,12 @@ int getNodeIndex(Node16 *self, uint8_t index) { // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon uint8x16_t indices; - memcpy(&indices, self->index, sizeof(self->index)); + memcpy(&indices, self->index, NodeT::kMaxNodes); // 0xff for each match uint16x8_t results = vreinterpretq_u16_u8(vceqq_u8(vdupq_n_u8(index), indices)); + static_assert(NodeT::kMaxNodes <= 16); + assume(self->numChildren <= NodeT::kMaxNodes); uint64_t mask = self->numChildren == 16 ? uint64_t(-1) : (uint64_t(1) << (self->numChildren * 4)) - 1; @@ -439,8 +453,10 @@ Node *&getChildExists(Node *self, uint8_t index) { switch (self->type) { case Type::Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *self3 = static_cast(self); + return self3->children[getNodeIndex(self3, index)].child; + } case Type::Node16: { auto *self16 = static_cast(self); return self16->children[getNodeIndex(self16, index)].child; @@ -467,8 +483,11 @@ Node *getChild(Node *self, uint8_t index) { switch (self->type) { case Type::Node0: return nullptr; - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *self3 = static_cast(self); + int i = getNodeIndex(self3, index); + return i < 0 ? nullptr : self3->children[i].child; + } case Type::Node16: { auto *self16 = static_cast(self); int i = getNodeIndex(self16, index); @@ -486,6 +505,64 @@ Node *getChild(Node *self, uint8_t index) { } } +template int getChildGeqSimd(NodeT *self, int child) { + static_assert(std::is_same_v || std::is_same_v); +#ifdef HAS_AVX + __m128i key_vec = _mm_set1_epi8(child); + __m128i indices; + memcpy(&indices, self->index, NodeT::kMaxNodes); + __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices)); + int mask = (1 << self->numChildren) - 1; + uint32_t bitfield = _mm_movemask_epi8(results) & mask; + int result = bitfield == 0 ? -1 : self->index[std::countr_zero(bitfield)]; + assert(result == [&]() -> int { + for (int i = 0; i < self->numChildren; ++i) { + if (self->index[i] >= child) { + return self->index[i]; + } + } + return -1; + }()); + return result; +#elif defined(HAS_ARM_NEON) + uint8x16_t indices; + memcpy(&indices, self->index, sizeof(self->index)); + // 0xff for each leq + auto results = vcleq_u8(vdupq_n_u8(child), indices); + static_assert(NodeT::kMaxNodes <= 16); + assume(self->numChildren <= NodeT::kMaxNodes); + uint64_t mask = self->numChildren == 16 + ? uint64_t(-1) + : (uint64_t(1) << (self->numChildren * 4)) - 1; + // 0xf for each 0xff (within mask) + uint64_t bitfield = + vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), + 0) & + mask; + int simd = bitfield == 0 ? -1 : self->index[std::countr_zero(bitfield) / 4]; + assert(simd == [&]() -> int { + for (int i = 0; i < self->numChildren; ++i) { + if (self->index[i] >= child) { + return self->index[i]; + } + } + return -1; + }()); + return simd; +#else + for (int i = 0; i < self->numChildren; ++i) { + if (i > 0) { + assert(self->index[i - 1] < self->index[i]); + } + if (self->index[i] >= child) { + return self->index[i]; + } + } + return -1; +#endif +} + int getChildGeq(Node *self, int child) { if (child > 255) { return -1; @@ -493,64 +570,10 @@ int getChildGeq(Node *self, int child) { switch (self->type) { case Type::Node0: return -1; - case Type::Node4: - [[fallthrough]]; - case Type::Node16: { - auto *self16 = static_cast(self); -#ifdef HAS_AVX - __m128i key_vec = _mm_set1_epi8(child); - __m128i indices; - memcpy(&indices, self16->index, sizeof(self16->index)); - __m128i results = _mm_cmpeq_epi8(key_vec, _mm_min_epu8(key_vec, indices)); - int mask = (1 << self16->numChildren) - 1; - uint32_t bitfield = _mm_movemask_epi8(results) & mask; - int result = bitfield == 0 ? -1 : self16->index[std::countr_zero(bitfield)]; - assert(result == [&]() -> int { - for (int i = 0; i < self16->numChildren; ++i) { - if (self16->index[i] >= child) { - return self16->index[i]; - } - } - return -1; - }()); - return result; -#elif defined(HAS_ARM_NEON) - uint8x16_t indices; - memcpy(&indices, self16->index, sizeof(self16->index)); - // 0xff for each leq - auto results = vcleq_u8(vdupq_n_u8(child), indices); - uint64_t mask = self->numChildren == 16 - ? uint64_t(-1) - : (uint64_t(1) << (self->numChildren * 4)) - 1; - // 0xf for each 0xff (within mask) - uint64_t bitfield = - vget_lane_u64( - vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(results), 4)), - 0) & - mask; - int simd = - bitfield == 0 ? -1 : self16->index[std::countr_zero(bitfield) / 4]; - assert(simd == [&]() -> int { - for (int i = 0; i < self->numChildren; ++i) { - if (self16->index[i] >= child) { - return self16->index[i]; - } - } - return -1; - }()); - return simd; -#else - for (int i = 0; i < self->numChildren; ++i) { - if (i > 0) { - assert(self16->index[i - 1] < self16->index[i]); - } - if (self16->index[i] >= child) { - return self16->index[i]; - } - } - return -1; -#endif - } + case Type::Node3: + return getChildGeqSimd(static_cast(self), child); + case Type::Node16: + return getChildGeqSimd(static_cast(self), child); case Type::Node48: [[fallthrough]]; case Type::Node256: { @@ -561,7 +584,7 @@ int getChildGeq(Node *self, int child) { } } -void setChildrenParents(Node4 *n) { +void setChildrenParents(Node3 *n) { for (int i = 0; i < n->numChildren; ++i) { n->children[i].child->parent = n; } @@ -592,8 +615,13 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, switch (self->type) { case Type::Node0: break; - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *self3 = static_cast(self); + int i = getNodeIndex(self3, index); + if (i >= 0) { + return self3->children[i].child; + } + } break; case Type::Node16: { auto *self16 = static_cast(self); int i = getNodeIndex(self16, index); @@ -620,43 +648,59 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, case Type::Node0: { auto *self0 = static_cast(self); - auto *newSelf = allocators->node4.allocate(self->partialKeyLen); + auto *newSelf = allocators->node3.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self0->partialKey(), self->partialKeyLen); allocators->node0.release(self0); self = newSelf; - goto insert16; + goto insert3; } - case Type::Node4: { - auto *self4 = static_cast(self); - - if (self->numChildren == 4) { + case Type::Node3: { + if (self->numChildren == Node3::kMaxNodes) { + auto *self3 = static_cast(self); auto *newSelf = allocators->node16.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); - memcpy(newSelf->partialKey(), self4->partialKey(), self->partialKeyLen); + memcpy(newSelf->partialKey(), self3->partialKey(), self->partialKeyLen); // TODO replace with memcpy? - for (int i = 0; i < 4; ++i) { - newSelf->index[i] = self4->index[i]; - newSelf->children[i] = self4->children[i]; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + newSelf->index[i] = self3->index[i]; + newSelf->children[i] = self3->children[i]; } - allocators->node4.release(self4); + allocators->node3.release(self3); setChildrenParents(newSelf); self = newSelf; + goto insert16; } - goto insert16; + insert3: + auto *self3 = static_cast(self); + ++self->numChildren; + int i = 0; + for (; i < self->numChildren - 1; ++i) { + if (int(self3->index[i]) > int(index)) { + memmove(self3->index + i + 1, self3->index + i, + self->numChildren - (i + 1)); + memmove(self3->children + i + 1, self3->children + i, + (self->numChildren - (i + 1)) * sizeof(Child)); + break; + } + } + self3->index[i] = index; + auto &result = self3->children[i].child; + result = nullptr; + return result; } case Type::Node16: { - if (self->numChildren == 16) { + if (self->numChildren == Node16::kMaxNodes) { auto *self16 = static_cast(self); auto *newSelf = allocators->node48.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); - newSelf->nextFree = 16; + newSelf->nextFree = Node16::kMaxNodes; int i = 0; for (auto x : self16->index) { newSelf->bitSet.set(x); @@ -664,7 +708,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, newSelf->index[x] = i; ++i; } - assert(i == 16); + assert(i == Node16::kMaxNodes); allocators->node16.release(self16); setChildrenParents(newSelf); self = newSelf; @@ -673,6 +717,7 @@ Node *&getOrCreateChild(Node *&self, uint8_t index, insert16: auto *self16 = static_cast(self); + assert(self->type == Type::Node16); ++self->numChildren; int i = 0; @@ -773,23 +818,23 @@ void makeCapacityAtLeast(Node *&self, int capacity, NodeAllocators *allocators, } self = newSelf; } break; - case Type::Node4: { - auto *self4 = (Node4 *)self; - auto *newSelf = allocators->node4.allocate(capacity); + case Type::Node3: { + auto *self3 = (Node3 *)self; + auto *newSelf = allocators->node3.allocate(capacity); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); - memcpy(newSelf->partialKey(), self4->partialKey(), self->partialKeyLen); + memcpy(newSelf->partialKey(), self3->partialKey(), self->partialKeyLen); // TODO replace with memcpy? - for (int i = 0; i < 4; ++i) { - newSelf->index[i] = self4->index[i]; - newSelf->children[i] = self4->children[i]; + for (int i = 0; i < Node3::kMaxNodes; ++i) { + newSelf->index[i] = self3->index[i]; + newSelf->children[i] = self3->children[i]; } getInTree(self, impl) = newSelf; setChildrenParents(newSelf); if constexpr (kUseFreeList) { - allocators->node4.release(self4); + allocators->node3.release(self3); } else { - free(self4); + free(self3); } self = newSelf; } break; @@ -800,7 +845,7 @@ void makeCapacityAtLeast(Node *&self, int capacity, NodeAllocators *allocators, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); // TODO replace with memcpy? - for (int i = 0; i < 16; ++i) { + for (int i = 0; i < Node16::kMaxNodes; ++i) { newSelf->index[i] = self16->index[i]; newSelf->children[i] = self16->children[i]; } @@ -883,20 +928,20 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, switch (self->type) { case Type::Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE - case Type::Node4: { - auto *self4 = (Node4 *)self; + case Type::Node3: { + auto *self3 = (Node3 *)self; if (self->numChildren == 0) { auto *newSelf = allocators->node0.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); - memcpy(newSelf->partialKey(), self4->partialKey(), self->partialKeyLen); + memcpy(newSelf->partialKey(), self3->partialKey(), self->partialKeyLen); getInTree(self, impl) = newSelf; - allocators->node4.release(self4); + allocators->node3.release(self3); } else if (self->numChildren == 1) { if (!self->entryPresent) { - auto *child = self4->children[0].child; - int minCapacity = self4->partialKeyLen + 1 + child->partialKeyLen; + auto *child = self3->children[0].child; + int minCapacity = self3->partialKeyLen + 1 + child->partialKeyLen; if (minCapacity > child->partialKeyCapacity) { const bool update = child == dontInvalidate; @@ -917,11 +962,11 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, int64_t childMaxVersion = maxVersion(child, impl); // Construct new partial key for child - memmove(child->partialKey() + self4->partialKeyLen + 1, + memmove(child->partialKey() + self3->partialKeyLen + 1, child->partialKey(), child->partialKeyLen); - memcpy(child->partialKey(), self4->partialKey(), self->partialKeyLen); - child->partialKey()[self4->partialKeyLen] = self4->index[0]; - child->partialKeyLen += 1 + self4->partialKeyLen; + memcpy(child->partialKey(), self3->partialKey(), self->partialKeyLen); + child->partialKey()[self3->partialKeyLen] = self3->index[0]; + child->partialKeyLen += 1 + self3->partialKeyLen; child->parent = self->parent; child->parentsIndex = self->parentsIndex; @@ -931,19 +976,19 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, maxVersion(child, impl) = childMaxVersion; getInTree(self, impl) = child; - allocators->node4.release(self4); + allocators->node3.release(self3); } } } break; case Type::Node16: if (self->numChildren + int(self->entryPresent) < kMinChildrenNode16) { auto *self16 = (Node16 *)self; - auto *newSelf = allocators->node4.allocate(self->partialKeyLen); + auto *newSelf = allocators->node3.allocate(self->partialKeyLen); memcpy((char *)newSelf + kNodeCopyBegin, (char *)self + kNodeCopyBegin, kNodeCopySize); memcpy(newSelf->partialKey(), self16->partialKey(), self->partialKeyLen); // TODO replace with memcpy? - for (int i = 0; i < 4; ++i) { + for (int i = 0; i < Node3::kMaxNodes; ++i) { newSelf->index[i] = self16->index[i]; newSelf->children[i] = self16->children[i]; } @@ -965,7 +1010,7 @@ void maybeDownsize(Node *self, NodeAllocators *allocators, [&](int c) { // Suppress a false positive -Waggressive-loop-optimizations warning // in gcc. `assume` doesn't work for some reason. - if (!(i < 16)) { + if (!(i < Node16::kMaxNodes)) { __builtin_unreachable(); // GCOVR_EXCL_LINE } newSelf->index[i] = c; @@ -1034,8 +1079,8 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, case Type::Node0: allocators->node0.release((Node0 *)self); break; - case Type::Node4: - allocators->node4.release((Node4 *)self); + case Type::Node3: + allocators->node3.release((Node3 *)self); break; case Type::Node16: allocators->node16.release((Node16 *)self); @@ -1051,8 +1096,17 @@ Node *erase(Node *self, NodeAllocators *allocators, ConflictSet::Impl *impl, switch (parent->type) { case Type::Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *parent3 = static_cast(parent); + int nodeIndex = getNodeIndex(parent3, parentsIndex); + assert(nodeIndex >= 0); + memmove(parent3->index + nodeIndex, parent3->index + nodeIndex + 1, + sizeof(parent3->index[0]) * + (parent->numChildren - (nodeIndex + 1))); + memmove(parent3->children + nodeIndex, parent3->children + nodeIndex + 1, + sizeof(parent3->children[0]) * + (parent->numChildren - (nodeIndex + 1))); + } break; case Type::Node16: { auto *parent16 = static_cast(parent); int nodeIndex = getNodeIndex(parent16, parentsIndex); @@ -1399,8 +1453,14 @@ int64_t maxBetweenExclusive(Node *n, int begin, int end) { case Type::Node0: // We would have returned above, after not finding a child __builtin_unreachable(); // GCOVR_EXCL_LINE - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *self = static_cast(n); + for (int i = 0; i < self->numChildren && self->index[i] < end; ++i) { + if (begin <= self->index[i]) { + result = std::max(result, self->children[i].childMaxVersion); + } + } + } break; case Type::Node16: { auto *self = static_cast(n); for (int i = 0; i < self->numChildren && self->index[i] < end; ++i) { @@ -1408,8 +1468,7 @@ int64_t maxBetweenExclusive(Node *n, int begin, int end) { result = std::max(result, self->children[i].childMaxVersion); } } - break; - } + } break; case Type::Node48: { auto *self = static_cast(n); self->bitSet.forEachInRange( @@ -1898,7 +1957,7 @@ template int64_t oldMaxVersion = maxVersion(old, impl); // *self will have one child - *self = allocators->node4.allocate(partialKeyIndex); + *self = allocators->node3.allocate(partialKeyIndex); memcpy((char *)*self + kNodeCopyBegin, (char *)old + kNodeCopyBegin, kNodeCopySize); @@ -2261,8 +2320,11 @@ int64_t &maxVersion(Node *n, ConflictSet::Impl *impl) { switch (n->type) { case Type::Node0: __builtin_unreachable(); // GCOVR_EXCL_LINE - case Type::Node4: - [[fallthrough]]; + case Type::Node3: { + auto *n3 = static_cast(n); + int i = getNodeIndex(n3, index); + return n3->children[i].childMaxVersion; + } case Type::Node16: { auto *n16 = static_cast(n); int i = getNodeIndex(n16, index); @@ -2314,23 +2376,6 @@ ConflictSet::~ConflictSet() { } } -#if SHOW_MEMORY -__attribute__((visibility("default"))) void showMemory(const ConflictSet &cs) { - ConflictSet::Impl *impl; - memcpy(&impl, &cs, sizeof(impl)); // NOLINT - fprintf(stderr, "Max Node0 memory usage: %" PRId64 "\n", - impl->allocators.node0.highWaterMarkBytes()); - fprintf(stderr, "Max Node4 memory usage: %" PRId64 "\n", - impl->allocators.node4.highWaterMarkBytes()); - fprintf(stderr, "Max Node16 memory usage: %" PRId64 "\n", - impl->allocators.node16.highWaterMarkBytes()); - fprintf(stderr, "Max Node48 memory usage: %" PRId64 "\n", - impl->allocators.node48.highWaterMarkBytes()); - fprintf(stderr, "Max Node256 memory usage: %" PRId64 "\n", - impl->allocators.node256.highWaterMarkBytes()); -} -#endif - ConflictSet::ConflictSet(ConflictSet &&other) noexcept : impl(std::exchange(other.impl, nullptr)) {} @@ -2553,8 +2598,8 @@ Iterator firstGeq(Node *n, std::string_view key) { case Type::Node0: minNumChildren = 0; break; - case Type::Node4: - minNumChildren = kMinChildrenNode4; + case Type::Node3: + minNumChildren = kMinChildrenNode3; break; case Type::Node16: minNumChildren = kMinChildrenNode16;