How To Store Random Integers From The Set Of 32-bit Integers In A Compact Data Structure To Rapidly Check Membership?
Solution 1:
You wrote:
One approach I am thinking that might work that fits these constraints is a sort of tree, like a B+tree. [...] I think it would be like a key-only B+tree (no values, and hence no leaves). But not entirely sure what this would look like (in JavaScript, if I had to pick a language).
Indeed, a key-only B(+)-tree would be a possible solution. It solves the problem of the wasted space you get from a sparse array solution. It is also a solution that works well when the data type happens to be string or float instead of number, which would be a problem for the sparse array solution.
The code I post below is similar to another B+ tree related answer I posted earlier on another question of yours, but which was not a search tree.
The idea is to use a B+ tree, taking into account the extra requirement about array sizes that are powers of 2. The actual data keys are found in the bottom level of the tree. The internal nodes repeat the smallest key that its subtree contains. This can be used for deciding which child to choose when walking from the root down the tree, in search for a bottom-level location where a key should be found or should be inserted.
The tree is kept compact by distributing children/keys to neighbors when a node gets full, only splitting a node when there is no room in the neighbors either. When a key is deleted, the affected node will merge with a neighbor when possible. Each node (except the root) is guaranteed to have at least half the use of the maximum (32) node capacity -- either for storing children or keys (at the bottom level).
Keys are located starting the search at the root. A child is selected using binary search so that the key of that child is the greatest key that is less or equal to the key were are locating. If the key is not found at the bottom level, we have at least the location where it should be, and it could be inserted there.
Implementation
class Node {
constructor(capacity) {
// Mimic fixed-size array (avoid accidentally growing it)
this.children = Object.seal(Array(capacity).fill(null));
this.childCount = 0; // Number of used slots in children array
this.key = null; // Property for supporting a search
// Maintain back-link to parent.
this.parent = null;
// Per level in the tree, maintain a doubly linked list
this.prev = this.next = null;
}
setCapacity(capacity) {
if (capacity < 1) return;
// Here we make a new array, and copy the data into it
let children = Object.seal(Array(capacity).fill(null));
for (let i = 0; i < this.childCount; i++) children[i] = this.children[i];
this.children = children;
}
isLeaf() {
return !(this.children[0] instanceof Node);
}
index() {
return this.parent.children.indexOf(this);
}
updateKey() {
this.key = this.isLeaf() ? this.children[0] : this.children[0].key;
for (let node = this.parent; node; node = node.parent) {
node.key = node.children[0].key;
}
}
wipe(start, end) {
this.children.copyWithin(start, end, this.childCount);
for (let i = this.childCount - end + start; i < this.childCount; i++) {
this.children[i] = null;
}
this.childCount -= end - start;
// Reduce allocated size if possible
if (this.childCount * 2 <= this.children.length) this.setCapacity(this.children.length / 2);
// Update key if first item changed
if (start === 0) this.updateKey();
}
moveFrom(neighbor, target, start, count=1) {
// Note: `start` can have two meanings:
// if neighbor is null, it is the key/Node to move to the target
// if neighbor is a Node, it is the index from where keys(s)/Node(s) have to be moved to the target
// Make room in target node
if (this.childCount + count > this.children.length) this.setCapacity(this.children.length * 2);
this.children.copyWithin(target + count, target, Math.max(target + count, this.childCount));
this.childCount += count;
if (neighbor !== null) {
// Copy the children
for (let i = 0; i < count; i++) {
this.children[target + i] = neighbor.children[start + i];
}
// Remove the original references
neighbor.wipe(start, start + count);
} else {
this.children[target] = start; // start is key to insert
}
// Set parent link(s)
if (!this.isLeaf()) {
for (let i = 0; i < count; i++) {
this.children[target + i].parent = this;
}
}
// Update key if first item changed
if (target === 0) this.updateKey();
}
moveToNext(count) {
this.next.moveFrom(this, 0, this.childCount - count, count);
}
moveFromNext(count) {
this.moveFrom(this.next, this.childCount, 0, count);
}
basicRemove(index) {
if (!this.isLeaf()) {
// Take node out of the level's linked list
let prev = this.children[index].prev;
let next = this.children[index].next;
if (prev) prev.next = next;
if (next) next.prev = prev;
}
this.wipe(index, index + 1);
}
basicInsert(index, value) { // value can be a key or a Node
this.moveFrom(null, index, value);
if (value instanceof Node) {
// Insert node in the level's linked list
if (index > 0) {
value.prev = this.children[index-1];
value.next = value.prev.next;
} else if (this.childCount > 1) {
value.next = this.children[1];
value.prev = value.next.prev;
}
if (value.prev) value.prev.next = value;
if (value.next) value.next.prev = value;
}
}
pairWithSmallest() {
return this.prev && (!this.next || this.next.childCount > this.prev.childCount)
? [this.prev, this] : [this, this.next];
}
toString() {
return "[" + this.children.map(v => v??"-").join() + "]";
}
}
class Tree {
constructor(nodeCapacity=32) {
this.nodeCapacity = nodeCapacity;
this.root = new Node(1);
this.first = this.root; // Head of doubly linked list at bottom level
}
locate(key) {
let node = this.root;
while (!node.isLeaf()) {
// Binary search among keys
let low = 0;
let high = node.childCount;
while (low < high) {
let index = (low + high) >> 1;
if (key >= node.children[index].key) {
low = index + 1;
} else {
high = index;
}
}
if (low) low--;
node = node.children[low];
}
// Binary search in leaf
let low = 0;
let high = node.childCount;
while (low < high) {
let index = (low + high) >> 1;
if (key > node.children[index]) {
low = index + 1;
} else {
high = index;
}
}
return [node, low];
}
has(key) {
let [node, index] = this.locate(key);
return index < node.childCount && node.children[index] === key;
}
remove(key) {
let [node, index] = this.locate(key);
if (index >= node.childCount || node.children[index] !== key) return; // not found
while (true) {
node.basicRemove(index);
// Exit when node's fill ratio is fine
if (!node.parent || node.childCount * 2 > this.nodeCapacity) return;
// Node has potentially too few children, we should either merge or redistribute
let [left, right] = node.pairWithSmallest();
if (!left || !right) { // A node with no siblings? Must become the root!
this.root = node;
node.parent = null;
return;
}
let sumCount = left.childCount + right.childCount;
let childCount = sumCount >> 1;
// Check whether to merge or to redistribute
if (sumCount > this.nodeCapacity) { // redistribute
// Move some data from the bigger to the smaller node
let shift = childCount - node.childCount;
if (!shift) { // Boundary case: when a redistribution would bring no improvement
console.assert(node.childCount * 2 === this.nodeCapacity && sumCount === this.nodeCapacity + 1);
return;
}
if (node === left) { // move some children from right to left
left.moveFromNext(shift);
} else { // move some children from left to right
left.moveToNext(shift);
}
return;
}
// Merge:
// Move all data from the right to the left
left.moveFromNext(right.childCount);
// Prepare to delete right node
node = right.parent;
index = right.index();
}
}
insert(value) { // value is a key
let [node, index] = this.locate(value);
if (index < node.childCount && node[index] === value) return; // already present
while (node.childCount === this.nodeCapacity) { // No room here
if (index === 0 && node.prev && node.prev.childCount < this.nodeCapacity) {
return node.prev.basicInsert(node.prev.childCount, value);
}
// Check whether we can redistribute (to avoid a split)
if (node !== this.root) {
let [left, right] = node.pairWithSmallest();
let joinedIndex = left === node ? index : left.childCount + index;
let sumCount = left.childCount + right.childCount + 1;
if (sumCount <= 2 * this.nodeCapacity) { // redistribute
let childCount = sumCount >> 1;
if (node === right) { // redistribute to the left
let insertInLeft = joinedIndex < childCount;
left.moveFromNext(childCount - left.childCount - +insertInLeft);
} else { // redistribute to the right
let insertInRight = index >= sumCount - childCount;
left.moveToNext(childCount - right.childCount - +insertInRight);
}
if (joinedIndex > left.childCount ||
joinedIndex === left.childCount && left.childCount > right.childCount) {
right.basicInsert(joinedIndex - left.childCount, value);
} else {
left.basicInsert(joinedIndex, value);
}
return;
}
}
// Cannot redistribute: split node
let childCount = node.childCount >> 1;
// Create a new node that will later become the right sibling of this node
let sibling = new Node(childCount);
// Move half of node node's data to it
sibling.moveFrom(node, 0, childCount, childCount);
// Insert the value in either the current node or the new one
if (index > node.childCount) {
sibling.basicInsert(index - node.childCount, value);
} else {
node.basicInsert(index, value);
}
// Is this the root?
if (!node.parent) {
// ...then first create a parent, which is the new root
this.root = new Node(2);
this.root.basicInsert(0, node);
}
// Prepare for inserting the sibling node into the tree
index = node.index() + 1;
node = node.parent;
value = sibling; // value is now a Node
}
node.basicInsert(index, value);
}
/* Below this point: these methods are optional */
* [Symbol.iterator]() { // Make tree iterable
let i = 0;
for (let node = this.first; node; node = node.next) {
for (let i = 0; i < node.childCount; i++) yield node.children[i];
}
}
print() {
console.log(this.root && this.root.toString());
}
verify() {
// Raise an error when the tree violates one of the required properties
if (!this.root) return; // An empty tree is fine.
if (this.root.parent) throw "root should not have a parent";
// Perform a breadth first traversal
let q = [this.root];
while (q.length) {
if (q[0].isLeaf() && this.first !== q[0]) throw "this.first is not pointing to first leaf";
let level = [];
let last = null;
for (let parent of q) {
if (!(parent instanceof Node)) throw "parent is not instance of Node";
if (parent.children.length > this.nodeCapacity) throw "node's children array is too large";
if (parent.childCount > 0 && parent.childCount * 2 <= parent.children.length) throw "node's fill ratio is too low";
for (let i = parent.childCount; i < parent.children.length; i++) {
if (parent.children[i] !== null) throw "child beyond childCount should be null but is not";
}
if (parent.isLeaf()) {
if (parent.children[0] !== parent.key) throw "key does not match with first child value";
for (let value of parent.children.slice(0, parent.childCount)) {
if (value === null) throw "leaf has a null as value";
if (value instanceof Node) throw "leaf has a Node as value";
}
} else {
if (parent.children[0].key !== parent.key) throw "key does not match with first child's key";
for (let node of parent.children.slice(0, parent.childCount)) {
if (node === null) throw "internal node has a null as value";
if (!(node instanceof Node)) throw "internal node has a non-Node as value";
if (node.parent !== parent) throw "wrong parent";
if (node.prev !== last) throw "prev link incorrect";
if (last && last.next !== node) throw "next link incorrect";
if (last && last.children.length + node.children.length <= this.nodeCapacity) {
throw "two consecutive siblings have a total number of children that is too small";
}
if (node.childCount * 2 < this.nodeCapacity) {
throw "internal node is too small: " + node;
}
level.push(node);
last = node;
}
}
}
if (last && last.next) throw "last node in level has a next reference";
q = level;
}
}
test(count=100, option=3) {
// option:
// 0 = always insert & delete at left side (offset 0)
// 1 = always insert & delete at right side
// 2 = always insert & delete at middle
// 3 = insert & delete at random offsets
// Create array to perform the same operations on it as on the tree
let max = count*2;
let arr = [];
// Perform a series of insertions
for (let i = 0; i < count; i++) {
// Choose random value
let key = Math.floor(Math.random() * max);
// Perform same insertion in array and tree
arr.push(key);
this.insert(key);
// Verify tree consistency and properties
this.verify();
// Verify the order of keys in the array is the same as in the tree
if (arr.sort((a,b) => a-b)+"" !== [...this]+"") throw i + ": tree not same as array";
}
// Perform a series of has-calls
for (let i = 0; i < count; i++) {
// Choose random update index
let key = Math.floor(Math.random() * max);
// Perform same insertion in array and tree
let has = arr.includes(key);
if (has !== this.has(key)) {
throw "has() returns inconsistent result";
}
if (!has) {
this.remove(key); // should not alter the tree
// Verify the order of keys in the array is the same as in the tree
if (arr+"" !== [...this]+"") throw i + ": tree not same as array";
}
}
// Perform a series of deletions
for (let i = arr.length - 1; i >= 0; i--) {
// Choose random deletion key
let key = arr[Math.floor(Math.random() * i)];
// Perform same deletion in array and tree
arr.splice(arr.indexOf(key), 1);
this.remove(key);
// Verify tree consistency and properties
this.verify();
// Verify the order of keys in the array is the same as in the tree
if (arr+"" !== [...this]+"") throw "tree not same as array";
}
}
}
// Perform 1000 insertions, 1000 updates, and 1000 deletions on a tree with node capacity of 8
new Tree(8).test(1000);
console.log("all tests completed");
Supported data types
The data type of the keys can be anything for which comparison operators work. So for strings or numbers it will work. In JavaScript it will also work for objects that have implemented either a valueOf
or toString
method. For instance, this is already the case for native Date
objects. JavaScript will first try the valueOf
method and in absence the toString
method, which is also defined on the Object prototype.
Considerations
I stored one key
in each node, as this is an in-memory solution. Historically, B(+)-trees are used for disk storage, where it is important to keep the number of block reads low. A standard B(+)-tree implementation stores the keys one level higher, in the parent, as an array. That way the search does not have to load each child record, while searching for the right one to choose.
Also, it then does not need to store the key of its first child, as we really only need the keys that separate two consecutive children.
Solution 2:
Here is my suggested approach.
Every node should be a 32 entry array. If the first entry is null
or an array, it is a 32-way split of the whole search space. Otherwise it is descending list of the entries in this block. Fill in the non-entries with -1.
This means that in no more than 5 lookups we get to the block that either has or doesn't have our value. We then do a linear scan. A binary search of a sorted list naively seems like it would be more efficient, but in fact it involves a series of hard to predict branches which CPUs hate. In a low level language (which I hope yours is), it is faster to avoid the pipeline stall and do a linear search.
Here is an implementation in JavaScript.
class IntegerSet {
constructor() {
this.make_node = function () {
let node = new Array();
for (let i = 0; i < 32; i++) {
node[i] = -1;
}
return node;
};
this.root = this.make_node();
this.lookup = function (target) {
let node = this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
return true;
}
// On average this lets us break out 1/4 of the way through.
else if (node[i] < target) {
return false;
}
}
return false;
};
this.insert = function (target, node) {
node = node || this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
return false;
}
else if (node[i] < target) {
// copy elements along, in order.
let value = node[i];
for (let j = i; j < 31; j++) {
node[j] = target;
target = value;
value = node[j+1];
if (target == -1) {
break;
}
}
node[31] = target; // just in case the block is full
if (-1 == target) {
return true;
}
else {
break; // we inserted and filled the node.
}
}
}
// If we get here, we need to split the node.
let node_copy = this.make_node();
for (let i = 0; i < 32; i++) {
node_copy[i] = node[i];
node[i] = this.make_node();
}
size /= 32;
let idx = Math.floor(target / size);
node[idx][0] = target % size;
for (let i = 0; i < 32; i++) {
let value = node_copy[i];
idx = Math.floor(value / size);
this.insert(value % size, node[idx]);
}
};
this.remove = function (target) {
let node = this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
for (let j = i; j < 31; j++) {
node[j] = node[j+1];
if (node[j] == -1) {
break;
}
}
node[31] = -1;
return true;
}
}
return false;
};
}
}
let foo = new IntegerSet();
for (let i = 0; i < 300; i++) {
foo.insert(Math.floor(Math.random() * 2**32));
}
console.log(foo.root);
console.log(foo.lookup(5));
console.log(foo.remove(5));
console.log(foo.insert(5));
console.log(foo.root);
console.log(foo.lookup(5));
console.log(foo.remove(5));
Here are the characteristics of this data structure.
A read requires no more than 5 lookups to get to a block, then a scan with at most 64 if statements (which are well-predicted to always be false). If you insert a lot of random elements, the leaves are on average half-full, and on average we only do 1/4 of the scan.
The fact that there are 32 children of each internal node mean that the internal nodes only add a few percent overhead to the structure.
If you insert and then delete, performance remains good but we never will reclaim memory.
The following is an implementation that is similar to the previous but uses arrays of different size, and allows you to reclaim space. It aims to make 60-70% of the space actually data. But this comes, obviously, with penalties on insert/remove.
class IntegerSet {
constructor() {
this.make_node = function (size) {
let node = new Array();
for (let i = 0; i < size; i++) {
node[i] = -1;
}
node.child_size = 0;
return node;
};
this.root = this.make_node(2);
this.lookup = function (target) {
let node = this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
let i = 0;
while (target < node[i]) {
i++;
}
return target == node[i];
};
this.insert = function (target, node, parent_node, size) {
node = node || this.root;
if (size == null) {
size = 2**32;
}
let idx = null;
let update_parent = false;
while (Array.isArray(node[0])) {
size /= 32;
parent_node = node;
update_parent = true;
idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
let i = 0;
while (target < node[i]) {
i++;
}
if (target == node[i]) {
return true; // already had it.
}
else {
// copy along, in order.
let value = node[i];
while (-1 < target) {
value = node[i]
node[i] = target;
i++;
target = value;
}
if (update_parent) {
parent_node.child_size++;
}
if (i == node.length) {
let new_node = null;
if (node.length < 32) {
// Double node size.
new_node = this.make_node(node.length * 2);
for (let j = 0; j < i; j++) {
new_node[j] = node[j];
}
}
else {
// Need to split.
new_node = this.make_node(node.length);
for (let j = 0; j < node.length; j++) {
new_node[j] = this.make_node(2);
}
new_node.child_size = node.length;
size /= 32;
for (let j = 0; j < 32; j++) {
target = node[j];
let new_idx = Math.floor(target / size);
target = target % size;
this.insert(target, new_node[new_idx], parent_node, size);
}
}
if (parent_node != null) {
parent_node[idx] = new_node;
}
else {
this.root = new_node;
}
}
}
};
this.remove = function (target) {
let node = this.root;
let grandparent_node = null;
let parent_node = null;
let size = 2**32;
let idx = null;
let parent_idx = null;
while (Array.isArray(node[0])) {
size /= 32;
parent_idx = idx;
idx = Math.floor(target / size);
grandparent_node = parent_node;
parent_node = node;
node = node[idx];
target = target % size;
}
let i = 0;
while (target < node[i]) {
i++;
}
if (node[i] == target) {
// now delete along.
while (-1 < node[i]) {
node[i] = node[i+1];
i++;
}
if (parent_node != null) {
parent_node.child_size--;
// Reconsolidate if we save about 1/4 of memory.
if (parent_node.child_size < 24) {
let new_node = this.make_node(32);
let new_id = 0;
for (let i = 31; -1 < i; i--) {
let old_node = parent_node[i];
let j = 0;
while (-1 < old_node[j]) {
new_node[new_id] = size * i + old_node[j];
new_id++;
j++;
}
}
if (grandparent_node == null) {
this.root = new_node;
}
else {
grandparent_node[parent_idx] = new_node;
grandparent_node.child_size += parent_node.child_size - 33;
}
return true;
}
}
// Reconsolidate if we save about 2/3 of memory.
if (i < node.length / 3 && 2 < node.length) {
let new_node = this.make_node(node.length/2);
for (let j = 0; j < i; j++) {
new_node[j] = node[j];
}
if (parent_node != null) {
parent_node[idx] = new_node;
}
else {
this.root = new_node;
}
}
return true;
}
else {
return false;
}
};
}
}
let foo = new IntegerSet();
let keep = new Array;
for (let i = 0; i < 5; i++) {
keep.push(Math.floor(Math.random() * 2**32));
}
let remove = new Array;
for (let i = 0; i < 300; i++) {
remove.push(Math.floor(Math.random() * 2**32));
}
for (let i = 0; i < keep.length; i++) {
foo.insert(keep[i]);
}
for (let i = 0; i < remove.length; i++) {
foo.insert(remove[i]);
}
for (let i = 0; i < remove.length; i++) {
foo.remove(remove[i]);
}
console.log(foo.root);
for (let i = 0; i < keep.length; i++) {
console.log(keep[i]);
console.log(foo.lookup(keep[i]));
console.log(foo.remove(keep[i]));
console.log(foo.lookup(keep[i]));
console.log(foo.remove(keep[i]));
}
Post a Comment for "How To Store Random Integers From The Set Of 32-bit Integers In A Compact Data Structure To Rapidly Check Membership?"