node
Node is an in-memory serialized representation of a page; B+ tree operations, page splitting, page rebalancing, and so on are mapped through Node
type node struct {
// The Bucket to which the current node (page) belongs is equivalent to a Table in a relational database
bucket *Bucket
// Whether the current node is a leaf node
isLeaf bool
// Whether to rebalance the current node
unbalanced bool
// Whether the current node is split
spilled bool
key []byte //node.inodes[0].key Specifies the initial key of the child node, the first key of each page (the smallest one).
// The id of the current page
pgid pgid
// The secondary node of the current node
parent *node
// Children of the current node
children nodes
// K/V data stored by the current node (page) (if leaf node)
inodes inodes
}
Copy the code
inode
The inode represents an actual node within a node, which can actually point to an element (a K/V pair) or to a K/V pair that has not yet been written to the page.
type inode struct {
flags uint32 BucketLeafFlag = bucketLeafFlag = bucketLeafFlag
pgid pgid // The current corresponding page_id is 0 if the page has not been written yet
key []byte // The actual key
value []byte // The actual value
}
Copy the code
BoltDB data form B+ tree through pages, all data are stored in leaf nodes, and node is the serialized representation of page in memory, so for the operation of B+ tree, B+ tree operations are performed by locating a specific page, serializing the page to a node, and performing operations on related nodes. Therefore, Node supports the following operations:
// Returns the root node of the current B+ tree
func (n *node) root(a) *node {
if n.parent == nil {
return n
}
return n.parent.root()
}
Copy the code
// Returns the minimum number of keys a page should have. Non-leaf nodes need at least two keys
func (n *node) minKeys(a) int {
if n.isLeaf {
return 1
}
return 2
}
Copy the code
// Returns the current node(page) size
// The size of the page is equal to pageHeader + object header * Number of objects + actual key + actual value
// https://juejin.cn/post/6942016731019739166
func (n *node) size(a) int {
sz, elsz := pageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i]
sz += elsz + len(item.key) + len(item.value)
}
return sz
}
Copy the code
// Return the node (page) corresponding to the i-th child of the current node.
// The data in the page is sorted in ascending order by key
func (n *node) childAt(index int) *node {
if n.isLeaf {
panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
}
return n.bucket.node(n.inodes[index].pgid, n)
}
Copy the code
// Returns the right and left siblings of the current node
func (n *node) nextSibling(a) *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
if index >= n.parent.numChildren()- 1 {
return nil
}
return n.parent.childAt(index + 1)}func (n *node) prevSibling(a) *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
if index == 0 {
return nil
}
return n.parent.childAt(index - 1)}Copy the code
// Put writes a key/value to a node. If olekey=newkey, replace value.
// Otherwise insert a new inode
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
if pgid >= n.bucket.tx.meta.pgid {
// Todo this judgment
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
}
// Find where newkey is inserted
index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, oldKey) ! =- 1 })
// Check if it is a replacement insert
exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)
if! exact {// If the key does not exist, insert a new node
n.inodes = append(n.inodes, inode{})
copy(n.inodes[index+1:], n.inodes[index:])
}
inode := &n.inodes[index]
inode.flags = flags // Assign related elements
inode.key = newKey
inode.value = value
inode.pgid = pgid
}
Copy the code
func (n *node) del(key []byte) {
// Find the first index not greater than key by key
index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, key) ! =- 1 })
// Returns if the current key does not exist
if index >= len(n.inodes) || ! bytes.Equal(n.inodes[index].key, key) {return
}
// Delete related ninodes if they exist
n.inodes = append(n.inodes[:index], n.inodes[index+1:]...).// The node rebalance is not being rebalanced.
Rebalance is triggered when an element is deleted
n.unbalanced = true
}
Copy the code
//read deserializes the contents of a page into an in-memory node
func (n *node) read(p *page){ n.pgid = p.id n.isLeaf = (p.flags & leafPageFlag) ! =0
n.inodes = make(inodes, int(p.count)) // Allocate the corresponding inode space according to the number of page elements
for i := 0; i < int(p.count); i++ {
// Deserialize the data corresponding to pageElement into the inode
inode := &n.inodes[i]
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
inode.flags = elem.flags
inode.key = elem.key()
inode.value = elem.value()
} else {
elem := p.branchPageElement(uint16(i))
inode.pgid = elem.pgid
inode.key = elem.key()
}
}
// If the current node has at least one inode, node.key equals the key of the first inode
if len(n.inodes) > 0 {
n.key = n.inodes[0].key
} else {
n.key = nil}}Copy the code
//write Writes the node item to one or more pages
// If the current node cannot hold a page, multiple consecutive pages will be allocated to store the current node
func (n *node) write(p *page) {
if n.isLeaf {
// Flag the page based on whether it is currently a leaf node
p.flags |= leafPageFlag
} else {
p.flags |= branchPageFlag
}
// A maximum of 65535 inodes
if len(n.inodes) >= 0xFFFF {
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)".len(n.inodes), p.id))
}
p.count = uint16(len(n.inodes))
// If the current node contains no elements, return
if p.count == 0 {
return
}
// Find the write offset in page according to pageElementSize*lengthOfInode
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
for i, item := range n.inodes {
/ / to traverse the node of the inode, writes it to the page, write pageHeader (leafElement/branchElement)
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
elem.flags = item.flags
elem.ksize = uint32(len(item.key))
elem.vsize = uint32(len(item.value))
} else {
elem := p.branchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
elem.ksize = uint32(len(item.key))
elem.pgid = item.pgid
}
// If the amount of memory allocated is smaller than the actual length of key+value, it needs to be reallocated
klen, vlen := len(item.key), len(item.value
if len(b) < klen+vlen {
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0[:]]))}// Write the actual key+value to the page
copy(b[0:], item.key)
b = b[klen:]
copy(b[0:], item.value)
b = b[vlen:]
}
}
Copy the code
// Node split in B+ tree
//split Splits the specified node into multiple nodes based on the pageSize passed in
func (n *node) split(pageSize int)[] *node {
var nodes []*node
node := n
for {
// Split the current node into nodes A and B
a, b := node.splitTwo(pageSize)
nodes = append(nodes, a)
if b == nil {
break
}
// Recursively split b until b is nil, then all nodes are no larger than pageSize
node = b
}
return nodes
}
Copy the code
//splitTwo splits the given node into two smaller nodes according to the pageSize size
func (n *node) splitTwo(pageSize int) (*node, *node) {
// If no split adjustment is reached, return directly, no split
// The split condition is that there should be at least two inodes for leaf nodes and at least one inode for non-leaf nodes, and the node size should not be smaller than pageSize
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
return n, nil
}
var fillPercent = n.bucket.FillPercent
if fillPercent < minFillPercent {
fillPercent = minFillPercent
} else if fillPercent > maxFillPercent {
fillPercent = maxFillPercent
}
// Determine the split threshold based on the fill factor
threshold := int(float64(pageSize) * fillPercent)
// Split the inodes array into two parts according to the inodes index
splitIndex, _ := n.splitIndex(threshold)
// If the current node does not have a parent node, create one
if n.parent == nil {
n.parent = &node{bucket: n.bucket, children: []*node{n}}
}
// Create a new node, add the split node to the children of the current parent node, the new node is not allocated pages, so pageID=0
next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
n.parent.children = append(n.parent.children, next)
// Split the inodes of the current node into two parts according to splitIndex
next.inodes = n.inodes[splitIndex:]
n.inodes = n.inodes[:splitIndex]
n.bucket.tx.stats.Split++
return n, next
}
Copy the code
//splitIndex Calculates the index of the inodes array that should be split based on the size of the threshold passed in
func (n *node) splitIndex(threshold int) (index, sz int) {
sz = pageHeaderSize / / the size of the pageHeader
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
index = i
inode := n.inodes[i]
// The actual Size of each inode contains three parts: elementheader. Size+key.Size+value
elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
if i >= minKeysPerPage && sz+elsize > threshold {
// The total size of the previous I inodes is greater than the threshold and split from the current index
break
}
sz += elsize
}
return
}
Copy the code
//spill recursively writes all nodes (leaf nodes) contained in a node to a dirty page, splitting nodes in the process.
// An error is returned if memory for dirty pages cannot be allocated
//spill describes the splitting process of b+ tree during the transaction commit. Since b+ tree pages are represented in memory by Node, page splitting is mapped by Node
func (n *node) spill(a) error {
var tx = n.bucket.tx
if n.spilled { // Returns if the current node is being split; A page can be split only once. Multiple split operations cannot be performed in parallel
return nil
}
// Sort the current leaf nodes by key
sort.Sort(n.children)
// Split the leaf nodes recursively
for i := 0; i < len(n.children); i++ {
iferr := n.children[i].spill(); err ! =nil {
return err
}
}
// The leaf node list is only used to track node splits
n.children = nil
// Split the current node into multiple nodes according to pageSize
var nodes = n.split(tx.db.pageSize)
for _, node := range nodes {
if node.pgid > 0 {
// If there is an old page, after split, the old page should be released and returned to the freelist
// The node has not been serialized to the corresponding page, so the corresponding page.id=0
tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
node.pgid = 0
}
// Create a new page to store the current node, and return the page if the allocation fails
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
iferr ! =nil {
return err
}
// if the assigned pageId is greater than the hidden pageId, then todo?? Tx.mit pageID change)
if p.id >= tx.meta.pgid {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
}
// Deserialize Node to the newly shard page
node.pgid = p.id
node.write(p)
node.spilled = true
// If the current node's parent node is not empty, write the current node's key(or the first inode's key) to the parent node.
ifnode.parent ! =nil {
var key = node.key
if key == nil {
key = node.inodes[0].key
}
// Update if present, insert otherwise
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
node.key = node.inodes[0].key
}
tx.stats.Spill++
}
// If the root node is split, create a root node and split it again
ifn.parent ! =nil && n.parent.pgid == 0 {
// Clear the current leaf node information to ensure that it will not be split again
n.children = nil
return n.parent.spill()
}
return nil
}
Copy the code
Rebalance combine siblings into one node if their size is smaller than the fill threshold or the number of inodes is smaller than the minimum
func (n *node) rebalance(a) {
if! n.unbalanced {rebalance
return
}
n.unbalanced = false rebalance
// Update statistics
n.bucket.tx.stats.Rebalance++
// Calculate the fill threshold, which is one-fourth of the page size
var threshold = n.bucket.tx.db.pageSize / 4
// A page should be at least a quarter of a page in size and have at least 2 (leaf nodes) or 1 (non-leaf nodes) keys
if n.size() > threshold && len(n.inodes) > n.minKeys() {
return
}
// The current node is the root node, special treatment
if n.parent == nil {
// The current node is a branch node and there is only one inode to split the current node
if! n.isLeaf &&len(n.inodes) == 1 {
// Move the leaf node of the root node up
// Create a new child node with the current node as the root node
child := n.bucket.node(n.inodes[0].pgid, n)
n.isLeaf = child.isLeaf
n.inodes = child.inodes[:]
n.children = child.children
// Recalculate the parent //todo of the current leaf node
for _, inode := range n.inodes {
// The current node is cached in the bucket
if child, ok := n.bucket.nodes[inode.pgid]; ok {
child.parent = n
}
}
// Delete the old leaf node
child.parent = nil
delete(n.bucket.nodes, child.pgid)
child.free()
}
return
}
// If the current node does not store the key, remove the current node
if n.numChildren() == 0 {
// If the current node has no leaf node, parallel size
// Remove the current node
n.parent.del(n.key) // Remove the current node from parent-inodes
n.parent.removeChild(n) // Remove the current array from parent. Children
delete(n.bucket.nodes, n.pgid)
n.free() // Release the page corresponding to the current node
n.parent.rebalance() Rebalance the parent node recursively
return
}
// The current node has data
var target *node // Locate the node where the rebalance is needed
var useNextSibling = n.parent.childIndex(n) == 0 // Find the position of the current node in the parent node, is the leftmost node
if useNextSibling {// The current node is the leftmost node
// The sibling node on the right
target = n.nextSibling() // Find the sibling to the right of the current node
} else {
// Left sibling node
target = n.prevSibling()
}
// If both the current and target nodes are too small, merge them
if useNextSibling {// If the target node is a sibling of the current node, merge the target node into the current node.
// Iterate over the target element and recalculate its parent
for _, inode := range target.inodes {
// If the current bucket caches the page corresponding to the inode's pageID,
if child, ok := n.bucket.nodes[inode.pgid]; ok {
// Remove the child node from its parent
child.parent.removeChild(child)
child.parent = n // Recalculate its parent to the current node
// Add child to the child of the current node
child.parent.children = append(child.parent.children, child) // ==> n.children = append(n.children, child) equality ?}}// Add the element of the target node to the element array of the current node
n.inodes = append(n.inodes, target.inodes...)
n.parent.del(target.key) // Remove the target key from the parent (target and n have the same parent)
n.parent.removeChild(target) // Remove the target node from the parent leaf node of the target node
delete(n.bucket.nodes, target.pgid) // Remove the target node from the node cache of the current bucket
target.free() // Release the page occupied by the target node
} else {
// If the target node is the left sibling of the current node, merge the current node to the left sibling
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
child.parent.removeChild(child)
child.parent = target
child.parent.children = append(child.parent.children, child)
}
}
// Remove the current node's parent and current bucket from the cache, and add the current node's element to the sibling on the left
target.inodes = append(target.inodes, n.inodes...) // The inodes are sorted by key and added to the destination node
n.parent.del(n.key)
n.parent.removeChild(n)
delete(n.bucket.nodes, n.pgid)
n.free()
}
// Recursive merge parent node
n.parent.rebalance()
}
Copy the code