# Source code for pygbm.splitting

```
"""This module contains njitted routines and data structures to:
- Find the best possible split of a node. For a given node, a split is
characterized by a feature and a bin.
- Apply a split to a node, i.e. split the indices of the samples at the node
into the newly created left and right childs.
"""
import numpy as np
from numba import njit, jitclass, prange, float32, uint8, uint32
import numba
from .histogram import _build_histogram
from .histogram import _subtract_histograms
from .histogram import _build_histogram_no_hessian
from .histogram import _build_histogram_root
from .histogram import _build_histogram_root_no_hessian
from .histogram import HISTOGRAM_DTYPE
from .utils import get_threads_chunks
@jitclass([
('gain', float32),
('feature_idx', uint32),
('bin_idx', uint8),
('gradient_left', float32),
('hessian_left', float32),
('gradient_right', float32),
('hessian_right', float32),
('n_samples_left', uint32),
('n_samples_right', uint32),
])
class SplitInfo:
"""Pure data class to store information about a potential split.
Parameters
----------
gain : float32
The gain of the split
feature_idx : int
The index of the feature to be split
bin_idx : int
The index of the bin on which the split is made
gradient_left : float32
The sum of the gradients of all the samples in the left child
hessian_left : float32
The sum of the hessians of all the samples in the left child
gradient_right : float32
The sum of the gradients of all the samples in the right child
hessian_right : float32
The sum of the hessians of all the samples in the right child
n_samples_left : int
The number of samples in the left child
n_samples_right : int
The number of samples in the right child
"""
def __init__(self, gain=-1., feature_idx=0, bin_idx=0,
gradient_left=0., hessian_left=0.,
gradient_right=0., hessian_right=0.,
n_samples_left=0, n_samples_right=0):
self.gain = gain
self.feature_idx = feature_idx
self.bin_idx = bin_idx
self.gradient_left = gradient_left
self.hessian_left = hessian_left
self.gradient_right = gradient_right
self.hessian_right = hessian_right
self.n_samples_left = n_samples_left
self.n_samples_right = n_samples_right
@jitclass([
('n_features', uint32),
('X_binned', uint8[::1, :]),
('max_bins', uint32),
('n_bins_per_feature', uint32[::1]),
('min_samples_leaf', uint32),
('min_gain_to_split', float32),
('gradients', float32[::1]),
('hessians', float32[::1]),
('ordered_gradients', float32[::1]),
('ordered_hessians', float32[::1]),
('sum_gradients', float32),
('sum_hessians', float32),
('constant_hessian', uint8),
('constant_hessian_value', float32),
('l2_regularization', float32),
('min_hessian_to_split', float32),
('partition', uint32[::1]),
('left_indices_buffer', uint32[::1]),
('right_indices_buffer', uint32[::1]),
])
class SplittingContext:
"""Pure data class defining a splitting context.
Ideally it would also have methods but numba does not support annotating
jitclasses (so we can't use parallel=True). This structure is
instanciated in the grower and stores all the required information to
compute the SplitInfo and histograms of each node.
Parameters
----------
X_binned : array of int
The binned input samples. Must be Fortran-aligned.
max_bins : int, optional(default=256)
The maximum number of bins. Used to define the shape of the
histograms.
n_bins_per_feature : array-like of int
The actual number of bins needed for each feature, which is lower or
equal to max_bins.
gradients : array-like, shape=(n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration i - 1.
hessians : array-like, shape=(n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration i - 1.
l2_regularization : float
The L2 regularization parameter.
min_hessian_to_split : float
The minimum sum of hessians needed in each node. Splits that result in
at least one child having a sum of hessians less than
min_hessian_to_split are discarded.
min_samples_leaf : int
The minimum number of samples per leaf.
min_gain_to_split : float, optional(default=0.)
The minimum gain needed to split a node. Splits with lower gain will
be ignored.
"""
def __init__(self, X_binned, max_bins, n_bins_per_feature,
gradients, hessians, l2_regularization,
min_hessian_to_split=1e-3, min_samples_leaf=20,
min_gain_to_split=0.):
self.X_binned = X_binned
self.n_features = X_binned.shape[1]
# Note: all histograms will have <max_bins> bins, but some of the
# last bins may be unused if n_bins_per_feature[f] < max_bins
self.max_bins = max_bins
self.n_bins_per_feature = n_bins_per_feature
self.gradients = gradients
self.hessians = hessians
# for root node, gradients and hessians are already ordered
self.ordered_gradients = gradients.copy()
self.ordered_hessians = hessians.copy()
self.sum_gradients = self.gradients.sum()
self.sum_hessians = self.hessians.sum()
self.constant_hessian = hessians.shape[0] == 1
self.l2_regularization = l2_regularization
self.min_hessian_to_split = min_hessian_to_split
self.min_samples_leaf = min_samples_leaf
self.min_gain_to_split = min_gain_to_split
if self.constant_hessian:
self.constant_hessian_value = self.hessians[0] # 1 scalar
else:
self.constant_hessian_value = float32(1.) # won't be used anyway
# The partition array maps each sample index into the leaves of the
# tree (a leaf in this context is a node that isn't splitted yet, not
# necessarily a 'finalized' leaf). Initially, the root contains all
# the indices, e.g.:
# partition = [abcdefghijkl]
# After a call to split_indices, it may look e.g. like this:
# partition = [cef|abdghijkl]
# we have 2 leaves, the left one is at position 0 and the second one at
# position 3. The order of the samples is irrelevant.
self.partition = np.arange(0, X_binned.shape[0], 1, np.uint32)
# buffers used in split_indices to support parallel splitting.
self.left_indices_buffer = np.empty_like(self.partition)
self.right_indices_buffer = np.empty_like(self.partition)
[docs]@njit(parallel=True,
locals={'sample_idx': uint32,
'left_count': uint32,
'right_count': uint32})
def split_indices(context, split_info, sample_indices):
"""Split samples into left and right arrays.
Parameters
----------
context : SplittingContext
The splitting context
split_ingo : SplitInfo
The SplitInfo of the node to split
sample_indices : array of int
The indices of the samples at the node to split. This is a view on
context.partition, and it is modified inplace by placing the indices
of the left child at the beginning, and the indices of the right child
at the end.
Returns
-------
left_indices : array of int
The indices of the samples in the left child. This is a view on
context.partition.
right_indices : array of int
The indices of the samples in the right child. This is a view on
context.partition.
"""
# This is a multi-threaded implementation inspired by lightgbm.
# Here is a quick break down. Let's suppose we want to split a node with
# 24 samples named from a to x. context.partition looks like this (the *
# are indices in other leaves that we don't care about):
# partition = [*************abcdefghijklmnopqrstuvwx****************]
# ^ ^
# node_position node_position + node.n_samples
# Ultimately, we want to reorder the samples inside the boundaries of the
# leaf (which becomes a node) to now represent the samples in its left and
# right child. For example:
# partition = [*************abefilmnopqrtuxcdghjksvw*****************]
# ^ ^
# left_child_pos right_child_pos
# Note that left_child_pos always takes the value of node_position, and
# right_child_pos = left_child_pos + left_child.n_samples. The order of
# the samples inside a leaf is irrelevant.
# 1. samples_indices is a view on this region a..x. We conceptually
# divide it into n_threads regions. Each thread will be responsible for
# its own region. Here is an example with 4 threads:
# samples_indices = [abcdef|ghijkl|mnopqr|stuvwx]
# 2. Each thread processes 6 = 24 // 4 entries and maps them into
# left_indices_buffer or right_indices_buffer. For example, we could
# have the following mapping ('.' denotes an undefined entry):
# - left_indices_buffer = [abef..|il....|mnopqr|tux...]
# - right_indices_buffer = [cd....|ghjk..|......|svw...]
# 3. We keep track of the start positions of the regions (the '|') in
# ``offset_in_buffers`` as well as the size of each region. We also keep
# track of the number of samples put into the left/right child by each
# thread. Concretely:
# - left_counts = [4, 2, 6, 3]
# - right_counts = [2, 4, 0, 3]
# 4. Finally, we put left/right_indices_buffer back into the
# samples_indices, without any undefined entries and the partition looks
# as expected
# partition = [*************abefilmnopqrtuxcdghjksvw*****************]
# Note: We here show left/right_indices_buffer as being the same size as
# sample_indices for simplicity, but in reality they are of the same size
# as partition.
X_binned = context.X_binned.T[split_info.feature_idx]
n_threads = numba.config.NUMBA_DEFAULT_NUM_THREADS
n_samples = sample_indices.shape[0]
# Note: we could probably allocate all the arrays of size n_threads in the
# splitting context as well, but gains are probably going to be minimal
sizes = np.full(n_threads, n_samples // n_threads, dtype=np.int32)
if n_samples % n_threads > 0:
# array[:0] will cause a bug in numba 0.41 so we need the if. Remove
# once issue numba 3554 is fixed.
sizes[:n_samples % n_threads] += 1
offset_in_buffers = np.zeros(n_threads, dtype=np.int32)
offset_in_buffers[1:] = np.cumsum(sizes[:-1])
left_counts = np.empty(n_threads, dtype=np.int32)
right_counts = np.empty(n_threads, dtype=np.int32)
# Need to declare local variables, else they're not updated :/
# (see numba issue 3459)
left_indices_buffer = context.left_indices_buffer
right_indices_buffer = context.right_indices_buffer
# map indices from samples_indices to left/right_indices_buffer
for thread_idx in prange(n_threads):
left_count = 0
right_count = 0
start = offset_in_buffers[thread_idx]
stop = start + sizes[thread_idx]
for i in range(start, stop):
sample_idx = sample_indices[i]
if X_binned[sample_idx] <= split_info.bin_idx:
left_indices_buffer[start + left_count] = sample_idx
left_count += 1
else:
right_indices_buffer[start + right_count] = sample_idx
right_count += 1
left_counts[thread_idx] = left_count
right_counts[thread_idx] = right_count
# position of right child = just after the left child
right_child_position = left_counts.sum()
# offset of each thread in samples_indices for left and right child, i.e.
# where each thread will start to write.
left_offset = np.zeros(n_threads, dtype=np.int32)
left_offset[1:] = np.cumsum(left_counts[:-1])
right_offset = np.full(n_threads, right_child_position, dtype=np.int32)
right_offset[1:] += np.cumsum(right_counts[:-1])
# map indices in left/right_indices_buffer back into samples_indices. This
# also updates context.partition since samples_indice is a view.
for thread_idx in prange(n_threads):
for i in range(left_counts[thread_idx]):
sample_indices[left_offset[thread_idx] + i] = \
left_indices_buffer[offset_in_buffers[thread_idx] + i]
for i in range(right_counts[thread_idx]):
sample_indices[right_offset[thread_idx] + i] = \
right_indices_buffer[offset_in_buffers[thread_idx] + i]
return (sample_indices[:right_child_position],
sample_indices[right_child_position:])
[docs]@njit(parallel=True)
def find_node_split(context, sample_indices):
"""For each feature, find the best bin to split on at a given node.
Returns the best split info among all features, and the histograms of
all the features. The histograms are computed by scanning the whole
data.
Parameters
----------
context : SplittingContext
The splitting context
sample_indices : array of int
The indices of the samples at the node to split.
Returns
-------
best_split_info : SplitInfo
The info about the best possible split among all features.
histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
The histograms of each feature. A histogram is an array of
HISTOGRAM_DTYPE of size ``max_bins`` (only
``n_bins_per_features[feature]`` entries are relevant).
"""
ctx = context # shorter name to avoid various line breaks
n_samples = sample_indices.shape[0]
# Need to declare local variables, else they're not updated
# (see numba issue 3459)
ordered_gradients = ctx.ordered_gradients
ordered_hessians = ctx.ordered_hessians
# Populate ordered_gradients and ordered_hessians. (Already done for root)
# Ordering the gradients and hessians helps to improve cache hit.
# This is a parallelized version of the following vanilla code:
# for i range(n_samples):
# ctx.ordered_gradients[i] = ctx.gradients[samples_indices[i]]
if sample_indices.shape[0] != ctx.gradients.shape[0]:
starts, ends, n_threads = get_threads_chunks(n_samples)
if ctx.constant_hessian:
for thread_idx in prange(n_threads):
for i in range(starts[thread_idx], ends[thread_idx]):
ordered_gradients[i] = ctx.gradients[sample_indices[i]]
else:
for thread_idx in prange(n_threads):
for i in range(starts[thread_idx], ends[thread_idx]):
ordered_gradients[i] = ctx.gradients[sample_indices[i]]
ordered_hessians[i] = ctx.hessians[sample_indices[i]]
ctx.sum_gradients = ctx.ordered_gradients[:n_samples].sum()
if ctx.constant_hessian:
ctx.sum_hessians = ctx.constant_hessian_value * float32(n_samples)
else:
ctx.sum_hessians = ctx.ordered_hessians[:n_samples].sum()
# Pre-allocate the results datastructure to be able to use prange:
# numba jitclass do not seem to properly support default values for kwargs.
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(context.n_features)]
histograms = np.empty(
shape=(np.int64(context.n_features), np.int64(context.max_bins)),
dtype=HISTOGRAM_DTYPE
)
for feature_idx in prange(context.n_features):
split_info, histogram = _find_histogram_split(
context, feature_idx, sample_indices)
split_infos[feature_idx] = split_info
histograms[feature_idx, :] = histogram
split_info = _find_best_feature_to_split_helper(split_infos)
return split_info, histograms
[docs]@njit(parallel=True)
def find_node_split_subtraction(context, sample_indices, parent_histograms,
sibling_histograms):
"""For each feature, find the best bin to split on at a given node.
Returns the best split info among all features, and the histograms of
all the features.
This does the same job as ``find_node_split()`` but uses the histograms
of the parent and sibling of the node to split. This allows to use the
identity: ``histogram(parent) = histogram(node) - histogram(sibling)``,
which is significantly faster than computing the histograms from data.
Returns the best SplitInfo among all features, along with all the feature
histograms that can be latter used to compute the sibling or children
histograms by substraction.
Parameters
----------
context : SplittingContext
The splitting context
sample_indices : array of int
The indices of the samples at the node to split.
parent_histograms : array of HISTOGRAM_DTYPE of shape(n_features, max_bins)
The histograms of the parent
sibling_histograms : array of HISTOGRAM_DTYPE of \
shape(n_features, max_bins)
The histograms of the sibling
Returns
-------
best_split_info : SplitInfo
The info about the best possible split among all features.
histograms : array of HISTOGRAM_DTYPE, shape=(n_features, max_bins)
The histograms of each feature. A histogram is an array of
HISTOGRAM_DTYPE of size ``max_bins`` (only
``n_bins_per_features[feature]`` entries are relevant).
"""
# We can pick any feature (here the first) in the histograms to
# compute the gradients: they must be the same across all features
# anyway, we have tests ensuring this. Maybe a more robust way would
# be to compute an average but it's probably not worth it.
context.sum_gradients = (parent_histograms[0]['sum_gradients'].sum() -
sibling_histograms[0]['sum_gradients'].sum())
n_samples = sample_indices.shape[0]
if context.constant_hessian:
context.sum_hessians = \
context.constant_hessian_value * float32(n_samples)
else:
context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() -
sibling_histograms[0]['sum_hessians'].sum())
# Pre-allocate the results datastructure to be able to use prange
split_infos = [SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
for i in range(context.n_features)]
histograms = np.empty(
shape=(np.int64(context.n_features), np.int64(context.max_bins)),
dtype=HISTOGRAM_DTYPE
)
for feature_idx in prange(context.n_features):
split_info, histogram = _find_histogram_split_subtraction(
context, feature_idx, parent_histograms,
sibling_histograms, n_samples)
split_infos[feature_idx] = split_info
histograms[feature_idx, :] = histogram
split_info = _find_best_feature_to_split_helper(split_infos)
return split_info, histograms
@njit
def _find_best_feature_to_split_helper(split_infos):
best_gain = None
for i, split_info in enumerate(split_infos):
gain = split_info.gain
if best_gain is None or gain > best_gain:
best_gain = gain
best_split_info = split_info
return best_split_info
@njit(fastmath=True)
def _find_histogram_split(context, feature_idx, sample_indices):
"""Compute the histogram for a given feature
Returns the best SplitInfo among all the possible bins of the feature.
"""
n_samples = sample_indices.shape[0]
X_binned = context.X_binned.T[feature_idx]
root_node = X_binned.shape[0] == n_samples
ordered_gradients = context.ordered_gradients[:n_samples]
ordered_hessians = context.ordered_hessians[:n_samples]
if root_node:
if context.constant_hessian:
histogram = _build_histogram_root_no_hessian(
context.max_bins, X_binned, ordered_gradients)
else:
histogram = _build_histogram_root(
context.max_bins, X_binned, ordered_gradients,
context.ordered_hessians)
else:
if context.constant_hessian:
histogram = _build_histogram_no_hessian(
context.max_bins, sample_indices, X_binned,
ordered_gradients)
else:
histogram = _build_histogram(
context.max_bins, sample_indices, X_binned,
ordered_gradients, ordered_hessians)
return _find_best_bin_to_split_helper(context, feature_idx, histogram,
n_samples)
@njit(fastmath=True)
def _find_histogram_split_subtraction(context, feature_idx,
parent_histograms, sibling_histograms,
n_samples):
"""Compute the histogram by substraction of parent and sibling
Uses the identity: hist(parent) = hist(left) + hist(right).
Returns the best SplitInfo among all the possible bins of the feature.
"""
histogram = _subtract_histograms(
context.max_bins,
parent_histograms[feature_idx], sibling_histograms[feature_idx])
return _find_best_bin_to_split_helper(context, feature_idx, histogram,
n_samples)
@njit(locals={'gradient_left': float32, 'hessian_left': float32,
'n_samples_left': uint32},
fastmath=True)
def _find_best_bin_to_split_helper(context, feature_idx, histogram, n_samples):
"""Find best bin to split on, and return the corresponding SplitInfo.
Splits that do not satisfy the splitting constraints (min_gain_to_split,
etc.) are discarded here. If no split can satisfy the constraints, a
SplitInfo with a gain of -1 is returned. If for a given node the best
SplitInfo has a gain of -1, it is finalized into a leaf.
"""
# Allocate the structure for the best split information. It can be
# returned as such (with a negative gain) if the min_hessian_to_split
# condition is not satisfied. Such invalid splits are later discarded by
# the TreeGrower.
best_split = SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0)
gradient_left, hessian_left = 0., 0.
n_samples_left = 0
for bin_idx in range(context.n_bins_per_feature[feature_idx]):
n_samples_left += histogram[bin_idx]['count']
n_samples_right = n_samples - n_samples_left
if context.constant_hessian:
hessian_left += (histogram[bin_idx]['count']
* context.constant_hessian_value)
else:
hessian_left += histogram[bin_idx]['sum_hessians']
hessian_right = context.sum_hessians - hessian_left
gradient_left += histogram[bin_idx]['sum_gradients']
gradient_right = context.sum_gradients - gradient_left
if n_samples_left < context.min_samples_leaf:
continue
if n_samples_right < context.min_samples_leaf:
# won't get any better
break
if hessian_left < context.min_hessian_to_split:
continue
if hessian_right < context.min_hessian_to_split:
# won't get any better (hessians are > 0 since loss is convex)
break
gain = _split_gain(gradient_left, hessian_left,
gradient_right, hessian_right,
context.sum_gradients, context.sum_hessians,
context.l2_regularization)
if gain > best_split.gain and gain > context.min_gain_to_split:
best_split.gain = gain
best_split.feature_idx = feature_idx
best_split.bin_idx = bin_idx
best_split.gradient_left = gradient_left
best_split.hessian_left = hessian_left
best_split.n_samples_left = n_samples_left
best_split.gradient_right = gradient_right
best_split.hessian_right = hessian_right
best_split.n_samples_right = n_samples_right
return best_split, histogram
@njit(fastmath=False)
def _split_gain(gradient_left, hessian_left, gradient_right, hessian_right,
sum_gradients, sum_hessians, l2_regularization):
"""Loss reduction
Compute the reduction in loss after taking a split, compared to keeping
the node a leaf of the tree.
See Equation 7 of:
XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
https://arxiv.org/abs/1603.02754
"""
def negative_loss(gradient, hessian):
return (gradient ** 2) / (hessian + l2_regularization)
gain = negative_loss(gradient_left, hessian_left)
gain += negative_loss(gradient_right, hessian_right)
gain -= negative_loss(sum_gradients, sum_hessians)
return gain
```