"""
This module contains the TreeGrower class.
TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
the gradients and hessians of the training data.
"""
from heapq import heappush, heappop
import numpy as np
from time import time
from .splitting import (SplittingContext, split_indices, find_node_split,
find_node_split_subtraction)
from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE
[docs]class TreeNode:
"""Tree Node class used in TreeGrower.
This isn't used for prediction purposes, only for training (see
TreePredictor).
Parameters
----------
depth : int
The depth of the node, i.e. its distance from the root
samples_indices : array of int
The indices of the samples at the node
sum_gradients : float
The sum of the gradients of the samples at the node
sum_hessians : float
The sum of the hessians of the samples at the node
parent : TreeNode or None, optional(default=None)
The parent of the node. None for root.
Attributes
----------
depth : int
The depth of the node, i.e. its distance from the root
samples_indices : array of int
The indices of the samples at the node
sum_gradients : float
The sum of the gradients of the samples at the node
sum_hessians : float
The sum of the hessians of the samples at the node
parent : TreeNode or None, optional(default=None)
The parent of the node. None for root.
split_info : SplitInfo or None
The result of the split evaluation
left_child : TreeNode or None
The left child of the node. None for leaves.
right_child : TreeNode or None
The right child of the node. None for leaves.
value : float or None
The value of the leaf, as computed in finalize_leaf(). None for
non-leaf nodes
find_split_time : float
The total time spent computing the histogram and finding the best
split at the node.
construction_speed : float
The Number of samples at the node divided find_split_time.
apply_split_time : float
The total time spent actually splitting the node, e.g. splitting
samples_indices into left and right child.
hist_subtraction : bool
Wheter the subtraction method was used for computing the histograms.
"""
split_info = None
left_child = None
right_child = None
value = None
histograms = None
sibling = None
parent = None
find_split_time = 0.
construction_speed = 0.
apply_split_time = 0.
hist_subtraction = False
def __init__(self, depth, sample_indices, sum_gradients,
sum_hessians, parent=None):
self.depth = depth
self.sample_indices = sample_indices
self.n_samples = sample_indices.shape[0]
self.sum_gradients = sum_gradients
self.sum_hessians = sum_hessians
self.parent = parent
def __repr__(self):
# To help with debugging
out = f"TreeNode: depth={self.depth}, "
out += f"samples={len(self.sample_indices)}"
if self.split_info is not None:
out += f", feature_idx={self.split_info.feature_idx}"
out += f", bin_idx={self.split_info.bin_idx}"
return out
def __lt__(self, other_node):
"""Comparison for priority queue.
Nodes with high gain are higher priority than nodes with low gain.
heapq.heappush only need the '<' operator.
heapq.heappop take the smallest item first (smaller is higher
priority).
Parameters
-----------
other_node : TreeNode
The node to compare with.
"""
if self.split_info is None or other_node.split_info is None:
raise ValueError("Cannot compare nodes with split_info")
return self.split_info.gain > other_node.split_info.gain
[docs]class TreeGrower:
"""Tree grower class used to build a tree.
The tree is fitted to predict the values of a Newton-Raphson step. The
splits are considered in a best-first fashion, and the quality of a
split is defined in splitting._split_gain.
Parameters
----------
X_binned : array-like of int, shape=(n_samples, n_features)
The binned input samples. Must be Fortran-aligned.
gradients : array-like, shape=(n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
hessians : array-like, shape=(n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
max_leaf_nodes : int or None, optional(default=None)
The maximum number of leaves for each tree. If None, there is no
maximum limit.
max_depth : int or None, optional(default=None)
The maximum depth of each tree. The depth of a tree is the number of
nodes to go from the root to the deepest leaf.
min_samples_leaf : int, optional(default=20)
The minimum number of samples per leaf.
min_gain_to_split : float, optional(default=0.)
The minimum gain needed to split a node. Splits with lower gain will
be ignored.
max_bins : int, optional(default=256)
The maximum number of bins. Used to define the shape of the
histograms.
n_bins_per_feature : array-like of int or int, optional(default=None)
The actual number of bins needed for each feature, which is lower or
equal to ``max_bins``. If it's an int, all features are considered to
have the same number of bins. If None, all features are considered to
have ``max_bins`` bins.
l2_regularization : float, optional(default=0)
The L2 regularization parameter.
min_hessian_to_split : float, optional(default=1e-3)
The minimum sum of hessians needed in each node. Splits that result in
at least one child having a sum of hessians less than
min_hessian_to_split are discarded.
shrinkage : float, optional(default=1)
The shrinkage parameter to apply to the leaves values, also known as
learning rate.
"""
def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
max_bins=256, n_bins_per_feature=None, l2_regularization=0.,
min_hessian_to_split=1e-3, shrinkage=1.):
self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split)
if n_bins_per_feature is None:
n_bins_per_feature = max_bins
if isinstance(n_bins_per_feature, int):
n_bins_per_feature = np.array(
[n_bins_per_feature] * X_binned.shape[1],
dtype=np.uint32)
self.splitting_context = SplittingContext(
X_binned, max_bins, n_bins_per_feature, gradients,
hessians, l2_regularization, min_hessian_to_split,
min_samples_leaf, min_gain_to_split)
self.max_leaf_nodes = max_leaf_nodes
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.X_binned = X_binned
self.min_gain_to_split = min_gain_to_split
self.shrinkage = shrinkage
self.splittable_nodes = []
self.finalized_leaves = []
self.total_find_split_time = 0. # time spent finding the best splits
self.total_apply_split_time = 0. # time spent splitting nodes
self._intilialize_root()
self.n_nodes = 1
def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split):
"""Validate parameters passed to __init__.
Also validate parameters passed to SplittingContext because we cannot
raise exceptions in a jitclass.
"""
if X_binned.dtype != np.uint8:
raise NotImplementedError(
"Explicit feature binning required for now")
if not X_binned.flags.f_contiguous:
raise ValueError(
"X_binned should be passed as Fortran contiguous "
"array for maximum efficiency.")
if max_leaf_nodes is not None and max_leaf_nodes < 1:
raise ValueError(f'max_leaf_nodes={max_leaf_nodes} should not be'
f' smaller than 1')
if max_depth is not None and max_depth < 1:
raise ValueError(f'max_depth={max_depth} should not be'
f' smaller than 1')
if min_samples_leaf < 1:
raise ValueError(f'min_samples_leaf={min_samples_leaf} should '
f'not be smaller than 1')
if min_gain_to_split < 0:
raise ValueError(f'min_gain_to_split={min_gain_to_split} '
f'must be positive.')
if l2_regularization < 0:
raise ValueError(f'l2_regularization={l2_regularization} must be '
f'positive.')
if min_hessian_to_split < 0:
raise ValueError(f'min_hessian_to_split={min_hessian_to_split} '
f'must be positive.')
[docs] def grow(self):
"""Grow the tree, from root to leaves."""
while self.can_split_further():
self.split_next()
def _intilialize_root(self):
"""Initialize root node and finalize it if needed."""
n_samples = self.X_binned.shape[0]
depth = 0
if self.splitting_context.constant_hessian:
hessian = self.splitting_context.hessians[0] * n_samples
else:
hessian = self.splitting_context.hessians.sum()
self.root = TreeNode(
depth=depth,
sample_indices=self.splitting_context.partition.view(),
sum_gradients=self.splitting_context.gradients.sum(),
sum_hessians=hessian
)
if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1):
self._finalize_leaf(self.root)
return
if self.root.n_samples < 2 * self.min_samples_leaf:
# Do not even bother computing any splitting statistics.
self._finalize_leaf(self.root)
return
self._compute_spittability(self.root)
def _compute_spittability(self, node, only_hist=False):
"""Compute histograms and best possible split of a node.
If the best possible gain is 0 of if the constraints aren't met
(min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the
node is finalized (transformed into a leaf), else it is pushed on
the splittable node heap.
Parameters
----------
node : TreeNode
The node to evaluate.
only_hist : bool, optional (default=False)
Whether to only compute the histograms and the SplitInfo. It is
set to ``True`` when ``_compute_spittability`` was called by a
sibling node: we only want to compute the histograms (which also
computes the ``SplitInfo``), not finalize or push the node. If
``_compute_spittability`` is called again by the grower on this
same node, the histograms won't be computed again.
"""
# Compute split_info and histograms if not already done
if node.split_info is None and node.histograms is None:
# If the sibling has less samples, compute its hist first (with
# the regular method) and use the subtraction method for the
# current node
if node.sibling is not None: # root has no sibling
if node.sibling.n_samples < node.n_samples:
self._compute_spittability(node.sibling, only_hist=True)
# As hist of sibling is now computed we'll use the hist
# subtraction method for the current node.
node.hist_subtraction = True
tic = time()
if node.hist_subtraction:
split_info, histograms = find_node_split_subtraction(
self.splitting_context, node.sample_indices,
node.parent.histograms, node.sibling.histograms)
else:
split_info, histograms = find_node_split(
self.splitting_context, node.sample_indices)
toc = time()
node.find_split_time = toc - tic
self.total_find_split_time += node.find_split_time
node.construction_speed = node.n_samples / node.find_split_time
node.split_info = split_info
node.histograms = histograms
if only_hist:
# _compute_spittability was called by a sibling. We only needed to
# compute the histogram.
return
if node.split_info.gain <= 0: # no valid split
# Note: this condition is reached if either all the leaves are
# pure (best gain = 0), or if no split would satisfy the
# constraints, (min_hessians_to_split, min_gain_to_split,
# min_samples_leaf)
self._finalize_leaf(node)
else:
heappush(self.splittable_nodes, node)
[docs] def split_next(self):
"""Split the node with highest potential gain.
Returns
-------
left : TreeNode
The resulting left child.
right : TreeNode
The resulting right child.
"""
if len(self.splittable_nodes) == 0:
raise StopIteration("No more splittable nodes")
# Consider the node with the highest loss reduction (a.k.a. gain)
node = heappop(self.splittable_nodes)
tic = time()
(sample_indices_left, sample_indices_right) = split_indices(
self.splitting_context, node.split_info, node.sample_indices)
toc = time()
node.apply_split_time = toc - tic
self.total_apply_split_time += node.apply_split_time
depth = node.depth + 1
n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)
n_leaf_nodes += 2
left_child_node = TreeNode(depth,
sample_indices_left,
node.split_info.gradient_left,
node.split_info.hessian_left,
parent=node)
right_child_node = TreeNode(depth,
sample_indices_right,
node.split_info.gradient_right,
node.split_info.hessian_right,
parent=node)
left_child_node.sibling = right_child_node
right_child_node.sibling = left_child_node
node.right_child = right_child_node
node.left_child = left_child_node
self.n_nodes += 2
if self.max_depth is not None and depth == self.max_depth:
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
return left_child_node, right_child_node
if (self.max_leaf_nodes is not None
and n_leaf_nodes == self.max_leaf_nodes):
self._finalize_leaf(left_child_node)
self._finalize_leaf(right_child_node)
self._finalize_splittable_nodes()
return left_child_node, right_child_node
if left_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(left_child_node)
else:
self._compute_spittability(left_child_node)
if right_child_node.n_samples < self.min_samples_leaf * 2:
self._finalize_leaf(right_child_node)
else:
self._compute_spittability(right_child_node)
return left_child_node, right_child_node
[docs] def can_split_further(self):
"""Return True if there are still nodes to split."""
return len(self.splittable_nodes) >= 1
def _finalize_leaf(self, node):
"""Compute the prediction value that minimizes the objective function.
This sets the node.value attribute (node is a leaf iff node.value is
not None).
See Equation 5 of:
XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
https://arxiv.org/abs/1603.02754
"""
node.value = -self.shrinkage * node.sum_gradients / (
node.sum_hessians + self.splitting_context.l2_regularization)
self.finalized_leaves.append(node)
def _finalize_splittable_nodes(self):
"""Transform all splittable nodes into leaves.
Used when some constraint is met e.g. maximum number of leaves or
maximum depth."""
while len(self.splittable_nodes) > 0:
node = self.splittable_nodes.pop()
self._finalize_leaf(node)
[docs] def make_predictor(self, bin_thresholds=None):
"""Make a TreePredictor object out of the current tree.
Parameters
----------
bin_thresholds : array-like of floats, optional (default=None)
The actual thresholds values of each bin.
Returns
-------
A TreePredictor object.
"""
predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)
self._fill_predictor_node_array(predictor_nodes, self.root,
bin_thresholds=bin_thresholds)
return TreePredictor(predictor_nodes)
def _fill_predictor_node_array(self, predictor_nodes, grower_node,
bin_thresholds=None, next_free_idx=0):
"""Helper used in make_predictor to set the TreePredictor fields."""
node = predictor_nodes[next_free_idx]
node['count'] = grower_node.n_samples
node['depth'] = grower_node.depth
if grower_node.split_info is not None:
node['gain'] = grower_node.split_info.gain
else:
node['gain'] = -1
if grower_node.value is not None:
# Leaf node
node['is_leaf'] = True
node['value'] = grower_node.value
return next_free_idx + 1
else:
# Decision node
split_info = grower_node.split_info
feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx
node['feature_idx'] = feature_idx
node['bin_threshold'] = bin_idx
if bin_thresholds is not None:
threshold = bin_thresholds[feature_idx][bin_idx]
node['threshold'] = threshold
next_free_idx += 1
node['left'] = next_free_idx
next_free_idx = self._fill_predictor_node_array(
predictor_nodes, grower_node.left_child,
bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)
node['right'] = next_free_idx
return self._fill_predictor_node_array(
predictor_nodes, grower_node.right_child,
bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)