Source code for pygbm.grower

"""
This module contains the TreeGrower class.

TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
the gradients and hessians of the training data.
"""
from heapq import heappush, heappop
import numpy as np
from time import time

from .splitting import (SplittingContext, split_indices, find_node_split,
                        find_node_split_subtraction)
from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE


[docs]class TreeNode: """Tree Node class used in TreeGrower. This isn't used for prediction purposes, only for training (see TreePredictor). Parameters ---------- depth : int The depth of the node, i.e. its distance from the root samples_indices : array of int The indices of the samples at the node sum_gradients : float The sum of the gradients of the samples at the node sum_hessians : float The sum of the hessians of the samples at the node parent : TreeNode or None, optional(default=None) The parent of the node. None for root. Attributes ---------- depth : int The depth of the node, i.e. its distance from the root samples_indices : array of int The indices of the samples at the node sum_gradients : float The sum of the gradients of the samples at the node sum_hessians : float The sum of the hessians of the samples at the node parent : TreeNode or None, optional(default=None) The parent of the node. None for root. split_info : SplitInfo or None The result of the split evaluation left_child : TreeNode or None The left child of the node. None for leaves. right_child : TreeNode or None The right child of the node. None for leaves. value : float or None The value of the leaf, as computed in finalize_leaf(). None for non-leaf nodes find_split_time : float The total time spent computing the histogram and finding the best split at the node. construction_speed : float The Number of samples at the node divided find_split_time. apply_split_time : float The total time spent actually splitting the node, e.g. splitting samples_indices into left and right child. hist_subtraction : bool Wheter the subtraction method was used for computing the histograms. """ split_info = None left_child = None right_child = None value = None histograms = None sibling = None parent = None find_split_time = 0. construction_speed = 0. apply_split_time = 0. hist_subtraction = False def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, parent=None): self.depth = depth self.sample_indices = sample_indices self.n_samples = sample_indices.shape[0] self.sum_gradients = sum_gradients self.sum_hessians = sum_hessians self.parent = parent def __repr__(self): # To help with debugging out = f"TreeNode: depth={self.depth}, " out += f"samples={len(self.sample_indices)}" if self.split_info is not None: out += f", feature_idx={self.split_info.feature_idx}" out += f", bin_idx={self.split_info.bin_idx}" return out def __lt__(self, other_node): """Comparison for priority queue. Nodes with high gain are higher priority than nodes with low gain. heapq.heappush only need the '<' operator. heapq.heappop take the smallest item first (smaller is higher priority). Parameters ----------- other_node : TreeNode The node to compare with. """ if self.split_info is None or other_node.split_info is None: raise ValueError("Cannot compare nodes with split_info") return self.split_info.gain > other_node.split_info.gain
[docs]class TreeGrower: """Tree grower class used to build a tree. The tree is fitted to predict the values of a Newton-Raphson step. The splits are considered in a best-first fashion, and the quality of a split is defined in splitting._split_gain. Parameters ---------- X_binned : array-like of int, shape=(n_samples, n_features) The binned input samples. Must be Fortran-aligned. gradients : array-like, shape=(n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. hessians : array-like, shape=(n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. max_leaf_nodes : int or None, optional(default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. max_depth : int or None, optional(default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. min_samples_leaf : int, optional(default=20) The minimum number of samples per leaf. min_gain_to_split : float, optional(default=0.) The minimum gain needed to split a node. Splits with lower gain will be ignored. max_bins : int, optional(default=256) The maximum number of bins. Used to define the shape of the histograms. n_bins_per_feature : array-like of int or int, optional(default=None) The actual number of bins needed for each feature, which is lower or equal to ``max_bins``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to have ``max_bins`` bins. l2_regularization : float, optional(default=0) The L2 regularization parameter. min_hessian_to_split : float, optional(default=1e-3) The minimum sum of hessians needed in each node. Splits that result in at least one child having a sum of hessians less than min_hessian_to_split are discarded. shrinkage : float, optional(default=1) The shrinkage parameter to apply to the leaves values, also known as learning rate. """ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0., max_bins=256, n_bins_per_feature=None, l2_regularization=0., min_hessian_to_split=1e-3, shrinkage=1.): self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split) if n_bins_per_feature is None: n_bins_per_feature = max_bins if isinstance(n_bins_per_feature, int): n_bins_per_feature = np.array( [n_bins_per_feature] * X_binned.shape[1], dtype=np.uint32) self.splitting_context = SplittingContext( X_binned, max_bins, n_bins_per_feature, gradients, hessians, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split) self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.X_binned = X_binned self.min_gain_to_split = min_gain_to_split self.shrinkage = shrinkage self.splittable_nodes = [] self.finalized_leaves = [] self.total_find_split_time = 0. # time spent finding the best splits self.total_apply_split_time = 0. # time spent splitting nodes self._intilialize_root() self.n_nodes = 1 def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split): """Validate parameters passed to __init__. Also validate parameters passed to SplittingContext because we cannot raise exceptions in a jitclass. """ if X_binned.dtype != np.uint8: raise NotImplementedError( "Explicit feature binning required for now") if not X_binned.flags.f_contiguous: raise ValueError( "X_binned should be passed as Fortran contiguous " "array for maximum efficiency.") if max_leaf_nodes is not None and max_leaf_nodes < 1: raise ValueError(f'max_leaf_nodes={max_leaf_nodes} should not be' f' smaller than 1') if max_depth is not None and max_depth < 1: raise ValueError(f'max_depth={max_depth} should not be' f' smaller than 1') if min_samples_leaf < 1: raise ValueError(f'min_samples_leaf={min_samples_leaf} should ' f'not be smaller than 1') if min_gain_to_split < 0: raise ValueError(f'min_gain_to_split={min_gain_to_split} ' f'must be positive.') if l2_regularization < 0: raise ValueError(f'l2_regularization={l2_regularization} must be ' f'positive.') if min_hessian_to_split < 0: raise ValueError(f'min_hessian_to_split={min_hessian_to_split} ' f'must be positive.')
[docs] def grow(self): """Grow the tree, from root to leaves.""" while self.can_split_further(): self.split_next()
def _intilialize_root(self): """Initialize root node and finalize it if needed.""" n_samples = self.X_binned.shape[0] depth = 0 if self.splitting_context.constant_hessian: hessian = self.splitting_context.hessians[0] * n_samples else: hessian = self.splitting_context.hessians.sum() self.root = TreeNode( depth=depth, sample_indices=self.splitting_context.partition.view(), sum_gradients=self.splitting_context.gradients.sum(), sum_hessians=hessian ) if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1): self._finalize_leaf(self.root) return if self.root.n_samples < 2 * self.min_samples_leaf: # Do not even bother computing any splitting statistics. self._finalize_leaf(self.root) return self._compute_spittability(self.root) def _compute_spittability(self, node, only_hist=False): """Compute histograms and best possible split of a node. If the best possible gain is 0 of if the constraints aren't met (min_samples_leaf, min_hessian_to_split, min_gain_to_split) then the node is finalized (transformed into a leaf), else it is pushed on the splittable node heap. Parameters ---------- node : TreeNode The node to evaluate. only_hist : bool, optional (default=False) Whether to only compute the histograms and the SplitInfo. It is set to ``True`` when ``_compute_spittability`` was called by a sibling node: we only want to compute the histograms (which also computes the ``SplitInfo``), not finalize or push the node. If ``_compute_spittability`` is called again by the grower on this same node, the histograms won't be computed again. """ # Compute split_info and histograms if not already done if node.split_info is None and node.histograms is None: # If the sibling has less samples, compute its hist first (with # the regular method) and use the subtraction method for the # current node if node.sibling is not None: # root has no sibling if node.sibling.n_samples < node.n_samples: self._compute_spittability(node.sibling, only_hist=True) # As hist of sibling is now computed we'll use the hist # subtraction method for the current node. node.hist_subtraction = True tic = time() if node.hist_subtraction: split_info, histograms = find_node_split_subtraction( self.splitting_context, node.sample_indices, node.parent.histograms, node.sibling.histograms) else: split_info, histograms = find_node_split( self.splitting_context, node.sample_indices) toc = time() node.find_split_time = toc - tic self.total_find_split_time += node.find_split_time node.construction_speed = node.n_samples / node.find_split_time node.split_info = split_info node.histograms = histograms if only_hist: # _compute_spittability was called by a sibling. We only needed to # compute the histogram. return if node.split_info.gain <= 0: # no valid split # Note: this condition is reached if either all the leaves are # pure (best gain = 0), or if no split would satisfy the # constraints, (min_hessians_to_split, min_gain_to_split, # min_samples_leaf) self._finalize_leaf(node) else: heappush(self.splittable_nodes, node)
[docs] def split_next(self): """Split the node with highest potential gain. Returns ------- left : TreeNode The resulting left child. right : TreeNode The resulting right child. """ if len(self.splittable_nodes) == 0: raise StopIteration("No more splittable nodes") # Consider the node with the highest loss reduction (a.k.a. gain) node = heappop(self.splittable_nodes) tic = time() (sample_indices_left, sample_indices_right) = split_indices( self.splitting_context, node.split_info, node.sample_indices) toc = time() node.apply_split_time = toc - tic self.total_apply_split_time += node.apply_split_time depth = node.depth + 1 n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes) n_leaf_nodes += 2 left_child_node = TreeNode(depth, sample_indices_left, node.split_info.gradient_left, node.split_info.hessian_left, parent=node) right_child_node = TreeNode(depth, sample_indices_right, node.split_info.gradient_right, node.split_info.hessian_right, parent=node) left_child_node.sibling = right_child_node right_child_node.sibling = left_child_node node.right_child = right_child_node node.left_child = left_child_node self.n_nodes += 2 if self.max_depth is not None and depth == self.max_depth: self._finalize_leaf(left_child_node) self._finalize_leaf(right_child_node) return left_child_node, right_child_node if (self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes): self._finalize_leaf(left_child_node) self._finalize_leaf(right_child_node) self._finalize_splittable_nodes() return left_child_node, right_child_node if left_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(left_child_node) else: self._compute_spittability(left_child_node) if right_child_node.n_samples < self.min_samples_leaf * 2: self._finalize_leaf(right_child_node) else: self._compute_spittability(right_child_node) return left_child_node, right_child_node
[docs] def can_split_further(self): """Return True if there are still nodes to split.""" return len(self.splittable_nodes) >= 1
def _finalize_leaf(self, node): """Compute the prediction value that minimizes the objective function. This sets the node.value attribute (node is a leaf iff node.value is not None). See Equation 5 of: XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 https://arxiv.org/abs/1603.02754 """ node.value = -self.shrinkage * node.sum_gradients / ( node.sum_hessians + self.splitting_context.l2_regularization) self.finalized_leaves.append(node) def _finalize_splittable_nodes(self): """Transform all splittable nodes into leaves. Used when some constraint is met e.g. maximum number of leaves or maximum depth.""" while len(self.splittable_nodes) > 0: node = self.splittable_nodes.pop() self._finalize_leaf(node)
[docs] def make_predictor(self, bin_thresholds=None): """Make a TreePredictor object out of the current tree. Parameters ---------- bin_thresholds : array-like of floats, optional (default=None) The actual thresholds values of each bin. Returns ------- A TreePredictor object. """ predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) self._fill_predictor_node_array(predictor_nodes, self.root, bin_thresholds=bin_thresholds) return TreePredictor(predictor_nodes)
def _fill_predictor_node_array(self, predictor_nodes, grower_node, bin_thresholds=None, next_free_idx=0): """Helper used in make_predictor to set the TreePredictor fields.""" node = predictor_nodes[next_free_idx] node['count'] = grower_node.n_samples node['depth'] = grower_node.depth if grower_node.split_info is not None: node['gain'] = grower_node.split_info.gain else: node['gain'] = -1 if grower_node.value is not None: # Leaf node node['is_leaf'] = True node['value'] = grower_node.value return next_free_idx + 1 else: # Decision node split_info = grower_node.split_info feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx node['feature_idx'] = feature_idx node['bin_threshold'] = bin_idx if bin_thresholds is not None: threshold = bin_thresholds[feature_idx][bin_idx] node['threshold'] = threshold next_free_idx += 1 node['left'] = next_free_idx next_free_idx = self._fill_predictor_node_array( predictor_nodes, grower_node.left_child, bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) node['right'] = next_free_idx return self._fill_predictor_node_array( predictor_nodes, grower_node.right_child, bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)