diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 87ffc36d342fa..93300bf67d4a5 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,7 +38,6 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. - :mod:`sklearn.calibration` .......................... @@ -46,6 +45,14 @@ Changelog `pos_label` to specify the positive class label. :pr:`21032` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.ensemble` +........................... + +- |Fix| Fixed a bug that could produce a segfault in rare cases for + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. + :pr:`21130` :user:`Christian Lorentzen `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 08ae7aaf0862c..232cf094876cb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -388,11 +388,25 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + if right_counts[thread_idx] > 0: + # If we're splitting the rightmost node of the tree, i.e. the + # rightmost node in the partition array, and if n_threads >= 2, one + # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices) + # leading to evaluating + # + # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] + # = &partition[n_samples_in_tree] + # + # which is an out-of-bounds read access that can cause a segmentation fault. + # When boundscheck=True, removing this check produces this exception: + # + # IndexError: Out of bounds on buffer access + # + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], @@ -839,7 +853,7 @@ cdef class Splitter: # other category. The low-support categories will always be mapped to # the right child. We scan the sorted categories array from left to # right and from right to left, and we stop at the middle. - + # Considering ordered categories A B C D, with E being a low-support # category: A B C D # ^