From b73e260ae107d2b0c280462993d4d571e5d6aef7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 23 Sep 2021 17:01:34 +0200 Subject: [PATCH 1/5] FIX out of bound error in split_indices --- .../_hist_gradient_boosting/splitting.pyx | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 08ae7aaf0862c..05a35b33fe241 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -388,11 +388,15 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + if right_counts[thread_idx] > 0: + # if n_threads >= 2 one has right_counts[-1] = 0 and + # right_offset[-1] = len(sample_indices) leading to + # IndexError: Out of bounds on buffer access (with boundscheck=True) + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], @@ -839,7 +843,7 @@ cdef class Splitter: # other category. The low-support categories will always be mapped to # the right child. We scan the sorted categories array from left to # right and from right to left, and we stop at the middle. - + # Considering ordered categories A B C D, with E being a low-support # category: A B C D # ^ From d9d1d7cea93577e4a2e14075b98418ec3d7ef928 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 23 Sep 2021 23:18:51 +0200 Subject: [PATCH 2/5] better comment --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 05a35b33fe241..7556b6869e60a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -389,8 +389,9 @@ cdef class Splitter: sizeof(unsigned int) * left_counts[thread_idx] ) if right_counts[thread_idx] > 0: - # if n_threads >= 2 one has right_counts[-1] = 0 and - # right_offset[-1] = len(sample_indices) leading to + # If n_threads >= 2, one might have right_counts[-1] = 0 and + # right_offset[-1] = len(sample_indices) leading to evaluating + # &sample_indices[right_offset[-1]] which produces # IndexError: Out of bounds on buffer access (with boundscheck=True) memcpy( &sample_indices[right_offset[thread_idx]], From a5e5d0f64475198f53fea8c7cbfb1650e1cc1ac9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 24 Sep 2021 13:49:17 +0200 Subject: [PATCH 3/5] improve comment according to review --- sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 7556b6869e60a..d761e2771933b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -389,9 +389,12 @@ cdef class Splitter: sizeof(unsigned int) * left_counts[thread_idx] ) if right_counts[thread_idx] > 0: - # If n_threads >= 2, one might have right_counts[-1] = 0 and + # If we're splitting the rightmost node of the tree, i.e. the + # rightmost node in the partition array, and if n_threads >= 2, one + # might have right_counts[-1] = 0 and # right_offset[-1] = len(sample_indices) leading to evaluating - # &sample_indices[right_offset[-1]] which produces + # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] + # = &partition[n_samples_in_tree] which produces # IndexError: Out of bounds on buffer access (with boundscheck=True) memcpy( &sample_indices[right_offset[thread_idx]], From 2ebef8eed38e4fb5b934016bed337596aa7ebd48 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 26 Sep 2021 13:23:08 +0200 Subject: [PATCH 4/5] DOC add whatsnew --- doc/whats_new/v1.1.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 3aabed6214771..c9d9f51c4da3b 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,6 +38,14 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.ensemble` +........................... + +- |Fix| Fixed a bug that could produce a segfault in rare cases for + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. + :pr:`21130` :user:`Christian Lorentzen `. + :mod:`sklearn.linear_model` ........................... From f34cb0411a095651f989171cb4d894ff97598b18 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 4 Oct 2021 14:35:08 -0400 Subject: [PATCH 5/5] CLN Use suggestion with better spacing --- .../_hist_gradient_boosting/splitting.pyx | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index d761e2771933b..232cf094876cb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -391,11 +391,17 @@ cdef class Splitter: if right_counts[thread_idx] > 0: # If we're splitting the rightmost node of the tree, i.e. the # rightmost node in the partition array, and if n_threads >= 2, one - # might have right_counts[-1] = 0 and - # right_offset[-1] = len(sample_indices) leading to evaluating - # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] - # = &partition[n_samples_in_tree] which produces - # IndexError: Out of bounds on buffer access (with boundscheck=True) + # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices) + # leading to evaluating + # + # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] + # = &partition[n_samples_in_tree] + # + # which is an out-of-bounds read access that can cause a segmentation fault. + # When boundscheck=True, removing this check produces this exception: + # + # IndexError: Out of bounds on buffer access + # memcpy( &sample_indices[right_offset[thread_idx]], &right_indices_buffer[offset_in_buffers[thread_idx]],