From 96699d69d053e56ed7c03b76c11648496512c98a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 31 Oct 2023 21:55:42 +0000 Subject: [PATCH 1/5] feat: add 'index', 'pad', 'nearest' interpolate methods --- bigframes/core/block_transforms.py | 205 ++++++++++++++---- tests/system/small/test_series.py | 21 +- .../bigframes_vendored/pandas/core/frame.py | 3 + .../bigframes_vendored/pandas/core/series.py | 4 +- 4 files changed, 178 insertions(+), 55 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 917edac0de..2158d13e86 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -106,18 +106,33 @@ def indicate_duplicates( def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: - if method != "linear": + supported_methods = [ + "linear", + "values", + "index", + "nearest", + "zero", + "slinear", + "pad", + ] + if method not in supported_methods: raise NotImplementedError( - f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}" + f"Method {method} not supported, following interpolate methods supported: {', '.join(supported_methods)}. {constants.FEEDBACK_LINK}" ) - backwards_window = windows.WindowSpec(following=0) - forwards_window = windows.WindowSpec(preceding=0) - output_column_ids = [] original_columns = block.value_columns original_labels = block.column_labels - block, offsets = block.promote_offsets() + + if method == "linear": # Assumes evenly spaced, ignore index + block, xvalues = block.promote_offsets() + else: + index_columns = block.index_columns + if len(index_columns) != 1: + raise ValueError("only method 'linear' support multi-index") + xvalues = block.index_columns[0] + # TODO: verify numeric + for column in original_columns: # null in same places column is null should_interpolate = block._column_type(column) in [ @@ -125,47 +140,25 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: pd.Int64Dtype(), ] if should_interpolate: - block, notnull = block.apply_unary_op(column, ops.notnull_op) - block, masked_offsets = block.apply_binary_op( - offsets, notnull, ops.partial_arg3(ops.where_op, None) - ) - - block, previous_value = block.apply_window_op( - column, agg_ops.LastNonNullOp(), backwards_window - ) - block, next_value = block.apply_window_op( - column, agg_ops.FirstNonNullOp(), forwards_window - ) - block, previous_value_offset = block.apply_window_op( - masked_offsets, - agg_ops.LastNonNullOp(), - backwards_window, - skip_reproject_unsafe=True, - ) - block, next_value_offset = block.apply_window_op( - masked_offsets, - agg_ops.FirstNonNullOp(), - forwards_window, - skip_reproject_unsafe=True, - ) - - block, prediction_id = _interpolate( + interpolate_method_map = { + "linear": "linear", + "values": "linear", + "index": "linear", + "slinear": "linear", + "pad": "ffill", + "zero": "ffill", + "nearest": "nearest", + } + extrapolating_methods = ["linear", "values", "index"] + interpolate_method = interpolate_method_map[method] + do_extrapolate = method in extrapolating_methods + block, interpolated_and_ffilled = _interpolate_column( block, - previous_value_offset, - previous_value, - next_value_offset, - next_value, - offsets, + column, + xvalues, + interpolate_method=interpolate_method, + do_extrapolate=do_extrapolate, ) - - block, interpolated_column = block.apply_binary_op( - column, prediction_id, ops.fillna_op - ) - # Pandas performs ffill-like behavior to extrapolate forwards - block, interpolated_and_ffilled = block.apply_binary_op( - interpolated_column, previous_value, ops.fillna_op - ) - output_column_ids.append(interpolated_and_ffilled) else: output_column_ids.append(column) @@ -175,7 +168,80 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: return block.with_column_labels(original_labels) -def _interpolate( +def _interpolate_column( + block: blocks.Block, + column: str, + x_values: str, + interpolate_method: str, + do_extrapolate: bool = True, +) -> typing.Tuple[blocks.Block, str]: + if interpolate_method not in ["linear", "nearest", "ffill"]: + raise ValueError("interpolate method not supported") + window_ordering = (ordering.OrderingColumnReference(x_values),) + backwards_window = windows.WindowSpec(following=0, ordering=window_ordering) + forwards_window = windows.WindowSpec(preceding=0, ordering=window_ordering) + + # Note, this method may + block, notnull = block.apply_unary_op(column, ops.notnull_op) + block, masked_offsets = block.apply_binary_op( + x_values, notnull, ops.partial_arg3(ops.where_op, None) + ) + + block, previous_value = block.apply_window_op( + column, agg_ops.LastNonNullOp(), backwards_window + ) + block, next_value = block.apply_window_op( + column, agg_ops.FirstNonNullOp(), forwards_window + ) + block, previous_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.LastNonNullOp(), + backwards_window, + skip_reproject_unsafe=True, + ) + block, next_value_offset = block.apply_window_op( + masked_offsets, + agg_ops.FirstNonNullOp(), + forwards_window, + skip_reproject_unsafe=True, + ) + + if interpolate_method == "linear": + block, prediction_id = _interpolate_points_linear( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + elif interpolate_method == "nearest": + block, prediction_id = _interpolate_points_nearest( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + else: # interpolate_method == 'ffill': + block, prediction_id = _interpolate_points_ffill( + block, + previous_value_offset, + previous_value, + next_value_offset, + next_value, + x_values, + ) + if do_extrapolate: + block, prediction_id = block.apply_binary_op( + prediction_id, previous_value, ops.fillna_op + ) + + return block.apply_binary_op(column, prediction_id, ops.fillna_op) + + +def _interpolate_points_linear( block: blocks.Block, x0_id: str, y0_id: str, @@ -196,6 +262,53 @@ def _interpolate( return block, prediction_id +def _interpolate_points_nearest( + block: blocks.Block, + x0_id: str, + y0_id: str, + x1_id: str, + y1_id: str, + xpredict_id: str, +) -> typing.Tuple[blocks.Block, str]: + """Interpolate by taking the y value of the nearest x value""" + block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) + block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op) + # If diffs equal, choose left + block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op) + block, choose_left = block.apply_unary_op( + choose_left, ops.partial_right(ops.fillna_op, False) + ) + + block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op) + + block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op) + block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) + block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op) + + block, prediction_id = block.apply_binary_op( + nearest, is_interpolation, ops.partial_arg3(ops.where_op, None) + ) + + return block, prediction_id + + +def _interpolate_points_ffill( + block: blocks.Block, + x0_id: str, + y0_id: str, + x1_id: str, + y1_id: str, + xpredict_id: str, +) -> typing.Tuple[blocks.Block, str]: + """Interpolates by using the preceding values""" + # check for existance of y1, otherwise we are extrapolating instead of interpolating + block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) + block, prediction_id = block.apply_binary_op( + y0_id, y1_exists, ops.partial_arg3(ops.where_op, None) + ) + return block, prediction_id + + def drop_duplicates( block: blocks.Block, columns: typing.Sequence[str], keep: str = "first" ) -> blocks.Block: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 183ba01c0e..647db6ee6f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -273,21 +273,26 @@ def test_series_replace_list_scalar(scalars_dfs): @pytest.mark.parametrize( - ("values",), + ("method",), ( - ([None, 1, 2, None, None, 16, None],), - ([None, None, 3.6, None],), - ([403.2, None, 352.1, None, None, 111.9],), + ("linear",), + ("values",), + ("slinear",), + ("nearest",), + ("zero",), + ("pad",), ), ) -def test_series_interpolate(values): - pd_series = pd.Series(values) +def test_series_interpolate(method): + values = [None, 1, 2, None, None, 16, None] + index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] + pd_series = pd.Series(values, index) bf_series = series.Series(pd_series) # Pandas can only interpolate on "float64" columns # https://github.com/pandas-dev/pandas/issues/40252 - pd_result = pd_series.astype("float64").interpolate() - bf_result = bf_series.interpolate().to_pandas() + pd_result = pd_series.astype("float64").interpolate(method=method) + bf_result = bf_series.interpolate(method=method).to_pandas() # pd uses non-null types, while bf uses nullable types pd.testing.assert_series_equal( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 12bd053179..45cf15796f 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2765,6 +2765,9 @@ def interpolate(self, method: str = "linear"): Interpolation technique to use. Only 'linear' supported. 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. + 'index', 'values': use the actual numerical values of the index. + 'pad': Fill in NaNs using existing values. + 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` Returns: DataFrame: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b569e5699c..9990a92891 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -925,7 +925,9 @@ def interpolate(self, method: str = "linear"): Interpolation technique to use. Only 'linear' supported. 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. - + 'index', 'values': use the actual numerical values of the index. + 'pad': Fill in NaNs using existing values. + 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` Returns: Series: Returns the same object type as the caller, interpolated at From 0e30360cd00c142efa19247f6084b079b7a1229e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 2 Nov 2023 01:23:46 +0000 Subject: [PATCH 2/5] make interpolate pad method use ffill --- bigframes/core/block_transforms.py | 11 ++++++----- bigframes/dataframe.py | 2 ++ bigframes/series.py | 2 ++ tests/system/small/test_series.py | 5 +++++ 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 2158d13e86..3828a39899 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -22,6 +22,7 @@ import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows +import bigframes.dtypes as dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -113,7 +114,6 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: "nearest", "zero", "slinear", - "pad", ] if method not in supported_methods: raise NotImplementedError( @@ -131,7 +131,9 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: if len(index_columns) != 1: raise ValueError("only method 'linear' support multi-index") xvalues = block.index_columns[0] - # TODO: verify numeric + dtypes.NUMERIC_BIGFRAMES_TYPES + if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: + raise ValueError("Can only interpolate on numeric index.") for column in original_columns: # null in same places column is null @@ -145,21 +147,20 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: "values": "linear", "index": "linear", "slinear": "linear", - "pad": "ffill", "zero": "ffill", "nearest": "nearest", } extrapolating_methods = ["linear", "values", "index"] interpolate_method = interpolate_method_map[method] do_extrapolate = method in extrapolating_methods - block, interpolated_and_ffilled = _interpolate_column( + block, interpolated = _interpolate_column( block, column, xvalues, interpolate_method=interpolate_method, do_extrapolate=do_extrapolate, ) - output_column_ids.append(interpolated_and_ffilled) + output_column_ids.append(interpolated) else: output_column_ids.append(column) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ffcaf0d613..9dbe8dc2ee 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1435,6 +1435,8 @@ def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = No return self.reindex(index=other.index, columns=other.columns, validate=validate) def interpolate(self, method: str = "linear") -> DataFrame: + if method == "pad": + return self.ffill() result = block_ops.interpolate(self._block, method) return DataFrame(result) diff --git a/bigframes/series.py b/bigframes/series.py index 824757cf52..415c5f1be3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -469,6 +469,8 @@ def replace( return Series(block.select_column(result_col)) def interpolate(self, method: str = "linear") -> Series: + if method == "pad": + return self.ffill() result = block_ops.interpolate(self._block, method) return Series(result) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 647db6ee6f..206dfea70c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -294,6 +294,11 @@ def test_series_interpolate(method): pd_result = pd_series.astype("float64").interpolate(method=method) bf_result = bf_series.interpolate(method=method).to_pandas() + print("pandas") + print(pd_result.to_string()) + print("bigframes") + print(bf_result.to_string()) + # pd uses non-null types, while bf uses nullable types pd.testing.assert_series_equal( pd_result, From 39e1af9464f35acfd1d4685cbc720a62398f2b93 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 7 Nov 2023 00:41:48 +0000 Subject: [PATCH 3/5] remove debug code --- bigframes/core/block_transforms.py | 1 - tests/system/small/test_series.py | 5 ----- 2 files changed, 6 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 3828a39899..ba6901a48b 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -131,7 +131,6 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: if len(index_columns) != 1: raise ValueError("only method 'linear' support multi-index") xvalues = block.index_columns[0] - dtypes.NUMERIC_BIGFRAMES_TYPES if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: raise ValueError("Can only interpolate on numeric index.") diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 206dfea70c..647db6ee6f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -294,11 +294,6 @@ def test_series_interpolate(method): pd_result = pd_series.astype("float64").interpolate(method=method) bf_result = bf_series.interpolate(method=method).to_pandas() - print("pandas") - print(pd_result.to_string()) - print("bigframes") - print(bf_result.to_string()) - # pd uses non-null types, while bf uses nullable types pd.testing.assert_series_equal( pd_result, From c1ba8cabcf9607ba052d909a281186eca0bee836 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 7 Nov 2023 21:08:23 +0000 Subject: [PATCH 4/5] doc amendments --- bigframes/core/block_transforms.py | 2 +- .../bigframes_vendored/pandas/core/frame.py | 26 +++++++++++++------ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index ba6901a48b..e095f21f6b 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -129,7 +129,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: else: index_columns = block.index_columns if len(index_columns) != 1: - raise ValueError("only method 'linear' support multi-index") + raise ValueError("only method 'linear' supports multi-index") xvalues = block.index_columns[0] if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: raise ValueError("Can only interpolate on numeric index.") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d4f979daaa..0e3ae9d708 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2810,15 +2810,25 @@ def interpolate(self, method: str = "linear"): >>> df = bpd.DataFrame({ ... 'A': [1, 2, 3, None, None, 6], ... 'B': [None, 6, None, 2, None, 3], - ... }) + ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0]) >>> df.interpolate() - A B - 0 1.0 - 1 2.0 6.0 - 2 3.0 4.0 - 3 4.0 2.0 - 4 5.0 2.5 - 5 6.0 3.0 + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.0 + 0.7 4.0 2.0 + 0.9 5.0 2.5 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + >>> df.interpolate(method="values") + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.666667 + 0.7 4.714286 2.0 + 0.9 5.571429 2.666667 + 1.0 6.0 3.0 [6 rows x 2 columns] """ From 5fe6dec6d612b0279545681b75d01088bc898b00 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 Nov 2023 18:50:40 +0000 Subject: [PATCH 5/5] fix interpolate docstrings to render examples --- .../bigframes_vendored/pandas/core/frame.py | 28 +++++------ .../bigframes_vendored/pandas/core/series.py | 46 +++++++++++++------ 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 0e3ae9d708..27717ac46f 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2788,20 +2788,6 @@ def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. - Args: - method (str, default 'linear'): - Interpolation technique to use. Only 'linear' supported. - 'linear': Ignore the index and treat the values as equally spaced. - This is the only method supported on MultiIndexes. - 'index', 'values': use the actual numerical values of the index. - 'pad': Fill in NaNs using existing values. - 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` - - Returns: - DataFrame: - Returns the same object type as the caller, interpolated at - some or all ``NaN`` values - **Examples:** >>> import bigframes.pandas as bpd @@ -2831,6 +2817,20 @@ def interpolate(self, method: str = "linear"): 1.0 6.0 3.0 [6 rows x 2 columns] + + Args: + method (str, default 'linear'): + Interpolation technique to use. Only 'linear' supported. + 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + 'index', 'values': use the actual numerical values of the index. + 'pad': Fill in NaNs using existing values. + 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` + + Returns: + DataFrame: + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 9990a92891..b25b0c75cf 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -920,6 +920,37 @@ def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3, None, None, 6], + ... 'B': [None, 6, None, 2, None, 3], + ... }, index=[0, 0.1, 0.3, 0.7, 0.9, 1.0]) + >>> df.interpolate() + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.0 + 0.7 4.0 2.0 + 0.9 5.0 2.5 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + >>> df.interpolate(method="values") + A B + 0.0 1.0 + 0.1 2.0 6.0 + 0.3 3.0 4.666667 + 0.7 4.714286 2.0 + 0.9 5.571429 2.666667 + 1.0 6.0 3.0 + + [6 rows x 2 columns] + + Args: method (str, default 'linear'): Interpolation technique to use. Only 'linear' supported. @@ -932,21 +963,6 @@ def interpolate(self, method: str = "linear"): Series: Returns the same object type as the caller, interpolated at some or all ``NaN`` values - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> series = bpd.Series([1, 2, 3, None, None, 6]) - >>> series.interpolate() - 0 1.0 - 1 2.0 - 2 3.0 - 3 4.0 - 4 5.0 - 5 6.0 - dtype: Float64 """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)