From 12ad9ed82867baf086900401786f906e7e7a9175 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Mon, 30 Oct 2023 23:44:29 +0000
Subject: [PATCH] feat: add interpolate() to series and dataframe

---
 bigframes/core/block_transforms.py            | 91 +++++++++++++++++++
 bigframes/dataframe.py                        |  4 +
 bigframes/series.py                           |  4 +
 tests/system/small/test_dataframe.py          | 16 ++++
 tests/system/small/test_series.py             | 26 ++++++
 .../bigframes_vendored/pandas/core/frame.py   | 37 ++++++++
 .../bigframes_vendored/pandas/core/series.py  | 32 +++++++
 7 files changed, 210 insertions(+)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index 3706bf1681..917edac0de 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -105,6 +105,97 @@ def indicate_duplicates(
     )
 
 
+def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
+    if method != "linear":
+        raise NotImplementedError(
+            f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}"
+        )
+    backwards_window = windows.WindowSpec(following=0)
+    forwards_window = windows.WindowSpec(preceding=0)
+
+    output_column_ids = []
+
+    original_columns = block.value_columns
+    original_labels = block.column_labels
+    block, offsets = block.promote_offsets()
+    for column in original_columns:
+        # null in same places column is null
+        should_interpolate = block._column_type(column) in [
+            pd.Float64Dtype(),
+            pd.Int64Dtype(),
+        ]
+        if should_interpolate:
+            block, notnull = block.apply_unary_op(column, ops.notnull_op)
+            block, masked_offsets = block.apply_binary_op(
+                offsets, notnull, ops.partial_arg3(ops.where_op, None)
+            )
+
+            block, previous_value = block.apply_window_op(
+                column, agg_ops.LastNonNullOp(), backwards_window
+            )
+            block, next_value = block.apply_window_op(
+                column, agg_ops.FirstNonNullOp(), forwards_window
+            )
+            block, previous_value_offset = block.apply_window_op(
+                masked_offsets,
+                agg_ops.LastNonNullOp(),
+                backwards_window,
+                skip_reproject_unsafe=True,
+            )
+            block, next_value_offset = block.apply_window_op(
+                masked_offsets,
+                agg_ops.FirstNonNullOp(),
+                forwards_window,
+                skip_reproject_unsafe=True,
+            )
+
+            block, prediction_id = _interpolate(
+                block,
+                previous_value_offset,
+                previous_value,
+                next_value_offset,
+                next_value,
+                offsets,
+            )
+
+            block, interpolated_column = block.apply_binary_op(
+                column, prediction_id, ops.fillna_op
+            )
+            # Pandas performs ffill-like behavior to extrapolate forwards
+            block, interpolated_and_ffilled = block.apply_binary_op(
+                interpolated_column, previous_value, ops.fillna_op
+            )
+
+            output_column_ids.append(interpolated_and_ffilled)
+        else:
+            output_column_ids.append(column)
+
+    # Force reproject since used `skip_project_unsafe` perviously
+    block = block.select_columns(output_column_ids)._force_reproject()
+    return block.with_column_labels(original_labels)
+
+
+def _interpolate(
+    block: blocks.Block,
+    x0_id: str,
+    y0_id: str,
+    x1_id: str,
+    y1_id: str,
+    xpredict_id: str,
+) -> typing.Tuple[blocks.Block, str]:
+    """Applies linear interpolation equation to predict y values for xpredict."""
+    block, x1x0diff = block.apply_binary_op(x1_id, x0_id, ops.sub_op)
+    block, y1y0diff = block.apply_binary_op(y1_id, y0_id, ops.sub_op)
+    block, xpredictx0diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op)
+
+    block, y1_weight = block.apply_binary_op(y1y0diff, x1x0diff, ops.div_op)
+    block, y1_part = block.apply_binary_op(xpredictx0diff, y1_weight, ops.mul_op)
+
+    block, prediction_id = block.apply_binary_op(y0_id, y1_part, ops.add_op)
+    block = block.drop_columns([x1x0diff, y1y0diff, xpredictx0diff, y1_weight, y1_part])
+    return block, prediction_id
+
+
 def drop_duplicates(
     block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
 ) -> blocks.Block:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 3369fb4868..ffcaf0d613 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1434,6 +1434,10 @@ def _reindex_columns(self, columns):
     def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None):
         return self.reindex(index=other.index, columns=other.columns, validate=validate)
 
+    def interpolate(self, method: str = "linear") -> DataFrame:
+        result = block_ops.interpolate(self._block, method)
+        return DataFrame(result)
+
     def fillna(self, value=None) -> DataFrame:
         return self._apply_binop(value, ops.fillna_op, how="left")
 
diff --git a/bigframes/series.py b/bigframes/series.py
index 37d00d16f3..824757cf52 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -468,6 +468,10 @@ def replace(
             )
             return Series(block.select_column(result_col))
 
+    def interpolate(self, method: str = "linear") -> Series:
+        result = block_ops.interpolate(self._block, method)
+        return Series(result)
+
     def dropna(
         self,
         *,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index c96faa3526..2b710d692a 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -711,6 +711,22 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+def test_df_interpolate(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    columns = ["int64_col", "int64_too", "float64_col"]
+    bf_result = scalars_df[columns].interpolate().to_pandas()
+    # Pandas can only interpolate on "float64" columns
+    # https://github.com/pandas-dev/pandas/issues/40252
+    pd_result = scalars_pandas_df[columns].astype("float64").interpolate()
+
+    pandas.testing.assert_frame_equal(
+        bf_result,
+        pd_result,
+        check_index_type=False,
+        check_dtype=False,
+    )
+
+
 def test_df_fillna(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     df = scalars_df[["int64_col", "float64_col"]].fillna(3)
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 05d8b84185..183ba01c0e 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -272,6 +272,32 @@ def test_series_replace_list_scalar(scalars_dfs):
     )
 
 
+@pytest.mark.parametrize(
+    ("values",),
+    (
+        ([None, 1, 2, None, None, 16, None],),
+        ([None, None, 3.6, None],),
+        ([403.2, None, 352.1, None, None, 111.9],),
+    ),
+)
+def test_series_interpolate(values):
+    pd_series = pd.Series(values)
+    bf_series = series.Series(pd_series)
+
+    # Pandas can only interpolate on "float64" columns
+    # https://github.com/pandas-dev/pandas/issues/40252
+    pd_result = pd_series.astype("float64").interpolate()
+    bf_result = bf_series.interpolate().to_pandas()
+
+    # pd uses non-null types, while bf uses nullable types
+    pd.testing.assert_series_equal(
+        pd_result,
+        bf_result,
+        check_index_type=False,
+        check_dtype=False,
+    )
+
+
 @pytest.mark.parametrize(
     ("ignore_index",),
     (
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 013d170114..12bd053179 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2756,6 +2756,43 @@ def value_counts(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def interpolate(self, method: str = "linear"):
+        """
+        Fill NaN values using an interpolation method.
+
+        Args:
+            method (str, default 'linear'):
+                Interpolation technique to use. Only 'linear' supported.
+                'linear': Ignore the index and treat the values as equally spaced.
+                This is the only method supported on MultiIndexes.
+
+        Returns:
+            DataFrame:
+                Returns the same object type as the caller, interpolated at
+                some or all ``NaN`` values
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'A': [1, 2, 3, None, None, 6],
+            ...     'B': [None, 6, None, 2, None, 3],
+            ...     })
+            >>> df.interpolate()
+                 A     B
+            0  1.0  <NA>
+            1  2.0   6.0
+            2  3.0   4.0
+            3  4.0   2.0
+            4  5.0   2.5
+            5  6.0   3.0
+            <BLANKLINE>
+            [6 rows x 2 columns]
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def fillna(self, value):
         """
         Fill NA/NaN values using the specified method.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index f0e13e16f5..b569e5699c 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -916,6 +916,38 @@ def droplevel(self, level, axis):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def interpolate(self, method: str = "linear"):
+        """
+        Fill NaN values using an interpolation method.
+
+        Args:
+            method (str, default 'linear'):
+                Interpolation technique to use. Only 'linear' supported.
+                'linear': Ignore the index and treat the values as equally spaced.
+                This is the only method supported on MultiIndexes.
+
+        Returns:
+            Series:
+                Returns the same object type as the caller, interpolated at
+                some or all ``NaN`` values
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> series = bpd.Series([1, 2, 3, None, None, 6])
+            >>> series.interpolate()
+            0    1.0
+            1    2.0
+            2    3.0
+            3    4.0
+            4    5.0
+            5    6.0
+            dtype: Float64
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def fillna(
         self,
         value=None,