Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: add resampler type and series/df resample #1671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions bigframes/core/resample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from bigframes.core.groupby import DataFrameGroupBy


class Resampler(DataFrameGroupBy):
def __init__(self, obj, by, **kwargs):
super().__init__(obj, by, **kwargs)
78 changes: 9 additions & 69 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3679,87 +3679,27 @@ def _split(
blocks = self._block.split(ns=ns, fracs=fracs, random_state=random_state)
return [DataFrame(block) for block in blocks]

@validations.requires_ordering()
def _resample(
def resample(
self,
rule: str,
*,
on: blocks.Label = None,
level: Optional[LevelsType] = None,
origin: Union[
Union[
pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str
],
Literal["epoch", "start", "start_day", "end", "end_day"],
] = "start_day",
level: Optional[LevelType] = None,
origin: Literal["epoch", "start", "start_day"] = "start_day",
) -> bigframes.core.groupby.DataFrameGroupBy:
"""Internal function to support resample. Resample time-series data.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import pandas as pd
>>> bpd.options.display.progress_bar = None

>>> data = {
... "timestamp_col": pd.date_range(
... start="2021-01-01 13:00:00", periods=30, freq="1s"
... ),
... "int64_col": range(30),
... "int64_too": range(10, 40),
... }

Resample on a DataFrame with index:

>>> df = bpd.DataFrame(data).set_index("timestamp_col")
>>> df._resample(rule="7s").min()
int64_col int64_too
2021-01-01 12:59:55 0 10
2021-01-01 13:00:02 2 12
2021-01-01 13:00:09 9 19
2021-01-01 13:00:16 16 26
2021-01-01 13:00:23 23 33
<BLANKLINE>
[5 rows x 2 columns]

Resample with column and origin set to 'start':

>>> df = bpd.DataFrame(data)
>>> df._resample(rule="7s", on = "timestamp_col", origin="start").min()
int64_col int64_too
2021-01-01 13:00:00 0 10
2021-01-01 13:00:07 7 17
2021-01-01 13:00:14 14 24
2021-01-01 13:00:21 21 31
2021-01-01 13:00:28 28 38
<BLANKLINE>
[5 rows x 2 columns]

Args:
rule (str):
The offset string representing target conversion.
on (str, default None):
For a DataFrame, column to use instead of index for resampling. Column
must be datetime-like.
level (str or int, default None):
For a MultiIndex, level (name or number) to use for resampling.
level must be datetime-like.
origin(str, default 'start_day'):
The timestamp on which to adjust the grouping. Must be one of the following:
'epoch': origin is 1970-01-01
'start': origin is the first value of the timeseries
'start_day': origin is the first day at midnight of the timeseries
Returns:
DataFrameGroupBy: DataFrameGroupBy object.
"""
block = self._block._generate_resample_label(
rule=rule,
on=on,
level=level,
origin=origin,
)
df = DataFrame(block)
return df.groupby(level=0)
return groupby.DataFrameGroupBy(
df._block,
by_col_ids=df._resolve_levels(0),
as_index=True,
dropna=True,
)

@classmethod
def from_dict(
Expand Down
57 changes: 4 additions & 53 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2172,65 +2172,16 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
)
)

@validations.requires_ordering()
def _resample(
def resample(
self,
rule: str,
*,
closed: Optional[Literal["right", "left"]] = None,
label: Optional[Literal["right", "left"]] = None,
level: Optional[LevelsType] = None,
origin: Union[
Union[
pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str
],
Literal["epoch", "start", "start_day", "end", "end_day"],
] = "start_day",
on: blocks.Label = None,
level: Optional[LevelType] = None,
origin: Literal["epoch", "start", "start_day"] = "start_day",
) -> bigframes.core.groupby.SeriesGroupBy:
"""Internal function to support resample. Resample time-series data.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import pandas as pd
>>> bpd.options.display.progress_bar = None

>>> data = {
... "timestamp_col": pd.date_range(
... start="2021-01-01 13:00:00", periods=30, freq="1s"
... ),
... "int64_col": range(30),
... }
>>> s = bpd.DataFrame(data).set_index("timestamp_col")
>>> s._resample(rule="7s", origin="epoch").min()
int64_col
2021-01-01 12:59:56 0
2021-01-01 13:00:03 3
2021-01-01 13:00:10 10
2021-01-01 13:00:17 17
2021-01-01 13:00:24 24
<BLANKLINE>
[5 rows x 1 columns]


Args:
rule (str):
The offset string representing target conversion.
level (str or int, default None):
For a MultiIndex, level (name or number) to use for resampling.
level must be datetime-like.
origin(str, default 'start_day'):
The timestamp on which to adjust the grouping. Must be one of the following:
'epoch': origin is 1970-01-01
'start': origin is the first value of the timeseries
'start_day': origin is the first day at midnight of the timeseries
Returns:
SeriesGroupBy: SeriesGroupBy object.
"""
block = self._block._generate_resample_label(
rule=rule,
closed=closed,
label=label,
on=None,
level=level,
origin=origin,
Expand Down
12 changes: 6 additions & 6 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5418,13 +5418,13 @@ def test_dataframe_explode_xfail(col_names):
),
],
)
def test__resample_with_column(
def test_resample_with_column(
scalars_df_index, scalars_pandas_df_index, on, rule, origin
):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
bf_result = (
scalars_df_index._resample(rule=rule, on=on, origin=origin)[
scalars_df_index.resample(rule=rule, on=on, origin=origin)[
["int64_col", "int64_too"]
]
.max()
Expand All @@ -5446,7 +5446,7 @@ def test__resample_with_column(
pytest.param(False, None, "datetime_col", "100d"),
],
)
def test__resample_with_index(
def test_resample_with_index(
scalars_df_index, scalars_pandas_df_index, append, level, col, rule
):
# TODO: supply a reason why this isn't compatible with pandas 1.x
Expand All @@ -5455,7 +5455,7 @@ def test__resample_with_index(
scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)
bf_result = (
scalars_df_index[["int64_col", "int64_too"]]
._resample(rule=rule, level=level)
.resample(rule=rule, level=level)
.min()
.to_pandas()
)
Expand Down Expand Up @@ -5505,15 +5505,15 @@ def test__resample_with_index(
),
],
)
def test__resample_start_time(rule, origin, data):
def test_resample_start_time(rule, origin, data):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
col = "timestamp_col"
scalars_df_index = bpd.DataFrame(data).set_index(col)
scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
scalars_pandas_df_index.index.name = None

bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas()

pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()

Expand Down
4 changes: 2 additions & 2 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4398,14 +4398,14 @@ def test_series_explode_null(data):
pytest.param(True, "timestamp_col", "timestamp_col", "1YE"),
],
)
def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"]
scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[
"int64_col"
]
bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas()
bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas()
pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min()
pd.testing.assert_series_equal(bf_result, pd_result)

Expand Down
4 changes: 2 additions & 2 deletions tests/system/small/test_unordered.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,15 +250,15 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session):
),
],
)
def test__resample_with_index(unordered_session, rule, origin, data):
def test_resample_with_index(unordered_session, rule, origin, data):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
col = "timestamp_col"
scalars_df_index = bpd.DataFrame(data, session=unordered_session).set_index(col)
scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
scalars_pandas_df_index.index.name = None

bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas()

pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()

Expand Down
77 changes: 76 additions & 1 deletion third_party/bigframes_vendored/pandas/core/generic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py
from __future__ import annotations

from typing import Callable, Iterator, Literal, Optional, TYPE_CHECKING
from typing import Callable, Hashable, Iterator, Literal, Optional, TYPE_CHECKING

import bigframes_vendored.constants as constants
from bigframes_vendored.pandas.core import indexing
Expand Down Expand Up @@ -1271,3 +1271,78 @@ def equals(self, other) -> bool:
otherwise.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def resample(
self,
rule: str,
*,
on: Hashable = None,
level: Optional[Hashable] = None,
origin: Literal["epoch", "start", "start_day"] = "start_day",
):
"""
Resample time-series data.

Convenience method for frequency conversion and resampling of time
series. The object must have a datetime index or the caller must
pass the label of a datetime series/index to the on/level keyword
parameter.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import pandas as pd
>>> bpd.options.display.progress_bar = None

>>> data = {
... "timestamp_col": pd.date_range(
... start="2021-01-01 13:00:00", periods=30, freq="1s"
... ),
... "int64_col": range(30),
... "int64_too": range(10, 40),
... }

Resample on a DataFrame with index:

>>> df = bpd.DataFrame(data).set_index("timestamp_col")
>>> df.resample(rule="7s").min()
int64_col int64_too
2021-01-01 12:59:55 0 10
2021-01-01 13:00:02 2 12
2021-01-01 13:00:09 9 19
2021-01-01 13:00:16 16 26
2021-01-01 13:00:23 23 33
<BLANKLINE>
[5 rows x 2 columns]

Resample with column and origin set to 'start':

>>> df = bpd.DataFrame(data)
>>> df.resample(rule="7s", on = "timestamp_col", origin="start").min()
int64_col int64_too
2021-01-01 13:00:00 0 10
2021-01-01 13:00:07 7 17
2021-01-01 13:00:14 14 24
2021-01-01 13:00:21 21 31
2021-01-01 13:00:28 28 38
<BLANKLINE>
[5 rows x 2 columns]

Args:
rule (str):
The offset string representing target conversion.
on (str, default None):
For a DataFrame, column to use instead of index for resampling. Column
must be datetime-like.
level (str or int, default None):
For a MultiIndex, level (name or number) to use for resampling.
level must be datetime-like.
origin(str, default 'start_day'):
The timestamp on which to adjust the grouping. Must be one of the following:
'epoch': origin is 1970-01-01
'start': origin is the first value of the timeseries
'start_day': origin is the first day at midnight of the timeseries
Returns:
Resampler: Resampler object.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
Loading