Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: add dataframe melt #116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 28, 2023
41 changes: 39 additions & 2 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1356,13 +1356,50 @@ def stack(self, how="left", levels: int = 1):
index_columns = [*added_index_columns, *self.index_columns]
index_labels = [*new_index_level_names, *self._index_labels]

block = Block(
return Block(
unpivot_expr,
index_columns=index_columns,
column_labels=result_index,
index_labels=index_labels,
)
return block

def melt(
self,
id_vars=typing.Sequence[str],
value_vars=typing.Sequence[str],
var_names=typing.Sequence[typing.Hashable],
value_name: typing.Hashable = "value",
):
# TODO: Implement col_level and ignore_index
unpivot_col_id = guid.generate_guid()
var_col_ids = tuple([guid.generate_guid() for _ in var_names])
# single unpivot col
unpivot_col = (unpivot_col_id, tuple(value_vars))
value_labels = [self.col_id_to_label[col_id] for col_id in value_vars]
id_labels = [self.col_id_to_label[col_id] for col_id in id_vars]

dtype = self._expr.get_column_type(value_vars[0])

unpivot_expr = self._expr.unpivot(
row_labels=value_labels,
passthrough_columns=id_vars,
unpivot_columns=(unpivot_col,),
index_col_ids=var_col_ids,
dtype=dtype,
how="right",
)
index_id = guid.generate_guid()
unpivot_expr = unpivot_expr.promote_offsets(index_id)
# Need to reorder to get id_vars before var_col and unpivot_col
unpivot_expr = unpivot_expr.select_columns(
[index_id, *id_vars, *var_col_ids, unpivot_col_id]
)

return Block(
unpivot_expr,
column_labels=[*id_labels, *var_names, value_name],
index_columns=[index_id],
)

def _create_stack_column(
self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
Expand Down
38 changes: 38 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,44 @@ def idxmin(self) -> bigframes.series.Series:
def idxmax(self) -> bigframes.series.Series:
return bigframes.series.Series(block_ops.idxmax(self._block))

def melt(
self,
id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
var_name: typing.Union[
typing.Hashable, typing.Sequence[typing.Hashable]
] = None,
value_name: typing.Hashable = "value",
):
if var_name is None:
# Determine default var_name. Attempt to use column labels if they are unique
if self.columns.nlevels > 1:
if len(set(self.columns.names)) == len(self.columns.names):
var_name = self.columns.names
else:
var_name = [f"variable_{i}" for i in range(len(self.columns.names))]
else:
var_name = self.columns.name or "variable"

var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,)

if id_vars is not None:
id_col_ids = [self._resolve_label_exact(col) for col in id_vars]
else:
id_col_ids = []
if value_vars is not None:
val_col_ids = [self._resolve_label_exact(col) for col in value_vars]
else:
val_col_ids = [
col_id
for col_id in self._block.value_columns
if col_id not in id_col_ids
]

return DataFrame(
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
)

def describe(self) -> DataFrame:
df_numeric = self._drop_non_numeric(keep_bool=False)
if len(df_numeric.columns) == 0:
Expand Down
43 changes: 43 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1919,6 +1919,49 @@ def test_df_stack(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)


def test_df_melt_default(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# To match bigquery dataframes
scalars_pandas_df = scalars_pandas_df.copy()
scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
# Can only stack identically-typed columns
columns = ["int64_col", "int64_too", "rowindex_2"]

bf_result = scalars_df[columns].melt().to_pandas()
pd_result = scalars_pandas_df[columns].melt()

# Pandas produces int64 index, Bigframes produces Int64 (nullable)
pd.testing.assert_frame_equal(
bf_result, pd_result, check_index_type=False, check_dtype=False
)


def test_df_melt_parameterized(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# To match bigquery dataframes
scalars_pandas_df = scalars_pandas_df.copy()
scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
# Can only stack identically-typed columns

bf_result = scalars_df.melt(
var_name="alice",
value_name="bob",
id_vars=["string_col"],
value_vars=["int64_col", "int64_too"],
).to_pandas()
pd_result = scalars_pandas_df.melt(
var_name="alice",
value_name="bob",
id_vars=["string_col"],
value_vars=["int64_col", "int64_too"],
)

# Pandas produces int64 index, Bigframes produces Int64 (nullable)
pd.testing.assert_frame_equal(
bf_result, pd_result, check_index_type=False, check_dtype=False
)


def test_df_unstack(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# To match bigquery dataframes
Expand Down
28 changes: 28 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,34 @@ def test_column_multi_index_stack(level):
)


def test_column_multi_index_melt():
if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
pytest.skip("pandas <2.1 uses different stack implementation")

level1 = pandas.Index(["b", "a", "b"])
level2 = pandas.Index(["a", "b", "b"])
level3 = pandas.Index(["b", "b", "a"])

multi_columns = pandas.MultiIndex.from_arrays(
[level1, level2, level3], names=["l1", "l2", "l3"]
)
pd_df = pandas.DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=[5, 2, None],
columns=multi_columns,
dtype="Int64",
)
bf_df = bpd.DataFrame(pd_df)

bf_result = bf_df.melt().to_pandas()
pd_result = pd_df.melt()

# BigFrames uses different string and int types, but values are identical
pandas.testing.assert_frame_equal(
bf_result, pd_result, check_index_type=False, check_dtype=False
)


def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "int64_col", "rowindex_2"]
level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]")
Expand Down
28 changes: 28 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,6 +2010,34 @@ def idxmax(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def melt(self, id_vars, value_vars, var_name, value_name):
"""
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.

This function is useful to massage a DataFrame into a format where one
or more columns are identifier variables (`id_vars`), while all other
columns, considered measured variables (`value_vars`), are "unpivoted" to
the row axis, leaving just two non-identifier columns, 'variable' and
'value'.

Parameters
----------
id_vars (tuple, list, or ndarray, optional):
Column(s) to use as identifier variables.
value_vars (tuple, list, or ndarray, optional):
Column(s) to unpivot. If not specified, uses all columns that
are not set as `id_vars`.
var_name (scalar):
Name to use for the 'variable' column. If None it uses
``frame.columns.name`` or 'variable'.
value_name (scalar, default 'value'):
Name to use for the 'value' column.

Returns:
DataFrame: Unpivoted DataFrame.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def nunique(self):
"""
Count number of distinct elements in specified axis.
Expand Down