From 4670078d5dbd2af608a0f9770690b260e163b0ba Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Fri, 3 Nov 2023 21:12:32 +0000 Subject: [PATCH 1/4] fix: match pandas behavior when assigning to empty dfs --- bigframes/dataframe.py | 31 ++++++++++++++++++++-------- tests/system/small/test_dataframe.py | 31 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 45dbcdc78d..bc8922c590 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1100,7 +1100,8 @@ def _assign_single_item( elif utils.is_list_like(v): given_rows = len(v) actual_rows = len(self) - if given_rows != actual_rows: + assigning_to_empty_df = len(self.columns) == 0 and actual_rows == 0 + if not assigning_to_empty_df and given_rows != actual_rows: raise ValueError( f"Length of values ({given_rows}) does not match length of index ({actual_rows})" ) @@ -1114,14 +1115,26 @@ def _assign_single_item( new_column_block = local_df._block original_index_column_ids = self._block.index_columns self_block = self._block.reset_index(drop=False) - result_index, (get_column_left, get_column_right) = self_block.index.join( - new_column_block.index, how="left", block_identity_join=True - ) - result_block = result_index._block - result_block = result_block.set_index( - [get_column_left[col_id] for col_id in original_index_column_ids], - index_labels=self._block.index_labels, - ) + if assigning_to_empty_df: + if len(self._block.index_columns) > 1: + # match error raised by pandas here + raise ValueError( + "Assigning listlike to a first column under multiindex is not supported." + ) + result_block = local_df._block + result_block = result_block.with_index_labels(self._block.index_labels) + else: + result_index, ( + get_column_left, + get_column_right, + ) = self_block.index.join( + new_column_block.index, how="left", block_identity_join=True + ) + result_block = result_index._block + result_block = result_block.set_index( + [get_column_left[col_id] for col_id in original_index_column_ids], + index_labels=self._block.index_labels, + ) return DataFrame(result_block) else: return self._assign_scalar(k, v) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 2b710d692a..4b968d34af 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -567,6 +567,37 @@ def test_assign_existing_column(scalars_dfs): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +def test_assign_to_empty_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + empty_df = scalars_df.drop(columns=list(scalars_df.columns)) + empty_pandas_df = scalars_pandas_df.drop(columns=list(scalars_pandas_df.columns)) + + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + assert_pandas_df_equal_ignore_ordering(bf_result.to_pandas(), pd_result) + + +def test_assign_to_empty_df_multiindex_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + empty_df = scalars_df.drop(columns=list(scalars_df.columns)) + empty_pandas_df = scalars_pandas_df.drop(columns=list(scalars_pandas_df.columns)) + empty_df["empty_col_1"] = [] + empty_df["empty_col_2"] = [] + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + def test_assign_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs column_name = "int64_col" From d8b1b59401e1fb0429770de5248fc514548d6609 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Sat, 4 Nov 2023 17:13:15 +0000 Subject: [PATCH 2/4] use dataframe constructor in test --- tests/system/small/test_dataframe.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b1c876d414..d6b52d1323 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -567,24 +567,21 @@ def test_assign_existing_column(scalars_dfs): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) -def test_assign_to_empty_df(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - empty_df = scalars_df.drop(columns=list(scalars_df.columns)) - empty_pandas_df = scalars_pandas_df.drop(columns=list(scalars_pandas_df.columns)) +def test_assign_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() bf_result = empty_df.assign(new_col=[1, 2, 3]) pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") assert_pandas_df_equal_ignore_ordering(bf_result.to_pandas(), pd_result) -def test_assign_to_empty_df_multiindex_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - empty_df = scalars_df.drop(columns=list(scalars_df.columns)) - empty_pandas_df = scalars_pandas_df.drop(columns=list(scalars_pandas_df.columns)) +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() empty_df["empty_col_1"] = [] empty_df["empty_col_2"] = [] empty_pandas_df["empty_col_1"] = [] From bb915f446eee97fda82e2668f02a4c1ad348e83f Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Sat, 4 Nov 2023 18:03:43 +0000 Subject: [PATCH 3/4] extract method for clarity --- bigframes/dataframe.py | 74 ++++++++++++++-------------- tests/system/small/test_dataframe.py | 2 +- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ce211e6168..6976883a2b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1100,50 +1100,48 @@ def _assign_single_item( copy[k] = v(copy) return copy elif utils.is_list_like(v): - given_rows = len(v) - actual_rows = len(self) - assigning_to_empty_df = len(self.columns) == 0 and actual_rows == 0 - if not assigning_to_empty_df and given_rows != actual_rows: - raise ValueError( - f"Length of values ({given_rows}) does not match length of index ({actual_rows})" - ) + return self._assign_single_item_listlike(k, v) + else: + return self._assign_scalar(k, v) - local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr.session + def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: + given_rows = len(v) + actual_rows = len(self) + assigning_to_empty_df = len(self.columns) == 0 and actual_rows == 0 + if not assigning_to_empty_df and given_rows != actual_rows: + raise ValueError( + f"Length of values ({given_rows}) does not match length of index ({actual_rows})" ) - # local_df is likely (but not guarunteed) to be cached locally - # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE - - new_column_block = local_df._block - original_index_column_ids = self._block.index_columns - self_block = self._block.reset_index(drop=False) - if assigning_to_empty_df: - if len(self._block.index_columns) > 1: - # match error raised by pandas here - raise ValueError( - "Assigning listlike to a first column under multiindex is not supported." - ) - result_block = local_df._block - result_block = result_block.with_index_labels(self._block.index_labels) - else: - result_index, ( - get_column_left, - get_column_right, - ) = self_block.index.join( - new_column_block.index, how="left", block_identity_join=True - ) - result_block = result_index._block - result_block = result_block.set_index( - [get_column_left[col_id] for col_id in original_index_column_ids], - index_labels=self._block.index_labels, + + local_df = bigframes.dataframe.DataFrame( + {k: v}, session=self._get_block().expr.session + ) + # local_df is likely (but not guarunteed) to be cached locally + # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE + + new_column_block = local_df._block + original_index_column_ids = self._block.index_columns + self_block = self._block.reset_index(drop=False) + if assigning_to_empty_df: + if len(self._block.index_columns) > 1: + # match error raised by pandas here + raise ValueError( + "Assigning listlike to a first column under multiindex is not supported." ) - return DataFrame(result_block) + result_block = new_column_block.with_index_labels(self._block.index_labels) + result_block = result_block.with_column_labels([k]) else: - return self._assign_scalar(k, v) + result_index, (get_column_left, get_column_right,) = self_block.index.join( + new_column_block.index, how="left", block_identity_join=True + ) + result_block = result_index._block + result_block = result_block.set_index( + [get_column_left[col_id] for col_id in original_index_column_ids], + index_labels=self._block.index_labels, + ) + return DataFrame(result_block) def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: - # TODO(swast): Make sure that k is the ID / SQL name, not a label, - # which could be invalid SQL. col_ids = self._block.cols_matching_label(label) block, constant_col_id = self._block.create_constant(value, label) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d6b52d1323..a2583e0815 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -567,7 +567,7 @@ def test_assign_existing_column(scalars_dfs): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) -def test_assign_to_empty_df(session): +def test_assign_listlike_to_empty_df(session): empty_df = dataframe.DataFrame(session=session) empty_pandas_df = pd.DataFrame() From 82320565ba9d7f419d888a9b9cb0b2b484f90e6a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 7 Nov 2023 16:44:57 -0600 Subject: [PATCH 4/4] Update bigframes/dataframe.py --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6976883a2b..27d89533b9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1116,7 +1116,7 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: local_df = bigframes.dataframe.DataFrame( {k: v}, session=self._get_block().expr.session ) - # local_df is likely (but not guarunteed) to be cached locally + # local_df is likely (but not guaranteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE new_column_block = local_df._block