diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 92a7b6f099..623da74aa4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1939,14 +1939,27 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index): ) -def test_value_counts(scalars_dfs): +@pytest.mark.parametrize( + ("kwargs"), + [ + {}, + {"normalize": True}, + {"ascending": True}, + ], + ids=[ + "default", + "normalize", + "ascending", + ], +) +def test_value_counts(scalars_dfs, kwargs): if pd.__version__.startswith("1."): pytest.skip("pandas 1.x produces different column labels.") scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].value_counts().to_pandas() - pd_result = scalars_pandas_df[col_name].value_counts() + bf_result = scalars_df[col_name].value_counts(**kwargs).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(**kwargs) pd.testing.assert_series_equal( bf_result, @@ -1954,6 +1967,25 @@ def test_value_counts(scalars_dfs): ) +def test_value_counts_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(dropna=False) + + # Older pandas version may not have these values, bigframes tries to emulate 2.0+ + pd_result.name = "count" + pd_result.index.name = col_name + + assert_series_equal( + bf_result, + pd_result, + # bigframes values_counts does not honor ordering in the original data + ignore_order=True, + ) + + def test_value_counts_w_cut(scalars_dfs): if pd.__version__.startswith("1."): pytest.skip("value_counts results different in pandas 1.x.") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 4753bfc589..c082b87336 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4216,6 +4216,62 @@ def value_counts( """ Return a Series containing counts of unique rows in the DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], + ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, + ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], + ... dtype='Int64') + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + octopus 7 + + [5 rows x 2 columns] + + ``value_counts`` sorts the result by counts in a descending order by default: + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + Name: count, dtype: Int64 + + You can normalize the counts to return relative frequencies by setting ``normalize=True``: + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.5 + 2 2 0.25 + 6 0 0.25 + Name: proportion, dtype: Float64 + + You can get the rows in the ascending order of the counts by setting ``ascending=True``: + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + Name: count, dtype: Int64 + + You can include the counts of the rows with ``NA`` values by setting ``dropna=False``: + + >>> df.value_counts(dropna=False) + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + 7 1 + Name: count, dtype: Int64 + Args: subset (label or list of labels, optional): Columns to use when counting unique combinations. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 48bcca4ad8..8303df5ef4 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2035,6 +2035,59 @@ def value_counts( first element is the most frequently-occurring element. Excludes NA values by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") + + >>> s + 0 3 + 1 1 + 2 2 + 3 3 + 4 4 + 5 + dtype: Int64 + + ``value_counts`` sorts the result by counts in a descending order by default: + + >>> s.value_counts() + 3 2 + 1 1 + 2 1 + 4 1 + Name: count, dtype: Int64 + + You can normalize the counts to return relative frequencies by setting ``normalize=True``: + + >>> s.value_counts(normalize=True) + 3 0.4 + 1 0.2 + 2 0.2 + 4 0.2 + Name: proportion, dtype: Float64 + + You can get the values in the ascending order of the counts by setting ``ascending=True``: + + >>> s.value_counts(ascending=True) + 1 1 + 2 1 + 4 1 + 3 2 + Name: count, dtype: Int64 + + You can include the counts of the ``NA`` values by setting ``dropna=False``: + + >>> s.value_counts(dropna=False) + 3 2 + 1 1 + 2 1 + 4 1 + 1 + Name: count, dtype: Int64 + Args: normalize (bool, default False): If True then the object returned will contain the relative @@ -2167,3 +2220,25 @@ def iat(self): def at(self): """Access a single value for a row/column label pair.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def values(self): + """ + Return Series as ndarray or ndarray-like depending on the dtype. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> bpd.Series([1, 2, 3]).values + array([1, 2, 3], dtype=object) + + >>> bpd.Series(list('aabc')).values + array(['a', 'a', 'b', 'c'], dtype=object) + + Returns: + numpy.ndarray or ndarray-like: Values in the Series. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)