From 3eec0102f3ca5863e181472f88d1814d5f5aab25 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 30 Nov 2023 08:03:59 +0000 Subject: [PATCH 1/5] docs: add code samples for `values` and `value_counts` --- tests/system/small/test_series.py | 38 ++++++++- .../bigframes_vendored/pandas/core/frame.py | 56 +++++++++++++ .../bigframes_vendored/pandas/core/series.py | 80 +++++++++++++++++++ 3 files changed, 171 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d9fc23fad0..fd778e97a3 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1939,12 +1939,25 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index): ) -def test_value_counts(scalars_dfs): +@pytest.mark.parametrize( + ("kwargs"), + [ + {}, + {"normalize": True}, + {"ascending": True}, + ], + ids=[ + "default", + "normalize", + "ascending", + ], +) +def test_value_counts(scalars_dfs, kwargs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_result = scalars_df[col_name].value_counts().to_pandas() - pd_result = scalars_pandas_df[col_name].value_counts() + bf_result = scalars_df[col_name].value_counts(**kwargs).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(**kwargs) # Older pandas version may not have these values, bigframes tries to emulate 2.0+ pd_result.name = "count" @@ -1956,6 +1969,25 @@ def test_value_counts(scalars_dfs): ) +def test_value_counts_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + + bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas() + pd_result = scalars_pandas_df[col_name].value_counts(dropna=False) + + # Older pandas version may not have these values, bigframes tries to emulate 2.0+ + pd_result.name = "count" + pd_result.index.name = col_name + + assert_series_equal( + bf_result, + pd_result, + # bigframes values_counts does not honor ordering in the original data + ignore_order=True, + ) + + def test_value_counts_w_cut(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 099d8b8e66..753deb1f41 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3613,6 +3613,62 @@ def value_counts( """ Return a Series containing counts of unique rows in the DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7], + ... 'num_wings': [2, 0, 0, 0, bpd.NA]}, + ... index=['falcon', 'dog', 'cat', 'ant', 'octopus'], + ... dtype='Int64') + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + octopus 7 + + [5 rows x 2 columns] + + ``value_counts`` sorts the result by counts in a descending order by default: + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + Name: count, dtype: Int64 + + You can normalize the counts to return relative frequencies by setting ``normalize=True``: + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.5 + 2 2 0.25 + 6 0 0.25 + Name: count, dtype: Float64 + + You can get the rows in the ascending order of the counts by setting ``ascending=True``: + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + Name: count, dtype: Int64 + + You can include the counts of the rows with ``NA`` values by setting ``dropna=False``: + + >>> df.value_counts(dropna=False) + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + 7 1 + Name: count, dtype: Int64 + Args: subset (label or list of labels, optional): Columns to use when counting unique combinations. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1b751ed83b..288c8150e8 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2022,6 +2022,58 @@ def value_counts( first element is the most frequently-occurring element. Excludes NA values by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64") + + >>> s + 0 3 + 1 1 + 2 2 + 3 3 + 4 4 + 5 + dtype: Int64 + + ``value_counts`` sorts the result by counts in a descending order by default: + >>> s.value_counts() + 3 2 + 1 1 + 2 1 + 4 1 + Name: count, dtype: Int64 + + You can normalize the counts to return relative frequencies by setting ``normalize=True``: + + >>> s.value_counts(normalize=True) + 3 0.4 + 1 0.2 + 2 0.2 + 4 0.2 + Name: count, dtype: Float64 + + You can get the values in the ascending order of the counts by setting ``ascending=True``: + + >>> s.value_counts(ascending=True) + 1 1 + 2 1 + 4 1 + 3 2 + Name: count, dtype: Int64 + + You can include the counts of the ``NA`` values by setting ``dropna=False``: + + >>> s.value_counts(dropna=False) + 3 2 + 1 1 + 2 1 + 4 1 + 1 + Name: count, dtype: Int64 + Args: normalize (bool, default False): If True then the object returned will contain the relative @@ -2154,3 +2206,31 @@ def iat(self): def at(self): """Access a single value for a row/column label pair.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def values(self): + """ + Return Series as ndarray or ndarray-like depending on the dtype. + + .. warning:: + + We recommend using :attr:`Series.array` or + :meth:`Series.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> bpd.Series([1, 2, 3]).values + array([1, 2, 3], dtype=object) + + >>> bpd.Series(list('aabc')).values + array(['a', 'a', 'b', 'c'], dtype=object) + + Returns: + numpy.ndarray or ndarray-like + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 5b2400cfff6c159ea2e1918b2d79dbe767996d05 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 30 Nov 2023 22:14:44 +0000 Subject: [PATCH 2/5] add newline in docstring for correct rendering --- third_party/bigframes_vendored/pandas/core/series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 288c8150e8..1f50f4d993 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2039,6 +2039,7 @@ def value_counts( dtype: Int64 ``value_counts`` sorts the result by counts in a descending order by default: + >>> s.value_counts() 3 2 1 1 From 6e2d15b3bafc0c7dd4f56a3ce34db610f8437ee1 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 1 Dec 2023 02:44:35 +0000 Subject: [PATCH 3/5] fix warning and return rendering --- third_party/bigframes_vendored/pandas/core/series.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1f50f4d993..7bf9b81698 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2214,7 +2214,6 @@ def values(self): Return Series as ndarray or ndarray-like depending on the dtype. .. warning:: - We recommend using :attr:`Series.array` or :meth:`Series.to_numpy`, depending on whether you need a reference to the underlying data or a NumPy array. @@ -2231,7 +2230,7 @@ def values(self): array(['a', 'a', 'b', 'c'], dtype=object) Returns: - numpy.ndarray or ndarray-like + numpy.ndarray or ndarray-like: Values in the Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 08de7884a9f76056d7d271b9bf1c26b5744f461c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 8 Dec 2023 22:00:02 +0000 Subject: [PATCH 4/5] remove differentiation docstring for values with other methods --- third_party/bigframes_vendored/pandas/core/series.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 7bf9b81698..bf3d77ea59 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2213,11 +2213,6 @@ def values(self): """ Return Series as ndarray or ndarray-like depending on the dtype. - .. warning:: - We recommend using :attr:`Series.array` or - :meth:`Series.to_numpy`, depending on whether you need - a reference to the underlying data or a NumPy array. - **Examples:** >>> import bigframes.pandas as bpd From 266db5e9d24190bfa4241d854fac08384df95268 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 12 Dec 2023 21:06:03 +0000 Subject: [PATCH 5/5] adjust series name and whitespaces in the docstrings --- third_party/bigframes_vendored/pandas/core/frame.py | 2 +- third_party/bigframes_vendored/pandas/core/series.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 096949bd8f..71a5c635bc 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3957,7 +3957,7 @@ def value_counts( 4 0 0.5 2 2 0.25 6 0 0.25 - Name: count, dtype: Float64 + Name: proportion, dtype: Float64 You can get the rows in the ascending order of the counts by setting ``ascending=True``: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 790bb7734e..e6ba6133aa 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2063,11 +2063,11 @@ def value_counts( You can normalize the counts to return relative frequencies by setting ``normalize=True``: >>> s.value_counts(normalize=True) - 3 0.4 - 1 0.2 - 2 0.2 - 4 0.2 - Name: count, dtype: Float64 + 3 0.4 + 1 0.2 + 2 0.2 + 4 0.2 + Name: proportion, dtype: Float64 You can get the values in the ascending order of the counts by setting ``ascending=True``: