Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1939,21 +1939,53 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index):
)


def test_value_counts(scalars_dfs):
@pytest.mark.parametrize(
("kwargs"),
[
{},
{"normalize": True},
{"ascending": True},
],
ids=[
"default",
"normalize",
"ascending",
],
)
def test_value_counts(scalars_dfs, kwargs):
if pd.__version__.startswith("1."):
pytest.skip("pandas 1.x produces different column labels.")
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"

bf_result = scalars_df[col_name].value_counts().to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts()
bf_result = scalars_df[col_name].value_counts(**kwargs).to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts(**kwargs)

pd.testing.assert_series_equal(
bf_result,
pd_result,
)


def test_value_counts_with_na(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_col"

bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts(dropna=False)

# Older pandas version may not have these values, bigframes tries to emulate 2.0+
pd_result.name = "count"
pd_result.index.name = col_name

assert_series_equal(
bf_result,
pd_result,
# bigframes values_counts does not honor ordering in the original data
ignore_order=True,
)


def test_value_counts_w_cut(scalars_dfs):
if pd.__version__.startswith("1."):
pytest.skip("value_counts results different in pandas 1.x.")
Expand Down
56 changes: 56 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4216,6 +4216,62 @@ def value_counts(
"""
Return a Series containing counts of unique rows in the DataFrame.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7],
... 'num_wings': [2, 0, 0, 0, bpd.NA]},
... index=['falcon', 'dog', 'cat', 'ant', 'octopus'],
... dtype='Int64')
>>> df
num_legs num_wings
falcon 2 2
dog 4 0
cat 4 0
ant 6 0
octopus 7 <NA>
<BLANKLINE>
[5 rows x 2 columns]

``value_counts`` sorts the result by counts in a descending order by default:

>>> df.value_counts()
num_legs num_wings
4 0 2
2 2 1
6 0 1
Name: count, dtype: Int64

You can normalize the counts to return relative frequencies by setting ``normalize=True``:

>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.5
2 2 0.25
6 0 0.25
Name: proportion, dtype: Float64

You can get the rows in the ascending order of the counts by setting ``ascending=True``:

>>> df.value_counts(ascending=True)
num_legs num_wings
2 2 1
6 0 1
4 0 2
Name: count, dtype: Int64

You can include the counts of the rows with ``NA`` values by setting ``dropna=False``:

>>> df.value_counts(dropna=False)
num_legs num_wings
4 0 2
2 2 1
6 0 1
7 <NA> 1
Name: count, dtype: Int64

Args:
subset (label or list of labels, optional):
Columns to use when counting unique combinations.
Expand Down
75 changes: 75 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,6 +2035,59 @@ def value_counts(
first element is the most frequently-occurring element.
Excludes NA values by default.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64")

>>> s
0 3
1 1
2 2
3 3
4 4
5 <NA>
dtype: Int64

``value_counts`` sorts the result by counts in a descending order by default:

>>> s.value_counts()
3 2
1 1
2 1
4 1
Name: count, dtype: Int64

You can normalize the counts to return relative frequencies by setting ``normalize=True``:

>>> s.value_counts(normalize=True)
3 0.4
1 0.2
2 0.2
4 0.2
Name: proportion, dtype: Float64

You can get the values in the ascending order of the counts by setting ``ascending=True``:

>>> s.value_counts(ascending=True)
1 1
2 1
4 1
3 2
Name: count, dtype: Int64

You can include the counts of the ``NA`` values by setting ``dropna=False``:

>>> s.value_counts(dropna=False)
3 2
1 1
2 1
4 1
<NA> 1
Name: count, dtype: Int64

Args:
normalize (bool, default False):
If True then the object returned will contain the relative
Expand Down Expand Up @@ -2167,3 +2220,25 @@ def iat(self):
def at(self):
"""Access a single value for a row/column label pair."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def values(self):
"""
Return Series as ndarray or ndarray-like depending on the dtype.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> bpd.Series([1, 2, 3]).values
array([1, 2, 3], dtype=object)

>>> bpd.Series(list('aabc')).values
array(['a', 'a', 'b', 'c'], dtype=object)

Returns:
numpy.ndarray or ndarray-like: Values in the Series.

"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)