From 36fc6281686ea2dce3c184079163fd65d189038d Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 21:40:31 +0000 Subject: [PATCH 01/10] docs: add example for dataframe.melt, dataframe.pivot, dataframe.stack, dataframe.unstack --- .../bigframes_vendored/pandas/core/frame.py | 167 ++++++++++++++++-- 1 file changed, 155 insertions(+), 12 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 08fe8e2de0..ecab3d2ba4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3414,18 +3414,75 @@ def melt(self, id_vars, value_vars, var_name, value_name): the row axis, leaving just two non-identifier columns, 'variable' and 'value'. - Parameters - ---------- - id_vars (tuple, list, or ndarray, optional): - Column(s) to use as identifier variables. - value_vars (tuple, list, or ndarray, optional): - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name (scalar): - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name (scalar, default 'value'): - Name to use for the 'value' column. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, None, 3, 4, 5], + ... "B": [1, 2, 3, 4, 5], + ... "C": [None, 3.5, None, 4.5, 5.0]}) + >>> df + A B C + 0 1.0 1 + 1 2 3.5 + 2 3.0 3 + 3 4.0 4 4.5 + 4 5.0 5 5.0 + + [5 rows x 3 columns] + + Using `melt` without optional arguments: + + >>> df.melt() + variable value + 0 A 1.0 + 1 A + 2 A 3.0 + 3 A 4.0 + 4 A 5.0 + 5 B 1.0 + 6 B 2.0 + 7 B 3.0 + 8 B 4.0 + 9 B 5.0 + 10 C + 11 C 3.5 + 12 C + 13 C 4.5 + 14 C 5.0 + + [15 rows x 2 columns] + + Using `melt` with `id_vars` and `value_vars`: + + >>> df.melt(id_vars='A', value_vars=['B', 'C']) + A variable value + 0 1.0 B 1 + 1 B 2 + 2 3.0 B 3 + 3 4.0 B 4 + 4 5.0 B 5 + 5 1.0 C + 6 C 3 + 7 3.0 C + 8 4.0 C 4 + 9 5.0 C 5 + + [10 rows x 3 columns] + + + Args: + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. Returns: DataFrame: Unpivoted DataFrame. @@ -3647,6 +3704,52 @@ def pivot(self, *, columns, index=None, values=None): do not together uniquely identify input rows, the output will be silently non-deterministic. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... "foo": ["one", "one", "one", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B"], + ... "baz": [1, 2, 3, 4, 5], + ... "zoo": ['x', 'y', 'z', 'q', 'w'] + ... }) + + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + + [5 rows x 4 columns] + + Using `pivot` without optional arguments: + + >>> df.pivot(columns='foo') + bar baz zoo + foo one two one two one two + 0 A 1 x + 1 B 2 y + 2 C 3 z + 3 A 4 q + 4 B 5 w + + [5 rows x 6 columns] + + Using `pivot` with `index` and `values`: + + >>> df.pivot(columns='foo', index='bar', values='baz') + foo one two + bar + A 1 4 + B 2 5 + C 3 + + [3 rows x 2 columns] + Args: columns (str or object or a list of str): Column to use to make new frame's columns. @@ -3682,6 +3785,26 @@ def stack(self): BigQuery DataFrames does not support stack operations that would combine columns of different dtypes. + **Example:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) + >>> df + A B + foo 1 2 + bar 3 4 + + [2 rows x 2 columns] + + >>> df.stack() + foo A 1 + B 2 + bar A 3 + B 4 + dtype: Int64 + Returns: DataFrame or Series: Stacked dataframe or series. """ @@ -3697,6 +3820,26 @@ def unstack(self): If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). + **Example:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 3], 'B': [2, 4]}, index=['foo', 'bar']) + >>> df + A B + foo 1 2 + bar 3 4 + + [2 rows x 2 columns] + + >>> df.unstack() + A foo 1 + bar 3 + B foo 2 + bar 4 + dtype: Int64 + Returns: DataFrame or Series """ From 6b7aeec88c571ffd89f0fada462c184709b89fa3 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 21:43:19 +0000 Subject: [PATCH 02/10] remove empty line --- third_party/bigframes_vendored/pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index ecab3d2ba4..9e10dab97a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3471,7 +3471,6 @@ def melt(self, id_vars, value_vars, var_name, value_name): [10 rows x 3 columns] - Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. From 58ed6645a5131437971a6da4d870518519af123f Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 22:05:58 +0000 Subject: [PATCH 03/10] docstring fix --- .../bigframes_vendored/pandas/core/frame.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 9e10dab97a..5382e16df3 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3766,7 +3766,7 @@ def pivot(self, *, columns, index=None, values=None): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def stack(self): + def stack(self, level=-1): """ Stack the prescribed level(s) from columns to index. @@ -3804,12 +3804,16 @@ def stack(self): B 4 dtype: Int64 + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) to stack from the column axis onto the index axis. + Returns: DataFrame or Series: Stacked dataframe or series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def unstack(self): + def unstack(self, level=-1): """ Pivot a level of the (necessarily hierarchical) index labels. @@ -3839,8 +3843,12 @@ def unstack(self): bar 4 dtype: Int64 + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) of index to unstack, can pass level name. + Returns: - DataFrame or Series + DataFrame or Series: Unstacked dataframe or series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From b0464203ad4d2f70b51dcde213d4b613621ece43 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 22:34:24 +0000 Subject: [PATCH 04/10] spacing update --- third_party/bigframes_vendored/pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5382e16df3..b95fc24c15 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3464,7 +3464,7 @@ def melt(self, id_vars, value_vars, var_name, value_name): 3 4.0 B 4 4 5.0 B 5 5 1.0 C - 6 C 3 + 6 C 3 7 3.0 C 8 4.0 C 4 9 5.0 C 5 From 16653a6ccfb5337e10779caee12e2f6b5a92821c Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:18:16 -0800 Subject: [PATCH 05/10] docs: correct the params rendering for `ml.remote` and `ml.ensemble` modules (#248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `ensemble.RandomForestClassifier`: https://screenshot.googleplex.com/4Q88xgdm5hkaYXu - `ensemble.RandomForestRegressor`: https://screenshot.googleplex.com/3CU6pJBjYHQvnDo - `remote.VertexAIModel`: https://screenshot.googleplex.com/8SL2max6GfPMwFe Fixes internal issue 314150462 🦕 --- bigframes/ml/remote.py | 8 +-- docs/templates/toc.yml | 12 ++-- .../sklearn/ensemble/_forest.py | 72 +++++++++---------- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index d4c34bbd0d..8da073802d 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -47,10 +47,10 @@ class VertexAIModel(base.BaseEstimator): Args: endpoint (str): Vertex AI https endpoint. - input ({column_name: column_type}): - Input schema. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". - output ({column_name: column_type}): - Output label schema. Supported the same types as the input. + input (Mapping): + Input schema: `{column_name: column_type}`. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output (Mapping): + Output label schema: `{column_name: column_type}`. Supported the same types as the input. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 58ac1c0efe..b680a5fc1a 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -108,12 +108,6 @@ - name: PaLM2TextEmbeddingGenerator uid: bigframes.ml.llm.PaLM2TextEmbeddingGenerator name: llm - - items: - - name: Overview - uid: bigframes.ml.remote - - name: VertexAIModel - uid: bigframes.ml.remote.VertexAIModel - name: remote - items: - name: metrics uid: bigframes.ml.metrics @@ -144,6 +138,12 @@ - name: OneHotEncoder uid: bigframes.ml.preprocessing.OneHotEncoder name: preprocessing + - items: + - name: Overview + uid: bigframes.ml.remote + - name: VertexAIModel + uid: bigframes.ml.remote.VertexAIModel + name: remote name: bigframes.ml name: BigQuery DataFrames status: beta diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 6be41bf9aa..63c62274fd 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -47,16 +47,16 @@ def fit(self, X, y): """Build a forest of trees from the training set (X, y). Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. Returns: - Fitted Estimator. + ForestModel: Fitted Estimator. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -73,12 +73,12 @@ def predict(self, X): mean predicted regression targets of the trees in the forest. Args: - X: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. Returns: - The predicted values. + bigframes.dataframe.DataFrame: The predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -91,38 +91,38 @@ class RandomForestRegressor(ForestRegressor): to improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", "hist". - min_child_weight : Optional[float] + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. The value should be between 0 and 1. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. The value should be between 0 and 1. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 0.8. The value should be between 0 and 1. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 15. The value should be greater than 0 and less than 1. - subsample : Optional[float] + subsample (Optional[float]: Subsample ratio of the training instance. Default to 0.8. The value should be greater than 0 and less than 1. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] + xgboost_version (Optional[str]): Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1". """ @@ -144,7 +144,7 @@ def predict(self, X): which we want to get the predictions. Returns: - The predicted values. + bigframes.dataframe.DataFrame: The predicted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -158,37 +158,37 @@ class RandomForestClassifier(ForestClassifier): improve the predictive accuracy and control over-fitting. Args: - num_parallel_tree: Optional[int] + num_parallel_tree (Optional[int]): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. - tree_method: Optional[str] + tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", "hist". - min_child_weight : Optional[float] + min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. - colsample_bytree : Optional[float] + colsample_bytree (Optional[float]): Subsample ratio of columns when constructing each tree. Default to 1.0. The value should be between 0 and 1. - colsample_bylevel : Optional[float] + colsample_bylevel (Optional[float]): Subsample ratio of columns for each level. Default to 1.0. The value should be between 0 and 1. - colsample_bynode : Optional[float] + colsample_bynode (Optional[float]): Subsample ratio of columns for each split. Default to 0.8. The value should be between 0 and 1. - gamma : Optional[float] + gamma (Optional[float]): (min_split_loss) Minimum loss reduction required to make a further partition on a leaf node of the tree. Default to 0.0. - max_depth : Optional[int] + max_depth (Optional[int]): Maximum tree depth for base learners. Default to 15. The value should be greater than 0 and less than 1. - subsample : Optional[float] + subsample (Optional[float]): Subsample ratio of the training instance. Default to 0.8. The value should be greater than 0 and less than 1. - reg_alpha : Optional[float] + reg_alpha (Optional[float]): L1 regularization term on weights (xgb's alpha). Default to 0.0. - reg_lambda : Optional[float] + reg_lambda (Optional[float]): L2 regularization term on weights (xgb's lambda). Default to 1.0. - early_stop: Optional[bool] + early_stop (Optional[bool]): Whether training should stop after the first iteration. Default to True. - min_rel_progress: Optional[float] + min_rel_progress (Optional[float]): Minimum relative loss improvement necessary to continue training when early_stop is set to True. Default to 0.01. - enable_global_explain: Optional[bool] + enable_global_explain (Optional[bool]): Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. - xgboost_version: Optional[str] + xgboost_version (Optional[str]): Specifies the Xgboost version for model training. Default to "0.9". Possible values: "0.9", "1.1".ß """ From de5138632cc1378d051a95111424649a67a5e19a Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 4 Dec 2023 17:15:14 -0800 Subject: [PATCH 06/10] =?UTF-8?q?docs:=20add=20examples=20for=20dataframe.?= =?UTF-8?q?nunique,=20dataframe.diff,=20dataframe.a=E2=80=A6=20(#251)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: add examples for dataframe.nunique, dataframe.diff, dataframe.agg, dataframe.describe * update spacing * update ordering --- .../bigframes_vendored/pandas/core/frame.py | 114 +++++++++++++++++- 1 file changed, 112 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b95fc24c15..bd2f41abd1 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3490,7 +3490,26 @@ def melt(self, id_vars, value_vars, var_name, value_name): def nunique(self): """ - Count number of distinct elements in specified axis. + Count number of distinct elements in each column. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 2]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 2 + + [3 rows x 2 columns] + + >>> df.nunique() + A 3.0 + B 2.0 + dtype: Float64 Returns: bigframes.series.Series: Series with number of distinct elements. @@ -3634,6 +3653,40 @@ def diff( Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is element in previous row). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + Calculating difference with default periods=1: + + >>> df.diff() + A B + 0 + 1 -2 1 + 2 1 1 + + [3 rows x 2 columns] + + Calculating difference with periods=-1: + + >>> df.diff(periods=-1) + A B + 0 2 -1 + 1 -1 -1 + 2 + + [3 rows x 2 columns] + Args: periods (int, default 1): Periods to shift for calculating difference, accepts negative @@ -3646,7 +3699,37 @@ def diff( def agg(self, func): """ - Aggregate using one or more operations over the specified axis. + Aggregate using one or more operations over columns. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 3 1 + 1 1 2 + 2 2 3 + + [3 rows x 2 columns] + + Using a single function: + + >>> df.agg('sum') + A 6.0 + B 6.0 + dtype: Float64 + + Using a list of functions: + + >>> df.agg(['sum', 'mean']) + A B + sum 6.0 6.0 + mean 2.0 2.0 + + [2 rows x 2 columns] Args: func (function): @@ -3679,6 +3762,33 @@ def describe(self): upper percentile is ``75``. The ``50`` percentile is the same as the median. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [3, 1, 2], "B": [0, 2, 8]}) + >>> df + A B + 0 3 0 + 1 1 2 + 2 2 8 + + [3 rows x 2 columns] + + >>> df.describe() + A B + count 3.0 3.0 + mean 2.0 3.333333 + std 1.0 4.163332 + min 1.0 0.0 + 25% 1.0 0.0 + 50% 2.0 2.0 + 75% 3.0 8.0 + max 3.0 8.0 + + [8 rows x 2 columns] + Returns: bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. """ From 2e1091086cf75e83a1753550c45d91799cd848f6 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 5 Dec 2023 02:14:14 +0000 Subject: [PATCH 07/10] docs: Fix return annotation in API docstrings (#253) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 314367409 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index bd2f41abd1..3b622221b2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -226,7 +226,7 @@ def from_dict( if used with ``orient='columns'`` or ``orient='tight'``. Returns: - DataFrame + DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -267,7 +267,7 @@ def from_records( Number of rows to read if data is an iterator. Returns: - DataFrame + DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -717,7 +717,7 @@ def to_markdown( These parameters will be passed to `tabulate `_. Returns: - DataFrame in Markdown-friendly format. + DataFrame: DataFrame in Markdown-friendly format. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1272,7 +1272,7 @@ def sort_values( if `first`; `last` puts NaNs at the end. Returns: - DataFrame with sorted values. + DataFrame: DataFrame with sorted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1282,7 +1282,7 @@ def sort_index( """Sort object by labels (along an axis). Returns: - The original DataFrame sorted by the labels. + DataFrame: The original DataFrame sorted by the labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1330,7 +1330,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - Result of the comparison. + DataFrame: Result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1861,7 +1861,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame result of the arithmetic operation. + DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2796,7 +2796,7 @@ def any(self, *, axis=0, bool_only: bool = False): Include only boolean columns. Returns: - Series + bigframes.series.Series: Series indicating if any element is True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2843,7 +2843,7 @@ def all(self, axis=0, *, bool_only: bool = False): Include only boolean columns. Returns: - bigframes.series.Series: Series if all elements are True. + bigframes.series.Series: Series indicating if all elements are True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3184,7 +3184,7 @@ def skew(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3225,7 +3225,7 @@ def kurt(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series + Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3872,7 +3872,7 @@ def pivot(self, *, columns, index=None, values=None): have hierarchically indexed columns. Returns: - Returns reshaped DataFrame. + DataFrame: Returns reshaped DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3958,7 +3958,7 @@ def unstack(self, level=-1): Level(s) of index to unstack, can pass level name. Returns: - DataFrame or Series: Unstacked dataframe or series. + DataFrame or Series: DataFrame or Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4016,7 +4016,7 @@ def index(self): dtype=object) Returns: - The index labels of the DataFrame. + Index: The index object of the DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4265,7 +4265,7 @@ def dot(self, other): The other object to compute the matrix product with. Returns: - Series or DataFrame + Series or DataFrame: If `other` is a Series, return the matrix product between self and other as a Series. If other is a DataFrame, return the matrix product of self and other in a DataFrame. From 00d30bf4acb62a6c88ebee9eb77b5285af0f7b8d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 5 Dec 2023 11:34:15 -0800 Subject: [PATCH 08/10] feat: add nunique method to Series/DataFrameGroupby (#256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/groupby/__init__.py | 6 ++++++ tests/system/small/test_groupby.py | 2 ++ .../pandas/core/groupby/__init__.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 18cb83fa18..a8b8afdae7 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -179,6 +179,9 @@ def any(self) -> df.DataFrame: def count(self) -> df.DataFrame: return self._aggregate_all(agg_ops.count_op) + def nunique(self) -> df.DataFrame: + return self._aggregate_all(agg_ops.nunique_op) + def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("cumsum") @@ -442,6 +445,9 @@ def max(self, *args) -> series.Series: def count(self) -> series.Series: return self._aggregate(agg_ops.count_op) + def nunique(self) -> series.Series: + return self._aggregate(agg_ops.nunique_op) + def sum(self, *args) -> series.Series: return self._aggregate(agg_ops.sum_op) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index a24713c2b3..5214905186 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -69,11 +69,13 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): ("operator"), [ (lambda x: x.count()), + (lambda x: x.nunique()), (lambda x: x.any()), (lambda x: x.all()), ], ids=[ "count", + "nunique", "any", "all", ], diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index b05319b4f7..8730cf0007 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -363,6 +363,15 @@ def agg(self, func): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): + """ + Return number of unique elements in the group. + + Returns: + Series: Number of unique values within each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class DataFrameGroupBy(GroupBy): def agg(self, func, **kwargs): @@ -391,3 +400,12 @@ def agg(self, func, **kwargs): DataFrame """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def nunique(self): + """ + Return DataFrame with counts of unique elements in each position. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 1793427d070d3a301d5a2978bc68ecd904ee5555 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 21:40:31 +0000 Subject: [PATCH 09/10] docs: add example for dataframe.melt, dataframe.pivot, dataframe.stack, dataframe.unstack --- third_party/bigframes_vendored/pandas/core/frame.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 3b622221b2..cc3c3546bd 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3464,13 +3464,14 @@ def melt(self, id_vars, value_vars, var_name, value_name): 3 4.0 B 4 4 5.0 B 5 5 1.0 C - 6 C 3 + 6 C 3 7 3.0 C 8 4.0 C 4 9 5.0 C 5 [10 rows x 3 columns] + Args: id_vars (tuple, list, or ndarray, optional): Column(s) to use as identifier variables. @@ -3914,10 +3915,6 @@ def stack(self, level=-1): B 4 dtype: Int64 - Args: - level (int, str, or list of these, default -1 (last level)): - Level(s) to stack from the column axis onto the index axis. - Returns: DataFrame or Series: Stacked dataframe or series. """ @@ -3953,10 +3950,6 @@ def unstack(self, level=-1): bar 4 dtype: Int64 - Args: - level (int, str, or list of these, default -1 (last level)): - Level(s) of index to unstack, can pass level name. - Returns: DataFrame or Series: DataFrame or Series. """ From 39abefe171a6771872d88b208c361313788d7479 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 1 Dec 2023 22:05:58 +0000 Subject: [PATCH 10/10] docstring fix --- third_party/bigframes_vendored/pandas/core/frame.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index cc3c3546bd..5b00385eb8 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3915,6 +3915,10 @@ def stack(self, level=-1): B 4 dtype: Int64 + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) to stack from the column axis onto the index axis. + Returns: DataFrame or Series: Stacked dataframe or series. """ @@ -3950,6 +3954,10 @@ def unstack(self, level=-1): bar 4 dtype: Int64 + Args: + level (int, str, or list of these, default -1 (last level)): + Level(s) of index to unstack, can pass level name. + Returns: DataFrame or Series: DataFrame or Series. """