googleapis · tswast · Aug 29, 2023 · Aug 29, 2023
@@ -20,7 +20,7 @@ env_vars: {
 }
 env_vars: {
     key: "TRAMPOLINE_BUILD_FILE"
-    value: "git/bigframes/.kokoro/publish-docs.sh"
+    value: ".kokoro/publish-docs.sh"
 }
 
 env_vars: {

@@ -41,6 +41,7 @@ method accepts either a fully-qualified table ID or a SQL query.
 
   import bigframes.pandas as bpd
 
+  bpd.options.bigquery.project = your_gcp_project_id
   df1 = bpd.read_gbq("project.dataset.table")
   df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`")
 
@@ -260,7 +261,7 @@ To view and manage Cloud Functions functions, use the
 `Functions <https://console.cloud.google.com/functions/list?env=gen2>`_
 page and use the project picker to select the project in which you
 created the function. For easy identification, the names of the functions
-created by BigQuery DataFrames are prefixed by ``bigframes-``.
+created by BigQuery DataFrames are prefixed by ``bigframes``.
 
 **Requirements**
 
@@ -283,7 +284,9 @@ following IAM roles:
 * BigQuery Data Editor (roles/bigquery.dataEditor)
 * BigQuery Connection Admin (roles/bigquery.connectionAdmin)
 * Cloud Functions Developer (roles/cloudfunctions.developer)
-* Service Account User (roles/iam.serviceAccountUser)
+* Service Account User (roles/iam.serviceAccountUser) on the
+  `service account <https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration> `
+  ``[email protected]``
 * Storage Object Viewer (roles/storage.objectViewer)
 * Project IAM Admin (roles/resourcemanager.projectIamAdmin)
 
@@ -330,7 +333,7 @@ Data processing location
 
 BigQuery DataFrames is designed for scale, which it achieves by keeping data
 and processing on the BigQuery service. However, you can bring data into the
-memory of your client machine by calling ``.execute()`` on a DataFrame or Series
+memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series
 object. If you choose to do this, the memory limitation of your client machine
 applies.
 

@@ -21,3 +21,5 @@
     "Share your usecase with the BigQuery DataFrames team at the "
     "https://bit.ly/bigframes-feedback survey."
 )
+
+ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}"
@@ -35,6 +35,7 @@
     reencode_order_string,
     StringEncoding,
 )
+import bigframes.core.utils as utils
 import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -562,6 +563,36 @@ def aggregate(
                 ordering=ordering,
             )
 
+    def corr_aggregate(
+        self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]]
+    ) -> ArrayValue:
+        """
+        Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id.
+        This uses BigQuery's CORR under the hood, and thus only Pearson's method is used.
+        Arguments:
+            corr_aggregations: left_column_id, right_column_id, output_column_id tuples
+        """
+        table = self.to_ibis_expr(ordering_mode="unordered")
+        stats = {
+            col_out: table[col_left].corr(table[col_right], how="pop")
+            for col_left, col_right, col_out in corr_aggregations
+        }
+        aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)}
+        result = table.aggregate(**aggregates)
+        # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it.
+        ordering = ExpressionOrdering(
+            ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
+            total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
+            integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
+        )
+        return ArrayValue(
+            self._session,
+            result,
+            columns=[result[col_id] for col_id in [*stats.keys()]],
+            hidden_ordering_columns=[result[ORDER_ID_COLUMN]],
+            ordering=ordering,
+        )
+
     def project_window_op(
         self,
         column_name: str,
@@ -852,52 +883,91 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
             group_by=group_by,
         )
 
-    def unpivot_single_row(
+    def unpivot(
         self,
         row_labels: typing.Sequence[typing.Hashable],
-        unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]],
+        unpivot_columns: typing.Sequence[
+            typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
+        ],
         *,
+        passthrough_columns: typing.Sequence[str] = (),
         index_col_id: str = "index",
-        dtype=pandas.Float64Dtype(),
+        dtype: typing.Union[
+            bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
+        ] = pandas.Float64Dtype(),
     ) -> ArrayValue:
-        """Unpivot a single row."""
-        # TODO: Generalize to multiple row input
-        table = self.to_ibis_expr(ordering_mode="unordered")
+        """
+        Unpivot ArrayValue columns.
+
+        Args:
+            row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument.
+            unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
+            passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
+            index_col_id (str): The column id to be used for the row labels.
+            dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns.
+
+        Returns:
+            ArrayValue: The unpivoted ArrayValue
+        """
+        table = self.to_ibis_expr(ordering_mode="offset_col")
         sub_expressions = []
 
-        # TODO: validate all columns are equal length, as well as row labels
+        # Use ibis memtable to infer type of rowlabels (if possible)
+        # TODO: Allow caller to specify dtype
+        labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
+        labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)
+
         row_n = len(row_labels)
         if not all(
             len(source_columns) == row_n for _, source_columns in unpivot_columns
         ):
             raise ValueError("Columns and row labels must all be same length.")
 
-        # Select each column
         for i in range(row_n):
             values = []
-            for result_col, source_cols in unpivot_columns:
-                values.append(
-                    ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col)
-                )
-
+            for j in range(len(unpivot_columns)):
+                result_col, source_cols = unpivot_columns[j]
+                col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
+                if source_cols[i] is not None:
+                    values.append(
+                        ops.AsTypeOp(col_dtype)
+                        ._as_ibis(table[source_cols[i]])
+                        .name(result_col)
+                    )
+                else:
+                    values.append(
+                        bigframes.dtypes.literal_to_ibis_scalar(
+                            None, force_dtype=col_dtype
+                        ).name(result_col)
+                    )
+            offsets_value = (
+                ((table[ORDER_ID_COLUMN] * row_n) + i)
+                .cast(ibis_dtypes.int64)
+                .name(ORDER_ID_COLUMN),
+            )
             sub_expr = table.select(
-                ibis_types.literal(row_labels[i]).name(index_col_id),
+                passthrough_columns,
+                bigframes.dtypes.literal_to_ibis_scalar(
+                    row_labels[i], force_dtype=labels_dtype  # type:ignore
+                ).name(index_col_id),
                 *values,
-                ibis_types.literal(i).name(ORDER_ID_COLUMN),
+                offsets_value,
             )
             sub_expressions.append(sub_expr)
         rotated_table = ibis.union(*sub_expressions)
 
         value_columns = [
             rotated_table[value_col_id] for value_col_id, _ in unpivot_columns
         ]
+        passthrough_values = [rotated_table[col] for col in passthrough_columns]
         return ArrayValue(
             session=self._session,
             table=rotated_table,
-            columns=[rotated_table[index_col_id], *value_columns],
+            columns=[rotated_table[index_col_id], *value_columns, *passthrough_values],
             hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]],
             ordering=ExpressionOrdering(
                 ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
+                integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
                 total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
             ),
         )

@@ -197,3 +197,35 @@ def rank(
             )
 
     return block.select_columns(rownum_col_ids).with_column_labels(labels)
+
+
+def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
+    """
+    Drop na entries from block
+    """
+    if how == "any":
+        filtered_block = block
+        for column in block.value_columns:
+            filtered_block, result_id = filtered_block.apply_unary_op(
+                column, ops.notnull_op
+            )
+            filtered_block = filtered_block.filter(result_id)
+            filtered_block = filtered_block.drop_columns([result_id])
+        return filtered_block
+    else:  # "all"
+        filtered_block = block
+        predicate = None
+        for column in block.value_columns:
+            filtered_block, partial_predicate = filtered_block.apply_unary_op(
+                column, ops.notnull_op
+            )
+            if predicate:
+                filtered_block, predicate = filtered_block.apply_binary_op(
+                    partial_predicate, predicate, ops.or_op
+                )
+            else:
+                predicate = partial_predicate
+        if predicate:
+            filtered_block = filtered_block.filter(predicate)
+        filtered_block = filtered_block.select_columns(block.value_columns)
+        return filtered_block