diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e0fd8b98f..517176da89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests] + additional_dependencies: [types-requests, types-tabulate] diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index ad3ea3f68c..afa36aa84c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -32,6 +32,10 @@ class DisplayOptions: progress_bar: Optional[str] = "auto" repr_mode: Literal["head", "deferred"] = "head" + max_info_columns: int = 100 + max_info_rows: Optional[int] = 200000 + memory_usage: bool = True + @contextlib.contextmanager def pandas_repr(display_options: DisplayOptions): diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6c66c36062..fc7cf167d4 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -155,6 +155,14 @@ def _block(self) -> blocks.Block: def T(self) -> Index: return self.transpose() + def _memory_usage(self) -> int: + (n_rows,) = self.shape + return sum( + self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + ) + def transpose(self) -> Index: return self @@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): - result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + if key != -1: + result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() + else: # special case, want [-1:] instead of [-1:0] + result_pd_df, _ = self._block.slice(key).to_pandas() if result_pd_df.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8567296e29..f7796291b9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -18,6 +18,7 @@ import datetime import re +import sys import textwrap import typing from typing import ( @@ -36,6 +37,7 @@ import google.cloud.bigquery as bigquery import numpy import pandas +import tabulate import bigframes import bigframes._config.display_options as display_options @@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + def memory_usage(self, index: bool = True): + n_rows, _ = self.shape + # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object + column_sizes = self.dtypes.map( + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows + ) + if index: + index_size = pandas.Series([self.index._memory_usage()], index=["Index"]) + column_sizes = pandas.concat([index_size, column_sizes]) + return column_sizes + + def info( + self, + verbose: Optional[bool] = None, + buf=None, + max_cols: Optional[int] = None, + memory_usage: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + obuf = buf or sys.stdout + + n_rows, n_columns = self.shape + + max_cols = ( + max_cols + if max_cols is not None + else bigframes.options.display.max_info_columns + ) + + show_all_columns = verbose if verbose is not None else (n_columns < max_cols) + + obuf.write(f"{type(self)}\n") + + index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" + + # These accessses are kind of expensive, maybe should try to skip? + first_indice = self.index[0] + last_indice = self.index[-1] + obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n") + + dtype_strings = self.dtypes.astype("string") + if show_all_columns: + obuf.write(f"Data columns (total {n_columns} columns):\n") + column_info = self.columns.to_frame(name="Column") + + max_rows = bigframes.options.display.max_info_rows + too_many_rows = n_rows > max_rows if max_rows is not None else False + + if show_counts if show_counts is not None else (not too_many_rows): + non_null_counts = self.count().to_pandas() + column_info["Non-Null Count"] = non_null_counts.map( + lambda x: f"{int(x)} non-null" + ) + + column_info["Dtype"] = dtype_strings + + column_info = column_info.reset_index(drop=True) + column_info.index.name = "#" + + column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore + obuf.write(column_info_formatted) + obuf.write("\n") + + else: # Just number of columns and first, last + obuf.write( + f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n" + ) + dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items() + dtype_counts_formatted = ", ".join( + f"{dtype}({count})" for dtype, count in dtype_counts + ) + obuf.write(f"dtypes: {dtype_counts_formatted}\n") + + show_memory = ( + memory_usage + if memory_usage is not None + else bigframes.options.display.memory_usage + ) + if show_memory: + # TODO: Convert to different units (kb, mb, etc.) + obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index cd35e380c0..774eb74d06 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -143,6 +143,19 @@ # "string" and "string[pyarrow] are accepted" BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow") +# For the purposes of dataframe.memory_usage +# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes +DTYPE_BYTE_SIZES = { + pd.BooleanDtype(): 1, + pd.Int64Dtype(): 8, + pd.Float32Dtype(): 8, + pd.StringDtype(): 8, + pd.ArrowDtype(pa.time64("us")): 8, + pd.ArrowDtype(pa.timestamp("us")): 8, + pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8, + pd.ArrowDtype(pa.date32()): 8, +} + def ibis_dtype_to_bigframes_dtype( ibis_dtype: ibis_dtypes.DataType, diff --git a/noxfile.py b/noxfile.py index 8d6d641fc1..c1fb53f794 100644 --- a/noxfile.py +++ b/noxfile.py @@ -228,6 +228,7 @@ def mypy(session): "types-python-dateutil", "types-requests", "types-setuptools", + "types-tabulate", ] ) | set(SYSTEM_TEST_STANDARD_DEPENDENCIES) diff --git a/setup.py b/setup.py index 29eacb74a9..abf165b3df 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ "requests >=2.27.1", "scikit-learn >=1.2.2", "sqlalchemy >=1.4,<3.0dev", + "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", ] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9b9567418b..9744d3f6e9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import operator import tempfile import typing @@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 13 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null object\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null object\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 945 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + def test_drop_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 198654015e..dfb91dfeb8 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -33,6 +33,17 @@ Instead estimated bytes processed will be shown. Dataframe and Series objects can still be computed with methods that explicitly execute and download results. + max_info_columns (int): + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. + max_info_rows (int or None): + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions than + specified. + memory_usage (bool): + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. Valid values True,False, """ sampling_options_doc = """ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 2a8972f2e5..099d8b8e66 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -92,6 +92,72 @@ def values(self) -> np.ndarray: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def info( + self, + verbose: bool | None = None, + buf=None, + max_cols: int | None = None, + memory_usage: bool | None = None, + show_counts: bool | None = None, + ) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtypeand columns, non-null values and memory usage. + + Args: + verbose (bool, optional): + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf (writable buffer, defaults to sys.stdout): + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols (int, optional): + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage (bool, optional): + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + True always show memory usage. False never shows memory usage. + Memory estimation is made based in column dtype and number of rows + assuming values consume the same memory amount for corresponding dtypes. + show_counts (bool, optional): + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns: + None: This method prints a summary of a DataFrame and returns None.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def memory_usage(self, index: bool = True): + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Args: + index (bool, default True): + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + + Returns: + Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # IO methods (to / from other formats) def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: