diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e8ac8c1d0f..13b0562092 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -27,6 +27,7 @@ import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference import bigframes.core.ordering as orderings +import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.operations as ops @@ -69,10 +70,14 @@ def from_ibis( @classmethod def from_pandas(cls, pd_df: pandas.DataFrame): iobytes = io.BytesIO() - # Discard row labels and use simple string ids for columns - column_ids = tuple(str(label) for label in pd_df.columns) - pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes) - node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids) + # Use alphanumeric identifiers, to avoid downstream problems with escaping. + as_ids = [ + bigframes.core.utils.label_to_identifier(label, strict=True) + for label in pd_df.columns + ] + unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids)) + pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes) + node = nodes.ReadLocalNode(iobytes.getvalue()) return cls(node) @property diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 30444f5565..62cd7373d0 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -122,7 +122,6 @@ def __hash__(self): @dataclass(frozen=True) class ReadLocalNode(BigFrameNode): feather_bytes: bytes - column_ids: typing.Tuple[str, ...] def __hash__(self): return self._node_hash diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index dc7c709011..4331999dd6 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re import typing from typing import Hashable, Iterable, List @@ -84,26 +85,42 @@ def get_standardized_ids( Tuple of (standardized_column_ids, standardized_index_ids) """ col_ids = [ - UNNAMED_COLUMN_ID if col_label is None else str(col_label) + UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label) for col_label in col_labels ] idx_ids = [ - UNNAMED_INDEX_ID if idx_label is None else str(idx_label) + UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label) for idx_label in idx_labels ] - ids = idx_ids + col_ids + ids = disambiguate_ids(idx_ids + col_ids) + + idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] + + return col_ids, idx_ids + + +def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: + """ + Convert pandas label to make legal bigquery identifier. May create collisions (should deduplicate after). + Strict mode might not be necessary, but ibis seems to escape non-alphanumeric characters inconsistently. + """ # Column values will be loaded as null if the column name has spaces. # https://github.com/googleapis/python-bigquery/issues/1566 - ids = [id.replace(" ", "_") for id in ids] + identifier = str(label).replace(" ", "_") + if strict: + identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier) + if not identifier: + identifier = "id" + return identifier + - ids = typing.cast( +def disambiguate_ids(ids: typing.Sequence[str]) -> typing.List[str]: + """Disambiguate list of ids by adding suffixes where needed. If inputs are legal sql ids, outputs should be as well.""" + return typing.cast( List[str], vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False), ) - idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] - - return col_ids, idx_ids def merge_column_labels( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 2d4e1f0204..058adb9390 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,37 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas + + +@skip_legacy_pandas +def test_read_pandas_multi_index_axes(): + index = pandas.MultiIndex.from_arrays( + [ + pandas.Index([4, 99], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Hello, World!", "_some_new_string"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1index 1", "_1index 2"], + ) + columns = pandas.MultiIndex.from_arrays( + [ + pandas.Index([6, 87], dtype=pandas.Int64Dtype()), + pandas.Index( + [" Bonjour le monde!", "_une_chaîne_de_caractères"], + dtype=pandas.StringDtype(storage="pyarrow"), + ), + ], + names=[" 1columns 1", "_1new_index 2"], + ) + pandas_df = pandas.DataFrame( + [[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype() + ) + bf_df = bpd.DataFrame(pandas_df) + + pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df) # Row Multi-index tests