Codestin Search App

History

5521 lines (4842 loc) · 214 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""Analogs for :class:`pandas.DataFrame` and :class:`pandas.Series`:

:class:`DeferredDataFrame` and :class:`DeferredSeries`.

These classes are effectively wrappers around a `schema-aware`_

:class:`~apache_beam.pvalue.PCollection` that provide a set of operations

compatible with the `pandas`_ API.

Note that we aim for the Beam DataFrame API to be completely compatible with

the pandas API, but there are some features that are currently unimplemented

for various reasons. Pay particular attention to the **'Differences from

pandas'** section for each operation to understand where we diverge.

.. _schema-aware:

https://beam.apache.org/documentation/programming-guide/#what-is-a-schema

.. _pandas:

https://pandas.pydata.org/

"""

import collections

import inspect

import itertools

import math

import re

import warnings

from typing import Optional

import numpy as np

import pandas as pd

from pandas._libs import lib

from pandas.api.types import is_float_dtype

from pandas.api.types import is_int64_dtype

from pandas.api.types import is_list_like

from pandas.core.groupby.generic import DataFrameGroupBy

from apache_beam.dataframe import convert

from apache_beam.dataframe import expressions

from apache_beam.dataframe import frame_base

from apache_beam.dataframe import io

from apache_beam.dataframe import partitionings

from apache_beam.transforms import PTransform

__all__ = [

'DeferredSeries',

'DeferredDataFrame',

]

# Get major, minor version

PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))

def populate_not_implemented(pd_type):

def wrapper(deferred_type):

for attr in dir(pd_type):

# Don't auto-define hidden methods or dunders

if attr.startswith('_'):

continue

if not hasattr(deferred_type, attr):

pd_value = getattr(pd_type, attr)

if isinstance(pd_value, property) or inspect.isclass(pd_value):

# Some of the properties on pandas types (cat, dt, sparse), are

# actually attributes with class values, not properties

setattr(

deferred_type,

attr,

property(

frame_base.not_implemented_method(attr, base_type=pd_type)))

elif callable(pd_value):

setattr(

deferred_type,

attr,

frame_base.not_implemented_method(attr, base_type=pd_type))

return deferred_type

return wrapper

def _fillna_alias(method):

def wrapper(self, *args, **kwargs):

return self.fillna(*args, method=method, **kwargs)

wrapper.__name__ = method

wrapper.__doc__ = (

f'{method} is only supported for axis="columns". '

'axis="index" is order-sensitive.')

return frame_base.with_docs_from(pd.DataFrame)(

frame_base.args_to_kwargs(pd.DataFrame)(

frame_base.populate_defaults(pd.DataFrame)(wrapper)))

# These aggregations are commutative and associative, they can be trivially

# "lifted" (i.e. we can pre-aggregate on partitions, group, then post-aggregate)

LIFTABLE_AGGREGATIONS = ['all', 'any', 'max', 'min', 'prod', 'sum']

# These aggregations can be lifted if post-aggregated with "sum"

LIFTABLE_WITH_SUM_AGGREGATIONS = ['size', 'count']

UNLIFTABLE_AGGREGATIONS = [

'mean',

'median',

'quantile',

'describe',

'sem',

'skew',

'kurt',

'kurtosis',

'std',

'var',

'corr',

'cov',

'nunique',

]

# mad was removed in Pandas 2.0.

if PD_VERSION < (2, 0):

UNLIFTABLE_AGGREGATIONS.append('mad')

ALL_AGGREGATIONS = (

LIFTABLE_AGGREGATIONS + LIFTABLE_WITH_SUM_AGGREGATIONS +

UNLIFTABLE_AGGREGATIONS)

# These aggregations have specialized distributed implementations on

# DeferredSeries, which are re-used in DeferredFrame. Note they are *not* used

# for grouped aggregations, since they generally require tracking multiple

# intermediate series, which is difficult to lift in groupby.

HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS = {

'quantile',

'std',

'var',

'mean',

'nunique',

'corr',

'cov',

'skew',

'kurt',

'kurtosis'

}

UNLIFTABLE_GLOBAL_AGGREGATIONS = (

set(UNLIFTABLE_AGGREGATIONS) - set(HAND_IMPLEMENTED_GLOBAL_AGGREGATIONS))

def _agg_method(base, func):

def wrapper(self, *args, **kwargs):

return self.agg(func, *args, **kwargs)

if func in UNLIFTABLE_GLOBAL_AGGREGATIONS:

wrapper.__doc__ = (

f"``{func}`` cannot currently be parallelized. It will "

"require collecting all data on a single node.")

wrapper.__name__ = func

return frame_base.with_docs_from(base)(wrapper)

# Docstring to use for head and tail (commonly used to peek at datasets)

_PEEK_METHOD_EXPLANATION = (

"because it is `order-sensitive "

"<https://s.apache.org/dataframe-order-sensitive-operations>`_.\n\n"

"If you want to peek at a large dataset consider using interactive Beam's "

":func:`ib.collect "

"<apache_beam.runners.interactive.interactive_beam.collect>` "

"with ``n`` specified, or :meth:`sample`. If you want to find the "

"N largest elements, consider using :meth:`DeferredDataFrame.nlargest`.")

class DeferredDataFrameOrSeries(frame_base.DeferredFrame):

def _render_indexes(self):

if self.index.nlevels == 1:

return 'index=' + (

'<unnamed>' if self.index.name is None else repr(self.index.name))

else:

return 'indexes=[' + ', '.join(

'<unnamed>' if ix is None else repr(ix)

for ix in self.index.names) + ']'

__array__ = frame_base.wont_implement_method(

pd.Series, '__array__', reason="non-deferred-result")

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def drop(self, labels, axis, index, columns, errors, **kwargs):

"""drop is not parallelizable when dropping from the index and

``errors="raise"`` is specified. It requires collecting all data on a single

node in order to detect if one of the index values is missing."""

if labels is not None:

if index is not None or columns is not None:

raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")

if axis in (0, 'index'):

index = labels

columns = None

elif axis in (1, 'columns'):

index = None

columns = labels

else:

raise ValueError(

"axis must be one of (0, 1, 'index', 'columns'), "

"got '%s'" % axis)

if columns is not None:

# Compute the proxy based on just the columns that are dropped.

proxy = self._expr.proxy().drop(columns=columns, errors=errors)

else:

proxy = self._expr.proxy()

if index is not None and errors == 'raise':

# In order to raise an error about missing index values, we'll

# need to collect the entire dataframe.

# TODO: This could be parallelized by putting index values in a

# ConstantExpression and partitioning by index.

requires = partitionings.Singleton(

reason=(

"drop(errors='raise', axis='index') is not currently "

"parallelizable. This requires collecting all data on a single "

f"node in order to detect if one of {index!r} is missing."))

else:

requires = partitionings.Arbitrary()

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'drop', lambda df: df.drop(

axis=axis, index=index, columns=columns, errors=errors, **kwargs

), [self._expr],

proxy=proxy,

requires_partition_by=requires))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def droplevel(self, level, axis):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'droplevel', lambda df: df.droplevel(level, axis=axis),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Arbitrary()

if axis in (1, 'column') else partitionings.Singleton()))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

def swaplevel(self, **kwargs):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'swaplevel', lambda df: df.swaplevel(**kwargs), [self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Arbitrary()))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def fillna(self, value, method, axis, limit, **kwargs):

"""When ``axis="index"``, both ``method`` and ``limit`` must be ``None``.

otherwise this operation is order-sensitive."""

# Default value is None, but is overriden with index.

axis = axis or 'index'

if axis in (0, 'index'):

if method is not None:

raise frame_base.WontImplementError(

f"fillna(method={method!r}, axis={axis!r}) is not supported "

"because it is order-sensitive. Only fillna(method=None) is "

f"supported with axis={axis!r}.",

reason="order-sensitive")

if limit is not None:

raise frame_base.WontImplementError(

f"fillna(limit={method!r}, axis={axis!r}) is not supported because "

"it is order-sensitive. Only fillna(limit=None) is supported with "

f"axis={axis!r}.",

reason="order-sensitive")

if isinstance(self, DeferredDataFrame) and isinstance(value,

DeferredSeries):

# If self is a DataFrame and value is a Series we want to broadcast value

# to all partitions of self.

# This is OK, as its index must be the same size as the columns set of

# self, so cannot be too large.

class AsScalar(object):

def __init__(self, value):

self.value = value

with expressions.allow_non_parallel_operations():

value_expr = expressions.ComputedExpression(

'as_scalar', lambda df: AsScalar(df), [value._expr],

requires_partition_by=partitionings.Singleton())

get_value = lambda x: x.value

requires = partitionings.Arbitrary()

elif isinstance(value, frame_base.DeferredBase):

# For other DeferredBase combinations, use Index partitioning to

# co-locate on the Index

value_expr = value._expr

get_value = lambda x: x

requires = partitionings.Index()

else:

# Default case, pass value through as a constant, no particular

# partitioning requirement

value_expr = expressions.ConstantExpression(value)

get_value = lambda x: x

requires = partitionings.Arbitrary()

return frame_base.DeferredFrame.wrap(

# yapf: disable

expressions.ComputedExpression(

'fillna', lambda df, value: df.fillna(

get_value(value), method=method, axis=axis, limit=limit, **

kwargs), [self._expr, value_expr],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=requires))

if hasattr(pd.DataFrame, 'ffill'):

ffill = _fillna_alias('ffill')

if hasattr(pd.DataFrame, 'bfill'):

bfill = _fillna_alias('bfill')

if hasattr(pd.DataFrame, 'backfill'):

backfill = _fillna_alias('backfill')

if hasattr(pd.DataFrame, 'pad'):

pad = _fillna_alias('pad')

@frame_base.with_docs_from(pd.DataFrame)

def first(self, offset):

per_partition = expressions.ComputedExpression(

'first-per-partition', lambda df: df.sort_index().first(offset=offset),

[self._expr],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=partitionings.Arbitrary())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'first', lambda df: df.sort_index().first(offset=offset),

[per_partition],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=partitionings.Singleton()))

@frame_base.with_docs_from(pd.DataFrame)

def last(self, offset):

per_partition = expressions.ComputedExpression(

'last-per-partition', lambda df: df.sort_index().last(offset=offset),

[self._expr],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=partitionings.Arbitrary())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'last', lambda df: df.sort_index().last(offset=offset),

[per_partition],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=partitionings.Singleton()))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def groupby(self, by, level, axis, as_index, group_keys, **kwargs):

"""``as_index`` must be ``True``.

Aggregations grouping by a categorical column with ``observed=False`` set

are not currently parallelizable

(`Issue 21827 <https://github.com/apache/beam/issues/21827>`_).

"""

if not as_index:

raise NotImplementedError('groupby(as_index=False)')

if axis in (1, 'columns'):

return _DeferredGroupByCols(

expressions.ComputedExpression(

'groupbycols', lambda df: df.groupby(

by, axis=axis, group_keys=group_keys, **kwargs), [self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Arbitrary()),

group_keys=group_keys)

if level is None and by is None:

raise TypeError("You have to supply one of 'by' and 'level'")

elif level is not None:

if isinstance(level, (list, tuple)):

grouping_indexes = level

else:

grouping_indexes = [level]

grouping_columns = []

index = self._expr.proxy().index

# Translate to level numbers only

grouping_indexes = [

l if isinstance(l, int) else index.names.index(l)

for l in grouping_indexes

]

if index.nlevels == 1:

to_group_with_index = self._expr

to_group = self._expr

else:

levels_to_drop = [

i for i in range(index.nlevels) if i not in grouping_indexes

]

# Reorder so the grouped indexes are first

to_group_with_index = self.reorder_levels(

grouping_indexes + levels_to_drop)

grouping_indexes = list(range(len(grouping_indexes)))

levels_to_drop = list(range(len(grouping_indexes), index.nlevels))

if levels_to_drop:

to_group = to_group_with_index.droplevel(levels_to_drop)._expr

else:

to_group = to_group_with_index._expr

to_group_with_index = to_group_with_index._expr

elif callable(by):

def map_index(df):

df = df.copy()

df.index = df.index.map(by)

return df

to_group = expressions.ComputedExpression(

'map_index',

map_index, [self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton())

orig_nlevels = self._expr.proxy().index.nlevels

def prepend_mapped_index(df):

df = df.copy()

index = df.index.to_frame()

index.insert(0, None, df.index.map(by))

df.index = pd.MultiIndex.from_frame(

index, names=[None] + list(df.index.names))

return df

to_group_with_index = expressions.ComputedExpression(

'map_index_keep_orig',

prepend_mapped_index,

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

# Partitioning by the original indexes is preserved

preserves_partition_by=partitionings.Index(

list(range(1, orig_nlevels + 1))))

grouping_columns = []

# The index we need to group by is the last one

grouping_indexes = [0]

elif isinstance(by, DeferredSeries):

if isinstance(self, DeferredSeries):

def set_index(s, by):

df = pd.DataFrame(s)

df, by = df.align(by, axis=0, join='inner')

return df.set_index(by).iloc[:, 0]

def prepend_index(s, by):

df = pd.DataFrame(s)

df, by = df.align(by, axis=0, join='inner')

return df.set_index([by, df.index]).iloc[:, 0]

else:

def set_index(df, by): # type: ignore

df, by = df.align(by, axis=0, join='inner')

return df.set_index(by)

def prepend_index(df, by): # type: ignore

df, by = df.align(by, axis=0, join='inner')

return df.set_index([by, df.index])

to_group = expressions.ComputedExpression(

'set_index',

set_index, [self._expr, by._expr],

requires_partition_by=partitionings.Index(),

preserves_partition_by=partitionings.Singleton())

orig_nlevels = self._expr.proxy().index.nlevels

to_group_with_index = expressions.ComputedExpression(

'prependindex',

prepend_index, [self._expr, by._expr],

requires_partition_by=partitionings.Index(),

preserves_partition_by=partitionings.Index(

list(range(1, orig_nlevels + 1))))

grouping_columns = []

grouping_indexes = [0]

elif isinstance(by, np.ndarray):

raise frame_base.WontImplementError(

"Grouping by a concrete ndarray is order sensitive.",

reason="order-sensitive")

elif isinstance(self, DeferredDataFrame):

if not isinstance(by, list):

by = [by]

# Find the columns that we need to move into the index so we can group by

# them

column_names = self._expr.proxy().columns

grouping_columns = list(set(by).intersection(column_names))

index_names = self._expr.proxy().index.names

for label in by:

if label not in index_names and label not in self._expr.proxy().columns:

raise KeyError(label)

grouping_indexes = list(set(by).intersection(index_names))

if grouping_indexes:

if set(by) == set(index_names):

to_group = self._expr

elif set(by).issubset(index_names):

to_group = self.droplevel(index_names.difference(by))._expr

else:

to_group = self.reset_index(grouping_indexes).set_index(by)._expr

else:

to_group = self.set_index(by)._expr

if grouping_columns:

# TODO(https://github.com/apache/beam/issues/20759):

# It should be possible to do this without creating

# an expression manually, by using DeferredDataFrame.set_index, i.e.:

# to_group_with_index = self.set_index([self.index] +

# grouping_columns)._expr

to_group_with_index = expressions.ComputedExpression(

'move_grouped_columns_to_index',

lambda df: df.set_index([df.index] + grouping_columns, drop=False),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Index(

list(range(self._expr.proxy().index.nlevels))))

else:

to_group_with_index = self._expr

else:

raise NotImplementedError(by)

return DeferredGroupBy(

expressions.ComputedExpression(

'groupbyindex', lambda df: df.groupby(

level=list(range(df.index.nlevels)), group_keys=group_keys, **

kwargs), [to_group],

requires_partition_by=partitionings.Index(),

preserves_partition_by=partitionings.Arbitrary()),

kwargs,

to_group,

to_group_with_index,

grouping_columns=grouping_columns,

grouping_indexes=grouping_indexes,

group_keys=group_keys)

@property # type: ignore

@frame_base.with_docs_from(pd.DataFrame)

def loc(self):

return _DeferredLoc(self)

@property # type: ignore

@frame_base.with_docs_from(pd.DataFrame)

def iloc(self):

"""Position-based indexing with `iloc` is order-sensitive in almost every

case. Beam DataFrame users should prefer label-based indexing with `loc`.

"""

return _DeferredILoc(self)

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def reset_index(self, level=None, **kwargs):

"""Dropping the entire index (e.g. with ``reset_index(level=None)``) is

not parallelizable. It is also only guaranteed that the newly generated

index values will be unique. The Beam DataFrame API makes no guarantee

that the same index values as the equivalent pandas operation will be

generated, because that implementation is order-sensitive."""

if level is not None and not isinstance(level, (tuple, list)):

level = [level]

if level is None or len(level) == self._expr.proxy().index.nlevels:

# TODO(https://github.com/apache/beam/issues/20859):

# Could do distributed re-index with offsets.

requires_partition_by = partitionings.Singleton(

reason=(

f"reset_index(level={level!r}) drops the entire index and "

"creates a new one, so it cannot currently be parallelized "

"(https://github.com/apache/beam/issues/20859)."))

else:

requires_partition_by = partitionings.Arbitrary()

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'reset_index', lambda df: df.reset_index(level=level, **kwargs),

[self._expr],

preserves_partition_by=partitionings.Singleton(),

requires_partition_by=requires_partition_by))

abs = frame_base._elementwise_method('abs', base=pd.core.generic.NDFrame)

@frame_base.with_docs_from(pd.core.generic.NDFrame)

@frame_base.args_to_kwargs(pd.core.generic.NDFrame)

@frame_base.populate_defaults(pd.core.generic.NDFrame)

def astype(self, dtype, copy, errors):

"""astype is not parallelizable when ``errors="ignore"`` is specified.

``copy=False`` is not supported because it relies on memory-sharing

semantics.

``dtype="category`` is not supported because the type of the output column

depends on the data. Please use ``pd.CategoricalDtype`` with explicit

categories instead.

"""

requires = partitionings.Arbitrary()

if errors == "ignore":

# We need all data in order to ignore errors and propagate the original

# data.

requires = partitionings.Singleton(

reason=(

f"astype(errors={errors!r}) is currently not parallelizable, "

"because all data must be collected on one node to determine if "

"the original data should be propagated instead."))

if not copy:

raise frame_base.WontImplementError(

f"astype(copy={copy!r}) is not supported because it relies on "

"memory-sharing semantics that are not compatible with the Beam "

"model.")

# An instance of CategoricalDtype is actualy considered equal to the string

# 'category', so we have to explicitly check if dtype is an instance of

# CategoricalDtype, and allow it.

# See https://github.com/apache/beam/issues/23276

if dtype == 'category' and not isinstance(dtype, pd.CategoricalDtype):

raise frame_base.WontImplementError(

"astype(dtype='category') is not supported because the type of the "

"output column depends on the data. Please use pd.CategoricalDtype "

"with explicit categories instead.",

reason="non-deferred-columns")

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'astype',

lambda df: df.astype(dtype=dtype, copy=copy, errors=errors),

[self._expr],

requires_partition_by=requires,

preserves_partition_by=partitionings.Arbitrary()))

at_time = frame_base._elementwise_method(

'at_time', base=pd.core.generic.NDFrame)

between_time = frame_base._elementwise_method(

'between_time', base=pd.core.generic.NDFrame)

copy = frame_base._elementwise_method('copy', base=pd.core.generic.NDFrame)

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def replace(self, to_replace, value, limit, method, **kwargs):

"""``method`` is not supported in the Beam DataFrame API because it is

order-sensitive. It cannot be specified.

If ``limit`` is specified this operation is not parallelizable."""

# pylint: disable-next=c-extension-no-member

value_compare = None if PD_VERSION < (1, 4) else lib.no_default

if method is not None and not isinstance(to_replace,

dict) and value is value_compare:

# pandas only relies on method if to_replace is not a dictionary, and

# value is the <no_default> value. This is different than

# if ``None`` is explicitly passed for ``value``. In this case, it will be

# respected

raise frame_base.WontImplementError(

f"replace(method={method!r}) is not supported because it is "

"order sensitive. Only replace(method=None) is supported.",

reason="order-sensitive")

if limit is None:

requires_partition_by = partitionings.Arbitrary()

else:

requires_partition_by = partitionings.Singleton(

reason=(

f"replace(limit={limit!r}) cannot currently be parallelized. It "

"requires collecting all data on a single node."))

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'replace', lambda df: df.replace(

to_replace=to_replace, value=value, limit=limit, method=method,

**kwargs), [self._expr],

preserves_partition_by=partitionings.Arbitrary(),

requires_partition_by=requires_partition_by))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def tz_localize(self, ambiguous, **kwargs):

"""``ambiguous`` cannot be set to ``"infer"`` as its semantics are

order-sensitive. Similarly, specifying ``ambiguous`` as an

:class:`~numpy.ndarray` is order-sensitive, but you can achieve similar

functionality by specifying ``ambiguous`` as a Series."""

if isinstance(ambiguous, np.ndarray):

raise frame_base.WontImplementError(

"tz_localize(ambiguous=ndarray) is not supported because it makes "

"this operation sensitive to the order of the data. Please use a "

"DeferredSeries instead.",

reason="order-sensitive")

elif isinstance(ambiguous, frame_base.DeferredFrame):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'tz_localize', lambda df, ambiguous: df.tz_localize(

ambiguous=ambiguous, **kwargs), [self._expr, ambiguous._expr],

requires_partition_by=partitionings.Index(),

preserves_partition_by=partitionings.Singleton()))

elif ambiguous == 'infer':

# infer attempts to infer based on the order of the timestamps

raise frame_base.WontImplementError(

f"tz_localize(ambiguous={ambiguous!r}) is not allowed because it "

"makes this operation sensitive to the order of the data.",

reason="order-sensitive")

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'tz_localize',

lambda df: df.tz_localize(ambiguous=ambiguous, **kwargs),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton()))

@property # type: ignore

@frame_base.with_docs_from(pd.DataFrame)

def size(self):

sizes = expressions.ComputedExpression(

'get_sizes',

# Wrap scalar results in a Series for easier concatenation later

lambda df: pd.Series(df.size),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'sum_sizes', lambda sizes: sizes.sum(), [sizes],

requires_partition_by=partitionings.Singleton(),

preserves_partition_by=partitionings.Singleton()))

def length(self):

"""Alternative to ``len(df)`` which returns a deferred result that can be

used in arithmetic with :class:`DeferredSeries` or

:class:`DeferredDataFrame` instances."""

lengths = expressions.ComputedExpression(

'get_lengths',

# Wrap scalar results in a Series for easier concatenation later

lambda df: pd.Series(len(df)),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'sum_lengths', lambda lengths: lengths.sum(), [lengths],

requires_partition_by=partitionings.Singleton(),

preserves_partition_by=partitionings.Singleton()))

def __len__(self):

raise frame_base.WontImplementError(

"len(df) is not currently supported because it produces a non-deferred "

"result. Consider using df.length() instead.",

reason="non-deferred-result")

@property # type: ignore

@frame_base.with_docs_from(pd.DataFrame)

def empty(self):

empties = expressions.ComputedExpression(

'get_empties',

# Wrap scalar results in a Series for easier concatenation later

lambda df: pd.Series(df.empty),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'check_all_empty', lambda empties: empties.all(), [empties],

requires_partition_by=partitionings.Singleton(),

preserves_partition_by=partitionings.Singleton()))

@frame_base.with_docs_from(pd.DataFrame)

def bool(self):

# TODO: Documentation about DeferredScalar

# Will throw if any partition has >1 element

bools = expressions.ComputedExpression(

'get_bools',

# Wrap scalar results in a Series for easier concatenation later

lambda df: pd.Series([], dtype=bool)

if df.empty else pd.Series([df.bool()]),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Singleton())

with expressions.allow_non_parallel_operations(True):

# Will throw if overall dataset has != 1 element

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'combine_all_bools', lambda bools: bools.bool(), [bools],

proxy=bool(),

requires_partition_by=partitionings.Singleton(),

preserves_partition_by=partitionings.Singleton()))

@frame_base.with_docs_from(pd.DataFrame)

def equals(self, other):

intermediate = expressions.ComputedExpression(

'equals_partitioned',

# Wrap scalar results in a Series for easier concatenation later

lambda df, other: pd.Series(df.equals(other)),

[self._expr, other._expr],

requires_partition_by=partitionings.Index(),

preserves_partition_by=partitionings.Singleton())

with expressions.allow_non_parallel_operations(True):

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'aggregate_equals', lambda df: df.all(), [intermediate],

requires_partition_by=partitionings.Singleton(),

preserves_partition_by=partitionings.Singleton()))

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def sort_values(self, axis, **kwargs):

"""``sort_values`` is not implemented.

It is not implemented for ``axis=index`` because it imposes an ordering on

the dataset, and it likely will not be maintained (see

https://s.apache.org/dataframe-order-sensitive-operations).

It is not implemented for ``axis=columns`` because it makes the order of

the columns depend on the data (see

https://s.apache.org/dataframe-non-deferred-columns)."""

if axis in (0, 'index'):

# axis=index imposes an ordering on the DataFrame rows which we do not

# support

raise frame_base.WontImplementError(

"sort_values(axis=index) is not supported because it imposes an "

"ordering on the dataset which likely will not be preserved.",

reason="order-sensitive")

else:

# axis=columns will reorder the columns based on the data

raise frame_base.WontImplementError(

"sort_values(axis=columns) is not supported because the order of the "

"columns in the result depends on the data.",

reason="non-deferred-columns")

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def sort_index(self, axis, **kwargs):

"""``axis=index`` is not allowed because it imposes an ordering on the

dataset, and we cannot guarantee it will be maintained (see

https://s.apache.org/dataframe-order-sensitive-operations). Only

``axis=columns`` is allowed."""

if axis in (0, 'index'):

# axis=rows imposes an ordering on the DataFrame which we do not support

raise frame_base.WontImplementError(

"sort_index(axis=index) is not supported because it imposes an "

"ordering on the dataset which we cannot guarantee will be "

"preserved.",

reason="order-sensitive")

# axis=columns reorders the columns by name

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'sort_index',

lambda df: df.sort_index(axis=axis, **kwargs),

[self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Arbitrary(),

))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(

pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None)

@frame_base.populate_defaults(

pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None)

@frame_base.maybe_inplace

def where(self, cond, other, errors, **kwargs):

"""where is not parallelizable when ``errors="ignore"`` is specified."""

requires = partitionings.Arbitrary()

deferred_args = {}

actual_args = {}

# TODO(bhulette): This is very similar to the logic in

# frame_base.elementwise_method, can we unify it?

if isinstance(cond, frame_base.DeferredFrame):

deferred_args['cond'] = cond

requires = partitionings.Index()

else:

actual_args['cond'] = cond

if isinstance(other, frame_base.DeferredFrame):

deferred_args['other'] = other

requires = partitionings.Index()

else:

actual_args['other'] = other

# For Pandas 2.0, errors was removed as an argument.

if PD_VERSION < (2, 0):

if "errors" in kwargs and kwargs['errors'] == "ignore":

# We need all data in order to ignore errors and propagate the original

# data.

requires = partitionings.Singleton(

reason=(

f"where(errors={kwargs['errors']!r}) is currently not "

"parallelizable, because all data must be collected on one "

"node to determine if the original data should be propagated "

"instead."))

actual_args['errors'] = kwargs['errors'] if 'errors' in kwargs else None

def where_execution(df, *args):

runtime_values = {

name: value

for (name, value) in zip(deferred_args.keys(), args)

}

return df.where(**runtime_values, **actual_args, **kwargs)

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

"where",

where_execution,

[self._expr] + [df._expr for df in deferred_args.values()],

requires_partition_by=requires,

preserves_partition_by=partitionings.Index(),

))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

@frame_base.maybe_inplace

def mask(self, cond, **kwargs):

"""mask is not parallelizable when ``errors="ignore"`` is specified."""

return self.where(~cond, **kwargs)

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def truncate(self, before, after, axis):

if axis in (None, 0, 'index'):

def truncate(df):

return df.sort_index().truncate(before=before, after=after, axis=axis)

else:

def truncate(df):

return df.truncate(before=before, after=after, axis=axis)

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'truncate',

truncate, [self._expr],

requires_partition_by=partitionings.Arbitrary(),

preserves_partition_by=partitionings.Arbitrary()))

@frame_base.with_docs_from(pd.DataFrame)

@frame_base.args_to_kwargs(pd.DataFrame)

@frame_base.populate_defaults(pd.DataFrame)

def unstack(self, **kwargs):

level = kwargs.get('level', -1)

if self._expr.proxy().index.nlevels == 1:

if PD_VERSION < (1, 2):

raise frame_base.WontImplementError(

"unstack() is not supported when using pandas < 1.2.0\n"

"Please upgrade to pandas 1.2.0 or higher to use this operation.")

return frame_base.DeferredFrame.wrap(

expressions.ComputedExpression(

'unstack', lambda s: s.unstack(**kwargs), [self._expr],

requires_partition_by=partitionings.Index()))

else:

# Unstacking MultiIndex objects

idx = self._expr.proxy().index

# Converting level (int, str, or combination) to a list of number levels

level_list = level if isinstance(level, list) else [level]

level_number_list = [idx._get_level_number(l) for l in level_list]

# Checking if levels provided are of CategoricalDtype

if not all(isinstance(idx.levels[l].dtype, (pd.CategoricalDtype,

pd.BooleanDtype))

for l in level_number_list):

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

frames.py

Latest commit

History

frames.py

File metadata and controls