-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathconvert.py
More file actions
303 lines (253 loc) · 11.1 KB
/
convert.py
File metadata and controls
303 lines (253 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import warnings
import weakref
from collections.abc import Iterable
from typing import Any
from typing import Optional
from typing import Union
import pandas as pd
import apache_beam as beam
from apache_beam import pvalue
from apache_beam.dataframe import expressions
from apache_beam.dataframe import frame_base
from apache_beam.dataframe import transforms
from apache_beam.dataframe.schemas import element_typehint_from_dataframe_proxy
from apache_beam.dataframe.schemas import generate_proxy
from apache_beam.typehints.pandas_type_compatibility import dtype_to_fieldtype
# TODO: Or should this be called as_dataframe?
def to_dataframe(
pcoll: pvalue.PCollection,
proxy: Optional[pd.core.generic.NDFrame] = None,
label: Optional[str] = None,
) -> frame_base.DeferredFrame:
"""Converts a PCollection to a deferred dataframe-like object, which can
manipulated with pandas methods like `filter` and `groupby`.
For example, one might write::
pcoll = ...
df = to_dataframe(pcoll, proxy=...)
result = df.groupby('col').sum()
pcoll_result = to_pcollection(result)
A proxy object must be given if the schema for the PCollection is not known.
"""
if proxy is None:
if pcoll.element_type is None:
raise ValueError(
"Cannot infer a proxy because the input PCollection does not have a "
"schema defined. Please make sure a schema type is specified for "
"the input PCollection, or provide a proxy.")
# If no proxy is given, assume this is an element-wise schema-aware
# PCollection that needs to be batched.
if label is None:
# Attempt to come up with a reasonable, stable label by retrieving
# the name of these variables in the calling context.
label = 'BatchElements(%s)' % _var_name(pcoll, 2)
proxy = generate_proxy(pcoll.element_type)
shim_dofn: beam.DoFn
if isinstance(proxy, pd.DataFrame):
shim_dofn = RowsToDataFrameFn()
elif isinstance(proxy, pd.Series):
shim_dofn = ElementsToSeriesFn()
else:
raise AssertionError("Unknown proxy type: %s" % proxy)
pcoll = pcoll | label >> beam.ParDo(shim_dofn)
return frame_base.DeferredFrame.wrap(
expressions.PlaceholderExpression(proxy, pcoll))
# PCollections generated by to_pcollection are memoized, keyed by expression id.
# WeakValueDictionary is used so the caches are cleaned up with the parent
# pipelines
# Note that the pipeline (indirectly) holds references to the transforms which
# keeps both the PCollections and expressions alive. This ensures the
# expression's ids are never accidentally re-used.
TO_PCOLLECTION_CACHE: 'weakref.WeakValueDictionary[str, pvalue.PCollection]' = (
weakref.WeakValueDictionary())
UNBATCHED_CACHE: 'weakref.WeakValueDictionary[str, pvalue.PCollection]' = (
weakref.WeakValueDictionary())
class RowsToDataFrameFn(beam.DoFn):
@beam.DoFn.yields_elements
def process_batch(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
yield batch
class ElementsToSeriesFn(beam.DoFn):
@beam.DoFn.yields_elements
def process_batch(self, batch: pd.Series) -> Iterable[pd.Series]:
yield batch
def _make_unbatched_pcoll(
pc: pvalue.PCollection, expr: expressions.Expression,
include_indexes: bool):
label = f"Unbatch '{expr._id}'"
if include_indexes:
label += " with indexes"
if label not in UNBATCHED_CACHE:
proxy = expr.proxy()
shim_dofn: beam.DoFn
if isinstance(proxy, pd.DataFrame):
shim_dofn = DataFrameToRowsFn(proxy, include_indexes)
elif isinstance(proxy, pd.Series):
if include_indexes:
warnings.warn(
"Pipeline is converting a DeferredSeries to PCollection "
"with include_indexes=True. Note that this parameter is "
"_not_ respected for DeferredSeries conversion. To "
"include the index with your data, produce a"
"DeferredDataFrame instead.")
shim_dofn = SeriesToElementsFn(proxy)
else:
raise TypeError(f"Proxy '{proxy}' has unsupported type '{type(proxy)}'")
UNBATCHED_CACHE[label] = pc | label >> beam.ParDo(shim_dofn)
# Note unbatched cache is keyed by the expression id as well as parameters
# for the unbatching (i.e. include_indexes)
return UNBATCHED_CACHE[label]
class DataFrameToRowsFn(beam.DoFn):
def __init__(self, proxy, include_indexes):
self._proxy = proxy
self._include_indexes = include_indexes
@beam.DoFn.yields_batches
def process(self, element: pd.DataFrame) -> Iterable[pd.DataFrame]:
yield element
def infer_output_type(self, input_element_type):
return element_typehint_from_dataframe_proxy(
self._proxy, self._include_indexes)
class SeriesToElementsFn(beam.DoFn):
def __init__(self, proxy):
self._proxy = proxy
@beam.DoFn.yields_batches
def process(self, element: pd.Series) -> Iterable[pd.Series]:
yield element
def infer_output_type(self, input_element_type):
return dtype_to_fieldtype(self._proxy.dtype)
# TODO: Or should this be called from_dataframe?
def to_pcollection(
*dataframes: Union[frame_base.DeferredFrame, pd.DataFrame, pd.Series],
label=None,
always_return_tuple=False,
yield_elements='schemas',
include_indexes=False,
pipeline=None) -> Union[pvalue.PCollection, tuple[pvalue.PCollection, ...]]:
"""Converts one or more deferred dataframe-like objects back to a PCollection.
This method creates and applies the actual Beam operations that compute
the given deferred dataframes, returning a PCollection of their results. By
default the resulting PCollections are schema-aware PCollections where each
element is one row from the output dataframes, excluding indexes. This
behavior can be modified with the `yield_elements` and `include_indexes`
arguments.
Also accepts non-deferred pandas dataframes, which are converted to deferred,
schema'd PCollections. In this case the contents of the entire dataframe are
serialized into the graph, so for large amounts of data it is preferable to
write them to disk and read them with one of the read methods.
If more than one (related) result is desired, it can be more efficient to
pass them all at the same time to this method.
Args:
label: (optional, default "ToPCollection(...)"") the label to use for the
conversion transform.
always_return_tuple: (optional, default: False) If true, always return
a tuple of PCollections, even if there's only one output.
yield_elements: (optional, default: "schemas") If set to "pandas", return
PCollections containing the raw Pandas objects (DataFrames or Series),
if set to "schemas", return an element-wise PCollection, where DataFrame
and Series instances are expanded to one element per row. DataFrames are
converted to schema-aware PCollections, where column values can be
accessed by attribute.
include_indexes: (optional, default: False) When yield_elements="schemas",
if include_indexes=True, attempt to include index columns in the output
schema for expanded DataFrames. Raises an error if any of the index
levels are unnamed (name=None), or if any of the names are not unique
among all column and index names.
pipeline: (optional, unless non-deferred dataframes are passed) Used when
creating a PCollection from a non-deferred dataframe.
"""
if not yield_elements in ("pandas", "schemas"):
raise ValueError(
"Invalid value for yield_elements argument, '%s'. "
"Allowed values are 'pandas' and 'schemas'" % yield_elements)
if label is None:
# Attempt to come up with a reasonable, stable label by retrieving the name
# of these variables in the calling context.
label = 'ToPCollection(%s)' % ', '.join(_var_name(e, 3) for e in dataframes)
# Support for non-deferred dataframes.
deferred_dataframes = []
for ix, df in enumerate(dataframes):
if isinstance(df, frame_base.DeferredBase):
# TODO(robertwb): Maybe extract pipeline object?
deferred_dataframes.append(df)
elif isinstance(df, (pd.Series, pd.DataFrame)):
if pipeline is None:
raise ValueError(
'Pipeline keyword required for non-deferred dataframe conversion.')
deferred = pipeline | '%s_Defer%s' % (label, ix) >> beam.Create([df])
deferred_dataframes.append(
frame_base.DeferredFrame.wrap(
expressions.PlaceholderExpression(df.iloc[:0], deferred)))
else:
raise TypeError(
'Unable to convert objects of type %s to a PCollection' % type(df))
dataframes = tuple(deferred_dataframes)
def extract_input(placeholder):
if not isinstance(placeholder._reference, pvalue.PCollection):
raise TypeError(
'Expression roots must have been created with to_dataframe.')
return placeholder._reference
placeholders = frozenset.union(
frozenset(), *[df._expr.placeholders() for df in dataframes])
# Exclude any dataframes that have already been converted to PCollections.
# We only want to convert each DF expression once, then re-use.
new_dataframes = [
df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE
]
if len(new_dataframes):
new_results: dict[Any, pvalue.PCollection] = {
p: extract_input(p)
for p in placeholders
} | label >> transforms._DataframeExpressionsTransform(
{ix: df._expr
for (ix, df) in enumerate(new_dataframes)})
TO_PCOLLECTION_CACHE.update({
new_dataframes[ix]._expr._id: pc
for ix, pc in new_results.items()
})
raw_results = {
ix: TO_PCOLLECTION_CACHE[df._expr._id]
for ix, df in enumerate(dataframes)
}
if yield_elements == "schemas":
def maybe_unbatch(pc, value):
if isinstance(value, frame_base._DeferredScalar):
return pc
else:
return _make_unbatched_pcoll(pc, value._expr, include_indexes)
results = {
ix: maybe_unbatch(pc, dataframes[ix])
for (ix, pc) in raw_results.items()
}
else:
results = raw_results
if len(results) == 1 and not always_return_tuple:
return results[0]
else:
return tuple(value for key, value in sorted(results.items()))
def _var_name(obj, level):
frame = inspect.currentframe()
for _ in range(level):
if frame is None:
return '...'
frame = frame.f_back
for key, value in frame.f_locals.items():
if obj is value:
return key
for key, value in frame.f_globals.items():
if obj is value:
return key
return '...'