Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b94dca2

Browse files
author
Boyuan Zhang
committed
Update SDF APIs
1 parent fc77c31 commit b94dca2

17 files changed

Lines changed: 591 additions & 231 deletions

File tree

model/fn-execution/src/main/proto/beam_fn_api.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import "beam_runner_api.proto";
4242
import "endpoints.proto";
4343
import "google/protobuf/descriptor.proto";
4444
import "google/protobuf/timestamp.proto";
45+
import "google/protobuf/duration.proto";
4546
import "google/protobuf/wrappers.proto";
4647
import "metrics.proto";
4748

@@ -203,13 +204,21 @@ message BundleApplication {
203204
}
204205

205206
// An Application should be scheduled for execution after a delay.
207+
// Either an absolute timestamp or a relative timestamp can represent a
208+
// scheduled execution time.
206209
message DelayedBundleApplication {
207210
// Recommended time at which the application should be scheduled to execute
208211
// by the runner. Times in the past may be scheduled to execute immediately.
212+
// TODO(BEAM-8536): Migrate usage of absolute time to requested_time_delay.
209213
google.protobuf.Timestamp requested_execution_time = 1;
210214

211215
// (Required) The application that should be scheduled.
212216
BundleApplication application = 2;
217+
218+
// Recommended time delay at which the application should be scheduled to
219+
// execute by the runner. Time delay that equals 0 may be scheduled to execute
220+
// immediately. The unit of time delay should be microsecond.
221+
google.protobuf.Duration requested_time_delay = 3;
213222
}
214223

215224
// A request to process a given bundle.

sdks/python/apache_beam/io/iobase.py

Lines changed: 144 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import logging
3636
import math
3737
import random
38+
import threading
3839
import uuid
3940
from builtins import object
4041
from builtins import range
@@ -1104,13 +1105,17 @@ def process(self, element):
11041105
class RestrictionTracker(object):
11051106
"""Manages concurrent access to a restriction.
11061107
1107-
Experimental; no backwards-compatibility guarantees.
1108-
11091108
Keeps track of the restrictions claimed part for a Splittable DoFn.
11101109
1110+
The restriction may be modified by different threads, however the system will
1111+
ensure sufficient locking such that no methods on the restriction tracker
1112+
will be called concurrently.
1113+
11111114
See following documents for more details.
11121115
* https://s.apache.org/splittable-do-fn
11131116
* https://s.apache.org/splittable-do-fn-python-sdk
1117+
1118+
Experimental; no backwards-compatibility guarantees.
11141119
"""
11151120

11161121
def current_restriction(self):
@@ -1121,52 +1126,20 @@ def current_restriction(self):
11211126
11221127
The current restriction returned by method may be updated dynamically due
11231128
to due to concurrent invocation of other methods of the
1124-
``RestrictionTracker``, For example, ``checkpoint()``.
1125-
1126-
** Thread safety **
1129+
``RestrictionTracker``, For example, ``split()``.
11271130
1128-
Methods of the class ``RestrictionTracker`` including this method may get
1129-
invoked by different threads, hence must be made thread-safe, e.g. by using
1130-
a single lock object.
1131+
This API is required to be implemented.
11311132
1132-
TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1133+
Returns: a restriction object.
11331134
"""
11341135
raise NotImplementedError
11351136

11361137
def current_progress(self):
11371138
"""Returns a RestrictionProgress object representing the current progress.
1138-
"""
1139-
raise NotImplementedError
1140-
1141-
def current_watermark(self):
1142-
"""Returns current watermark. By default, not report watermark.
1143-
1144-
TODO(BEAM-7473): Provide synchronization guarantee by using a wrapper.
1145-
"""
1146-
return None
1147-
1148-
def checkpoint(self):
1149-
"""Performs a checkpoint of the current restriction.
1150-
1151-
Signals that the current ``DoFn.process()`` call should terminate as soon as
1152-
possible. After this method returns, the tracker MUST refuse all future
1153-
claim calls, and ``RestrictionTracker.check_done()`` MUST succeed.
1154-
1155-
This invocation modifies the value returned by ``current_restriction()``
1156-
invocation and returns a restriction representing the rest of the work. The
1157-
old value of ``current_restriction()`` is equivalent to the new value of
1158-
``current_restriction()`` and the return value of this method invocation
1159-
combined.
11601139
1161-
** Thread safety **
1162-
1163-
Methods of the class ``RestrictionTracker`` including this method may get
1164-
invoked by different threads, hence must be made thread-safe, e.g. by using
1165-
a single lock object.
1166-
1167-
TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1140+
This API is recommended to be implemented. The runner can do a better job
1141+
at parallel processing with better progress signals.
11681142
"""
1169-
11701143
raise NotImplementedError
11711144

11721145
def check_done(self):
@@ -1179,13 +1152,8 @@ def check_done(self):
11791152
remaining in the restriction when this method is invoked. Exception raised
11801153
must have an informative error message.
11811154
1182-
** Thread safety **
1183-
1184-
Methods of the class ``RestrictionTracker`` including this method may get
1185-
invoked by different threads, hence must be made thread-safe, e.g. by using
1186-
a single lock object.
1187-
1188-
TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1155+
This API is required to be implemented in order to make sure no data loss
1156+
during SDK processing.
11891157
11901158
Returns: ``True`` if current restriction has been fully processed.
11911159
Raises:
@@ -1215,8 +1183,12 @@ def try_split(self, fraction_of_remainder):
12151183
restrictions returned would be [100, 179), [179, 200) (note: current_offset
12161184
+ fraction_of_remainder * remaining_work = 130 + 0.7 * 70 = 179).
12171185
1218-
It is very important for pipeline scaling and end to end pipeline execution
1219-
that try_split is implemented well.
1186+
``fraction_of_remainder`` = 0 means a checkpoint is required.
1187+
1188+
The API is recommended to be implemented for batch pipeline given that it is
1189+
very important for pipeline scaling and end to end pipeline execution.
1190+
1191+
The API is required to be implemented for a streaming pipeline.
12201192
12211193
Args:
12221194
fraction_of_remainder: A hint as to the fraction of work the primary
@@ -1226,19 +1198,11 @@ def try_split(self, fraction_of_remainder):
12261198
Returns:
12271199
(primary_restriction, residual_restriction) if a split was possible,
12281200
otherwise returns ``None``.
1229-
1230-
** Thread safety **
1231-
1232-
Methods of the class ``RestrictionTracker`` including this method may get
1233-
invoked by different threads, hence must be made thread-safe, e.g. by using
1234-
a single lock object.
1235-
1236-
TODO(BEAM-7473): Remove thread safety requirements from API implementation.
12371201
"""
12381202
raise NotImplementedError
12391203

12401204
def try_claim(self, position):
1241-
""" Attempts to claim the block of work in the current restriction
1205+
"""Attempts to claim the block of work in the current restriction
12421206
identified by the given position.
12431207
12441208
If this succeeds, the DoFn MUST execute the entire block of work. If it
@@ -1247,40 +1211,137 @@ def try_claim(self, position):
12471211
work from ``DoFn.process()`` is also not allowed before the first call of
12481212
this method).
12491213
1214+
The API is required to be implemented.
1215+
12501216
Args:
12511217
position: current position that wants to be claimed.
12521218
12531219
Returns: ``True`` if the position can be claimed as current_position.
12541220
Otherwise, returns ``False``.
1221+
"""
1222+
raise NotImplementedError
12551223

1256-
** Thread safety **
12571224

1258-
Methods of the class ``RestrictionTracker`` including this method may get
1259-
invoked by different threads, hence must be made thread-safe, e.g. by using
1260-
a single lock object.
1225+
class ThreadsafeRestrictionTracker(object):
1226+
"""A thread-safe wrapper which wraps a `RestritionTracker`.
12611227
1262-
TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1263-
"""
1264-
raise NotImplementedError
1228+
This wrapper guarantees synchronization of modifying restrictions across
1229+
multi-thread.
1230+
"""
1231+
1232+
def __init__(self, restriction_tracker):
1233+
if not isinstance(restriction_tracker, RestrictionTracker):
1234+
raise ValueError(
1235+
'Initialize ThreadsafeRestrictionTracker requires'
1236+
'RestrictionTracker.')
1237+
self._restriction_tracker = restriction_tracker
1238+
# Records an absolute timestamp when defer_remainder is called.
1239+
self._deferred_timestamp = None
1240+
self._lock = threading.RLock()
1241+
self._deferred_residual = None
1242+
self._deferred_watermark = None
12651243

1266-
def defer_remainder(self, watermark=None):
1267-
""" Invokes checkpoint() in an SDF.process().
1244+
def current_restriction(self):
1245+
with self._lock:
1246+
return self._restriction_tracker.current_restriction()
12681247

1269-
TODO(BEAM-7472): Remove defer_remainder() once SDF.process() uses
1270-
``ProcessContinuation``.
1248+
def try_claim(self, position):
1249+
with self._lock:
1250+
return self._restriction_tracker.try_claim(position)
1251+
1252+
def defer_remainder(self, deferred_time=None):
1253+
"""Performs self-checkpoint on current processing restriction with an
1254+
expected resuming time.
1255+
1256+
Self-checkpoint could happen during processing elements. When executing an
1257+
DoFn.process(), you may want to stop processing an element and resuming
1258+
later if current element has been processed quit a long time or you also
1259+
want to have some outputs from other elements. ``defer_remainder()`` can be
1260+
called on per element if needed.
12711261
12721262
Args:
1273-
watermark
1263+
deferred_time: A relative ``timestamp.Duration`` that indicates the ideal
1264+
time gap between now and resuming, or an absolute ``timestamp.Timestamp``
1265+
for resuming execution time. If the time_delay is None, the deferred work
1266+
will be executed as soon as possible.
12741267
"""
1275-
raise NotImplementedError
1268+
1269+
# Record current time for calculating deferred_time later.
1270+
self._deferred_timestamp = timestamp.Timestamp.now()
1271+
if (deferred_time and
1272+
not isinstance(deferred_time, timestamp.Duration) and
1273+
not isinstance(deferred_time, timestamp.Timestamp)):
1274+
raise ValueError('The timestamp of deter_remainder() should be a '
1275+
'Duration or a Timestamp, or None.')
1276+
self._deferred_watermark = deferred_time
1277+
checkpoint = self.try_split(0)
1278+
if checkpoint:
1279+
_, self._deferred_residual = checkpoint
1280+
1281+
def check_done(self):
1282+
with self._lock:
1283+
return self._restriction_tracker.check_done()
1284+
1285+
def current_progress(self):
1286+
with self._lock:
1287+
return self._restriction_tracker.current_progress()
1288+
1289+
def try_split(self, fraction_of_remainder):
1290+
with self._lock:
1291+
return self._restriction_tracker.try_split(fraction_of_remainder)
12761292

12771293
def deferred_status(self):
1278-
""" Returns deferred_residual with deferred_watermark.
1294+
"""Returns deferred work which is produced by ``defer_remainder()``.
1295+
1296+
When there is a self-checkpoint performed, the system needs to fulfill the
1297+
DelayedBundleApplication with deferred_work for a ProcessBundleResponse.
1298+
The system calls this API to get deferred_residual with watermark together
1299+
to help the runner to schedule a future work.
12791300
1280-
TODO(BEAM-7472): Remove defer_status() once SDF.process() uses
1281-
``ProcessContinuation``.
1301+
Returns: (deferred_residual, time_delay) if having any residual, else None.
12821302
"""
1283-
raise NotImplementedError
1303+
if self._deferred_residual:
1304+
# If _deferred_watermark is None, create Duration(0).
1305+
if not self._deferred_watermark:
1306+
self._deferred_watermark = timestamp.Duration()
1307+
# If an absolute timestamp is provided, calculate the delta between
1308+
# the absoluted time and the time deferred_status() is called.
1309+
elif isinstance(self._deferred_watermark, timestamp.Timestamp):
1310+
self._deferred_watermark = (self._deferred_watermark -
1311+
timestamp.Timestamp.now())
1312+
# If a Duration is provided, the deferred time should be:
1313+
# provided duration - the spent time since the defer_remainder() is
1314+
# called.
1315+
elif isinstance(self._deferred_watermark, timestamp.Duration):
1316+
self._deferred_watermark -= (timestamp.Timestamp.now() -
1317+
self._deferred_timestamp)
1318+
return self._deferred_residual, self._deferred_watermark
1319+
1320+
1321+
class RestrictionTrackerView(object):
1322+
"""A DoFn view of thread-safe RestrictionTracker.
1323+
1324+
The RestrictionTrackerView wraps a ThreadsafeRestrictionTracker and only
1325+
exposes APIs that will be called by a ``DoFn.process()``. During execution
1326+
time, the RestrictionTrackerView will be fed into the ``DoFn.process`` as a
1327+
restriction_tracker.
1328+
"""
1329+
1330+
def __init__(self, threadsafe_restriction_tracker):
1331+
if not isinstance(threadsafe_restriction_tracker,
1332+
ThreadsafeRestrictionTracker):
1333+
raise ValueError('Initialize RestrictionTrackerView requires '
1334+
'ThreadsafeRestrictionTracker.')
1335+
self._threadsafe_restriction_tracker = threadsafe_restriction_tracker
1336+
1337+
def current_restriction(self):
1338+
return self._threadsafe_restriction_tracker.current_restriction()
1339+
1340+
def try_claim(self, position):
1341+
return self._threadsafe_restriction_tracker.try_claim(position)
1342+
1343+
def defer_remainder(self, deferred_time=None):
1344+
self._threadsafe_restriction_tracker.defer_remainder(deferred_time)
12841345

12851346

12861347
class RestrictionProgress(object):
@@ -1400,17 +1461,8 @@ def try_split(self, fraction_of_remainder):
14001461
SourceBundle(residual_weight, self._source, split_pos,
14011462
stop_pos))
14021463

1403-
def deferred_status(self):
1404-
return None
1405-
1406-
def current_watermark(self):
1407-
return None
1408-
1409-
def get_delegate_range_tracker(self):
1410-
return self._delegate_range_tracker
1411-
1412-
def get_tracking_source(self):
1413-
return self._source
1464+
def check_done(self):
1465+
return self._delegate_range_tracker.fraction_consumed() >= 1.0
14141466

14151467
class _SDFBoundedSourceRestrictionProvider(core.RestrictionProvider):
14161468
"""A `RestrictionProvider` that is used by SDF for `BoundedSource`."""
@@ -1463,8 +1515,13 @@ def process(
14631515
restriction_tracker=core.DoFn.RestrictionParam(
14641516
_SDFBoundedSourceWrapper._SDFBoundedSourceRestrictionProvider(
14651517
source, chunk_size))):
1466-
return restriction_tracker.get_tracking_source().read(
1467-
restriction_tracker.get_delegate_range_tracker())
1518+
current_restriction = restriction_tracker.current_restriction()
1519+
assert isinstance(current_restriction, SourceBundle)
1520+
tracking_source = current_restriction.source
1521+
start = current_restriction.start_position
1522+
stop = current_restriction.stop_position
1523+
return tracking_source.read(tracking_source.get_range_tracker(start,
1524+
stop))
14681525

14691526
return SDFBoundedSourceDoFn(self.source)
14701527

0 commit comments

Comments
 (0)