3535import logging
3636import math
3737import random
38+ import threading
3839import uuid
3940from builtins import object
4041from builtins import range
@@ -1104,13 +1105,17 @@ def process(self, element):
11041105class RestrictionTracker (object ):
11051106 """Manages concurrent access to a restriction.
11061107
1107- Experimental; no backwards-compatibility guarantees.
1108-
11091108 Keeps track of the restrictions claimed part for a Splittable DoFn.
11101109
1110+ The restriction may be modified by different threads, however the system will
1111+ ensure sufficient locking such that no methods on the restriction tracker
1112+ will be called concurrently.
1113+
11111114 See following documents for more details.
11121115 * https://s.apache.org/splittable-do-fn
11131116 * https://s.apache.org/splittable-do-fn-python-sdk
1117+
1118+ Experimental; no backwards-compatibility guarantees.
11141119 """
11151120
11161121 def current_restriction (self ):
@@ -1121,52 +1126,20 @@ def current_restriction(self):
11211126
11221127 The current restriction returned by method may be updated dynamically due
11231128 to due to concurrent invocation of other methods of the
1124- ``RestrictionTracker``, For example, ``checkpoint()``.
1125-
1126- ** Thread safety **
1129+ ``RestrictionTracker``, For example, ``split()``.
11271130
1128- Methods of the class ``RestrictionTracker`` including this method may get
1129- invoked by different threads, hence must be made thread-safe, e.g. by using
1130- a single lock object.
1131+ This API is required to be implemented.
11311132
1132- TODO(BEAM-7473): Remove thread safety requirements from API implementation .
1133+ Returns: a restriction object .
11331134 """
11341135 raise NotImplementedError
11351136
11361137 def current_progress (self ):
11371138 """Returns a RestrictionProgress object representing the current progress.
1138- """
1139- raise NotImplementedError
1140-
1141- def current_watermark (self ):
1142- """Returns current watermark. By default, not report watermark.
1143-
1144- TODO(BEAM-7473): Provide synchronization guarantee by using a wrapper.
1145- """
1146- return None
1147-
1148- def checkpoint (self ):
1149- """Performs a checkpoint of the current restriction.
1150-
1151- Signals that the current ``DoFn.process()`` call should terminate as soon as
1152- possible. After this method returns, the tracker MUST refuse all future
1153- claim calls, and ``RestrictionTracker.check_done()`` MUST succeed.
1154-
1155- This invocation modifies the value returned by ``current_restriction()``
1156- invocation and returns a restriction representing the rest of the work. The
1157- old value of ``current_restriction()`` is equivalent to the new value of
1158- ``current_restriction()`` and the return value of this method invocation
1159- combined.
11601139
1161- ** Thread safety **
1162-
1163- Methods of the class ``RestrictionTracker`` including this method may get
1164- invoked by different threads, hence must be made thread-safe, e.g. by using
1165- a single lock object.
1166-
1167- TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1140+ This API is recommended to be implemented. The runner can do a better job
1141+ at parallel processing with better progress signals.
11681142 """
1169-
11701143 raise NotImplementedError
11711144
11721145 def check_done (self ):
@@ -1179,13 +1152,8 @@ def check_done(self):
11791152 remaining in the restriction when this method is invoked. Exception raised
11801153 must have an informative error message.
11811154
1182- ** Thread safety **
1183-
1184- Methods of the class ``RestrictionTracker`` including this method may get
1185- invoked by different threads, hence must be made thread-safe, e.g. by using
1186- a single lock object.
1187-
1188- TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1155+ This API is required to be implemented in order to make sure no data loss
1156+ during SDK processing.
11891157
11901158 Returns: ``True`` if current restriction has been fully processed.
11911159 Raises:
@@ -1215,8 +1183,12 @@ def try_split(self, fraction_of_remainder):
12151183 restrictions returned would be [100, 179), [179, 200) (note: current_offset
12161184 + fraction_of_remainder * remaining_work = 130 + 0.7 * 70 = 179).
12171185
1218- It is very important for pipeline scaling and end to end pipeline execution
1219- that try_split is implemented well.
1186+ ``fraction_of_remainder`` = 0 means a checkpoint is required.
1187+
1188+ The API is recommended to be implemented for batch pipeline given that it is
1189+ very important for pipeline scaling and end to end pipeline execution.
1190+
1191+ The API is required to be implemented for a streaming pipeline.
12201192
12211193 Args:
12221194 fraction_of_remainder: A hint as to the fraction of work the primary
@@ -1226,19 +1198,11 @@ def try_split(self, fraction_of_remainder):
12261198 Returns:
12271199 (primary_restriction, residual_restriction) if a split was possible,
12281200 otherwise returns ``None``.
1229-
1230- ** Thread safety **
1231-
1232- Methods of the class ``RestrictionTracker`` including this method may get
1233- invoked by different threads, hence must be made thread-safe, e.g. by using
1234- a single lock object.
1235-
1236- TODO(BEAM-7473): Remove thread safety requirements from API implementation.
12371201 """
12381202 raise NotImplementedError
12391203
12401204 def try_claim (self , position ):
1241- """ Attempts to claim the block of work in the current restriction
1205+ """Attempts to claim the block of work in the current restriction
12421206 identified by the given position.
12431207
12441208 If this succeeds, the DoFn MUST execute the entire block of work. If it
@@ -1247,40 +1211,137 @@ def try_claim(self, position):
12471211 work from ``DoFn.process()`` is also not allowed before the first call of
12481212 this method).
12491213
1214+ The API is required to be implemented.
1215+
12501216 Args:
12511217 position: current position that wants to be claimed.
12521218
12531219 Returns: ``True`` if the position can be claimed as current_position.
12541220 Otherwise, returns ``False``.
1221+ """
1222+ raise NotImplementedError
12551223
1256- ** Thread safety **
12571224
1258- Methods of the class ``RestrictionTracker`` including this method may get
1259- invoked by different threads, hence must be made thread-safe, e.g. by using
1260- a single lock object.
1225+ class ThreadsafeRestrictionTracker (object ):
1226+ """A thread-safe wrapper which wraps a `RestritionTracker`.
12611227
1262- TODO(BEAM-7473): Remove thread safety requirements from API implementation.
1263- """
1264- raise NotImplementedError
1228+ This wrapper guarantees synchronization of modifying restrictions across
1229+ multi-thread.
1230+ """
1231+
1232+ def __init__ (self , restriction_tracker ):
1233+ if not isinstance (restriction_tracker , RestrictionTracker ):
1234+ raise ValueError (
1235+ 'Initialize ThreadsafeRestrictionTracker requires'
1236+ 'RestrictionTracker.' )
1237+ self ._restriction_tracker = restriction_tracker
1238+ # Records an absolute timestamp when defer_remainder is called.
1239+ self ._deferred_timestamp = None
1240+ self ._lock = threading .RLock ()
1241+ self ._deferred_residual = None
1242+ self ._deferred_watermark = None
12651243
1266- def defer_remainder (self , watermark = None ):
1267- """ Invokes checkpoint() in an SDF.process().
1244+ def current_restriction (self ):
1245+ with self ._lock :
1246+ return self ._restriction_tracker .current_restriction ()
12681247
1269- TODO(BEAM-7472): Remove defer_remainder() once SDF.process() uses
1270- ``ProcessContinuation``.
1248+ def try_claim (self , position ):
1249+ with self ._lock :
1250+ return self ._restriction_tracker .try_claim (position )
1251+
1252+ def defer_remainder (self , deferred_time = None ):
1253+ """Performs self-checkpoint on current processing restriction with an
1254+ expected resuming time.
1255+
1256+ Self-checkpoint could happen during processing elements. When executing an
1257+ DoFn.process(), you may want to stop processing an element and resuming
1258+ later if current element has been processed quit a long time or you also
1259+ want to have some outputs from other elements. ``defer_remainder()`` can be
1260+ called on per element if needed.
12711261
12721262 Args:
1273- watermark
1263+ deferred_time: A relative ``timestamp.Duration`` that indicates the ideal
1264+ time gap between now and resuming, or an absolute ``timestamp.Timestamp``
1265+ for resuming execution time. If the time_delay is None, the deferred work
1266+ will be executed as soon as possible.
12741267 """
1275- raise NotImplementedError
1268+
1269+ # Record current time for calculating deferred_time later.
1270+ self ._deferred_timestamp = timestamp .Timestamp .now ()
1271+ if (deferred_time and
1272+ not isinstance (deferred_time , timestamp .Duration ) and
1273+ not isinstance (deferred_time , timestamp .Timestamp )):
1274+ raise ValueError ('The timestamp of deter_remainder() should be a '
1275+ 'Duration or a Timestamp, or None.' )
1276+ self ._deferred_watermark = deferred_time
1277+ checkpoint = self .try_split (0 )
1278+ if checkpoint :
1279+ _ , self ._deferred_residual = checkpoint
1280+
1281+ def check_done (self ):
1282+ with self ._lock :
1283+ return self ._restriction_tracker .check_done ()
1284+
1285+ def current_progress (self ):
1286+ with self ._lock :
1287+ return self ._restriction_tracker .current_progress ()
1288+
1289+ def try_split (self , fraction_of_remainder ):
1290+ with self ._lock :
1291+ return self ._restriction_tracker .try_split (fraction_of_remainder )
12761292
12771293 def deferred_status (self ):
1278- """ Returns deferred_residual with deferred_watermark.
1294+ """Returns deferred work which is produced by ``defer_remainder()``.
1295+
1296+ When there is a self-checkpoint performed, the system needs to fulfill the
1297+ DelayedBundleApplication with deferred_work for a ProcessBundleResponse.
1298+ The system calls this API to get deferred_residual with watermark together
1299+ to help the runner to schedule a future work.
12791300
1280- TODO(BEAM-7472): Remove defer_status() once SDF.process() uses
1281- ``ProcessContinuation``.
1301+ Returns: (deferred_residual, time_delay) if having any residual, else None.
12821302 """
1283- raise NotImplementedError
1303+ if self ._deferred_residual :
1304+ # If _deferred_watermark is None, create Duration(0).
1305+ if not self ._deferred_watermark :
1306+ self ._deferred_watermark = timestamp .Duration ()
1307+ # If an absolute timestamp is provided, calculate the delta between
1308+ # the absoluted time and the time deferred_status() is called.
1309+ elif isinstance (self ._deferred_watermark , timestamp .Timestamp ):
1310+ self ._deferred_watermark = (self ._deferred_watermark -
1311+ timestamp .Timestamp .now ())
1312+ # If a Duration is provided, the deferred time should be:
1313+ # provided duration - the spent time since the defer_remainder() is
1314+ # called.
1315+ elif isinstance (self ._deferred_watermark , timestamp .Duration ):
1316+ self ._deferred_watermark -= (timestamp .Timestamp .now () -
1317+ self ._deferred_timestamp )
1318+ return self ._deferred_residual , self ._deferred_watermark
1319+
1320+
1321+ class RestrictionTrackerView (object ):
1322+ """A DoFn view of thread-safe RestrictionTracker.
1323+
1324+ The RestrictionTrackerView wraps a ThreadsafeRestrictionTracker and only
1325+ exposes APIs that will be called by a ``DoFn.process()``. During execution
1326+ time, the RestrictionTrackerView will be fed into the ``DoFn.process`` as a
1327+ restriction_tracker.
1328+ """
1329+
1330+ def __init__ (self , threadsafe_restriction_tracker ):
1331+ if not isinstance (threadsafe_restriction_tracker ,
1332+ ThreadsafeRestrictionTracker ):
1333+ raise ValueError ('Initialize RestrictionTrackerView requires '
1334+ 'ThreadsafeRestrictionTracker.' )
1335+ self ._threadsafe_restriction_tracker = threadsafe_restriction_tracker
1336+
1337+ def current_restriction (self ):
1338+ return self ._threadsafe_restriction_tracker .current_restriction ()
1339+
1340+ def try_claim (self , position ):
1341+ return self ._threadsafe_restriction_tracker .try_claim (position )
1342+
1343+ def defer_remainder (self , deferred_time = None ):
1344+ self ._threadsafe_restriction_tracker .defer_remainder (deferred_time )
12841345
12851346
12861347class RestrictionProgress (object ):
@@ -1400,17 +1461,8 @@ def try_split(self, fraction_of_remainder):
14001461 SourceBundle (residual_weight , self ._source , split_pos ,
14011462 stop_pos ))
14021463
1403- def deferred_status (self ):
1404- return None
1405-
1406- def current_watermark (self ):
1407- return None
1408-
1409- def get_delegate_range_tracker (self ):
1410- return self ._delegate_range_tracker
1411-
1412- def get_tracking_source (self ):
1413- return self ._source
1464+ def check_done (self ):
1465+ return self ._delegate_range_tracker .fraction_consumed () >= 1.0
14141466
14151467 class _SDFBoundedSourceRestrictionProvider (core .RestrictionProvider ):
14161468 """A `RestrictionProvider` that is used by SDF for `BoundedSource`."""
@@ -1463,8 +1515,13 @@ def process(
14631515 restriction_tracker = core .DoFn .RestrictionParam (
14641516 _SDFBoundedSourceWrapper ._SDFBoundedSourceRestrictionProvider (
14651517 source , chunk_size ))):
1466- return restriction_tracker .get_tracking_source ().read (
1467- restriction_tracker .get_delegate_range_tracker ())
1518+ current_restriction = restriction_tracker .current_restriction ()
1519+ assert isinstance (current_restriction , SourceBundle )
1520+ tracking_source = current_restriction .source
1521+ start = current_restriction .start_position
1522+ stop = current_restriction .stop_position
1523+ return tracking_source .read (tracking_source .get_range_tracker (start ,
1524+ stop ))
14681525
14691526 return SDFBoundedSourceDoFn (self .source )
14701527
0 commit comments