3232from apache_beam .runners .interactive import interactive_environment as ie
3333from apache_beam .runners .interactive import pipeline_fragment as pf
3434from apache_beam .runners .interactive import background_caching_job
35- from apache_beam .runners .interactive .utils import obfuscate
35+ from apache_beam .runners .interactive .caching .cacheable import Cacheable
36+ from apache_beam .runners .interactive .caching .cacheable import CacheKey
3637from apache_beam .testing import test_stream
3738from apache_beam .transforms .window import WindowedValue
3839
3940READ_CACHE = "_ReadCache_"
4041WRITE_CACHE = "_WriteCache_"
4142
4243
43- # TODO: turn this into a dataclass object when we finally get off of Python2.
44- class Cacheable :
45- def __init__ (self , pcoll_id , var , version , pcoll , producer_version ):
46- self .pcoll_id = pcoll_id
47- self .var = var
48- self .version = version
49- self .pcoll = pcoll
50- self .producer_version = producer_version
51-
52- def __eq__ (self , other ):
53- return (
54- self .pcoll_id == other .pcoll_id and self .var == other .var and
55- self .version == other .version and self .pcoll == other .pcoll and
56- self .producer_version == other .producer_version )
57-
58- def __hash__ (self ):
59- return hash ((
60- self .pcoll_id ,
61- self .var ,
62- self .version ,
63- self .pcoll ,
64- self .producer_version ))
65-
66- def to_key (self ):
67- return CacheKey (
68- self .var ,
69- self .version ,
70- self .producer_version ,
71- str (id (self .pcoll .pipeline )))
72-
73-
74- # TODO: turn this into a dataclass object when we finally get off of Python2.
75- class CacheKey :
76- def __init__ (self , var , version , producer_version , pipeline_id ):
77- # Makes sure that the variable name is obfuscated and only first 10
78- # characters taken so that the CacheKey has a constant length.
79- self .var = obfuscate (var )[:10 ]
80- self .version = version
81- self .producer_version = producer_version
82- self .pipeline_id = pipeline_id
83-
84- @staticmethod
85- def from_str (r ):
86- split = r .split ('-' )
87- return CacheKey (split [0 ], split [1 ], split [2 ], split [3 ])
88-
89- def __repr__ (self ):
90- return '-' .join (
91- [self .var , self .version , self .producer_version , self .pipeline_id ])
92-
93-
9444class PipelineInstrument (object ):
9545 """A pipeline instrument for pipeline to be executed by interactive runner.
9646
@@ -103,36 +53,34 @@ class PipelineInstrument(object):
10353 """
10454 def __init__ (self , pipeline , options = None ):
10555 self ._pipeline = pipeline
106- # The cache manager per user-defined pipeline is lazily initiated the first
107- # time accessed. It is owned by interactive_environment module. This
108- # shortcut reference will be initialized when the user pipeline associated
109- # to the given pipeline is identified.
110- self ._cache_manager = None
111-
112- # Invoke a round trip through the runner API. This makes sure the Pipeline
113- # proto is stable. The snapshot of pipeline will not be mutated within this
114- # module and can be used to recover original pipeline if needed .
115- self . _pipeline_snap = beam . pipeline . Pipeline . from_runner_api (
116- pipeline . to_runner_api ( use_fake_coders = True ), pipeline . runner , options )
117- ie . current_env (). add_derived_pipeline ( self . _pipeline , self ._pipeline_snap )
56+
57+ self . _user_pipeline = ie . current_env (). user_pipeline ( pipeline )
58+ if not self . _user_pipeline :
59+ self . _user_pipeline = pipeline
60+ self ._cache_manager = ie . current_env (). get_cache_manager (
61+ self . _user_pipeline , create_if_absent = True )
62+ # Check if the user defined pipeline contains any source to cache.
63+ # If so, during the check, the cache manager is converted into a
64+ # streaming cache manager, thus re-assign .
65+ if background_caching_job . has_source_to_cache ( self . _user_pipeline ):
66+ self . _cache_manager = ie . current_env (). get_cache_manager (
67+ self ._user_pipeline )
11868
11969 self ._background_caching_pipeline = beam .pipeline .Pipeline .from_runner_api (
12070 pipeline .to_runner_api (use_fake_coders = True ), pipeline .runner , options )
12171 ie .current_env ().add_derived_pipeline (
12272 self ._pipeline , self ._background_caching_pipeline )
12373
12474 # Snapshot of original pipeline information.
125- (self ._original_pipeline_proto ,
126- self ._original_context ) = self ._pipeline_snap .to_runner_api (
127- return_context = True , use_fake_coders = True )
75+ (self ._original_pipeline_proto , context ) = self ._pipeline .to_runner_api (
76+ return_context = True , use_fake_coders = True )
12877
12978 # All compute-once-against-original-pipeline fields.
13079 self ._unbounded_sources = unbounded_sources (
13180 self ._background_caching_pipeline )
13281 # TODO(BEAM-7760): once cache scope changed, this is not needed to manage
13382 # relationships across pipelines, runners, and jobs.
134- self ._pcolls_to_pcoll_id = pcolls_to_pcoll_id (
135- self ._pipeline_snap , self ._original_context )
83+ self ._pcolls_to_pcoll_id = pcolls_to_pcoll_id (self ._pipeline , context )
13684
13785 # A mapping from PCollection id to python id() value in user defined
13886 # pipeline instance.
@@ -149,11 +97,6 @@ def __init__(self, pipeline, options=None):
14997 # (Dict[str, AppliedPTransform]).
15098 self ._cached_pcoll_read = {}
15199
152- # Reference to the user defined pipeline instance based on the given
153- # pipeline. The class never mutates it.
154- # Note: the original pipeline is not the user pipeline.
155- self ._user_pipeline = None
156-
157100 # A dict from PCollections in the runner pipeline instance to their
158101 # corresponding PCollections in the user pipeline instance. Populated
159102 # after preprocess().
@@ -421,15 +364,9 @@ def pcolls_to_pcoll_id(self):
421364
422365 @property
423366 def original_pipeline_proto (self ):
424- """Returns the portable proto representation of the pipeline before
425- instrumentation."""
367+ """Returns a snapshot of the pipeline proto before instrumentation."""
426368 return self ._original_pipeline_proto
427369
428- @property
429- def original_pipeline (self ):
430- """Returns a snapshot of the pipeline before instrumentation."""
431- return self ._pipeline_snap
432-
433370 @property
434371 def user_pipeline (self ):
435372 """Returns a reference to the pipeline instance defined by the user. If a
@@ -571,29 +508,11 @@ def _process(self, pcoll):
571508 cacheable_key = self ._pin ._cacheable_key (pcoll )
572509 user_pcoll = self ._pin .cacheables [cacheable_key ].pcoll
573510 if (cacheable_key in self ._pin .cacheables and user_pcoll != pcoll ):
574- if not self ._pin ._user_pipeline :
575- # Retrieve a reference to the user defined pipeline instance.
576- self ._pin ._user_pipeline = user_pcoll .pipeline
577- # Retrieve a reference to the cache manager for the user defined
578- # pipeline instance.
579- self ._pin ._cache_manager = ie .current_env ().get_cache_manager (
580- self ._pin ._user_pipeline , create_if_absent = True )
581- # Check if the user defined pipeline contains any source to cache.
582- # If so, during the check, the cache manager is converted into a
583- # streaming cache manager, thus re-assign the reference.
584- if background_caching_job .has_source_to_cache (
585- self ._pin ._user_pipeline ):
586- self ._pin ._cache_manager = ie .current_env ().get_cache_manager (
587- self ._pin ._user_pipeline )
588511 self ._pin ._runner_pcoll_to_user_pcoll [pcoll ] = user_pcoll
589512 self ._pin .cacheables [cacheable_key ].pcoll = pcoll
590513
591514 v = PreprocessVisitor (self )
592515 self ._pipeline .visit (v )
593- if not self ._user_pipeline :
594- self ._user_pipeline = self ._pipeline
595- self ._cache_manager = ie .current_env ().get_cache_manager (
596- self ._user_pipeline , create_if_absent = True )
597516
598517 def _write_cache (
599518 self ,
@@ -679,7 +598,6 @@ def _read_cache(self, pipeline, pcoll, is_unbounded_source_output):
679598 key = self .cache_key (pcoll )
680599 # Can only read from cache when the cache with expected key exists and its
681600 # computation has been completed.
682-
683601 is_cached = self ._cache_manager .exists ('full' , key )
684602 is_computed = (
685603 pcoll in self ._runner_pcoll_to_user_pcoll and
@@ -886,7 +804,6 @@ def cacheables(pcolls_to_pcoll_id):
886804 for watching in ie .current_env ().watching ():
887805 for key , val in watching :
888806 if isinstance (val , beam .pvalue .PCollection ):
889-
890807 pcoll_id = pcolls_to_pcoll_id .get (str (val ), None )
891808 # It's highly possible that PCollection str is not unique across
892809 # multiple pipelines, further check during instrument is needed.
0 commit comments