Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 33453c2

Browse files
committed
Revert "Revert "Optimize several Python coder implementations.""
This reverts commit 4a6527d.
1 parent fc373df commit 33453c2

8 files changed

Lines changed: 242 additions & 71 deletions

File tree

sdks/python/apache_beam/coders/coder_impl.pxd

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ cimport libc.stdint
2525
cimport libc.stdlib
2626
cimport libc.string
2727

28+
cdef extern from "math.h":
29+
libc.stdint.int64_t abs "llabs"(libc.stdint.int64_t)
30+
2831
from .stream cimport InputStream, OutputStream
2932
from apache_beam.utils cimport windowed_value
3033

3134

32-
cdef object loads, dumps, create_InputStream, create_OutputStream, ByteCountingOutputStream, get_varint_size
35+
cdef object loads, dumps, create_InputStream, create_OutputStream, ByteCountingOutputStream, get_varint_size, past_unicode
3336
# Temporarily untyped to allow monkeypatching on failed import.
3437
#cdef type WindowedValue
3538

@@ -75,8 +78,11 @@ cdef unsigned char SET_TYPE
7578

7679
cdef class FastPrimitivesCoderImpl(StreamCoderImpl):
7780
cdef CoderImpl fallback_coder_impl
78-
@cython.locals(dict_value=dict, int_value=libc.stdint.int64_t)
81+
@cython.locals(dict_value=dict, int_value=libc.stdint.int64_t,
82+
unicode_value=unicode)
7983
cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
84+
@cython.locals(t=int)
85+
cpdef decode_from_stream(self, InputStream stream, bint nested)
8086

8187

8288
cdef class BytesCoderImpl(CoderImpl):
@@ -123,6 +129,9 @@ cdef class TupleCoderImpl(AbstractComponentCoderImpl):
123129
cdef class SequenceCoderImpl(StreamCoderImpl):
124130
cdef CoderImpl _elem_coder
125131
cpdef _construct_from_sequence(self, values)
132+
@cython.locals(buffer=OutputStream, target_buffer_size=libc.stdint.int64_t,
133+
index=libc.stdint.int64_t)
134+
cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
126135

127136

128137
cdef class TupleSequenceCoderImpl(SequenceCoderImpl):
@@ -133,8 +142,41 @@ cdef class IterableCoderImpl(SequenceCoderImpl):
133142
pass
134143

135144

145+
cdef object IntervalWindow
146+
147+
cdef class IntervalWindowCoderImpl(StreamCoderImpl):
148+
cdef libc.stdint.uint64_t _to_normal_time(self, libc.stdint.int64_t value)
149+
cdef libc.stdint.int64_t _from_normal_time(self, libc.stdint.uint64_t value)
150+
151+
@cython.locals(typed_value=windowed_value._IntervalWindowBase,
152+
span_millis=libc.stdint.int64_t)
153+
cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
154+
155+
@cython.locals(typed_value=windowed_value._IntervalWindowBase)
156+
cpdef decode_from_stream(self, InputStream stream, bint nested)
157+
158+
@cython.locals(typed_value=windowed_value._IntervalWindowBase,
159+
span_millis=libc.stdint.int64_t)
160+
cpdef estimate_size(self, value, bint nested=?)
161+
162+
163+
cdef int PaneInfoTiming_UNKNOWN
164+
cdef int PaneInfoEncoding_FIRST
165+
166+
136167
cdef class PaneInfoCoderImpl(StreamCoderImpl):
137-
cdef int _choose_encoding(self, value)
168+
cdef int _choose_encoding(self, windowed_value.PaneInfo value)
169+
170+
@cython.locals(pane_info=windowed_value.PaneInfo, encoding_type=int)
171+
cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
172+
173+
@cython.locals(encoded_first_byte=int, encoding_type=int)
174+
cpdef decode_from_stream(self, InputStream stream, bint nested)
175+
176+
177+
cdef libc.stdint.uint64_t _TIME_SHIFT
178+
cdef libc.stdint.int64_t MIN_TIMESTAMP_micros
179+
cdef libc.stdint.int64_t MAX_TIMESTAMP_micros
138180

139181

140182
cdef class WindowedValueCoderImpl(StreamCoderImpl):
@@ -144,8 +186,18 @@ cdef class WindowedValueCoderImpl(StreamCoderImpl):
144186
cdef CoderImpl _windows_coder
145187
cdef CoderImpl _pane_info_coder
146188

189+
cdef libc.stdint.uint64_t _to_normal_time(self, libc.stdint.int64_t value)
190+
cdef libc.stdint.int64_t _from_normal_time(self, libc.stdint.uint64_t value)
191+
147192
@cython.locals(c=CoderImpl)
148193
cpdef get_estimated_size_and_observables(self, value, bint nested=?)
149194

150-
@cython.locals(wv=windowed_value.WindowedValue)
195+
@cython.locals(timestamp=libc.stdint.int64_t)
196+
cpdef decode_from_stream(self, InputStream stream, bint nested)
197+
198+
@cython.locals(wv=windowed_value.WindowedValue, restore_sign=int)
151199
cpdef encode_to_stream(self, value, OutputStream stream, bint nested)
200+
201+
202+
cdef class LengthPrefixCoderImpl(StreamCoderImpl):
203+
cdef CoderImpl _value_coder

sdks/python/apache_beam/coders/coder_impl.py

Lines changed: 56 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@
3737
from builtins import chr
3838
from builtins import object
3939

40+
from past.builtins import unicode as past_unicode
4041
from past.builtins import long
41-
from past.builtins import unicode
4242

4343
from apache_beam.coders import observable
4444
from apache_beam.utils import windowed_value
@@ -71,6 +71,11 @@
7171
# pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports
7272

7373

74+
_TIME_SHIFT = 1 << 63
75+
MIN_TIMESTAMP_micros = MIN_TIMESTAMP.micros
76+
MAX_TIMESTAMP_micros = MAX_TIMESTAMP.micros
77+
78+
7479
class CoderImpl(object):
7580
"""For internal use only; no backwards-compatibility guarantees."""
7681

@@ -216,7 +221,7 @@ def __init__(self, coder, step_label):
216221
self._step_label = step_label
217222

218223
def _check_safe(self, value):
219-
if isinstance(value, (bytes, unicode, long, int, float)):
224+
if isinstance(value, (bytes, past_unicode, long, int, float)):
220225
pass
221226
elif value is None:
222227
pass
@@ -321,10 +326,10 @@ def encode_to_stream(self, value, stream, nested):
321326
elif t is bytes:
322327
stream.write_byte(BYTES_TYPE)
323328
stream.write(value, nested)
324-
elif t is unicode:
325-
text_value = value # for typing
329+
elif t is past_unicode:
330+
unicode_value = value # for typing
326331
stream.write_byte(UNICODE_TYPE)
327-
stream.write(text_value.encode('utf-8'), nested)
332+
stream.write(unicode_value.encode('utf-8'), nested)
328333
elif t is list or t is tuple or t is set:
329334
stream.write_byte(
330335
LIST_TYPE if t is list else TUPLE_TYPE if t is tuple else SET_TYPE)
@@ -413,37 +418,47 @@ def estimate_size(self, unused_value, nested=False):
413418
return 8
414419

415420

421+
IntervalWindow = None
422+
423+
416424
class IntervalWindowCoderImpl(StreamCoderImpl):
417425
"""For internal use only; no backwards-compatibility guarantees."""
418426

419427
# TODO: Fn Harness only supports millis. Is this important enough to fix?
420428
def _to_normal_time(self, value):
421429
"""Convert "lexicographically ordered unsigned" to signed."""
422-
return value - (1 << 63)
430+
return value - _TIME_SHIFT
423431

424432
def _from_normal_time(self, value):
425433
"""Convert signed to "lexicographically ordered unsigned"."""
426-
return value + (1 << 63)
434+
return value + _TIME_SHIFT
427435

428436
def encode_to_stream(self, value, out, nested):
429-
span_micros = value.end.micros - value.start.micros
437+
typed_value = value
438+
span_millis = (typed_value._end_micros // 1000
439+
- typed_value._start_micros // 1000)
430440
out.write_bigendian_uint64(
431-
self._from_normal_time(value.end.micros // 1000))
432-
out.write_var_int64(span_micros // 1000)
441+
self._from_normal_time(typed_value._end_micros // 1000))
442+
out.write_var_int64(span_millis)
433443

434444
def decode_from_stream(self, in_, nested):
435-
end_millis = self._to_normal_time(in_.read_bigendian_uint64())
436-
start_millis = end_millis - in_.read_var_int64()
437-
from apache_beam.transforms.window import IntervalWindow
438-
ret = IntervalWindow(start=Timestamp(micros=start_millis * 1000),
439-
end=Timestamp(micros=end_millis * 1000))
440-
return ret
445+
global IntervalWindow
446+
if IntervalWindow is None:
447+
from apache_beam.transforms.window import IntervalWindow
448+
typed_value = IntervalWindow(None, None)
449+
typed_value._end_micros = (
450+
1000 * self._to_normal_time(in_.read_bigendian_uint64()))
451+
typed_value._start_micros = (
452+
typed_value._end_micros - 1000 * in_.read_var_int64())
453+
return typed_value
441454

442455
def estimate_size(self, value, nested=False):
443456
# An IntervalWindow is context-insensitive, with a timestamp (8 bytes)
444457
# and a varint timespam.
445-
span = value.end.micros - value.start.micros
446-
return 8 + get_varint_size(span // 1000)
458+
typed_value = value
459+
span_millis = (typed_value._end_micros // 1000
460+
- typed_value._start_micros // 1000)
461+
return 8 + get_varint_size(span_millis)
447462

448463

449464
class TimestampCoderImpl(StreamCoderImpl):
@@ -647,10 +662,11 @@ def encode_to_stream(self, value, out, nested):
647662
# -1 to indicate that the length is not known.
648663
out.write_bigendian_int32(-1)
649664
buffer = create_OutputStream()
665+
target_buffer_size = self._DEFAULT_BUFFER_SIZE
650666
prev_index = index = -1
651667
for index, elem in enumerate(value):
652668
self._elem_coder.encode_to_stream(elem, buffer, True)
653-
if out.size() > self._DEFAULT_BUFFER_SIZE:
669+
if buffer.size() > target_buffer_size:
654670
out.write_var_int64(index - prev_index)
655671
out.write(buffer.get())
656672
prev_index = index
@@ -739,25 +755,31 @@ class PaneInfoEncoding(object):
739755
TWO_INDICES = 2
740756

741757

758+
# These are cdef'd to ints to optimized the common case.
759+
PaneInfoTiming_UNKNOWN = windowed_value.PaneInfoTiming.UNKNOWN
760+
PaneInfoEncoding_FIRST = PaneInfoEncoding.FIRST
761+
762+
742763
class PaneInfoCoderImpl(StreamCoderImpl):
743764
"""For internal use only; no backwards-compatibility guarantees.
744765
745766
Coder for a PaneInfo descriptor."""
746767

747768
def _choose_encoding(self, value):
748-
if ((value.index == 0 and value.nonspeculative_index == 0) or
749-
value.timing == windowed_value.PaneInfoTiming.UNKNOWN):
750-
return PaneInfoEncoding.FIRST
751-
elif (value.index == value.nonspeculative_index or
752-
value.timing == windowed_value.PaneInfoTiming.EARLY):
769+
if ((value._index == 0 and value._nonspeculative_index == 0) or
770+
value._timing == PaneInfoTiming_UNKNOWN):
771+
return PaneInfoEncoding_FIRST
772+
elif (value._index == value._nonspeculative_index or
773+
value._timing == windowed_value.PaneInfoTiming.EARLY):
753774
return PaneInfoEncoding.ONE_INDEX
754775
else:
755776
return PaneInfoEncoding.TWO_INDICES
756777

757778
def encode_to_stream(self, value, out, nested):
758-
encoding_type = self._choose_encoding(value)
759-
out.write_byte(value.encoded_byte | (encoding_type << 4))
760-
if encoding_type == PaneInfoEncoding.FIRST:
779+
pane_info = value # cast
780+
encoding_type = self._choose_encoding(pane_info)
781+
out.write_byte(pane_info._encoded_byte | (encoding_type << 4))
782+
if encoding_type == PaneInfoEncoding_FIRST:
761783
return
762784
elif encoding_type == PaneInfoEncoding.ONE_INDEX:
763785
out.write_var_int64(value.index)
@@ -772,7 +794,7 @@ def decode_from_stream(self, in_stream, nested):
772794
base = windowed_value._BYTE_TO_PANE_INFO[encoded_first_byte & 0xF]
773795
assert base is not None
774796
encoding_type = encoded_first_byte >> 4
775-
if encoding_type == PaneInfoEncoding.FIRST:
797+
if encoding_type == PaneInfoEncoding_FIRST:
776798
return base
777799
elif encoding_type == PaneInfoEncoding.ONE_INDEX:
778800
index = in_stream.read_var_int64()
@@ -811,11 +833,11 @@ class WindowedValueCoderImpl(StreamCoderImpl):
811833
# byte representation of timestamps.
812834
def _to_normal_time(self, value):
813835
"""Convert "lexicographically ordered unsigned" to signed."""
814-
return value - (1 << 63)
836+
return value - _TIME_SHIFT
815837

816838
def _from_normal_time(self, value):
817839
"""Convert signed to "lexicographically ordered unsigned"."""
818-
return value + (1 << 63)
840+
return value + _TIME_SHIFT
819841

820842
def __init__(self, value_coder, timestamp_coder, window_coder):
821843
# TODO(lcwik): Remove the timestamp coder field
@@ -849,16 +871,12 @@ def decode_from_stream(self, in_stream, nested):
849871
# were indeed MIN/MAX timestamps.
850872
# TODO(BEAM-1524): Clean this up once we have a BEAM wide consensus on
851873
# precision of timestamps.
852-
if timestamp == -(abs(MIN_TIMESTAMP.micros) // 1000):
853-
timestamp = MIN_TIMESTAMP.micros
854-
elif timestamp == (MAX_TIMESTAMP.micros // 1000):
855-
timestamp = MAX_TIMESTAMP.micros
874+
if timestamp <= -(abs(MIN_TIMESTAMP_micros) // 1000):
875+
timestamp = MIN_TIMESTAMP_micros
876+
elif timestamp >= MAX_TIMESTAMP_micros // 1000:
877+
timestamp = MAX_TIMESTAMP_micros
856878
else:
857879
timestamp *= 1000
858-
if timestamp > MAX_TIMESTAMP.micros:
859-
timestamp = MAX_TIMESTAMP.micros
860-
if timestamp < MIN_TIMESTAMP.micros:
861-
timestamp = MIN_TIMESTAMP.micros
862880

863881
windows = self._windows_coder.decode_from_stream(in_stream, True)
864882
# Read PaneInfo encoded byte.

sdks/python/apache_beam/coders/coders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,7 @@ def __init__(self, value_coder):
995995
self._value_coder = value_coder
996996

997997
def _create_impl(self):
998-
return coder_impl.LengthPrefixCoderImpl(self._value_coder)
998+
return coder_impl.LengthPrefixCoderImpl(self._value_coder.get_impl())
999999

10001000
def is_deterministic(self):
10011001
return self._value_coder.is_deterministic()

0 commit comments

Comments
 (0)