-
Notifications
You must be signed in to change notification settings - Fork 37
[WIP] Protobuf Performance Refactor #230
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a085789
ff3c6f0
1109927
d04f72e
f57c932
fe17aae
d723f18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,11 +12,24 @@ | |
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import datetime | ||
| from enum import EnumMeta | ||
| from re import L | ||
| from proto.datetime_helpers import DatetimeWithNanoseconds | ||
| from proto.marshal.rules import wrappers | ||
| from proto.marshal.rules.dates import DurationRule | ||
| from proto.marshal.collections.maps import MapComposite | ||
| from proto.utils import cached_property | ||
| from typing import Any, Callable, Optional, Union | ||
|
|
||
| from google.protobuf import descriptor_pb2 | ||
| from google.protobuf import duration_pb2 | ||
| from google.protobuf import struct_pb2 | ||
| from google.protobuf import timestamp_pb2 | ||
| from google.protobuf import wrappers_pb2 | ||
| from google.protobuf.internal.enum_type_wrapper import EnumTypeWrapper | ||
|
|
||
| import proto | ||
| from proto.primitives import ProtoType | ||
|
|
||
|
|
||
|
|
@@ -140,6 +153,88 @@ def pb_type(self): | |
| return self.message.pb() | ||
| return self.message | ||
|
|
||
| @property | ||
| def can_represent_natively(self) -> bool: | ||
| return not ( | ||
| self.proto_type == ProtoType.MESSAGE and | ||
| self.message == struct_pb2.Value | ||
| ) | ||
|
|
||
| def contribute_to_class(self, cls, name: str): | ||
| """Attaches a descriptor to the top-level proto.Message class, so that attribute | ||
| reads and writes can be specially handled in `_FieldDescriptor.__get__` and | ||
| `FieldDescriptor.__set__`. | ||
|
|
||
| Also contains hooks for write-time type-coersion to translate special cases between | ||
| pure Pythonic objects and pb2-compatible structs or values. | ||
| """ | ||
| set_coercion = None | ||
| if self.proto_type == ProtoType.STRING: | ||
| # Bytes are accepted for string values, but strings are not accepted for byte values. | ||
| # This is an artifact of older Python2 implementations. | ||
| set_coercion = self._bytes_to_str | ||
| elif self.pb_type == timestamp_pb2.Timestamp: | ||
| set_coercion = self._timestamp_to_datetime | ||
| elif self.proto_type == ProtoType.MESSAGE and self.message == duration_pb2.Duration: | ||
| set_coercion = self._duration_to_timedelta | ||
| elif self.proto_type == ProtoType.MESSAGE and self.message == wrappers_pb2.BoolValue: | ||
| set_coercion = self._bool_value_to_bool | ||
| elif self.enum: | ||
| set_coercion = self._literal_to_enum | ||
| setattr(cls, name, _FieldDescriptor(name, cls=cls, set_coercion=set_coercion)) | ||
|
|
||
| @cached_property | ||
| def reverse_enum_map(self): | ||
| """Helper that allows for constant-time lookup on self.enum, used to hydrate | ||
| primitives that are supplied but which stand for their official enum types. | ||
|
|
||
| This is used when a developer supplies the literal value for an enum type (often an int). | ||
| """ | ||
| return {e.value: e for e in self.enum} if self.enum else None | ||
|
|
||
| @cached_property | ||
| def reverse_enum_names_map(self): | ||
| """Helper that allows for constant-time lookup on self.enum, used to hydrate | ||
| primitives that are supplied but which stand for their official enum types. | ||
|
|
||
| This is used when a developer supplies the string value for an enum type's name. | ||
| """ | ||
| return {e.name: e for e in self.enum} if self.enum else None | ||
|
|
||
| def _literal_to_enum(self, val: Any): | ||
| if isinstance(val, self.enum): | ||
| return val | ||
| return ( | ||
| self.reverse_enum_map.get(val, None) or | ||
| self.reverse_enum_names_map.get(val, None) | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def _bytes_to_str(val: Union[bytes, str]) -> str: | ||
| if type(val) == bytes: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably worth profiling here vs. $ python3.9 -m timeit "isinstance('', bytes)"
2000000 loops, best of 5: 127 nsec per loop
$ python3.9 -m timeit "type('') == bytes"
2000000 loops, best of 5: 129 nsec per loop
$ python3.9 -m timeit "type('') is bytes"
2000000 loops, best of 5: 123 nsec per loopand similarly for the other conversions below. |
||
| val = val.decode('utf-8') | ||
| return val | ||
|
|
||
| @staticmethod | ||
| def _timestamp_to_datetime(val: Union[timestamp_pb2.Timestamp, datetime.datetime]) -> datetime.datetime: | ||
| if type(val) == timestamp_pb2.Timestamp: | ||
| val = DatetimeWithNanoseconds.from_timestamp_pb(val) | ||
| return val | ||
|
|
||
| @staticmethod | ||
| def _duration_to_timedelta(val: Union[duration_pb2.Duration, datetime.timedelta]) -> datetime.datetime: | ||
| if type(val) == duration_pb2.Duration: | ||
| val = DurationRule().to_python(val) | ||
| return val | ||
|
|
||
| @staticmethod | ||
| def _bool_value_to_bool(val: Union[wrappers_pb2.BoolValue, bool]) -> Optional[bool]: | ||
| if val is None: | ||
| return None | ||
| if type(val) == wrappers_pb2.BoolValue: | ||
| val = val.value | ||
| return val | ||
|
|
||
|
|
||
| class RepeatedField(Field): | ||
| """A representation of a repeated field in protocol buffers.""" | ||
|
|
@@ -155,6 +250,192 @@ def __init__(self, key_type, value_type, *, number: int, message=None, enum=None | |
| self.map_key_type = key_type | ||
|
|
||
|
|
||
| class _FieldDescriptor: | ||
| """Handler for proto.Field access on any proto.Message object. | ||
|
|
||
| Wraps each proto.Field instance within a given proto.Message subclass's definition | ||
| with getters and setters that allow for caching of values on the proto-plus object, | ||
| deferment of syncing to the underlying pb2 object, and tracking of the current state. | ||
|
|
||
| Special treatment is given to MapFields, nested Messages, and certain data types, as | ||
| their various implementations within pb2 (which for our purposes is mostly a black box) | ||
| sometimes mandate immediate syncing. This is usually because proto-plus objects are not | ||
| long-lived, and thus information about which fields are stale would be lost if syncing | ||
| was left for serialization time. | ||
| """ | ||
|
|
||
| # Namespace for attributes where we will store the Pythonic values of | ||
| # various `proto.Field` classes on instantiated `proto.Message` objects. | ||
| # For example, in the following scenario, this attribute is involved in | ||
| # saving the value "Homer Simpson" to `my_message._cached_fields__name`. | ||
| # | ||
| # class MyMessage(proto.Message): | ||
| # name = proto.Field(proto.STRING, ...) | ||
| # | ||
| # my_message = MyMessage() | ||
| # my_message.name = "Homer Simpson" # saves to `_cached_fields__name` | ||
| cached_fields_prefix = '_cached_fields__' | ||
|
|
||
| def __init__(self, name: str, *, cls, set_coercion: Optional[Callable] = None): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make it more clear what signature
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, from usage it looks like it it takes one param and returns one param, with the types dependent on the field type. Can you clarify that in a comment? , set_coercion: Callable = lambda v: v)
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There used to be a no-op default, but @tseaver opined that this pattern might be cleaner. Either way, expanding on the comments. |
||
| # something like "id". required whenever reach back to the pb2 object. | ||
| self.original_name = name | ||
| # something like "_cached_id" | ||
| self.instance_attr_name = f'{self.cached_fields_prefix}{name}' | ||
|
|
||
| # simple types coercion for setting attributes | ||
| # (e.g., bytes -> str if our type is string, but we are supplied bytes) | ||
| # the signature of `set_coercion` is dependent on the field's data types | ||
| # and is always handled by `contribute_to_class` which pairs data types | ||
| # to appropriate write-time coercions. | ||
| self._set_coercion: Optional[Callable] = set_coercion | ||
| self.cls = cls | ||
|
|
||
| @property | ||
| def field(self): | ||
| return self.cls._meta.fields[self.original_name] | ||
|
|
||
| def _hydrate_dicts(self, value: Any): | ||
| """Turns a dictionary assigned to a nested Message into a full instance of | ||
| that Message type. | ||
| """ | ||
| if not isinstance(value, dict): | ||
| return value | ||
|
|
||
| if self.field.proto_type == proto.MESSAGE: | ||
| _pb = self.field.message._meta.pb(**value) | ||
| value = self.field.message.wrap(_pb) | ||
|
|
||
| return value | ||
|
|
||
| def _clear_oneofs(self, instance): | ||
| if not self.field.oneof: | ||
| return | ||
|
|
||
| for field_name, field in self.cls._meta.fields.items(): | ||
| # Don't clear this field | ||
| if field_name == self.original_name: | ||
| continue | ||
|
|
||
| # Don't clear other fields with different oneof values, or with | ||
| # no such values at all | ||
| if field.oneof != self.field.oneof: | ||
| continue | ||
|
|
||
| delattr(instance, field_name) | ||
|
|
||
| def __set__(self, instance, value): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An interesting idea. It seems to be convention when using attribute descriptors like this to name the variable |
||
| """Called whenever a value is assigned to a `proto.Field` attribute on an instantiated | ||
| `proto.Message` object. | ||
|
|
||
| Usage: | ||
|
|
||
| class MyMessage(proto.Message): | ||
| name = proto.Field(proto.STRING, number=1) | ||
|
|
||
| my_message = MyMessage() | ||
| my_message.name = "Frodo" | ||
|
|
||
| In the above scenario, `__set__` is called with "Frodo" passed as `value` and | ||
| `my_message` passed as `instance`. | ||
| """ | ||
| value = self._set_coercion(value) if self._set_coercion is not None else value | ||
| value = self._hydrate_dicts(value) | ||
|
|
||
| # Warning: `always_commit` is hacky! | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As the comment says, this is hacky. Sadly, it is a lynch-pin for the whole refactor. |
||
| # Some contexts, particularly instances created from MapFields, require immediate syncing. | ||
| # It is impossible to deduce such a scenario purely from logic available to this function, | ||
| # so instead we set a flag on instances when a MapField yields them, and then when those | ||
| # instances receive attribute updates, immediately syncing those values to the underlying | ||
| # pb2 instance is sufficient. | ||
| always_commit: bool = getattr(instance, '_always_commit', False) | ||
| if always_commit or not self.field.can_represent_natively: | ||
| pb_value = instance._meta.marshal.to_proto(self.field.pb_type, value) | ||
| _pb = instance._meta.pb(**{self.original_name: pb_value}) | ||
| instance._pb.ClearField(self.original_name) | ||
| instance._pb.MergeFrom(_pb) | ||
| else: | ||
|
|
||
| if value is None: | ||
| self.__delete__(instance) | ||
| instance._pb.MergeFrom(instance._meta._pb(**{self.original_name: None})) | ||
| return | ||
|
|
||
| instance._meta.marshal.validate_primitives(self.field, value) | ||
| instance._mark_pb_stale(self.original_name) | ||
|
|
||
| setattr(instance, self.instance_attr_name, value) | ||
| self._clear_oneofs(instance) | ||
|
|
||
| def __get__(self, instance: 'proto.Message', _): # type: ignore | ||
| """Called whenever a value is read from a proto.Field attribute on an instantiated | ||
| proto.Message object. | ||
|
|
||
| Usage: | ||
|
|
||
| class MyMessage(proto.Message): | ||
| name = proto.Field(proto.STRING, number=1) | ||
|
|
||
| my_message = MyMessage(name="Frodo") | ||
| print(my_message.name) | ||
|
|
||
| In the above scenario, `__get__` is called with "my_message" passed as | ||
| `instance`. | ||
| """ | ||
| # If `instance` is None, then we are accessing this field directly | ||
| # off the class itself instead of off an instance. | ||
| if instance is None: | ||
| return self.original_name | ||
|
|
||
| value = getattr(instance, self.instance_attr_name, _none) | ||
| is_map: bool = isinstance(value, MapComposite) | ||
|
|
||
| # Return any values that do not require immediate rehydration. | ||
| # A few notes: | ||
| # * primitives are simple, and so can be returned | ||
| # * `Messages` are already Pythonic, and so can be returned | ||
| # * `Values` are wrappers and so have to be unwrapped | ||
| # * The exception to this is MapComposites, which have the same | ||
| # types as Values, but which handle their own field caching and | ||
| # thus can be returned when pulled off the `instance`. | ||
| if value is not _none and (is_map or self.field.can_represent_natively): | ||
| return value | ||
|
|
||
| # For the most part, only primitive values can be returned natively, | ||
| # meaning this is either a Message itself, in which case, since we're | ||
| # dealing with the underlying pb object, we need to sync all deferred | ||
| # fields. This is functionally a no-op if no fields have been deferred. | ||
| if hasattr(value, '_update_pb'): | ||
| value._update_pb() | ||
|
|
||
| pb_value = getattr(instance._pb, self.original_name, None) | ||
| value = instance._meta.marshal.to_python( | ||
| self.field.pb_type, pb_value, | ||
| absent=self.original_name not in instance, | ||
| ) | ||
|
|
||
| setattr(instance, self.instance_attr_name, value) | ||
| return value | ||
|
|
||
| def __delete__(self, instance): | ||
| if hasattr(instance, self.instance_attr_name): | ||
| delattr(instance, self.instance_attr_name) | ||
| instance._pb.ClearField(self.original_name) | ||
| if self.original_name in getattr(instance, '_stale_fields', []): | ||
| instance._stale_fields.remove(self.original_name) | ||
|
|
||
|
|
||
| class _NoneType: | ||
| def __bool__(self): | ||
| return False | ||
|
|
||
| def __eq__(self, other): | ||
| """All _NoneType instances are equal""" | ||
| return isinstance(other, _NoneType) | ||
|
|
||
|
|
||
| _none = _NoneType() | ||
|
Comment on lines
+427
to
+436
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outrageous nitpicking: we're not checking for _canary = object()
...
value = getattr(instance, self.instance_attr_name, _canary)
if self.field.can_get_natively and value is not _canary: |
||
|
|
||
|
|
||
| __all__ = ( | ||
| "Field", | ||
| "MapField", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.