58
58
LARGE_CHARACTER_NAMESPACE = "LargeCharacterEntity"
59
59
LARGE_CHARACTER_KIND = "LargeCharacter"
60
60
61
+ MERGEJOIN_QUERY_NUM_RESULTS = 7
62
+ MERGEJOIN_DATASET_INTERMEDIATE_OBJECTS = 20000
63
+ MERGEJOIN_DATASET_NAMESPACE = "MergejoinNamespace"
64
+ MERGEJOIN_DATASET_KIND = "Mergejoin"
65
+
61
66
62
67
def get_system_test_db ():
63
68
return os .getenv ("SYSTEM_TESTS_DATABASE" ) or "system-tests-named-db"
@@ -179,12 +184,92 @@ def add_timestamp_keys(client=None):
179
184
batch .put (entity )
180
185
181
186
187
+ def add_mergejoin_dataset_entities (client = None ):
188
+ """
189
+ Dataset to account for one bug that was seen in https://github.com/googleapis/python-datastore/issues/547
190
+ The root cause of this is us setting a subsequent query's start_cursor to skipped_cursor instead of end_cursor.
191
+ In niche scenarios involving mergejoins, skipped_cursor becomes empty and the query starts back from the beginning,
192
+ returning duplicate items.
193
+
194
+ This bug is able to be reproduced with a dataset shown in b/352377540, with 7 items of a=1, b=1
195
+ followed by 20k items of alternating a=1, b=0 and a=0, b=1, then 7 more a=1, b=1, then querying for all
196
+ items with a=1, b=1 and an offset of 8.
197
+ """
198
+ client .namespace = MERGEJOIN_DATASET_NAMESPACE
199
+
200
+ # Query used for all tests
201
+ page_query = client .query (
202
+ kind = MERGEJOIN_DATASET_KIND , namespace = MERGEJOIN_DATASET_NAMESPACE
203
+ )
204
+
205
+ def create_entity (id , a , b ):
206
+ key = client .key (MERGEJOIN_DATASET_KIND , id )
207
+ entity = datastore .Entity (key = key )
208
+ entity ["a" ] = a
209
+ entity ["b" ] = b
210
+ return entity
211
+
212
+ def put_objects (count ):
213
+ id = 1
214
+ curr_intermediate_entries = 0
215
+
216
+ # Can only do 500 operations in a transaction with an overall
217
+ # size limit.
218
+ ENTITIES_TO_BATCH = 500
219
+
220
+ with client .transaction () as xact :
221
+ for _ in range (0 , MERGEJOIN_QUERY_NUM_RESULTS ):
222
+ entity = create_entity (id , 1 , 1 )
223
+ id += 1
224
+ xact .put (entity )
225
+
226
+ while curr_intermediate_entries < count - MERGEJOIN_QUERY_NUM_RESULTS :
227
+ start = curr_intermediate_entries
228
+ end = min (curr_intermediate_entries + ENTITIES_TO_BATCH , count )
229
+ with client .transaction () as xact :
230
+ # The name/ID for the new entity
231
+ for i in range (start , end ):
232
+ if id % 2 :
233
+ entity = create_entity (id , 0 , 1 )
234
+ else :
235
+ entity = create_entity (id , 1 , 0 )
236
+ id += 1
237
+
238
+ # Saves the entity
239
+ xact .put (entity )
240
+ curr_intermediate_entries += ENTITIES_TO_BATCH
241
+
242
+ with client .transaction () as xact :
243
+ for _ in range (0 , MERGEJOIN_QUERY_NUM_RESULTS ):
244
+ entity = create_entity (id , 1 , 1 )
245
+ id += 1
246
+ xact .put (entity )
247
+
248
+ # If anything exists in this namespace, delete it, since we need to
249
+ # set up something very specific.
250
+ all_entities = [e for e in page_query .fetch ()]
251
+ if len (all_entities ) > 0 :
252
+ # Cleanup Collection if not an exact match
253
+ while all_entities :
254
+ entities = all_entities [:500 ]
255
+ all_entities = all_entities [500 :]
256
+ client .delete_multi ([e .key for e in entities ])
257
+ # Put objects
258
+ put_objects (MERGEJOIN_DATASET_INTERMEDIATE_OBJECTS )
259
+
260
+
182
261
def run (database ):
183
262
client = datastore .Client (database = database )
184
263
flags = sys .argv [1 :]
185
264
186
265
if len (flags ) == 0 :
187
- flags = ["--characters" , "--uuid" , "--timestamps" ]
266
+ flags = [
267
+ "--characters" ,
268
+ "--uuid" ,
269
+ "--timestamps" ,
270
+ "--large-characters" ,
271
+ "--mergejoin" ,
272
+ ]
188
273
189
274
if "--characters" in flags :
190
275
add_characters (client )
@@ -195,6 +280,12 @@ def run(database):
195
280
if "--timestamps" in flags :
196
281
add_timestamp_keys (client )
197
282
283
+ if "--large-characters" in flags :
284
+ add_large_character_entities (client )
285
+
286
+ if "--mergejoin" in flags :
287
+ add_mergejoin_dataset_entities (client )
288
+
198
289
199
290
def main ():
200
291
for database in ["" , get_system_test_db ()]:
0 commit comments