Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ba20019

Browse files
test: Added system test for query offset issue (#557)
* test: Added system test for query offset issue * linting * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fixed test * Removed testing the default database --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 5e773cb commit ba20019

File tree

3 files changed

+132
-5
lines changed

3 files changed

+132
-5
lines changed

tests/system/test_query.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,17 @@ def large_query_client(datastore_client):
337337
return large_query_client
338338

339339

340+
@pytest.fixture(scope="session")
341+
def mergejoin_query_client(datastore_client):
342+
mergejoin_query_client = _helpers.clone_client(
343+
datastore_client,
344+
namespace=populate_datastore.MERGEJOIN_DATASET_NAMESPACE,
345+
)
346+
populate_datastore.add_mergejoin_dataset_entities(client=mergejoin_query_client)
347+
348+
return mergejoin_query_client
349+
350+
340351
@pytest.fixture(scope="function")
341352
def large_query(large_query_client):
342353
# Use the client for this test instead of the global.
@@ -346,6 +357,15 @@ def large_query(large_query_client):
346357
)
347358

348359

360+
@pytest.fixture(scope="function")
361+
def mergejoin_query(mergejoin_query_client):
362+
# Use the client for this test instead of the global.
363+
return mergejoin_query_client.query(
364+
kind=populate_datastore.MERGEJOIN_DATASET_KIND,
365+
namespace=populate_datastore.MERGEJOIN_DATASET_NAMESPACE,
366+
)
367+
368+
349369
@pytest.mark.parametrize(
350370
"limit,offset,expected",
351371
[
@@ -385,6 +405,20 @@ def test_large_query(large_query, limit, offset, expected, database_id):
385405
assert len(entities) == expected
386406

387407

408+
@pytest.mark.parametrize("database_id", [_helpers.TEST_DATABASE], indirect=True)
409+
def test_mergejoin_query(mergejoin_query, database_id):
410+
query = mergejoin_query
411+
query.add_filter(filter=PropertyFilter("a", "=", 1))
412+
query.add_filter(filter=PropertyFilter("b", "=", 1))
413+
414+
# There should be 2 * MERGEJOIN_QUERY_NUM_RESULTS results total
415+
expected_total = 2 * populate_datastore.MERGEJOIN_QUERY_NUM_RESULTS
416+
for offset in range(0, expected_total + 1):
417+
iterator = query.fetch(offset=offset)
418+
num_entities = len([e for e in iterator])
419+
assert num_entities == expected_total - offset
420+
421+
388422
@pytest.mark.parametrize("database_id", [None, _helpers.TEST_DATABASE], indirect=True)
389423
def test_query_add_property_filter(ancestor_query, database_id):
390424
query = ancestor_query

tests/system/utils/clear_datastore.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
"Post",
3232
"uuid_key",
3333
"timestamp_key",
34+
"LargeCharacter",
35+
"Mergejoin",
3436
)
3537
TRANSACTION_MAX_GROUPS = 5
3638
MAX_DEL_ENTITIES = 500
@@ -90,12 +92,10 @@ def remove_all_entities(client):
9092

9193

9294
def run(database):
93-
client = datastore.Client(database=database)
9495
kinds = sys.argv[1:]
9596

9697
if len(kinds) == 0:
9798
kinds = ALL_KINDS
98-
9999
print_func(
100100
"This command will remove all entities from the database "
101101
+ database
@@ -105,8 +105,10 @@ def run(database):
105105
response = input("Is this OK [y/n]? ")
106106

107107
if response.lower() == "y":
108-
for kind in kinds:
109-
remove_kind(kind, client)
108+
for namespace in ["", "LargeCharacterEntity", "MergejoinNamespace"]:
109+
client = datastore.Client(database=database, namespace=namespace)
110+
for kind in kinds:
111+
remove_kind(kind, client)
110112

111113
else:
112114
print_func("Doing nothing.")

tests/system/utils/populate_datastore.py

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@
5858
LARGE_CHARACTER_NAMESPACE = "LargeCharacterEntity"
5959
LARGE_CHARACTER_KIND = "LargeCharacter"
6060

61+
MERGEJOIN_QUERY_NUM_RESULTS = 7
62+
MERGEJOIN_DATASET_INTERMEDIATE_OBJECTS = 20000
63+
MERGEJOIN_DATASET_NAMESPACE = "MergejoinNamespace"
64+
MERGEJOIN_DATASET_KIND = "Mergejoin"
65+
6166

6267
def get_system_test_db():
6368
return os.getenv("SYSTEM_TESTS_DATABASE") or "system-tests-named-db"
@@ -179,12 +184,92 @@ def add_timestamp_keys(client=None):
179184
batch.put(entity)
180185

181186

187+
def add_mergejoin_dataset_entities(client=None):
188+
"""
189+
Dataset to account for one bug that was seen in https://github.com/googleapis/python-datastore/issues/547
190+
The root cause of this is us setting a subsequent query's start_cursor to skipped_cursor instead of end_cursor.
191+
In niche scenarios involving mergejoins, skipped_cursor becomes empty and the query starts back from the beginning,
192+
returning duplicate items.
193+
194+
This bug is able to be reproduced with a dataset shown in b/352377540, with 7 items of a=1, b=1
195+
followed by 20k items of alternating a=1, b=0 and a=0, b=1, then 7 more a=1, b=1, then querying for all
196+
items with a=1, b=1 and an offset of 8.
197+
"""
198+
client.namespace = MERGEJOIN_DATASET_NAMESPACE
199+
200+
# Query used for all tests
201+
page_query = client.query(
202+
kind=MERGEJOIN_DATASET_KIND, namespace=MERGEJOIN_DATASET_NAMESPACE
203+
)
204+
205+
def create_entity(id, a, b):
206+
key = client.key(MERGEJOIN_DATASET_KIND, id)
207+
entity = datastore.Entity(key=key)
208+
entity["a"] = a
209+
entity["b"] = b
210+
return entity
211+
212+
def put_objects(count):
213+
id = 1
214+
curr_intermediate_entries = 0
215+
216+
# Can only do 500 operations in a transaction with an overall
217+
# size limit.
218+
ENTITIES_TO_BATCH = 500
219+
220+
with client.transaction() as xact:
221+
for _ in range(0, MERGEJOIN_QUERY_NUM_RESULTS):
222+
entity = create_entity(id, 1, 1)
223+
id += 1
224+
xact.put(entity)
225+
226+
while curr_intermediate_entries < count - MERGEJOIN_QUERY_NUM_RESULTS:
227+
start = curr_intermediate_entries
228+
end = min(curr_intermediate_entries + ENTITIES_TO_BATCH, count)
229+
with client.transaction() as xact:
230+
# The name/ID for the new entity
231+
for i in range(start, end):
232+
if id % 2:
233+
entity = create_entity(id, 0, 1)
234+
else:
235+
entity = create_entity(id, 1, 0)
236+
id += 1
237+
238+
# Saves the entity
239+
xact.put(entity)
240+
curr_intermediate_entries += ENTITIES_TO_BATCH
241+
242+
with client.transaction() as xact:
243+
for _ in range(0, MERGEJOIN_QUERY_NUM_RESULTS):
244+
entity = create_entity(id, 1, 1)
245+
id += 1
246+
xact.put(entity)
247+
248+
# If anything exists in this namespace, delete it, since we need to
249+
# set up something very specific.
250+
all_entities = [e for e in page_query.fetch()]
251+
if len(all_entities) > 0:
252+
# Cleanup Collection if not an exact match
253+
while all_entities:
254+
entities = all_entities[:500]
255+
all_entities = all_entities[500:]
256+
client.delete_multi([e.key for e in entities])
257+
# Put objects
258+
put_objects(MERGEJOIN_DATASET_INTERMEDIATE_OBJECTS)
259+
260+
182261
def run(database):
183262
client = datastore.Client(database=database)
184263
flags = sys.argv[1:]
185264

186265
if len(flags) == 0:
187-
flags = ["--characters", "--uuid", "--timestamps"]
266+
flags = [
267+
"--characters",
268+
"--uuid",
269+
"--timestamps",
270+
"--large-characters",
271+
"--mergejoin",
272+
]
188273

189274
if "--characters" in flags:
190275
add_characters(client)
@@ -195,6 +280,12 @@ def run(database):
195280
if "--timestamps" in flags:
196281
add_timestamp_keys(client)
197282

283+
if "--large-characters" in flags:
284+
add_large_character_entities(client)
285+
286+
if "--mergejoin" in flags:
287+
add_mergejoin_dataset_entities(client)
288+
198289

199290
def main():
200291
for database in ["", get_system_test_db()]:

0 commit comments

Comments
 (0)