Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4171ce9

Browse files
author
David Read
authored
Merge pull request #4562 from ckan/4561-limit-datastore_search
Add a hard upper limit to datastore_search(_sql) rows returned
2 parents b8568db + 8bc80d7 commit 4171ce9

9 files changed

Lines changed: 483 additions & 113 deletions

File tree

CHANGELOG.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ Minor changes:
1616
the value is False, 0, [] or {} (#4448)
1717
* If you've customized the schema for package_search, you'll need to add to it
1818
the limiting of ``row``, as per default_package_search_schema now does (#4484)
19-
* Several logic functions now have new limits to how many items can be
20-
returned, notably ``group_list`` and ``organization_list`` when
21-
``all_fields=true``. These are all configurable. (#4484)
19+
* Several logic functions now have new upper limits to how many items can be
20+
returned, notably ``group_list``, ``organization_list`` when
21+
``all_fields=true``, ``datastore_search`` and ``datastore_search_sql``.
22+
These are all configurable. (#4484, #4562)
2223

2324
Bugfixes:
2425

ckanext/datastore/backend/postgres.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,7 +1375,7 @@ def _execute_single_statement_copy_to(context, sql_string, where_values, buf):
13751375
cursor.close()
13761376

13771377

1378-
def format_results(context, results, data_dict):
1378+
def format_results(context, results, data_dict, rows_max):
13791379
result_fields = []
13801380
for field in results.cursor.description:
13811381
result_fields.append({
@@ -1391,6 +1391,8 @@ def format_results(context, results, data_dict):
13911391
field['type'])
13921392
records.append(converted_row)
13931393
data_dict['records'] = records
1394+
if data_dict.get('records_truncated', False):
1395+
data_dict['records'].pop()
13941396
data_dict['fields'] = result_fields
13951397

13961398
return _unrename_json_field(data_dict)
@@ -1550,6 +1552,11 @@ def search_sql(context, data_dict):
15501552

15511553
sql = data_dict['sql'].replace('%', '%%')
15521554

1555+
# limit the number of results to ckan.datastore.search.rows_max + 1
1556+
# (the +1 is so that we know if the results went over the limit or not)
1557+
rows_max = int(config.get('ckan.datastore.search.rows_max', 32000))
1558+
sql = 'SELECT * FROM ({0}) AS blah LIMIT {1} ;'.format(sql, rows_max + 1)
1559+
15531560
try:
15541561

15551562
context['connection'].execute(
@@ -1566,7 +1573,10 @@ def search_sql(context, data_dict):
15661573

15671574
results = context['connection'].execute(sql)
15681575

1569-
return format_results(context, results, data_dict)
1576+
if results.rowcount == rows_max + 1:
1577+
data_dict['records_truncated'] = True
1578+
1579+
return format_results(context, results, data_dict, rows_max)
15701580

15711581
except ProgrammingError as e:
15721582
if e.orig.pgcode == _PG_ERR_CODE['permission_denied']:
@@ -1718,6 +1728,11 @@ def configure(self, config):
17181728
else:
17191729
self._check_urls_and_permissions()
17201730

1731+
# check rows_max is valid on CKAN start-up
1732+
rows_max = config.get('ckan.datastore.search.rows_max')
1733+
if rows_max is not None:
1734+
int(rows_max)
1735+
17211736
def datastore_delete(self, context, data_dict, fields_types, query_dict):
17221737
query_dict['where'] += _where_clauses(data_dict, fields_types)
17231738
return query_dict
@@ -1739,6 +1754,7 @@ def datastore_search(self, context, data_dict, fields_types, query_dict):
17391754
else:
17401755
field_ids = fields_types.keys()
17411756

1757+
# add default limit here just in case - already defaulted in the schema
17421758
limit = data_dict.get('limit', 100)
17431759
offset = data_dict.get('offset', 0)
17441760

ckanext/datastore/controller.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
render,
1515
c,
1616
h,
17+
config,
1718
)
1819
from ckanext.datastore.writer import (
1920
csv_writer,
@@ -175,6 +176,15 @@ def result_page(offs, lim):
175176

176177
result = result_page(offset, limit)
177178

179+
if result['limit'] != limit:
180+
# `limit` (from PAGINATE_BY) must have been more than
181+
# ckan.datastore.search.rows_max, so datastore_search responded with a
182+
# limit matching ckan.datastore.search.rows_max. So we need to paginate
183+
# by that amount instead, otherwise we'll have gaps in the records.
184+
paginate_by = result['limit']
185+
else:
186+
paginate_by = PAGINATE_BY
187+
178188
with start_writer(result['fields']) as wr:
179189
while True:
180190
if limit is not None and limit <= 0:
@@ -185,14 +195,14 @@ def result_page(offs, lim):
185195
wr.write_records(records)
186196

187197
if records_format == 'objects' or records_format == 'lists':
188-
if len(records) < PAGINATE_BY:
198+
if len(records) < paginate_by:
189199
break
190200
elif not records:
191201
break
192202

193-
offset += PAGINATE_BY
203+
offset += paginate_by
194204
if limit is not None:
195-
limit -= PAGINATE_BY
205+
limit -= paginate_by
196206
if limit <= 0:
197207
break
198208

ckanext/datastore/logic/action.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,8 +399,10 @@ def datastore_delete(context, data_dict):
399399
def datastore_search(context, data_dict):
400400
'''Search a DataStore resource.
401401
402-
The datastore_search action allows you to search data in a resource.
403-
DataStore resources that belong to private CKAN resource can only be
402+
The datastore_search action allows you to search data in a resource. By
403+
default 100 rows are returned - see the `limit` parameter for more info.
404+
405+
A DataStore resource that belongs to a private CKAN resource can only be
404406
read by you if you have access to the CKAN resource and send the
405407
appropriate authorization.
406408
@@ -420,7 +422,10 @@ def datastore_search(context, data_dict):
420422
:param language: language of the full text query
421423
(optional, default: english)
422424
:type language: string
423-
:param limit: maximum number of rows to return (optional, default: 100)
425+
:param limit: maximum number of rows to return
426+
(optional, default: ``100``, unless set in the site's configuration
427+
``ckan.datastore.search.rows_default``, upper limit: ``32000`` unless
428+
set in site's configuration ``ckan.datastore.search.rows_max``)
424429
:type limit: int
425430
:param offset: offset this number of rows (optional)
426431
:type offset: int
@@ -471,7 +476,9 @@ def datastore_search(context, data_dict):
471476
:type fields: list of dictionaries
472477
:param offset: query offset value
473478
:type offset: int
474-
:param limit: query limit value
479+
:param limit: queried limit value (if the requested ``limit`` was above the
480+
``ckan.datastore.search.rows_max`` value then this response ``limit``
481+
will be set to the value of ``ckan.datastore.search.rows_max``)
475482
:type limit: int
476483
:param filters: query filters
477484
:type filters: list of dictionaries
@@ -522,13 +529,15 @@ def datastore_search_sql(context, data_dict):
522529
engine is the
523530
`PostgreSQL engine <http://www.postgresql.org/docs/9.1/interactive/>`_.
524531
There is an enforced timeout on SQL queries to avoid an unintended DOS.
532+
The number of results returned is limited to 32000, unless set in the
533+
site's configuration ``ckan.datastore.search.rows_max``
525534
Queries are only allowed if you have access to the all the CKAN resources
526535
in the query and send the appropriate authorization.
527536
528537
.. note:: This action is not available when
529538
:ref:`ckan.datastore.sqlsearch.enabled` is set to false
530539
531-
.. note:: When source data columns (i.e. CSV) heading names are provdied
540+
.. note:: When source data columns (i.e. CSV) heading names are provided
532541
in all UPPERCASE you need to double quote them in the SQL select
533542
statement to avoid returning null results.
534543
@@ -544,6 +553,12 @@ def datastore_search_sql(context, data_dict):
544553
:type fields: list of dictionaries
545554
:param records: list of matching results
546555
:type records: list of dictionaries
556+
:param records_truncated: indicates whether the number of records returned
557+
was limited by the internal limit, which is 32000 records (or other
558+
value set in the site's configuration
559+
``ckan.datastore.search.rows_max``). If records are truncated by this,
560+
this key has value True, otherwise the key is not returned at all.
561+
:type records_truncated: bool
547562
548563
'''
549564
backend = DatastoreBackend.get_active_backend()

ckanext/datastore/logic/schema.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
OneOf = get_validator('OneOf')
2222
unicode_only = get_validator('unicode_only')
2323
default = get_validator('default')
24+
natural_number_validator = get_validator('natural_number_validator')
25+
configured_default = get_validator('configured_default')
26+
limit_to_configured_maximum = get_validator('limit_to_configured_maximum')
2427

2528

2629
def rename(old, new):
@@ -167,7 +170,11 @@ def datastore_search_schema():
167170
'plain': [ignore_missing, boolean_validator],
168171
'filters': [ignore_missing, json_validator],
169172
'language': [ignore_missing, text_type],
170-
'limit': [ignore_missing, int_validator],
173+
'limit': [
174+
configured_default('ckan.datastore.search.rows_default', 100),
175+
natural_number_validator,
176+
limit_to_configured_maximum('ckan.datastore.search.rows_max',
177+
32000)],
171178
'offset': [ignore_missing, int_validator],
172179
'fields': [ignore_missing, list_of_strings_or_string],
173180
'sort': [ignore_missing, list_of_strings_or_string],

ckanext/datastore/tests/test_dump.py

Lines changed: 100 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from nose.tools import assert_equals, assert_in
44
import mock
5+
import json
56

67
from ckanext.datastore.tests.helpers import DatastoreFunctionalTestBase
78
import ckan.tests.helpers as helpers
@@ -428,6 +429,21 @@ def test_dump_xml(self):
428429
)
429430
assert_equals(content, expected_content)
430431

432+
@helpers.change_config('ckan.datastore.search.rows_max', '3')
433+
def test_dump_with_low_rows_max(self):
434+
resource = factories.Resource()
435+
data = {
436+
'resource_id': resource['id'],
437+
'force': True,
438+
'records': [{u'record': str(num)} for num in range(12)],
439+
}
440+
helpers.call_action('datastore_create', **data)
441+
442+
app = self._get_test_app()
443+
response = app.get('/datastore/dump/{0}'.format(str(resource['id'])))
444+
assert_equals(get_csv_record_values(response.body),
445+
range(12))
446+
431447
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
432448
def test_dump_pagination(self):
433449
resource = factories.Resource()
@@ -440,12 +456,10 @@ def test_dump_pagination(self):
440456

441457
app = self._get_test_app()
442458
response = app.get('/datastore/dump/{0}'.format(str(resource['id'])))
443-
assert_equals(
444-
'_id,record\r\n'
445-
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n7,6\n8,7\n9,8\n10,9\n'
446-
'11,10\n12,11\n',
447-
response.body)
459+
assert_equals(get_csv_record_values(response.body),
460+
range(12))
448461

462+
@helpers.change_config('ckan.datastore.search.rows_max', '7')
449463
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
450464
def test_dump_pagination_csv_with_limit(self):
451465
resource = factories.Resource()
@@ -456,14 +470,62 @@ def test_dump_pagination_csv_with_limit(self):
456470
}
457471
helpers.call_action('datastore_create', **data)
458472

473+
app = self._get_test_app()
474+
response = app.get('/datastore/dump/{0}?limit=11'.format(
475+
str(resource['id'])))
476+
assert_equals(get_csv_record_values(response.body),
477+
range(11))
478+
479+
@helpers.change_config('ckan.datastore.search.rows_max', '7')
480+
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 6)
481+
def test_dump_pagination_csv_with_limit_same_as_paginate(self):
482+
resource = factories.Resource()
483+
data = {
484+
'resource_id': resource['id'],
485+
'force': True,
486+
'records': [{u'record': str(num)} for num in range(12)],
487+
}
488+
helpers.call_action('datastore_create', **data)
489+
459490
app = self._get_test_app()
460491
response = app.get('/datastore/dump/{0}?limit=6'.format(
461492
str(resource['id'])))
462-
assert_equals(
463-
'_id,record\r\n'
464-
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n',
465-
response.body)
493+
assert_equals(get_csv_record_values(response.body),
494+
range(6))
466495

496+
@helpers.change_config('ckan.datastore.search.rows_max', '6')
497+
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
498+
def test_dump_pagination_with_rows_max(self):
499+
resource = factories.Resource()
500+
data = {
501+
'resource_id': resource['id'],
502+
'force': True,
503+
'records': [{u'record': str(num)} for num in range(12)],
504+
}
505+
helpers.call_action('datastore_create', **data)
506+
507+
app = self._get_test_app()
508+
response = app.get('/datastore/dump/{0}?limit=7'.format(str(resource['id'])))
509+
assert_equals(get_csv_record_values(response.body),
510+
range(7))
511+
512+
@helpers.change_config('ckan.datastore.search.rows_max', '6')
513+
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 6)
514+
def test_dump_pagination_with_rows_max_same_as_paginate(self):
515+
resource = factories.Resource()
516+
data = {
517+
'resource_id': resource['id'],
518+
'force': True,
519+
'records': [{u'record': str(num)} for num in range(12)],
520+
}
521+
helpers.call_action('datastore_create', **data)
522+
523+
app = self._get_test_app()
524+
response = app.get('/datastore/dump/{0}?limit=7'.format(str(resource['id'])))
525+
assert_equals(get_csv_record_values(response.body),
526+
range(7))
527+
528+
@helpers.change_config('ckan.datastore.search.rows_max', '7')
467529
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
468530
def test_dump_pagination_json_with_limit(self):
469531
resource = factories.Resource()
@@ -477,9 +539,32 @@ def test_dump_pagination_json_with_limit(self):
477539
app = self._get_test_app()
478540
response = app.get('/datastore/dump/{0}?limit=6&format=json'.format(
479541
str(resource['id'])))
480-
assert_equals(
481-
'{\n "fields": [{"type":"int","id":"_id"},'
482-
'{"type":"int4","id":"record"}],\n'
483-
' "records": [\n [1,0],\n [2,1],\n [3,2],\n [4,3],\n'
484-
' [5,4],\n [6,5]\n]}\n',
485-
response.body)
542+
assert_equals(get_json_record_values(response.body),
543+
range(6))
544+
545+
@helpers.change_config('ckan.datastore.search.rows_max', '6')
546+
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
547+
def test_dump_pagination_json_with_rows_max(self):
548+
resource = factories.Resource()
549+
data = {
550+
'resource_id': resource['id'],
551+
'force': True,
552+
'records': [{u'record': str(num)} for num in range(12)],
553+
}
554+
helpers.call_action('datastore_create', **data)
555+
556+
app = self._get_test_app()
557+
response = app.get('/datastore/dump/{0}?limit=7&format=json'.format(
558+
str(resource['id'])))
559+
assert_equals(get_json_record_values(response.body),
560+
range(7))
561+
562+
563+
def get_csv_record_values(response_body):
564+
return [int(record.split(',')[1])
565+
for record in response_body.split()[1:]]
566+
567+
568+
def get_json_record_values(response_body):
569+
return [record[1]
570+
for record in json.loads(response_body)['records']]

0 commit comments

Comments
 (0)