diff --git a/README.md b/README.md index ed263c78..88ef1be2 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,15 @@ [![npm (scoped)](https://img.shields.io/npm/v/@gen3/guppy)](https://www.npmjs.com/package/@gen3/guppy) -Server that support GraphQL queries on data from elasticsearch. +Server that support GraphQL queries on data from elasticsearch. Please see [this doc](https://github.com/uc-cdis/guppy/blob/master/doc/queries.md) for syntax Guppy supports. -Run `npm start` to start server at port 80. +Run `npm start` to start server at port 80. -### Configurations: -Before launch, we need to write config and tell Guppy which elasticsearch indices and which auth control field to use. -You could put following as your config files: +### Configurations: +Before launch, we need to write config and tell Guppy which elasticsearch indices and which auth control field to use. +You could put following as your config files: ``` { @@ -32,7 +32,7 @@ You could put following as your config files: } ``` -Following script will start server using at port 3000, using config file `example_config.json`: +Following script will start server using at port 3000, using config file `example_config.json`: ``` export GUPPY_PORT=3000 @@ -41,9 +41,9 @@ npm start ``` #### Authorization -Guppy connects Arborist for authorization. -The `auth_filter_field` item in your config file is the field used for authorization. -You could set the endpoint by: +Guppy connects Arborist for authorization. +The `auth_filter_field` item in your config file is the field used for authorization. +You could set the endpoint by: ``` export GEN3_ARBORIST_ENDPOINT=${arborist_service} @@ -55,7 +55,7 @@ behavior for local test without Arborist, just set `INTERNAL_LOCAL_TEST=true`. P look into `/src/server/auth/utils.js` for more details. #### Tier access -Guppy also support 3 different levels of tier access, by setting `TIER_ACCESS_LEVEL`: +Guppy also support 3 different levels of tier access, by setting `TIER_ACCESS_LEVEL`: - `private` by default: only allows access to authorized resources - `regular`: allows all kind of aggregation (with limitation for unauthorized resources), but forbid access to raw data without authorization - `libre`: access to all data @@ -65,7 +65,7 @@ For `regular` level, there's another configuration environment variable `TIER_AC `regular` level commons could also take in a whitelist of values that won't be encrypted. It is set by `config.encrypt_whitelist`. By default the whitelist contains missing values: ['\_\_missing\_\_', 'unknown', 'not reported', 'no data']. Also the whitelist is disabled by default due to security reasons. If you would like to enable whitelist, simply put `enable_encrypt_whitelist: true` in your config. -For example `regular` leveled commons with config looks like this will skip encrypting value `do-not-encrypt-me` even if its count is less than `TIER_ACCESS_LIMIT`: +For example `regular` leveled commons with config looks like this will skip encrypting value `do-not-encrypt-me` even if its count is less than `TIER_ACCESS_LIMIT`: ``` { @@ -86,7 +86,7 @@ For example `regular` leveled commons with config looks like this will skip encr } ``` -For example following script will start a Guppy server with `regular` tier access level, and minimum visible count set to 100: +For example following script will start a Guppy server with `regular` tier access level, and minimum visible count set to 100: ``` export TIER_ACCESS_LEVEL=regular @@ -94,5 +94,15 @@ export TIER_ACCESS_LIMIT=100 npm start ``` +> ##### Tier Access Sensitive Record Exclusion +> It is possible to configure Guppy to hide some records from being returned in `_aggregation` queries when Tiered Access is enabled (tierAccessLevel: "regular"). +> The purpose of this is to "hide" information about certain sensitive resources, essentially making this an escape hatch from Tiered Access. +> Crucially, Sensitive Record Exclusion only applies to records which the user does not have access to. If the user has access to a record, it will +> be counted in the aggregation query whether or not it is sensitive. +> +> To enable Sensitive Record Exclusion, set `guppy.tier_access_sensitive_record_exclusion_field: "fieldname"` in the commons' `manifest.json`. "fieldName" should match a boolean field in the Elasticsearch index that indicates whether or not a record is sensitive. +> +> (E.g., `"tier_access_sensitive_record_exclusion_field": "sensitive"` in the Guppy config tells Guppy to look for a field in the ES index called `sensitive`, and to exclude records in the ES index which have `sensitive: "true"`) + #### Download endpoint -Guppy has another special endpoint `/download` for just fetching raw data from elasticsearch. please see [here](https://github.com/uc-cdis/guppy/blob/master/doc/download.md) for more details. +Guppy has another special endpoint `/download` for just fetching raw data from elasticsearch. please see [here](https://github.com/uc-cdis/guppy/blob/master/doc/download.md) for more details. diff --git a/devHelper/scripts/commands.sh b/devHelper/scripts/commands.sh index 52d9813f..1151ecff 100644 --- a/devHelper/scripts/commands.sh +++ b/devHelper/scripts/commands.sh @@ -81,7 +81,8 @@ curl -iv -X PUT "${ESHOST}/${indexName}" \ "whatever_lab_result_value": { "type": "float" }, "some_string_field": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, "some_integer_field": { "type": "integer" }, - "some_long_field": { "type": "long" } + "some_long_field": { "type": "long" }, + "sensitive": { "type": "keyword" } } } } @@ -102,7 +103,8 @@ curl -iv -X PUT "${ESHOST}/${fileIndexName}" \ "properties": { "file_id": { "type": "keyword" }, "gen3_resource_path": { "type": "keyword" }, - "subject_id": { "type": "keyword" } + "subject_id": { "type": "keyword" }, + "sensitive": { "type": "keyword" } } } } @@ -162,6 +164,7 @@ fileTypeList=( "mRNA Array" "Unaligned Reads" "Lipdomic MS" "Protionic MS" "1Gs fileFormatList=( "BEM" "BAM" "BED" "CSV" "FASTQ" "RAW" "TAR" "TSV" "TXT" "IDAT" "__missing__" ) resourceList=( "/programs/jnkns/projects/jenkins" "/programs/DEV/projects/test" "/programs/external/projects/test") projectList=( "jnkns-jenkins" "DEV-test" "external-test" ) +sensitiveList=( "true" "false" ) COUNT=$startIndex XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/tmp}" @@ -182,6 +185,7 @@ while [[ $COUNT -lt $endIndex ]]; do stringArray='["1", "2"]' intArray='[1, 2]' longNumber="10737418240" + sensitive="${sensitiveList[$(( $RANDOM % ${#sensitiveList[@]} ))]}" cat - > "$tmpName" < -## Queries +## Queries Guppy allows you to query the raw data with offset, maximum number of rows, sorting and filters (see the end of the document for how filter syntax looks). -Example query: +Example query: ``` { @@ -39,7 +39,7 @@ Example query: } ``` -Example result: +Example result: ``` { @@ -72,7 +72,7 @@ Example result: } ``` -Arguments: +Arguments: | argument | description | type | default | |---------------|-----------------------------------------------------------------|-------------------------------------|---------| @@ -85,12 +85,12 @@ Arguments: -## Aggregations -Aggregation query is wrapped within `_aggregation` keyword. Three possible aggregations available: +## Aggregations +Aggregation query is wrapped within `_aggregation` keyword. Three possible aggregations available: -### 1. Total count aggregation +### 1. Total count aggregation By using `_totalCount` keyword, return total count of the result. Example: ``` @@ -103,7 +103,7 @@ Aggregation query is wrapped within `_aggregation` keyword. Three possible aggre } ``` -Example result: +Example result: ``` { @@ -119,8 +119,8 @@ Example result: -### 2. Text aggregation -Text aggregation returns histogram for a text field, results are wrapped by keywords `key` and `count`, example: +### 2. Text aggregation +Text aggregation returns histogram for a text field, results are wrapped by keywords `key` and `count`, example: ``` query { @@ -164,10 +164,10 @@ Example result: -### 3. Numeric aggregation -For numeric field, aggregation can calculate ***statistical summary*** or ***histogram***. +### 3. Numeric aggregation +For numeric field, aggregation can calculate ***statistical summary*** or ***histogram***. -***Statistical summary*** includes minimum, maximum, average, sum and count for the data. Example: +***Statistical summary*** includes minimum, maximum, average, sum and count for the data. Example: ``` query($filter: JSON) { @@ -187,7 +187,7 @@ query($filter: JSON) { } ``` -Result: +Result: ``` { "data": { @@ -210,9 +210,9 @@ Result: } ``` -***Histogram*** could be built by 2 methods: giving bin width, or giving bin counts. +***Histogram*** could be built by 2 methods: giving bin width, or giving bin counts. - - Giving "bin width" means giving start and end value of histogram, and giving a step as bin width: + - Giving "bin width" means giving start and end value of histogram, and giving a step as bin width: | argument | description | type | default | |------------|-----------------------------|------------------|-----------| @@ -220,7 +220,7 @@ Result: | rangeEnd | ending value of histogram | integer or float | max value | | rangeStep | step for each histogram bin | integer or float | max - min | -Example: +Example: ``` query($filter: JSON) { @@ -289,7 +289,7 @@ Result: | rangeEnd | ending value of histogram | integer or float | max value | | binCount | how many bins in histogram | integer | 1 | -Example: +Example: ``` query { @@ -306,7 +306,7 @@ query { } ``` -Result: +Result: ``` { @@ -558,14 +558,14 @@ Result: -## Filters +## Filters ### Basic filter unit Currently Guppy uses `JSON`-based syntax for filters. -The JSON object key could be an operation like `=`, `>`. -A very basic filter unit would look like: `{: { : }}`. +The JSON object key could be an operation like `=`, `>`. +A very basic filter unit would look like: `{: { : }}`. One simple example could look like: ``` @@ -579,7 +579,7 @@ One simple example could look like: ``` -Currently we support following operators: +Currently we support following operators: | operator | meaning | support field type | example | @@ -598,7 +598,7 @@ Currently we support following operators: ### A search unit in filter -You could add a search unit into your filter, the syntax looks like: +You could add a search unit into your filter, the syntax looks like: ``` { @@ -609,13 +609,13 @@ You could add a search unit into your filter, the syntax looks like: } ``` -Notice that `keyword` is required. But `fields` is optional, -and if not set, guppy will search thru all analyzed text fields that matched the suffix pattern set in `ANALYZED_TEXT_FIELD_SUFFIX` (by default `.analyzed`, which means search thru all `*.analyzed` fields). +Notice that `keyword` is required. But `fields` is optional, +and if not set, guppy will search thru all analyzed text fields that matched the suffix pattern set in `ANALYZED_TEXT_FIELD_SUFFIX` (by default `.analyzed`, which means search thru all `*.analyzed` fields). #### Matched results and highlights -Guppy will return matched fields and highlight partial in `_matched` keyword, -with the matched field name, and highlighted partial words wrapped inside `` tags. -A example search filter: +Guppy will return matched fields and highlight partial in `_matched` keyword, +with the matched field name, and highlighted partial words wrapped inside `` tags. +A example search filter: ``` query ($filter: JSON) { @@ -743,10 +743,10 @@ In future Guppy will support `SQL` like syntax for filter, like ` -## Some other queries and arguments +## Some other queries and arguments ### Mapping query -Mapping query simply returns all fields under a doc type. Example: +Mapping query simply returns all fields under a doc type. Example: ``` { _mapping { @@ -756,7 +756,7 @@ Mapping query simply returns all fields under a doc type. Example: } ``` -Result: +Result: ``` { @@ -789,10 +789,10 @@ Result: } ``` - + ### "accessibility" argument for "regular" tier access level -When choose "regular" mode for for tier access level, `accessibility` argument will be valid for raw data or aggregation query. It support 3 enum values: `all`, `accessible`, and `unaccessible`. And will return data by those three accessibility types. By default it is set to `all`. Below are the different behaviors for each enum value. +When choose "regular" mode for for tier access level, `accessibility` argument will be valid for raw data or aggregation query. It support 3 enum values: `all`, `accessible`, and `unaccessible`. And will return data by those three accessibility types. By default it is set to `all`. Below are the different behaviors for each enum value. | enum | description | when query raw data | when query aggregation | |--------------|----------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -800,7 +800,7 @@ When choose "regular" mode for for tier access level, `accessibility` argument w | accessible | return aggregation for accessible data | Only returns data that user has access to. | Only returns aggregation result that user has access to. | | unaccessible | return aggregation for unaccessible data | Always returns 401 | Returns aggregation result.Maximum visible number may apply according to `TIER_ACCESS_LIMIT` | -Example 1 (trying to get raw data for unaccessible resources is forbidden): +Example 1 (trying to get raw data for unaccessible resources is forbidden): ``` query { subject (accessibility: unaccessible) { @@ -809,7 +809,7 @@ query { } ``` -Result: +Result: ``` { "errors": [ @@ -833,7 +833,7 @@ Result: } ``` -Example 2 (trying to get aggregation for unaccessible resources): +Example 2 (trying to get aggregation for unaccessible resources): ``` query { @@ -850,7 +850,7 @@ query { } ``` -Result: +Result: ``` { @@ -871,10 +871,48 @@ Result: } ``` +### Tiered Access Sensitive Record Exclusion +It is possible to configure Guppy to hide some records from being returned in `_aggregation` queries when Tiered Access is enabled (tierAccessLevel: "regular"). +The purpose of this is to "hide" information about certain sensitive resources, essentially making this an escape hatch from Tiered Access. Specifically, this feature is used in a Gen3 data commons to hide the existence of some studies from clients who do not have access, while keeping the features of Tiered Access for the non-sensitive studies. + +Crucially, Sensitive Record Exclusion only applies to records which the user does not have access to. If the user has access to a record, it will +be counted in the aggregation query whether or not it is sensitive. + +To enable Sensitive Record Exclusion, set `guppy.tier_access_sensitive_record_exclusion_field: "fieldname"` in the commons' `manifest.json`. `"fieldName"` should match a boolean field in the Elasticsearch index that indicates whether or not a record is sensitive. +(E.g., `"tier_access_sensitive_record_exclusion_field": "sensitive"` in the Guppy config tells Guppy to look for a field in the ES index called `sensitive`, and to exclude records in the ES index which have `sensitive: "true"`) + + +> Example: We have a index called "subject" with `100` records in it. Of those, `55` are inaccessible to this user. +Of the inaccessible records, `15` records are sensitive. There are also `5` other sensitive records which are accessible to the user. +> +> __What will Guppy return when we ask for a total count of all records in the index?__ +> ``` +> query { +> _aggregation{ +> $indexName(accessibility: all) { +> _totalCount +> } +> } +> } +>``` +> * Expected output: +> ``` +> { +> "data": { +> "_aggregation": { +> "$indexName": { +> "_totalCount": 85 +> } +> } +> } +> } +>``` +> If sensitive study exclusion is enabled, Guppy will return `85`, instead of `100`. This is because Guppy excludes the `15` sensitive records that are not accessible to the user. Importantly, Guppy does not exclude the `5` sensitive records which are accessible to the user. +> ### `filterSelf` -In some UI scenarios, there's need that aggregation should skip applying filters on those fields that appear in filter object. For example, in Guppy's filter UI component, when user select `gender=female`, the aggregation (with filter object include `gender=female`) should return all gender values including "female", "male", and "unknown" etc., because filter UI still need to render those options. +In some UI scenarios, there's need that aggregation should skip applying filters on those fields that appear in filter object. For example, in Guppy's filter UI component, when user select `gender=female`, the aggregation (with filter object include `gender=female`) should return all gender values including "female", "male", and "unknown" etc., because filter UI still need to render those options. -In order to skip applying filters for those fields, simply add `filterSelf=false`. +In order to skip applying filters for those fields, simply add `filterSelf=false`. Example without setting `filterSelf` (default is `true`): ``` @@ -892,7 +930,7 @@ query { } ``` -Result: +Result: ``` { @@ -913,7 +951,7 @@ Result: } ``` -Example with `filterSelf: false`: +Example with `filterSelf: false`: ``` query { diff --git a/src/server/config.js b/src/server/config.js index f117ecb9..8a930bf1 100644 --- a/src/server/config.js +++ b/src/server/config.js @@ -32,6 +32,7 @@ const config = { arboristEndpoint: 'http://arborist-service', tierAccessLevel: 'private', tierAccessLimit: 1000, + tierAccessSensitiveRecordExclusionField: inputConfig.tier_access_sensitive_record_exclusion_field, logLevel: 'INFO', enableEncryptWhiteList: typeof inputConfig.enable_encrypt_whitelist === 'undefined' ? false : inputConfig.enable_encrypt_whitelist, encryptWhitelist: inputConfig.encrypt_whitelist || ['__missing__', 'unknown', 'not reported', 'no data'], diff --git a/src/server/middlewares/tierAccessMiddleware/index.js b/src/server/middlewares/tierAccessMiddleware/index.js index 7c24e0ca..72eb896e 100644 --- a/src/server/middlewares/tierAccessMiddleware/index.js +++ b/src/server/middlewares/tierAccessMiddleware/index.js @@ -4,7 +4,7 @@ import log from '../../logger'; import config from '../../config'; import esInstance from '../../es/index'; import CodedError from '../../utils/error'; -import { firstLetterUpperCase, isWhitelisted } from '../../utils/utils'; +import { firstLetterUpperCase, isWhitelisted, addTwoFilters } from '../../utils/utils'; const ENCRYPT_COUNT = -1; @@ -76,15 +76,77 @@ const tierAccessResolver = ( * For `accessible`, we will apply auth filter on top of filter argument * For `unaccessible`, we apply unaccessible filters on top of filter argument */ + const sensitiveRecordExclusionEnabled = !!config.tierAccessSensitiveRecordExclusionField; if (accessibility === 'all') { - return resolve(root, { ...args, needEncryptAgg: true }, context, info); + if (sensitiveRecordExclusionEnabled) { + // Sensitive study exclusion is enabled: For all of the projects user does + // not have access to, hide the studies marked 'sensitive' from the aggregation. + // (See doc/queries.md#Tiered_Access_sensitive_record_exclusion) + const projectsUserHasAccessTo = authHelper.getAccessibleResources(); + const sensitiveStudiesFilter = { + OR: [ + { + IN: { + [config.esConfig.authFilterField]: projectsUserHasAccessTo, + }, + }, + { + '!=': { + [config.tierAccessSensitiveRecordExclusionField]: 'true', + }, + }, + ], + }; + return resolve( + root, + { + ...args, + filter: addTwoFilters(filter, sensitiveStudiesFilter), + needEncryptAgg: true, + }, + context, + info, + ); + } + + return resolve( + root, + { + ...args, + filter, + needEncryptAgg: true, + }, + context, + info, + ); } if (accessibility === 'accessible') { + // We do not need to apply sensitive studies filter here, because + // user has access to all of these projects. log.debug('[tierAccessResolver] applying "accessible" to resolver'); return resolverWithAccessibleFilterApplied( resolve, root, args, context, info, authHelper, filter, ); } + // The below code executes if accessibility === 'unaccessible'. + if (sensitiveRecordExclusionEnabled) { + // Apply sensitive studies filter. Hide the studies marked 'sensitive' from + // the aggregation. + const sensitiveStudiesFilter = { + '!=': { + [config.tierAccessSensitiveRecordExclusionField]: 'true', + }, + }; + return resolverWithUnaccessibleFilterApplied( + resolve, + root, + args, + context, + info, + authHelper, + addTwoFilters(filter, sensitiveStudiesFilter), + ); + } return resolverWithUnaccessibleFilterApplied( resolve, root, args, context, info, authHelper, filter, );