uc-cdis · em-ingram · Jan 30, 2020 · Jan 6, 2020 · Jan 6, 2020 · Jan 7, 2020
diff --git a/README.md b/README.md
@@ -2,15 +2,15 @@
 
 [![npm (scoped)](https://img.shields.io/npm/v/@gen3/guppy)](https://www.npmjs.com/package/@gen3/guppy)
 
-Server that support GraphQL queries on data from elasticsearch. 
+Server that support GraphQL queries on data from elasticsearch.
 
 Please see [this doc](https://github.com/uc-cdis/guppy/blob/master/doc/queries.md) for syntax Guppy supports.
 
-Run `npm start` to start server at port 80. 
+Run `npm start` to start server at port 80.
 
-### Configurations: 
-Before launch, we need to write config and tell Guppy which elasticsearch indices and which auth control field to use. 
-You could put following as your config files: 
+### Configurations:
+Before launch, we need to write config and tell Guppy which elasticsearch indices and which auth control field to use.
+You could put following as your config files:
 
 ```
 {
@@ -32,7 +32,7 @@ You could put following as your config files:
 }
 ```
 
-Following script will start server using at port 3000, using config file `example_config.json`: 
+Following script will start server using at port 3000, using config file `example_config.json`:
 
 ```
 export GUPPY_PORT=3000
@@ -41,9 +41,9 @@ npm start
 ```
 
 #### Authorization
-Guppy connects Arborist for authorization. 
-The `auth_filter_field` item in your config file is the field used for authorization. 
-You could set the endpoint by: 
+Guppy connects Arborist for authorization.
+The `auth_filter_field` item in your config file is the field used for authorization.
+You could set the endpoint by:
 
 ```
 export GEN3_ARBORIST_ENDPOINT=${arborist_service}
@@ -55,7 +55,7 @@ behavior for local test without Arborist, just set `INTERNAL_LOCAL_TEST=true`. P
 look into `/src/server/auth/utils.js` for more details.
 
 #### Tier access
-Guppy also support 3 different levels of tier access, by setting `TIER_ACCESS_LEVEL`: 
+Guppy also support 3 different levels of tier access, by setting `TIER_ACCESS_LEVEL`:
 - `private` by default: only allows access to authorized resources
 - `regular`: allows all kind of aggregation (with limitation for unauthorized resources), but forbid access to raw data without authorization
 - `libre`: access to all data
@@ -65,7 +65,7 @@ For `regular` level, there's another configuration environment variable `TIER_AC
 `regular` level commons could also take in a whitelist of values that won't be encrypted. It is set by `config.encrypt_whitelist`.
 By default the whitelist contains missing values: ['\_\_missing\_\_', 'unknown', 'not reported', 'no data'].
 Also the whitelist is disabled by default due to security reasons. If you would like to enable whitelist, simply put `enable_encrypt_whitelist: true` in your config.
-For example `regular` leveled commons with config looks like this will skip encrypting value `do-not-encrypt-me` even if its count is less than `TIER_ACCESS_LIMIT`: 
+For example `regular` leveled commons with config looks like this will skip encrypting value `do-not-encrypt-me` even if its count is less than `TIER_ACCESS_LIMIT`:
 
 ```
 {
@@ -86,13 +86,23 @@ For example `regular` leveled commons with config looks like this will skip encr
 }
 ```
 
-For example following script will start a Guppy server with `regular` tier access level, and minimum visible count set to 100: 
+For example following script will start a Guppy server with `regular` tier access level, and minimum visible count set to 100:
 
 ```
 export TIER_ACCESS_LEVEL=regular
 export TIER_ACCESS_LIMIT=100
 npm start
 ```
 
+> ##### Tier Access Sensitive Record Exclusion
+> It is possible to configure Guppy to hide some records from being returned in `_aggregation` queries when Tiered Access is enabled (tierAccessLevel: "regular").
+> The purpose of this is to "hide" information about certain sensitive resources, essentially making this an escape hatch from Tiered Access.
+> Crucially, Sensitive Record Exclusion only applies to records which the user does not have access to. If the user has access to a record, it will
+> be counted in the aggregation query whether or not it is sensitive.
+>
+> To enable Sensitive Record Exclusion, set  `guppy.tier_access_sensitive_record_exclusion_field: "fieldname"` in the commons' `manifest.json`. "fieldName" should match a boolean field in the Elasticsearch index that indicates whether or not a record is sensitive.
+>
+> (E.g., `"tier_access_sensitive_record_exclusion_field": "sensitive"` in the Guppy config tells Guppy to look for a field in the ES index called `sensitive`, and to exclude records in the ES index which have `sensitive: "true"`)
+
 #### Download endpoint
-Guppy has another special endpoint `/download` for just fetching raw data from elasticsearch. please see [here](https://github.com/uc-cdis/guppy/blob/master/doc/download.md) for more details.  
+Guppy has another special endpoint `/download` for just fetching raw data from elasticsearch. please see [here](https://github.com/uc-cdis/guppy/blob/master/doc/download.md) for more details.
diff --git a/devHelper/scripts/commands.sh b/devHelper/scripts/commands.sh
@@ -81,7 +81,8 @@ curl -iv -X PUT "${ESHOST}/${indexName}" \
           "whatever_lab_result_value": { "type": "float" },
           "some_string_field": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
           "some_integer_field": { "type": "integer" },
-          "some_long_field": { "type": "long" }
+          "some_long_field": { "type": "long" },
+          "sensitive": { "type": "keyword" }
         }
       }
     }
@@ -102,7 +103,8 @@ curl -iv -X PUT "${ESHOST}/${fileIndexName}" \
         "properties": {
           "file_id": { "type": "keyword" },
           "gen3_resource_path": { "type": "keyword" },
-          "subject_id": { "type": "keyword" }
+          "subject_id": { "type": "keyword" },
+          "sensitive": { "type": "keyword" }
         }
       }
     }
@@ -162,6 +164,7 @@ fileTypeList=( "mRNA Array" "Unaligned Reads" "Lipdomic MS" "Protionic MS" "1Gs
 fileFormatList=( "BEM" "BAM" "BED" "CSV" "FASTQ" "RAW" "TAR" "TSV" "TXT" "IDAT" "__missing__" )
 resourceList=( "/programs/jnkns/projects/jenkins" "/programs/DEV/projects/test" "/programs/external/projects/test")
 projectList=( "jnkns-jenkins" "DEV-test" "external-test" )
+sensitiveList=( "true" "false" )
 
 COUNT=$startIndex
 XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/tmp}"
@@ -182,6 +185,7 @@ while [[ $COUNT -lt $endIndex ]]; do
   stringArray='["1", "2"]'
   intArray='[1, 2]'
   longNumber="10737418240"
+  sensitive="${sensitiveList[$(( $RANDOM % ${#sensitiveList[@]} ))]}"
 
   cat - > "$tmpName" <<EOM
 {
@@ -200,7 +204,9 @@ while [[ $COUNT -lt $endIndex ]]; do
   "whatever_lab_result_value": $randomFloatNumber,
   "some_string_field": $stringArray,
   "some_integer_field": $intArray,
-  "some_long_field": $longNumber
+  "some_long_field": $longNumber,
+  "sensitive": $sensitive
+
 }
 EOM
   cat - $tmpName <<EOM
@@ -214,7 +220,8 @@ EOM
 {
   "subject_id": "$COUNT",
   "gen3_resource_path": "${resourceName}",
-  "file_id": "file_id_$(( $RANDOM % 1000 ))"
+  "file_id": "file_id_$(( $RANDOM % 1000 ))",
+  "sensitive": $sensitive
 }
 EOM
   cat - $tmpName <<EOM