diff --git a/devHelper/scripts/commands.sh b/devHelper/scripts/commands.sh index 22b6def1..52d9813f 100644 --- a/devHelper/scripts/commands.sh +++ b/devHelper/scripts/commands.sh @@ -34,28 +34,52 @@ curl -iv -X PUT "${ESHOST}/${indexName}" \ -H 'Content-Type: application/json' -d' { "settings" : { - "index" : { - "number_of_shards" : 1, - "number_of_replicas" : 0 + "index" : { + "number_of_shards" : 1, + "number_of_replicas" : 0, + "analysis": { + "tokenizer": { + "ngram_tokenizer": { + "type": "ngram", + "min_gram": 2, + "max_gram": 20, + "token_chars": [ "letter", "digit" ] + } + }, + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "ngram_tokenizer", + "filter": [ + "lowercase" + ] + }, + "search_analyzer": { + "type": "custom", + "tokenizer": "keyword", + "filter": "lowercase" + } + } } + } }, "mappings": { "subject": { "properties": { - "subject_id": { "type": "keyword" }, - "name": { "type": "text" }, - "project": { "type": "keyword" }, - "study": { "type": "keyword" }, - "gender": { "type": "keyword" }, - "race": { "type": "keyword" }, - "ethnicity": { "type": "keyword" }, - "vital_status": { "type": "keyword" }, - "file_type": { "type": "keyword" }, - "file_format": { "type": "keyword" }, - "gen3_resource_path": { "type": "keyword" }, + "subject_id": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "name": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "project": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "study": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "gender": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "race": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "ethnicity": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "vital_status": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "file_type": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "file_format": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, + "gen3_resource_path": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, "file_count": { "type": "integer" }, "whatever_lab_result_value": { "type": "float" }, - "some_string_field": { "type": "keyword" }, + "some_string_field": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } }, "some_integer_field": { "type": "integer" }, "some_long_field": { "type": "long" } } diff --git a/doc/queries.md b/doc/queries.md index 19654315..0678d6da 100644 --- a/doc/queries.md +++ b/doc/queries.md @@ -8,6 +8,9 @@ Table of Contents - [Numeric Aggregation](#aggs-numeric) - [Nested Aggregation](#aggs-nested) - [Filters](#filter) + - [Basic Filter Unit](#filter-unit) + - [Text Search Unit in Filter](#filter-search) + - [Combined Filters](#filter-comb) - [Some other queries and arguments](#other) @@ -556,15 +559,136 @@ Result: ## Filters -Currently Guppy uses `JSON`-based syntax for filters. The JSON object key could be an operation like `=`, `>`. One simple example could be: + + + +### Basic filter unit +Currently Guppy uses `JSON`-based syntax for filters. +The JSON object key could be an operation like `=`, `>`. +A very basic filter unit would look like: `{: { : }}`. +One simple example could look like: + +``` +{ + "filter": { + "=": { + "subject_id": "sbj_69" + } + } +} +``` + + +Currently we support following operators: + + +| operator | meaning | support field type | example | +|--------------|--------------------------|--------------------|------------------------------------------------------------------| +| eq, EQ, = | equal | string, number | {"eq": {"gender": "female"}} | +| in, IN | inside | string, number | {"in": {"gender": ["female", "F"]}} | +| != | is not | string, number | {"!=": {"gender": "male"}} | +| gt, GT, > | greater than | number | {">": {"age": 50}} | +| gte, GTE, >= | greater than or equal to | number | {">=": {"age": 50}} | +| lt, LT, < | less then | number | {"<": {"age": 50}} | +| lte, LTE, <= | less than or equal to | number | {"<=": {"age": 50}} | +| search | [search text](#filter-search) | text | {"search": {"keyword": "asian","fields": ["race", "ethnicity"]}} | + + + + + +### A search unit in filter +You could add a search unit into your filter, the syntax looks like: + +``` +{ + "search": { + "keyword": , + "fields": + } +} +``` + +Notice that `keyword` is required. But `fields` is optional, +and if not set, guppy will search thru all analyzed text fields that matched the suffix pattern set in `ANALYZED_TEXT_FIELD_SUFFIX` (by default `.analyzed`, which means search thru all `*.analyzed` fields). + +#### Matched results and highlights +Guppy will return matched fields and highlight partial in `_matched` keyword, +with the matched field name, and highlighted partial words wrapped inside `` tags. +A example search filter: + +``` +query ($filter: JSON) { + subject (filter: $filter, first: 20) { + gender + race + ethnicity + _matched { + field + highlights + } + } +} +``` + +with variable: ``` { - "filter": {"=": {"subject_id": "69"}} + "filter": { + "search": { + "keyword": "asia", + "fields": "race" + } + } } ``` -Or you could use binary combination (`AND` or `OR`)to combine simple filter units into more complicated big filters. Example: +example result: + +``` +{ + "data": { + "subject": [ + { + "gender": "female", + "race": "asian", + "ethnicity": "__missing__", + "_matched": [ + { + "field": "race", + "highlights": [ + "asian" + ] + } + ] + }, + { + "gender": "male", + "race": "asian", + "ethnicity": "White", + "_matched": [ + { + "field": "race", + "highlights": [ + "asian" + ] + } + ] + }, + ... + ] + } +} +``` + + + + + + +### Combine into advanced filters +You could use binary combination (`AND` or `OR`) to combine simple filter units into more complicated big filters. Example: ``` { diff --git a/src/server/__mocks__/config.js b/src/server/__mocks__/config.js index ceb10289..4b66f9da 100644 --- a/src/server/__mocks__/config.js +++ b/src/server/__mocks__/config.js @@ -22,6 +22,8 @@ const config = { tierAccessLevel: 'regular', tierAccessLimit: 20, arboristEndpoint: 'http://mock-arborist', + analyzedTextFieldSuffix: '.analyzed', + matchedTextHighlightTagName: 'em', }; export default config; diff --git a/src/server/__mocks__/mockDataFromES.js b/src/server/__mocks__/mockDataFromES.js index 64228310..0078d2ac 100644 --- a/src/server/__mocks__/mockDataFromES.js +++ b/src/server/__mocks__/mockDataFromES.js @@ -32,6 +32,17 @@ const mockResourcePath = () => { }, }, }, + highlight: { + pre_tags: [ + '', + ], + post_tags: [ + '', + ], + fields: { + '*.analyzed': {}, + }, + }, }; const fakeResource = { aggregations: { @@ -101,6 +112,17 @@ const mockResourcePath = () => { }, }, }, + highlight: { + pre_tags: [ + '', + ], + post_tags: [ + '', + ], + fields: { + '*.analyzed': {}, + }, + }, }; const fakeResourceWithFilter1 = { aggregations: { @@ -155,6 +177,17 @@ const mockResourcePath = () => { }, }, }, + highlight: { + pre_tags: [ + '', + ], + post_tags: [ + '', + ], + fields: { + '*.analyzed': {}, + }, + }, }; const fakeResourceWithFilter2 = { aggregations: { diff --git a/src/server/__mocks__/mockESData/utils.js b/src/server/__mocks__/mockESData/utils.js index aced2e92..9b28f5d8 100644 --- a/src/server/__mocks__/mockESData/utils.js +++ b/src/server/__mocks__/mockESData/utils.js @@ -2,9 +2,23 @@ import nock from 'nock'; import config from '../config'; const mockSearchEndpoint = (mockRequest, mockResult) => { + const mockRequestPatched = { + ...mockRequest, + highlight: { + pre_tags: [ + '', + ], + post_tags: [ + '', + ], + fields: { + '*.analyzed': {}, + }, + }, + }; nock(config.esConfig.host) .persist() - .post(/_search$/, mockRequest) + .post(/_search$/, mockRequestPatched) .reply(200, mockResult); }; diff --git a/src/server/__tests__/schema.test.js b/src/server/__tests__/schema.test.js index 81d6968a..590dd9ff 100644 --- a/src/server/__tests__/schema.test.js +++ b/src/server/__tests__/schema.test.js @@ -58,12 +58,14 @@ describe('Schema', () => { some_array_integer_field: [Int], some_array_string_field: [String], whatever_lab_result_value: Float, + _matched:[MatchedItem] } type File { gen3_resource_path: String, file_id: String, file_size: Float, subject_id: String, + _matched:[MatchedItem] }`; test('could create type schemas', async () => { await esInstance.initialize(); diff --git a/src/server/config.js b/src/server/config.js index 43fdac20..f117ecb9 100644 --- a/src/server/config.js +++ b/src/server/config.js @@ -35,6 +35,9 @@ const config = { logLevel: 'INFO', enableEncryptWhiteList: typeof inputConfig.enable_encrypt_whitelist === 'undefined' ? false : inputConfig.enable_encrypt_whitelist, encryptWhitelist: inputConfig.encrypt_whitelist || ['__missing__', 'unknown', 'not reported', 'no data'], + analyzedTextFieldSuffix: '.analyzed', + matchedTextHighlightTagName: 'em', + allowedMinimumSearchLen: 2, }; if (process.env.GEN3_ES_ENDPOINT) { @@ -64,6 +67,10 @@ if (process.env.LOG_LEVEL) { config.logLevel = process.env.LOG_LEVEL; } +if (process.env.ANALYZED_TEXT_FIELD_SUFFIX) { + config.analyzedTextFieldSuffix = process.env.ANALYZED_TEXT_FIELD_SUFFIX; +} + // only three options for tier access level: 'private' (default), 'regular', and 'libre' if (process.env.TIER_ACCESS_LEVEL) { if (process.env.TIER_ACCESS_LEVEL !== 'private' diff --git a/src/server/es/__tests__/filter.test.js b/src/server/es/__tests__/filter.test.js index d08ec32c..18583511 100644 --- a/src/server/es/__tests__/filter.test.js +++ b/src/server/es/__tests__/filter.test.js @@ -4,6 +4,7 @@ import { UserInputError } from 'apollo-server'; import getFilterObj from '../filter'; import esInstance from '../index'; import setupMockDataEndpoint from '../../__mocks__/mockDataFromES'; +import config from '../../config'; jest.mock('../../config'); jest.mock('../../logger'); @@ -181,6 +182,28 @@ describe('Transfer GraphQL filter to ES filter, filter unit', () => { expect(resultESFilter3).toEqual(expectedESFilter); }); + test('could transfer graphql filter to ES filter object, "search" operator', async () => { + await esInstance.initialize(); + const keyword = 'male'; + const gqlFilter1 = { search: { keyword } }; + const resultESFilter1 = getFilterObj(esInstance, esIndex, esType, gqlFilter1); + const expectedESFilter1 = { multi_match: { query: keyword, fields: [`*${config.analyzedTextFieldSuffix}`] } }; + expect(resultESFilter1).toEqual(expectedESFilter1); + + const targetFields = ['gender', 'name']; + const gqlFilter2 = { search: { keyword, fields: targetFields } }; + const resultESFilter2 = getFilterObj(esInstance, esIndex, esType, gqlFilter2); + const expectedTargetFields = targetFields.map(f => `${f}${config.analyzedTextFieldSuffix}`); + const expectedESFilter2 = { multi_match: { query: keyword, fields: expectedTargetFields } }; + expect(resultESFilter2).toEqual(expectedESFilter2); + + const targetField = 'gender'; + const gqlFilter3 = { search: { keyword, fields: targetField } }; + const resultESFilter3 = getFilterObj(esInstance, esIndex, esType, gqlFilter3); + const expectedESFilter3 = { multi_match: { query: keyword, fields: [`${targetField}${config.analyzedTextFieldSuffix}`] } }; + expect(resultESFilter3).toEqual(expectedESFilter3); + }); + test('could throw err for invalid operator', async () => { await esInstance.initialize(); diff --git a/src/server/es/filter.js b/src/server/es/filter.js index 9a2aac59..b9d51aac 100644 --- a/src/server/es/filter.js +++ b/src/server/es/filter.js @@ -182,6 +182,30 @@ const getFilterItemForNumbers = (op, field, value) => { throw new UserInputError(`Invalid numeric operation "${op}" for field "${field}" in filter argument`); }; +const getESSearchFilterFragment = (esInstance, esIndex, fields, keyword) => { + let analyzedFields = [`*${config.analyzedTextFieldSuffix}`]; // search all fields by default + if (typeof fields !== 'undefined') { + if (typeof fields === 'string') { + fields = [fields]; // eslint-disable-line no-param-reassign + } + // Check fields are valid + fields.forEach((f) => { + if (!esInstance.fieldTypes[esIndex]) { + throw new UserInputError(`es index ${esIndex} doesn't exist`); + } else if (!esInstance.fieldTypes[esIndex][f]) { + throw new UserInputError(`invalid field ${f} in "filter" variable`); + } + }); + analyzedFields = fields.map(f => `${f}${config.analyzedTextFieldSuffix}`); + } + return { + multi_match: { + query: keyword, + fields: analyzedFields, + }, + }; +}; + /** * This function transfer graphql filter arg to ES filter object * It first parse graphql filter object recursively from top to down, @@ -236,6 +260,23 @@ const getFilterObj = ( }, }; } + } else if (topLevelOpLowerCase === 'search') { + if (!('keyword' in graphqlFilterObj[topLevelOp])) { // "keyword" required + throw new UserInputError('Invalid search filter syntax: missing \'keyword\' field'); + } + Object.keys(graphqlFilterObj[topLevelOp]).forEach((o) => { // check filter syntax + if (o !== 'keyword' && o !== 'fields') { + throw new UserInputError(`Invalid search filter syntax: unrecognize field '${o}'`); + } + }); + const targetSearchKeyword = graphqlFilterObj[topLevelOp].keyword; + if (targetSearchKeyword.length < config.allowedMinimumSearchLen) { + throw new UserInputError(`Keyword too short (length < ${config.allowedMinimumSearchLen}`); + } + const targetSearchFields = graphqlFilterObj[topLevelOp].fields; + resultFilterObj = getESSearchFilterFragment( + esInstance, esIndex, targetSearchFields, targetSearchKeyword, + ); } else { const field = Object.keys(graphqlFilterObj[topLevelOp])[0]; if (aggsField === field && !filterSelf) { diff --git a/src/server/es/index.js b/src/server/es/index.js index 7dbfad49..456378d6 100644 --- a/src/server/es/index.js +++ b/src/server/es/index.js @@ -40,6 +40,17 @@ class ES { validatedQueryBody[key] = queryBody[key]; } }); + validatedQueryBody.highlight = { + pre_tags: [ + `<${config.matchedTextHighlightTagName}>`, + ], + post_tags: [ + ``, + ], + fields: { + [`*${config.analyzedTextFieldSuffix}`]: {}, + }, + }; log.info('[ES.query] query body: ', JSON.stringify(validatedQueryBody)); return this.client.search({ index: esIndex, @@ -348,7 +359,32 @@ class ES { filter, fields, sort, offset, size, }, ); - return result.hits.hits.map(item => item._source); + const { hits } = result.hits; + const hitsWithMatchedResults = hits.map((h) => { + if (!('highlight' in h)) { + // ES doesn't returns "highlight" + return h._source; + } + // ES returns highlight, transfer them into "_matched" schema + const matchedList = Object.keys(h.highlight).map((f) => { + let field = f; + if (f.endsWith(config.analyzedTextFieldSuffix)) { + // remove ".analyzed" suffix from field name + field = f.substr(0, f.length - config.analyzedTextFieldSuffix.length); + } + return { + field, + // just use ES highlights' format, + // should be a list of string, with matched part emphasized with < + highlights: h.highlight[f], + }; + }); + return { + ...h._source, + _matched: matchedList, + }; + }); + return hitsWithMatchedResults; } downloadData({ diff --git a/src/server/resolvers.js b/src/server/resolvers.js index a60ecf32..5281768c 100644 --- a/src/server/resolvers.js +++ b/src/server/resolvers.js @@ -9,9 +9,11 @@ import { firstLetterUpperCase } from './utils/utils'; * @returns {string[]} parsed fields */ const parseFieldsFromTypeResolveInfo = (resolveInfo) => { + const reservedNames = ['_matched']; // This is for search results const parsedInfo = parseResolveInfo(resolveInfo); const typeName = firstLetterUpperCase(parsedInfo.name); - const fields = Object.keys(parsedInfo.fieldsByTypeName[typeName]); + const fields = Object.keys(parsedInfo.fieldsByTypeName[typeName]) + .filter(f => !reservedNames.includes(f)); return fields; }; diff --git a/src/server/schema.js b/src/server/schema.js index 24e3df98..f22be10b 100644 --- a/src/server/schema.js +++ b/src/server/schema.js @@ -74,6 +74,7 @@ const getTypeSchemaForOneIndex = (esInstance, esIndex, esType) => { const typeSchema = ` type ${esTypeObjName} { ${fieldGQLTypeMap.map(entry => `${entry.field}: ${entry.type},`).join('\n')} + _matched: [MatchedItem] } `; return typeSchema; @@ -126,6 +127,13 @@ export const getMappingSchema = esConfig => ` export const buildSchemaString = (esConfig, esInstance) => { const querySchema = getQuerySchema(esConfig); + const matchedItemSchema = ` + type MatchedItem { + field: String + highlights: [String] + } + `; + const typesSchemas = getTypesSchemas(esConfig, esInstance); const accessibilityEnum = ` @@ -206,6 +214,7 @@ export const buildSchemaString = (esConfig, esInstance) => { const schemaStr = ` scalar JSON + ${matchedItemSchema} ${querySchema} ${accessibilityEnum} ${typesSchemas}