diff --git a/devHelper/scripts/commands.sh b/devHelper/scripts/commands.sh
index 22b6def1..52d9813f 100644
--- a/devHelper/scripts/commands.sh
+++ b/devHelper/scripts/commands.sh
@@ -34,28 +34,52 @@ curl -iv -X PUT "${ESHOST}/${indexName}" \
-H 'Content-Type: application/json' -d'
{
"settings" : {
- "index" : {
- "number_of_shards" : 1,
- "number_of_replicas" : 0
+ "index" : {
+ "number_of_shards" : 1,
+ "number_of_replicas" : 0,
+ "analysis": {
+ "tokenizer": {
+ "ngram_tokenizer": {
+ "type": "ngram",
+ "min_gram": 2,
+ "max_gram": 20,
+ "token_chars": [ "letter", "digit" ]
+ }
+ },
+ "analyzer": {
+ "ngram_analyzer": {
+ "type": "custom",
+ "tokenizer": "ngram_tokenizer",
+ "filter": [
+ "lowercase"
+ ]
+ },
+ "search_analyzer": {
+ "type": "custom",
+ "tokenizer": "keyword",
+ "filter": "lowercase"
+ }
+ }
}
+ }
},
"mappings": {
"subject": {
"properties": {
- "subject_id": { "type": "keyword" },
- "name": { "type": "text" },
- "project": { "type": "keyword" },
- "study": { "type": "keyword" },
- "gender": { "type": "keyword" },
- "race": { "type": "keyword" },
- "ethnicity": { "type": "keyword" },
- "vital_status": { "type": "keyword" },
- "file_type": { "type": "keyword" },
- "file_format": { "type": "keyword" },
- "gen3_resource_path": { "type": "keyword" },
+ "subject_id": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "name": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "project": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "study": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "gender": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "race": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "ethnicity": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "vital_status": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "file_type": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "file_format": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
+ "gen3_resource_path": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
"file_count": { "type": "integer" },
"whatever_lab_result_value": { "type": "float" },
- "some_string_field": { "type": "keyword" },
+ "some_string_field": { "type": "keyword", "fields": { "analyzed": {"type": "text", "analyzer": "ngram_analyzer", "search_analyzer": "search_analyzer", "term_vector": "with_positions_offsets"} } },
"some_integer_field": { "type": "integer" },
"some_long_field": { "type": "long" }
}
diff --git a/doc/queries.md b/doc/queries.md
index 19654315..0678d6da 100644
--- a/doc/queries.md
+++ b/doc/queries.md
@@ -8,6 +8,9 @@ Table of Contents
- [Numeric Aggregation](#aggs-numeric)
- [Nested Aggregation](#aggs-nested)
- [Filters](#filter)
+ - [Basic Filter Unit](#filter-unit)
+ - [Text Search Unit in Filter](#filter-search)
+ - [Combined Filters](#filter-comb)
- [Some other queries and arguments](#other)
@@ -556,15 +559,136 @@ Result:
## Filters
-Currently Guppy uses `JSON`-based syntax for filters. The JSON object key could be an operation like `=`, `>`. One simple example could be:
+
+
+
+### Basic filter unit
+Currently Guppy uses `JSON`-based syntax for filters.
+The JSON object key could be an operation like `=`, `>`.
+A very basic filter unit would look like: `{: { : }}`.
+One simple example could look like:
+
+```
+{
+ "filter": {
+ "=": {
+ "subject_id": "sbj_69"
+ }
+ }
+}
+```
+
+
+Currently we support following operators:
+
+
+| operator | meaning | support field type | example |
+|--------------|--------------------------|--------------------|------------------------------------------------------------------|
+| eq, EQ, = | equal | string, number | {"eq": {"gender": "female"}} |
+| in, IN | inside | string, number | {"in": {"gender": ["female", "F"]}} |
+| != | is not | string, number | {"!=": {"gender": "male"}} |
+| gt, GT, > | greater than | number | {">": {"age": 50}} |
+| gte, GTE, >= | greater than or equal to | number | {">=": {"age": 50}} |
+| lt, LT, < | less then | number | {"<": {"age": 50}} |
+| lte, LTE, <= | less than or equal to | number | {"<=": {"age": 50}} |
+| search | [search text](#filter-search) | text | {"search": {"keyword": "asian","fields": ["race", "ethnicity"]}} |
+
+
+
+
+
+### A search unit in filter
+You could add a search unit into your filter, the syntax looks like:
+
+```
+{
+ "search": {
+ "keyword": ,
+ "fields":
+ }
+}
+```
+
+Notice that `keyword` is required. But `fields` is optional,
+and if not set, guppy will search thru all analyzed text fields that matched the suffix pattern set in `ANALYZED_TEXT_FIELD_SUFFIX` (by default `.analyzed`, which means search thru all `*.analyzed` fields).
+
+#### Matched results and highlights
+Guppy will return matched fields and highlight partial in `_matched` keyword,
+with the matched field name, and highlighted partial words wrapped inside `` tags.
+A example search filter:
+
+```
+query ($filter: JSON) {
+ subject (filter: $filter, first: 20) {
+ gender
+ race
+ ethnicity
+ _matched {
+ field
+ highlights
+ }
+ }
+}
+```
+
+with variable:
```
{
- "filter": {"=": {"subject_id": "69"}}
+ "filter": {
+ "search": {
+ "keyword": "asia",
+ "fields": "race"
+ }
+ }
}
```
-Or you could use binary combination (`AND` or `OR`)to combine simple filter units into more complicated big filters. Example:
+example result:
+
+```
+{
+ "data": {
+ "subject": [
+ {
+ "gender": "female",
+ "race": "asian",
+ "ethnicity": "__missing__",
+ "_matched": [
+ {
+ "field": "race",
+ "highlights": [
+ "asian"
+ ]
+ }
+ ]
+ },
+ {
+ "gender": "male",
+ "race": "asian",
+ "ethnicity": "White",
+ "_matched": [
+ {
+ "field": "race",
+ "highlights": [
+ "asian"
+ ]
+ }
+ ]
+ },
+ ...
+ ]
+ }
+}
+```
+
+
+
+
+
+
+### Combine into advanced filters
+You could use binary combination (`AND` or `OR`) to combine simple filter units into more complicated big filters. Example:
```
{
diff --git a/src/server/__mocks__/config.js b/src/server/__mocks__/config.js
index ceb10289..4b66f9da 100644
--- a/src/server/__mocks__/config.js
+++ b/src/server/__mocks__/config.js
@@ -22,6 +22,8 @@ const config = {
tierAccessLevel: 'regular',
tierAccessLimit: 20,
arboristEndpoint: 'http://mock-arborist',
+ analyzedTextFieldSuffix: '.analyzed',
+ matchedTextHighlightTagName: 'em',
};
export default config;
diff --git a/src/server/__mocks__/mockDataFromES.js b/src/server/__mocks__/mockDataFromES.js
index 64228310..0078d2ac 100644
--- a/src/server/__mocks__/mockDataFromES.js
+++ b/src/server/__mocks__/mockDataFromES.js
@@ -32,6 +32,17 @@ const mockResourcePath = () => {
},
},
},
+ highlight: {
+ pre_tags: [
+ '',
+ ],
+ post_tags: [
+ '',
+ ],
+ fields: {
+ '*.analyzed': {},
+ },
+ },
};
const fakeResource = {
aggregations: {
@@ -101,6 +112,17 @@ const mockResourcePath = () => {
},
},
},
+ highlight: {
+ pre_tags: [
+ '',
+ ],
+ post_tags: [
+ '',
+ ],
+ fields: {
+ '*.analyzed': {},
+ },
+ },
};
const fakeResourceWithFilter1 = {
aggregations: {
@@ -155,6 +177,17 @@ const mockResourcePath = () => {
},
},
},
+ highlight: {
+ pre_tags: [
+ '',
+ ],
+ post_tags: [
+ '',
+ ],
+ fields: {
+ '*.analyzed': {},
+ },
+ },
};
const fakeResourceWithFilter2 = {
aggregations: {
diff --git a/src/server/__mocks__/mockESData/utils.js b/src/server/__mocks__/mockESData/utils.js
index aced2e92..9b28f5d8 100644
--- a/src/server/__mocks__/mockESData/utils.js
+++ b/src/server/__mocks__/mockESData/utils.js
@@ -2,9 +2,23 @@ import nock from 'nock';
import config from '../config';
const mockSearchEndpoint = (mockRequest, mockResult) => {
+ const mockRequestPatched = {
+ ...mockRequest,
+ highlight: {
+ pre_tags: [
+ '',
+ ],
+ post_tags: [
+ '',
+ ],
+ fields: {
+ '*.analyzed': {},
+ },
+ },
+ };
nock(config.esConfig.host)
.persist()
- .post(/_search$/, mockRequest)
+ .post(/_search$/, mockRequestPatched)
.reply(200, mockResult);
};
diff --git a/src/server/__tests__/schema.test.js b/src/server/__tests__/schema.test.js
index 81d6968a..590dd9ff 100644
--- a/src/server/__tests__/schema.test.js
+++ b/src/server/__tests__/schema.test.js
@@ -58,12 +58,14 @@ describe('Schema', () => {
some_array_integer_field: [Int],
some_array_string_field: [String],
whatever_lab_result_value: Float,
+ _matched:[MatchedItem]
}
type File {
gen3_resource_path: String,
file_id: String,
file_size: Float,
subject_id: String,
+ _matched:[MatchedItem]
}`;
test('could create type schemas', async () => {
await esInstance.initialize();
diff --git a/src/server/config.js b/src/server/config.js
index 43fdac20..f117ecb9 100644
--- a/src/server/config.js
+++ b/src/server/config.js
@@ -35,6 +35,9 @@ const config = {
logLevel: 'INFO',
enableEncryptWhiteList: typeof inputConfig.enable_encrypt_whitelist === 'undefined' ? false : inputConfig.enable_encrypt_whitelist,
encryptWhitelist: inputConfig.encrypt_whitelist || ['__missing__', 'unknown', 'not reported', 'no data'],
+ analyzedTextFieldSuffix: '.analyzed',
+ matchedTextHighlightTagName: 'em',
+ allowedMinimumSearchLen: 2,
};
if (process.env.GEN3_ES_ENDPOINT) {
@@ -64,6 +67,10 @@ if (process.env.LOG_LEVEL) {
config.logLevel = process.env.LOG_LEVEL;
}
+if (process.env.ANALYZED_TEXT_FIELD_SUFFIX) {
+ config.analyzedTextFieldSuffix = process.env.ANALYZED_TEXT_FIELD_SUFFIX;
+}
+
// only three options for tier access level: 'private' (default), 'regular', and 'libre'
if (process.env.TIER_ACCESS_LEVEL) {
if (process.env.TIER_ACCESS_LEVEL !== 'private'
diff --git a/src/server/es/__tests__/filter.test.js b/src/server/es/__tests__/filter.test.js
index d08ec32c..18583511 100644
--- a/src/server/es/__tests__/filter.test.js
+++ b/src/server/es/__tests__/filter.test.js
@@ -4,6 +4,7 @@ import { UserInputError } from 'apollo-server';
import getFilterObj from '../filter';
import esInstance from '../index';
import setupMockDataEndpoint from '../../__mocks__/mockDataFromES';
+import config from '../../config';
jest.mock('../../config');
jest.mock('../../logger');
@@ -181,6 +182,28 @@ describe('Transfer GraphQL filter to ES filter, filter unit', () => {
expect(resultESFilter3).toEqual(expectedESFilter);
});
+ test('could transfer graphql filter to ES filter object, "search" operator', async () => {
+ await esInstance.initialize();
+ const keyword = 'male';
+ const gqlFilter1 = { search: { keyword } };
+ const resultESFilter1 = getFilterObj(esInstance, esIndex, esType, gqlFilter1);
+ const expectedESFilter1 = { multi_match: { query: keyword, fields: [`*${config.analyzedTextFieldSuffix}`] } };
+ expect(resultESFilter1).toEqual(expectedESFilter1);
+
+ const targetFields = ['gender', 'name'];
+ const gqlFilter2 = { search: { keyword, fields: targetFields } };
+ const resultESFilter2 = getFilterObj(esInstance, esIndex, esType, gqlFilter2);
+ const expectedTargetFields = targetFields.map(f => `${f}${config.analyzedTextFieldSuffix}`);
+ const expectedESFilter2 = { multi_match: { query: keyword, fields: expectedTargetFields } };
+ expect(resultESFilter2).toEqual(expectedESFilter2);
+
+ const targetField = 'gender';
+ const gqlFilter3 = { search: { keyword, fields: targetField } };
+ const resultESFilter3 = getFilterObj(esInstance, esIndex, esType, gqlFilter3);
+ const expectedESFilter3 = { multi_match: { query: keyword, fields: [`${targetField}${config.analyzedTextFieldSuffix}`] } };
+ expect(resultESFilter3).toEqual(expectedESFilter3);
+ });
+
test('could throw err for invalid operator', async () => {
await esInstance.initialize();
diff --git a/src/server/es/filter.js b/src/server/es/filter.js
index 9a2aac59..b9d51aac 100644
--- a/src/server/es/filter.js
+++ b/src/server/es/filter.js
@@ -182,6 +182,30 @@ const getFilterItemForNumbers = (op, field, value) => {
throw new UserInputError(`Invalid numeric operation "${op}" for field "${field}" in filter argument`);
};
+const getESSearchFilterFragment = (esInstance, esIndex, fields, keyword) => {
+ let analyzedFields = [`*${config.analyzedTextFieldSuffix}`]; // search all fields by default
+ if (typeof fields !== 'undefined') {
+ if (typeof fields === 'string') {
+ fields = [fields]; // eslint-disable-line no-param-reassign
+ }
+ // Check fields are valid
+ fields.forEach((f) => {
+ if (!esInstance.fieldTypes[esIndex]) {
+ throw new UserInputError(`es index ${esIndex} doesn't exist`);
+ } else if (!esInstance.fieldTypes[esIndex][f]) {
+ throw new UserInputError(`invalid field ${f} in "filter" variable`);
+ }
+ });
+ analyzedFields = fields.map(f => `${f}${config.analyzedTextFieldSuffix}`);
+ }
+ return {
+ multi_match: {
+ query: keyword,
+ fields: analyzedFields,
+ },
+ };
+};
+
/**
* This function transfer graphql filter arg to ES filter object
* It first parse graphql filter object recursively from top to down,
@@ -236,6 +260,23 @@ const getFilterObj = (
},
};
}
+ } else if (topLevelOpLowerCase === 'search') {
+ if (!('keyword' in graphqlFilterObj[topLevelOp])) { // "keyword" required
+ throw new UserInputError('Invalid search filter syntax: missing \'keyword\' field');
+ }
+ Object.keys(graphqlFilterObj[topLevelOp]).forEach((o) => { // check filter syntax
+ if (o !== 'keyword' && o !== 'fields') {
+ throw new UserInputError(`Invalid search filter syntax: unrecognize field '${o}'`);
+ }
+ });
+ const targetSearchKeyword = graphqlFilterObj[topLevelOp].keyword;
+ if (targetSearchKeyword.length < config.allowedMinimumSearchLen) {
+ throw new UserInputError(`Keyword too short (length < ${config.allowedMinimumSearchLen}`);
+ }
+ const targetSearchFields = graphqlFilterObj[topLevelOp].fields;
+ resultFilterObj = getESSearchFilterFragment(
+ esInstance, esIndex, targetSearchFields, targetSearchKeyword,
+ );
} else {
const field = Object.keys(graphqlFilterObj[topLevelOp])[0];
if (aggsField === field && !filterSelf) {
diff --git a/src/server/es/index.js b/src/server/es/index.js
index 7dbfad49..456378d6 100644
--- a/src/server/es/index.js
+++ b/src/server/es/index.js
@@ -40,6 +40,17 @@ class ES {
validatedQueryBody[key] = queryBody[key];
}
});
+ validatedQueryBody.highlight = {
+ pre_tags: [
+ `<${config.matchedTextHighlightTagName}>`,
+ ],
+ post_tags: [
+ `${config.matchedTextHighlightTagName}>`,
+ ],
+ fields: {
+ [`*${config.analyzedTextFieldSuffix}`]: {},
+ },
+ };
log.info('[ES.query] query body: ', JSON.stringify(validatedQueryBody));
return this.client.search({
index: esIndex,
@@ -348,7 +359,32 @@ class ES {
filter, fields, sort, offset, size,
},
);
- return result.hits.hits.map(item => item._source);
+ const { hits } = result.hits;
+ const hitsWithMatchedResults = hits.map((h) => {
+ if (!('highlight' in h)) {
+ // ES doesn't returns "highlight"
+ return h._source;
+ }
+ // ES returns highlight, transfer them into "_matched" schema
+ const matchedList = Object.keys(h.highlight).map((f) => {
+ let field = f;
+ if (f.endsWith(config.analyzedTextFieldSuffix)) {
+ // remove ".analyzed" suffix from field name
+ field = f.substr(0, f.length - config.analyzedTextFieldSuffix.length);
+ }
+ return {
+ field,
+ // just use ES highlights' format,
+ // should be a list of string, with matched part emphasized with <
+ highlights: h.highlight[f],
+ };
+ });
+ return {
+ ...h._source,
+ _matched: matchedList,
+ };
+ });
+ return hitsWithMatchedResults;
}
downloadData({
diff --git a/src/server/resolvers.js b/src/server/resolvers.js
index a60ecf32..5281768c 100644
--- a/src/server/resolvers.js
+++ b/src/server/resolvers.js
@@ -9,9 +9,11 @@ import { firstLetterUpperCase } from './utils/utils';
* @returns {string[]} parsed fields
*/
const parseFieldsFromTypeResolveInfo = (resolveInfo) => {
+ const reservedNames = ['_matched']; // This is for search results
const parsedInfo = parseResolveInfo(resolveInfo);
const typeName = firstLetterUpperCase(parsedInfo.name);
- const fields = Object.keys(parsedInfo.fieldsByTypeName[typeName]);
+ const fields = Object.keys(parsedInfo.fieldsByTypeName[typeName])
+ .filter(f => !reservedNames.includes(f));
return fields;
};
diff --git a/src/server/schema.js b/src/server/schema.js
index 24e3df98..f22be10b 100644
--- a/src/server/schema.js
+++ b/src/server/schema.js
@@ -74,6 +74,7 @@ const getTypeSchemaForOneIndex = (esInstance, esIndex, esType) => {
const typeSchema = `
type ${esTypeObjName} {
${fieldGQLTypeMap.map(entry => `${entry.field}: ${entry.type},`).join('\n')}
+ _matched: [MatchedItem]
}
`;
return typeSchema;
@@ -126,6 +127,13 @@ export const getMappingSchema = esConfig => `
export const buildSchemaString = (esConfig, esInstance) => {
const querySchema = getQuerySchema(esConfig);
+ const matchedItemSchema = `
+ type MatchedItem {
+ field: String
+ highlights: [String]
+ }
+ `;
+
const typesSchemas = getTypesSchemas(esConfig, esInstance);
const accessibilityEnum = `
@@ -206,6 +214,7 @@ export const buildSchemaString = (esConfig, esInstance) => {
const schemaStr = `
scalar JSON
+ ${matchedItemSchema}
${querySchema}
${accessibilityEnum}
${typesSchemas}