NLPchina#114 csv - aggregation support for singleNumerics (sum,avg,count etc) and buckets (terms,date_histogram etc)

eliranmoyal · eliranmoyal · commit b2fb76cdacfb · 2015-12-29T10:39:28.000+02:00
diff --git a/src/main/java/org/elasticsearch/plugin/nlpcn/executors/CSVResultRestExecutor.java b/src/main/java/org/elasticsearch/plugin/nlpcn/executors/CSVResultRestExecutor.java
@@ -26,7 +26,7 @@ public void execute(Client client, Map<String, String> params, QueryAction query
         if(params.containsKey("separator")){
          separator = params.get("separator");
         }
-        CSVResult result  = CSVResultsExtractor.extractResults(queryResult,flat,separator);
+        CSVResult result  = new CSVResultsExtractor().extractResults(queryResult,flat,separator);
         String newLine = "\n";
         if(params.containsKey("newLine")){
          newLine = params.get("newLine");
diff --git a/src/main/java/org/elasticsearch/plugin/nlpcn/executors/CSVResultsExtractor.java b/src/main/java/org/elasticsearch/plugin/nlpcn/executors/CSVResultsExtractor.java
@@ -1,41 +1,187 @@
 package org.elasticsearch.plugin.nlpcn.executors;
 
+import com.sun.org.apache.xpath.internal.operations.Mult;
+import org.elasticsearch.cluster.routing.allocation.decider.Decision;
+import org.elasticsearch.common.base.Joiner;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.SearchHits;
+import org.elasticsearch.search.aggregations.Aggregation;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
+import org.elasticsearch.search.aggregations.Aggregations;
+import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation;
+import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation;
+import org.elasticsearch.search.aggregations.bucket.geogrid.GeoHashGrid;
+import org.elasticsearch.search.aggregations.metrics.MetricsAggregator;
+import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation;
+import org.elasticsearch.search.aggregations.metrics.scripted.ScriptedMetric;
+import org.elasticsearch.search.aggregations.metrics.stats.extended.ExtendedStats;
 
 import java.util.*;
 
 /**
  * Created by Eliran on 27/12/2015.
  */
 public class CSVResultsExtractor {
-    public static CSVResult extractResults(Object queryResult, boolean flat, String separator) {
+    private int currentLineIndex;
+
+    public CSVResultsExtractor() {
+        this.currentLineIndex = 0;
+    }
+
+    public CSVResult extractResults(Object queryResult, boolean flat, String separator) {
         if(queryResult instanceof SearchHits){
             SearchHit[] hits = ((SearchHits) queryResult).getHits();
-            Set<String> csvHeaders = new HashSet<>();
             List<Map<String,Object>> docsAsMap = new ArrayList<>();
-            for(SearchHit hit : hits){
-                Map<String, Object> doc = hit.sourceAsMap();
-                mergeHeaders(csvHeaders,doc,flat);
-                docsAsMap.add(doc);
-            }
-            List<String> headers = new ArrayList<>(csvHeaders);
+            List<String> headers = createHeadersAndFillDocsMap(flat, hits, docsAsMap);
+            List<String> csvLines = createCSVLinesFromDocs(flat, separator, docsAsMap, headers);
+            return new CSVResult(headers,csvLines);
+        }
+        if(queryResult instanceof Aggregations){
+            List<String> headers = new ArrayList<>();
+            List<List<String>> lines = new ArrayList<>();
+            lines.add(new ArrayList<String>());
+            handleAggregations((Aggregations) queryResult, headers, lines);
 
-            List<String> csvLines = new ArrayList<>();
-            for(Map<String,Object> doc : docsAsMap){
-                String line = "";
-                for(String header : headers){
-                    line += findFieldValue(header, doc, flat, separator);
-                }
-                csvLines.add(line.substring(0, line.length() - 1));
+            List<String> csvLines  = new ArrayList<>();
+            for(List<String> simpleLine : lines){
+                csvLines.add(Joiner.on(separator).join(simpleLine));
             }
 
+            //todo: need to handle more options for aggregations:
+            //NumericMetricsAggregation.Multi : ExtendedStats,Stats,Percentiles
+            //Aggregations that inhrit from base
+            //ScriptedMetric
+            //TopHits
+            //GeoBounds
+
             return new CSVResult(headers,csvLines);
+
         }
         return null;
     }
 
-    private static String findFieldValue(String header, Map<String, Object> doc, boolean flat, String separator) {
+    private  void handleAggregations(Aggregations aggregations, List<String> headers, List<List<String>> lines) {
+        if(allNumericAggregations(aggregations)){
+            lines.get(this.currentLineIndex).addAll(fillHeaderAndCreateLineForNumericAggregations(aggregations, headers));
+            return;
+        }
+        //aggregations with size one only supported when not metrics.
+        List<Aggregation> aggregationList = aggregations.asList();
+        if(aggregationList.size() > 1){
+            //todo: throw exception
+        }
+        Aggregation aggregation = aggregationList.get(0);
+        //we want to skip singleBucketAggregations (nested,reverse_nested,filters)
+        if(aggregation instanceof SingleBucketAggregation){
+            Aggregations singleBucketAggs = ((SingleBucketAggregation) aggregation).getAggregations();
+            handleAggregations(singleBucketAggs,headers,lines);
+            return;
+        }
+        if(aggregation instanceof NumericMetricsAggregation){
+            handleNumericMetricAggregation(headers,lines.get(currentLineIndex),aggregation);
+            return;
+        }
+        if(aggregation instanceof MultiBucketsAggregation){
+            MultiBucketsAggregation bucketsAggregation = (MultiBucketsAggregation) aggregation;
+            String name = bucketsAggregation.getName();
+            //checking because it can comes from sub aggregation again
+            if(!headers.contains(name)){
+                headers.add(name);
+            }
+            Collection<? extends MultiBucketsAggregation.Bucket> buckets = bucketsAggregation.getBuckets();
+
+            //clone current line.
+            List<String> currentLine = lines.get(this.currentLineIndex);
+            List<String> clonedLine = new ArrayList<>(currentLine);
+
+            //call handle_Agg with current_line++
+            boolean firstLine = true;
+            for (MultiBucketsAggregation.Bucket bucket : buckets) {
+                //each bucket need to add new line with current line copied => except for first line
+                String key = bucket.getKeyAsText().string();
+                if(firstLine){
+                    firstLine = false;
+                }
+                else {
+                    currentLineIndex++;
+                    currentLine = new ArrayList<String>(clonedLine);
+                    lines.add(currentLine);
+                }
+                currentLine.add(key);
+                handleAggregations(bucket.getAggregations(),headers,lines);
+
+            }
+        }
+
+    }
+
+    private  List<String> fillHeaderAndCreateLineForNumericAggregations(Aggregations aggregations, List<String> header) {
+        List<String> line = new ArrayList<>();
+        List<Aggregation> aggregationList = aggregations.asList();
+        for(Aggregation aggregation : aggregationList){
+            handleNumericMetricAggregation(header, line, aggregation);
+        }
+        return line;
+    }
+
+    private  void handleNumericMetricAggregation(List<String> header, List<String> line, Aggregation aggregation) {
+        String name = aggregation.getName();
+        if(!header.contains(name)){
+            header.add(aggregation.getName());
+        }
+        if(aggregation instanceof NumericMetricsAggregation.SingleValue){
+            line.add(((NumericMetricsAggregation.SingleValue) aggregation).getValueAsString());
+        }
+        //todo:Numeric MultiValue - Stats,ExtendedStats,Percentile...
+        else {
+
+        }
+    }
+
+    private  boolean allNumericAggregations(Aggregations aggregations) {
+        List<Aggregation> aggregationList = aggregations.asList();
+        for(Aggregation aggregation : aggregationList){
+            if(!(aggregation instanceof NumericMetricsAggregation)){
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private  Aggregation skipAggregations(Aggregation firstAggregation) {
+        while(firstAggregation instanceof SingleBucketAggregation){
+            firstAggregation = getFirstAggregation(((SingleBucketAggregation) firstAggregation).getAggregations());
+        }
+        return firstAggregation;
+    }
+
+    private Aggregation getFirstAggregation(Aggregations aggregations){
+        return aggregations.asList().get(0);
+    }
+
+    private List<String> createCSVLinesFromDocs(boolean flat, String separator, List<Map<String, Object>> docsAsMap, List<String> headers) {
+        List<String> csvLines = new ArrayList<>();
+        for(Map<String,Object> doc : docsAsMap){
+            String line = "";
+            for(String header : headers){
+                line += findFieldValue(header, doc, flat, separator);
+            }
+            csvLines.add(line.substring(0, line.length() - 1));
+        }
+        return csvLines;
+    }
+
+    private List<String> createHeadersAndFillDocsMap(boolean flat, SearchHit[] hits, List<Map<String, Object>> docsAsMap) {
+        Set<String> csvHeaders = new HashSet<>();
+        for(SearchHit hit : hits){
+            Map<String, Object> doc = hit.sourceAsMap();
+            mergeHeaders(csvHeaders,doc,flat);
+            docsAsMap.add(doc);
+        }
+        return new ArrayList<>(csvHeaders);
+    }
+
+    private String findFieldValue(String header, Map<String, Object> doc, boolean flat, String separator) {
         if(flat && header.contains(".")){
             String[] split = header.split("\\.");
             Object innerDoc = doc;
@@ -59,15 +205,15 @@ private static String findFieldValue(String header, Map<String, Object> doc, boo
         return separator;
     }
 
-    private static void mergeHeaders(Set<String> headers, Map<String, Object> doc, boolean flat) {
+    private void mergeHeaders(Set<String> headers, Map<String, Object> doc, boolean flat) {
         if (!flat) {
             headers.addAll(doc.keySet());
             return;
         }
         mergeFieldNamesRecursive(headers, doc, "");
     }
 
-    private static void mergeFieldNamesRecursive(Set<String> headers, Map<String, Object> doc, String prefix) {
+    private void mergeFieldNamesRecursive(Set<String> headers, Map<String, Object> doc, String prefix) {
         for(Map.Entry<String,Object> field : doc.entrySet()){
             Object value = field.getValue();
             if(value instanceof Map){
diff --git a/src/test/java/org/nlpcn/es4sql/CSVResultsExtractorTests.java b/src/test/java/org/nlpcn/es4sql/CSVResultsExtractorTests.java
@@ -5,6 +5,9 @@
 import org.elasticsearch.plugin.nlpcn.executors.CSVResult;
 import org.elasticsearch.plugin.nlpcn.executors.CSVResultsExtractor;
 import org.elasticsearch.search.SearchHits;
+import org.elasticsearch.search.aggregations.Aggregations;
+import org.elasticsearch.search.aggregations.bucket.terms.Terms;
+import org.elasticsearch.search.aggregations.metrics.valuecount.ValueCount;
 import org.junit.Assert;
 import org.junit.Test;
 import org.nlpcn.es4sql.exception.SqlParseException;
@@ -149,11 +152,140 @@ public void joinSearchResultNotNestedNotFlatNoAggs() throws SqlParseException, S
         );
 
     }
+
+    @Test
+    public void simpleNumericValueAgg() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("select count(*) from %s/dog ",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(1, headers.size());
+        Assert.assertEquals("COUNT(*)", headers.get(0));
+
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(1, lines.size());
+        Assert.assertEquals("2.0", lines.get(0));
+
+    }
+    @Test
+    public void simpleNumericValueAggWithAlias() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("select avg(age) as myAlias from %s/dog ",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(1, headers.size());
+        Assert.assertEquals("myAlias", headers.get(0));
+
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(1, lines.size());
+        Assert.assertEquals("3.0", lines.get(0));
+
+    }
+
+    @Test
+    public void twoNumericAggWithAlias() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("select count(*) as count, avg(age) as myAlias from %s/dog ",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(2, headers.size());
+        Assert.assertEquals("count", headers.get(0));
+        Assert.assertEquals("myAlias", headers.get(1));
+
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(1, lines.size());
+        Assert.assertEquals("2.0,3.0", lines.get(0));
+
+    }
+
+    @Test
+    public void aggAfterTermsGroupBy() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("SELECT COUNT(*) FROM %s/account GROUP BY gender",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(2, headers.size());
+        Assert.assertEquals("gender", headers.get(0));
+        Assert.assertEquals("COUNT(*)", headers.get(1));
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(2, lines.size());
+        Assert.assertTrue("m,507.0", lines.contains("m,507.0"));
+        Assert.assertTrue("f,493.0", lines.contains("f,493.0"));
+
+    }
+    @Test
+    public void aggAfterTwoTermsGroupBy() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("SELECT COUNT(*) FROM %s/account where age in (35,36) GROUP BY gender,age",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(3, headers.size());
+        Assert.assertEquals("gender", headers.get(0));
+        Assert.assertEquals("age", headers.get(1));
+        Assert.assertEquals("COUNT(*)", headers.get(2));
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(4, lines.size());
+        Assert.assertTrue("m,36,31.0", lines.contains("m,36,31.0"));
+        Assert.assertTrue("m,35,28.0", lines.contains("m,36,31.0"));
+        Assert.assertTrue("f,36,21.0", lines.contains("f,36,21.0"));
+        Assert.assertTrue("f,35,24.0", lines.contains("f,35,24.0"));
+
+    }
+    @Test
+    public void multipleAggAfterTwoTermsGroupBy() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("SELECT COUNT(*) , sum(balance) FROM %s/account where age in (35,36) GROUP BY gender,age",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(4, headers.size());
+        Assert.assertEquals("gender", headers.get(0));
+        Assert.assertEquals("age", headers.get(1));
+        Assert.assertEquals("COUNT(*)", headers.get(2));
+        Assert.assertEquals("SUM(balance)", headers.get(3));
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(4, lines.size());
+        Assert.assertTrue("m,36,31.0,647425.0", lines.contains("m,36,31.0,647425.0"));
+        Assert.assertTrue("m,35,28.0,678337.0", lines.contains("m,35,28.0,678337.0"));
+        Assert.assertTrue("f,36,21.0,505660.0", lines.contains("f,36,21.0,505660.0"));
+        Assert.assertTrue("f,35,24.0,472771.0", lines.contains("f,35,24.0,472771.0"));
+
+    }
+
+    @Test
+    public void dateHistogramTest() throws SqlParseException, SQLFeatureNotSupportedException, IOException {
+        String query = String.format("select count(*) from %s/online" +
+                " group by date_histogram('field'='insert_time','interval'='4d','alias'='days')",TEST_INDEX);
+        CSVResult csvResult = getCsvResult(false, query);
+        List<String> headers = csvResult.getHeaders();
+        Assert.assertEquals(2, headers.size());
+        Assert.assertEquals("days", headers.get(0));
+        Assert.assertEquals("COUNT(*)", headers.get(1));
+
+        List<String> lines = csvResult.getLines();
+        Assert.assertEquals(3, lines.size());
+        Assert.assertTrue("2014-08-14 00:00:00,477.0", lines.contains("2014-08-14 00:00:00,477.0"));
+        Assert.assertTrue("2014-08-18 00:00:00,5664.0", lines.contains("2014-08-18 00:00:00,5664.0"));
+        Assert.assertTrue("2014-08-22 00:00:00,3795.0", lines.contains("2014-08-22 00:00:00,3795.0"));
+
+    }
+
+
+    /* todo: more tests:
+    * multi_numeric extended_stats , stats , percentiles.
+    * filter/nested and than metric
+    * histogram
+    * geo
+     */
+
+
     private CSVResult getCsvResult(boolean flat, String query) throws SqlParseException, SQLFeatureNotSupportedException, IOException {
         SearchDao searchDao = MainTestSuite.getSearchDao();
         QueryAction queryAction = searchDao.explain(query);
-        SearchHits searchHits = (SearchHits) QueryActionElasticExecutor.executeAnyAction(searchDao.getClient(), queryAction);
-        return CSVResultsExtractor.extractResults(searchHits, flat, ",");
+        Object execution =  QueryActionElasticExecutor.executeAnyAction(searchDao.getClient(), queryAction);
+        return new CSVResultsExtractor().extractResults(execution, flat, ",");
     }
 
 

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ public void execute(Client client, Map<String, String> params, QueryAction query`
`26`	`26`	`if(params.containsKey("separator")){`
`27`	`27`	`separator = params.get("separator");`
`28`	`28`	`}`
`29`		`- CSVResult result = CSVResultsExtractor.extractResults(queryResult,flat,separator);`
	`29`	`+ CSVResult result = new CSVResultsExtractor().extractResults(queryResult,flat,separator);`
`30`	`30`	`String newLine = "\n";`
`31`	`31`	`if(params.containsKey("newLine")){`
`32`	`32`	`newLine = params.get("newLine");`