apache · fjy · Mar 30, 2016 · Mar 29, 2016 · himanshug · Apr 7, 2016
diff --git a/docs/content/querying/multi-value-dimensions.md b/docs/content/querying/multi-value-dimensions.md
@@ -19,11 +19,72 @@ called `tags`.
 {"timestamp": "2011-01-12T00:00:00.000Z", "tags": ["t1","t2","t3"]}  #row1
 {"timestamp": "2011-01-13T00:00:00.000Z", "tags": ["t3","t4","t5"]}  #row2
 {"timestamp": "2011-01-14T00:00:00.000Z", "tags": ["t5","t6","t7"]}  #row3
+{"timestamp": "2011-01-14T00:00:00.000Z", "tags": []}                #row4
 ```
 
-All query types can filter on multi-value dimensions. Filters operate independently on each value of a multi-value
-dimension. For example, a `"t1" OR "t3"` filter would match row1 and row2 but not row3. A `"t1" AND "t3"` filter
-would only match row1.
+### Filtering
+
+All query types, as well as [filtered aggregators](aggregations.html#filtered-aggregator), can filter on multi-value
+dimensions. Filters follow these rules on multi-value dimensions:
+
+- Value filters (like "selector", "bound", and "in") match a row if any of the values of a multi-value dimension match
+  the filter.
+- Value filters that match `null` or `""` (empty string) will match empty cells in a multi-value dimension.
+- Logical expression filters behave the same way they do on single-value dimensions: "and" matches a row if all
+  underlying filters match that row; "or" matches a row if any underlying filters match that row; "not" matches a row
+  if the underlying filter does not match the row.
+
+For example, this "or" filter would match row1 and row2 of the dataset above, but not row3:
+
+```
+{
+  "type": "or",
+  "fields": [
+    {
+      "type": "selector",
+      "dimension": "tags",
+      "value": "t1"
+    },
+    {
+      "type": "selector",
+      "dimension": "tags",
+      "value": "t3"
+    }
+  ]
+}
+```
+
+This "and" filter would match only row1 of the dataset above:
+
+```
+{
+  "type": "and",
+  "fields": [
+    {
+      "type": "selector",
+      "dimension": "tags",
+      "value": "t1"
+    },
+    {
+      "type": "selector",
+      "dimension": "tags",
+      "value": "t3"
+    }
+  ]
+}
+```
+
+This "selector" filter would match row4 of the dataset above:
+
+```
+{
+  "type": "selector",
+  "dimension": "tags",
+  "value": null
+}
+```
+
+### Grouping
 
 topN and groupBy queries can group on multi-value dimensions. When grouping on a multi-value dimension, _all_ values
 from matching rows will be used to generate one group per value. It's possible for a query to return more groups than

diff --git a/processing/src/main/java/io/druid/query/aggregation/FilteredAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/FilteredAggregatorFactory.java
@@ -22,6 +22,7 @@
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicate;
+import com.google.common.base.Strings;
 import io.druid.query.dimension.DefaultDimensionSpec;
 import io.druid.query.filter.DimFilter;
 import io.druid.query.filter.ValueMatcher;
@@ -225,12 +226,11 @@ public ValueMatcher makeValueMatcher(final String dimension, final Comparable va
       );
 
       // Compare "value" as a String.
-      final String valueString = value == null ? null : value.toString();
-      final boolean isNullOrEmpty = valueString == null || valueString.isEmpty();
+      final String valueString = value == null ? null : Strings.emptyToNull(value.toString());
 
       // Missing columns match a null or empty string value, and don't match anything else.
       if (selector == null) {
-        return new BooleanValueMatcher(isNullOrEmpty);
+        return new BooleanValueMatcher(valueString == null);
       }
 
       final int valueId = selector.lookupId(valueString);
@@ -241,12 +241,17 @@ public boolean matches()
         {
           final IndexedInts row = selector.getRow();
           final int size = row.size();
-          for (int i = 0; i < size; ++i) {
-            if (row.get(i) == valueId) {
-              return true;
+          if (size == 0) {
+            // null should match empty rows in multi-value columns
+            return valueString == null;
+          } else {
+            for (int i = 0; i < size; ++i) {
+              if (row.get(i) == valueId) {
+                return true;
+              }
             }
+            return false;
           }
-          return false;
         }
       };
     }
@@ -258,8 +263,10 @@ public ValueMatcher makeValueMatcher(final String dimension, final Predicate pre
           new DefaultDimensionSpec(dimension, dimension)
       );
 
+      final boolean doesMatchNull = predicate.apply(null);
+
       if (selector == null) {
-        return new BooleanValueMatcher(predicate.apply(null));
+        return new BooleanValueMatcher(doesMatchNull);
       }
 
       // Check every value in the dimension, as a String.
@@ -278,12 +285,17 @@ public boolean matches()
         {
           final IndexedInts row = selector.getRow();
           final int size = row.size();
-          for (int i = 0; i < size; ++i) {
-            if (valueIds.get(row.get(i))) {
-              return true;
+          if (size == 0) {
+            // null should match empty rows in multi-value columns
+            return doesMatchNull;
+          } else {
+            for (int i = 0; i < size; ++i) {
+              if (valueIds.get(row.get(i))) {
+                return true;
+              }
             }
+            return false;
           }
-          return false;
         }
       };
     }

diff --git a/processing/src/main/java/io/druid/segment/IndexMerger.java b/processing/src/main/java/io/druid/segment/IndexMerger.java
@@ -659,16 +659,20 @@ public void close() throws IOException
       progress.progress();
       startTime = System.currentTimeMillis();
 
-      ArrayList<FileOutputSupplier> dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size());
-      Map<String, Integer> dimensionCardinalities = Maps.newHashMap();
-      ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(indexes.size());
+      final ArrayList<FileOutputSupplier> dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size());
+      final Map<String, Integer> dimensionCardinalities = Maps.newHashMap();
+      final ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(indexes.size());
       final ArrayList<Boolean> convertMissingDimsFlags = Lists.newArrayListWithCapacity(mergedDimensions.size());
+      final ArrayList<MutableBitmap> nullRowsList = Lists.newArrayListWithCapacity(mergedDimensions.size());
+      final ArrayList<Boolean> dimHasNullFlags = Lists.newArrayListWithCapacity(mergedDimensions.size());
 
       for (int i = 0; i < indexes.size(); ++i) {
         dimConversions.add(Maps.<String, IntBuffer>newHashMap());
       }
 
       for (String dimension : mergedDimensions) {
+        nullRowsList.add(indexSpec.getBitmapSerdeFactory().getBitmapFactory().makeEmptyMutableBitmap());
+
         final GenericIndexedWriter<String> writer = new GenericIndexedWriter<String>(
             ioPeon, dimension, GenericIndexed.STRING_STRATEGY
         );
@@ -704,6 +708,7 @@ public void close() throws IOException
        * rows from indexes with null/empty str values for that dimension.
        */
         if (convertMissingDims && !dimHasNull) {
+          dimHasNull = true;
           dimValueLookups[indexes.size()] = dimValueLookup = EMPTY_STR_DIM_VAL;
           numMergeIndex++;
         }
@@ -731,6 +736,9 @@ public void close() throws IOException
 
         dimensionCardinalities.put(dimension, cardinality);
 
+        // Mark if this dim has the null/empty str value in its dictionary, used for determining nullRowsList later.
+        dimHasNullFlags.add(dimHasNull);
+
         FileOutputSupplier dimOut = new FileOutputSupplier(IndexIO.makeDimFile(v8OutDir, dimension), true);
         dimOuts.add(dimOut);
 
@@ -821,6 +829,14 @@ public void close() throws IOException
                                       ? null
                                       : Ints.asList(dims[i]);
           forwardDimWriters.get(i).write(listToWrite);
+          if (listToWrite == null || listToWrite.isEmpty()) {
+            // empty row; add to the nullRows bitmap
+            nullRowsList.get(i).add(rowCount);
+          } else if (dimHasNullFlags.get(i) && listToWrite.size() == 1 && listToWrite.get(0) == 0) {
+            // If this dimension has the null/empty str in its dictionary, a row with a single-valued dimension
+            // that matches the null/empty str's dictionary ID should also be added to nullRowsList.
+            nullRowsList.get(i).add(rowCount);
+          }
         }
 
         for (Map.Entry<Integer, TreeSet<Integer>> comprisedRow : theRow.getComprisedRows().entrySet()) {
@@ -930,6 +946,9 @@ public void close() throws IOException
           }
 
           MutableBitmap bitset = bitmapSerdeFactory.getBitmapFactory().makeEmptyMutableBitmap();
+          if ((dictId == 0) && (Iterables.getFirst(dimVals, "") == null)) {
+            bitset.or(nullRowsList.get(i));
+          }
           for (Integer row : CombiningIterable.createSplatted(
               convertedInverteds,
               Ordering.<Integer>natural().nullsFirst()

diff --git a/processing/src/test/java/io/druid/segment/IndexBuilder.java b/processing/src/test/java/io/druid/segment/IndexBuilder.java
@@ -46,7 +46,7 @@
  */
 public class IndexBuilder
 {
-  private static final int ROWS_PER_INDEX_FOR_MERGING = 2;
+  private static final int ROWS_PER_INDEX_FOR_MERGING = 1;
   private static final int MAX_ROWS = 50_000;
 
   private IncrementalIndexSchema schema = new IncrementalIndexSchema.Builder().withMetrics(new AggregatorFactory[]{