[7.x][ML] Add optional source filtering during data frame reindexing (e…

…lastic#49690) This adds a `_source` setting under the `source` setting of a data frame analytics config. The new `_source` is reusing the structure of a `FetchSourceContext` like `analyzed_fields` does. Specifying includes and excludes for source allows selecting which fields will get reindexed and will be available in the destination index. Closes elastic#49531 Backport of elastic#49690
dimitris-athanasiou · Nov 29, 2019 · f412a0c · f412a0c
1 parent 90e9d61
commit f412a0c
Show file tree

Hide file tree

Showing 28 changed files with 525 additions and 123 deletions.
diff --git a/...h-level/src/main/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSource.java b/...h-level/src/main/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSource.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -44,20 +45,27 @@ public static Builder builder() {
 
     private static final ParseField INDEX = new ParseField("index");
     private static final ParseField QUERY = new ParseField("query");
+    public static final ParseField _SOURCE = new ParseField("_source");
 
     private static ObjectParser<Builder, Void> PARSER = new ObjectParser<>("data_frame_analytics_source", true, Builder::new);
 
     static {
         PARSER.declareStringArray(Builder::setIndex, INDEX);
         PARSER.declareObject(Builder::setQueryConfig, (p, c) -> QueryConfig.fromXContent(p), QUERY);
+        PARSER.declareField(Builder::setSourceFiltering,
+            (p, c) -> FetchSourceContext.fromXContent(p),
+            _SOURCE,
+            ObjectParser.ValueType.OBJECT_ARRAY_BOOLEAN_OR_STRING);
     }
 
     private final String[] index;
     private final QueryConfig queryConfig;
+    private final FetchSourceContext sourceFiltering;
 
-    private DataFrameAnalyticsSource(String[] index, @Nullable QueryConfig queryConfig) {
+    private DataFrameAnalyticsSource(String[] index, @Nullable QueryConfig queryConfig, @Nullable FetchSourceContext sourceFiltering) {
         this.index = Objects.requireNonNull(index);
         this.queryConfig = queryConfig;
+        this.sourceFiltering = sourceFiltering;
     }
 
     public String[] getIndex() {
@@ -68,13 +76,20 @@ public QueryConfig getQueryConfig() {
         return queryConfig;
     }
 
+    public FetchSourceContext getSourceFiltering() {
+        return sourceFiltering;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
         builder.field(INDEX.getPreferredName(), index);
         if (queryConfig != null) {
             builder.field(QUERY.getPreferredName(), queryConfig.getQuery());
         }
+        if (sourceFiltering != null) {
+            builder.field(_SOURCE.getPreferredName(), sourceFiltering);
+        }
         builder.endObject();
         return builder;
     }
@@ -86,12 +101,13 @@ public boolean equals(Object o) {
 
         DataFrameAnalyticsSource other = (DataFrameAnalyticsSource) o;
         return Arrays.equals(index, other.index)
-            && Objects.equals(queryConfig, other.queryConfig);
+            && Objects.equals(queryConfig, other.queryConfig)
+            && Objects.equals(sourceFiltering, other.sourceFiltering);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(Arrays.asList(index), queryConfig);
+        return Objects.hash(Arrays.asList(index), queryConfig, sourceFiltering);
     }
 
     @Override
@@ -103,6 +119,7 @@ public static class Builder {
 
         private String[] index;
         private QueryConfig queryConfig;
+        private FetchSourceContext sourceFiltering;
 
         private Builder() {}
 
@@ -121,8 +138,13 @@ public Builder setQueryConfig(QueryConfig queryConfig) {
             return this;
         }
 
+        public Builder setSourceFiltering(FetchSourceContext sourceFiltering) {
+            this.sourceFiltering = sourceFiltering;
+            return this;
+        }
+
         public DataFrameAnalyticsSource build() {
-            return new DataFrameAnalyticsSource(index, queryConfig);
+            return new DataFrameAnalyticsSource(index, queryConfig, sourceFiltering);
         }
     }
 }
diff --git a/...h-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java b/...h-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
@@ -2939,6 +2939,9 @@ public void testPutDataFrameAnalytics() throws Exception {
             DataFrameAnalyticsSource sourceConfig = DataFrameAnalyticsSource.builder() // <1>
                 .setIndex("put-test-source-index") // <2>
                 .setQueryConfig(queryConfig) // <3>
+                .setSourceFiltering(new FetchSourceContext(true,
+                    new String[] { "included_field_1", "included_field_2" },
+                    new String[] { "excluded_field" })) // <4>
                 .build();
             // end::put-data-frame-analytics-source-config
 

diff --git a/...el/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSourceTests.java b/...el/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsSourceTests.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.search.SearchModule;
+import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
 import org.elasticsearch.test.AbstractXContentTestCase;
 
 import java.io.IOException;
@@ -35,9 +36,17 @@
 public class DataFrameAnalyticsSourceTests extends AbstractXContentTestCase<DataFrameAnalyticsSource> {
 
     public static DataFrameAnalyticsSource randomSourceConfig() {
+        FetchSourceContext sourceFiltering = null;
+        if (randomBoolean()) {
+            sourceFiltering = new FetchSourceContext(true,
+                generateRandomStringArray(10, 10, false, false),
+                generateRandomStringArray(10, 10, false, false));
+        }
+
         return DataFrameAnalyticsSource.builder()
             .setIndex(generateRandomStringArray(10, 10, false, false))
             .setQueryConfig(randomBoolean() ? null : randomQueryConfig())
+            .setSourceFiltering(sourceFiltering)
             .build();
     }
 

diff --git a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
@@ -52,6 +52,7 @@ include-tagged::{doc-tests-file}[{api}-source-config]
 <1> Constructing a new DataFrameAnalyticsSource
 <2> The source index
 <3> The query from which to gather the data. If query is not set, a `match_all` query is used by default.
+<4> Source filtering to select which fields will exist in the destination index.
 
 ===== QueryConfig
 

diff --git a/docs/reference/ml/df-analytics/apis/dfanalyticsresources.asciidoc b/docs/reference/ml/df-analytics/apis/dfanalyticsresources.asciidoc
@@ -16,17 +16,18 @@
   <<dfanalytics-types>>.
 
 `analyzed_fields`::
-  (object) You can specify both `includes` and/or `excludes` patterns. If 
-  `analyzed_fields` is not set, only the relevant fields will be included. For 
-  example, all the numeric fields for {oldetection}. For the supported field 
-  types, see <<ml-put-dfanalytics-supported-fields>>.
+  (Optional, object) Specify `includes` and/or `excludes` patterns to select
+  which fields will be included in the analysis. If `analyzed_fields` is not set,
+  only the relevant fields will be included. For example, all the numeric fields
+  for {oldetection}. For the supported field types, see <<ml-put-dfanalytics-supported-fields>>.
+  Also see the <<explain-dfanalytics>> which helps understand field selection.
 
   `includes`:::
-    (array) An array of strings that defines the fields that will be included in 
+    (Optional, array) An array of strings that defines the fields that will be included in 
     the analysis.
 
   `excludes`:::
-    (array) An array of strings that defines the fields that will be excluded 
+    (Optional, array) An array of strings that defines the fields that will be excluded 
     from the analysis.
 
 
@@ -81,8 +82,8 @@ PUT _ml/data_frame/analytics/loganalytics
   that setting. For more information, see <<ml-settings>>.
 
 `source`::
-  (object) The source configuration consisting an `index` and optionally a 
-  `query` object.
+  (object) The configuration of how to source the analysis data. It requires an `index`.
+  Optionally, `query` and `_source` may be specified.
 
   `index`:::
     (Required, string or array) Index or indices on which to perform the 
@@ -96,6 +97,19 @@ PUT _ml/data_frame/analytics/loganalytics
     as this object is passed verbatim to {es}. By default, this property has 
     the following value: `{"match_all": {}}`.
 
+  `_source`:::
+    (Optional, object) Specify `includes` and/or `excludes` patterns to select
+    which fields will be present in the destination. Fields that are excluded
+    cannot be included in the analysis.
+
+      `includes`::::
+        (array) An array of strings that defines the fields that will be included in 
+        the destination.
+
+      `excludes`::::
+        (array) An array of strings that defines the fields that will be excluded 
+        from the destination.
+
 [[dfanalytics-types]]
 ==== Analysis objects
 
@@ -277,4 +291,4 @@ improvement. If you override any parameters, then the optimization will
 calculate the value of the remaining parameters accordingly and use the value 
 you provided for the overridden parameter. The number of rounds are reduced 
 respectively. The validation error is estimated in each round by using 4-fold 
-cross validation.
+cross validation.
diff --git a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
@@ -101,13 +101,13 @@ single number. For example, in case of age ranges, you can model the values as
   <<dfanalytics-types>>.
 
 `analyzed_fields`::
-  (Optional, object) You can specify both `includes` and/or `excludes` patterns. 
-  If `analyzed_fields` is not set, only the relevant fields will be included. 
-  For example, all the numeric fields for {oldetection}. For the supported field 
-  types, see <<ml-put-dfanalytics-supported-fields>>. If you specify fields – 
-  either in `includes` or in `excludes` – that have a data type that is not 
-  supported, an error occurs.
-  
+  (Optional, object) Specify `includes` and/or `excludes` patterns to select
+  which fields will be included in the analysis. If `analyzed_fields` is not set,
+  only the relevant fields will be included. For example, all the numeric fields
+  for {oldetection}. For the supported field types, see <<ml-put-dfanalytics-supported-fields>>.
+  Also see the <<explain-dfanalytics>> which helps understand
+  field selection.
+
   `includes`:::
     (Optional, array) An array of strings that defines the fields that will be 
     included in the analysis.
@@ -142,20 +142,33 @@ single number. For example, in case of age ranges, you can model the values as
   that setting. For more information, see <<ml-settings>>.
 
 `source`::
-  (Required, object) The source configuration, consisting of `index` and 
-  optionally a `query`.
+  (object) The configuration of how to source the analysis data. It requires an `index`.
+  Optionally, `query` and `_source` may be specified.
 
-    `index`:::
-      (Required, string or array) Index or indices on which to perform the 
-      analysis. It can be a single index or index pattern as well as an array of 
-      indices or patterns.
-
-    `query`:::
-      (Optional, object) The {es} query domain-specific language 
-      (<<query-dsl,DSL>>). This value corresponds to the query object in an {es} 
-      search POST body. All the options that are supported by {es} can be used, 
-      as this object is passed verbatim to {es}. By default, this property has 
-      the following value: `{"match_all": {}}`.
+  `index`:::
+    (Required, string or array) Index or indices on which to perform the 
+    analysis. It can be a single index or index pattern as well as an array of 
+    indices or patterns.
+
+  `query`:::
+    (Optional, object) The {es} query domain-specific language 
+    (<<query-dsl,DSL>>). This value corresponds to the query object in an {es} 
+    search POST body. All the options that are supported by {es} can be used, 
+    as this object is passed verbatim to {es}. By default, this property has 
+    the following value: `{"match_all": {}}`.
+
+  `_source`:::
+    (Optional, object) Specify `includes` and/or `excludes` patterns to select
+    which fields will be present in the destination. Fields that are excluded
+    cannot be included in the analysis.
+
+      `includes`::::
+        (array) An array of strings that defines the fields that will be included in 
+        the destination.
+
+      `excludes`::::
+        (array) An array of strings that defines the fields that will be excluded 
+        from the destination.
 
 `allow_lazy_start`::
   (Optional, boolean) Whether this job should be allowed to start when there

diff --git a/...ore/src/main/java/org/elasticsearch/xpack/core/ml/action/PutDataFrameAnalyticsAction.java b/...ore/src/main/java/org/elasticsearch/xpack/core/ml/action/PutDataFrameAnalyticsAction.java
@@ -8,6 +8,7 @@
 import org.elasticsearch.action.ActionRequestValidationException;
 import org.elasticsearch.action.ActionResponse;
 import org.elasticsearch.action.ActionType;
+import org.elasticsearch.action.ValidateActions;
 import org.elasticsearch.action.support.master.AcknowledgedRequest;
 import org.elasticsearch.action.support.master.MasterNodeOperationRequestBuilder;
 import org.elasticsearch.client.ElasticsearchClient;
@@ -18,6 +19,7 @@
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
+import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsSource;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 
 import java.io.IOException;
@@ -87,6 +89,24 @@ public DataFrameAnalyticsConfig getConfig() {
 
         @Override
         public ActionRequestValidationException validate() {
+            ActionRequestValidationException error = null;
+            error = checkNoIncludedAnalyzedFieldsAreExcludedBySourceFiltering(config, error);
+            return error;
+        }
+
+        private ActionRequestValidationException checkNoIncludedAnalyzedFieldsAreExcludedBySourceFiltering(
+                DataFrameAnalyticsConfig config, ActionRequestValidationException error) {
+            if (config.getAnalyzedFields() == null) {
+                return null;
+            }
+            for (String analyzedInclude : config.getAnalyzedFields().includes()) {
+                if (config.getSource().isFieldExcluded(analyzedInclude)) {
+                    return ValidateActions.addValidationError("field [" + analyzedInclude + "] is included in ["
+                        + DataFrameAnalyticsConfig.ANALYZED_FIELDS.getPreferredName() + "] but not in ["
+                        + DataFrameAnalyticsConfig.SOURCE.getPreferredName() + "."
+                        + DataFrameAnalyticsSource._SOURCE.getPreferredName() + "]", error);
+                }
+            }
             return null;
         }
 

diff --git a/...ore/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/DataFrameAnalyticsConfig.java b/...ore/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/DataFrameAnalyticsConfig.java
@@ -127,7 +127,7 @@ private static DataFrameAnalysis parseAnalysis(XContentParser parser, boolean ig
     private final Version version;
     private final boolean allowLazyStart;
 
-    public DataFrameAnalyticsConfig(String id, String description, DataFrameAnalyticsSource source, DataFrameAnalyticsDest dest,
+    private DataFrameAnalyticsConfig(String id, String description, DataFrameAnalyticsSource source, DataFrameAnalyticsDest dest,
                                     DataFrameAnalysis analysis, Map<String, String> headers, ByteSizeValue modelMemoryLimit,
                                     FetchSourceContext analyzedFields, Instant createTime, Version version, boolean allowLazyStart) {
         this.id = ExceptionsHelper.requireNonNull(id, ID);