From 15e584fa5b31b010a073f0f7f22076e1cf061684 Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Tue, 21 Jan 2020 19:13:16 +0000 Subject: [PATCH] [ML] Fixing categorization wizard example results (#54924) (#55438) * [ML] Fixing categorization wizard example results * moving validation results to class * cleaning up category analyzer types * small tweaks * removing commented out code * fixing string ids * small refactor * improving validation messages * fixing types * updating message text * fixing typo * adding privileges error * updating privilege message * changes based on review * removing old warning message * fixing translations * renaming enum --- .../plugins/ml/common/constants/new_job.ts | 7 + .../plugins/ml/common/types/categories.ts | 29 ++ .../ml/common/util/string_utils.test.ts | 24 +- .../plugins/ml/common/util/string_utils.ts | 5 + .../job_creator/categorization_job_creator.ts | 53 +-- .../common/job_validator/job_validator.ts | 4 +- .../categorization_examples_loader.ts | 25 +- .../new_job/common/results_loader/index.ts | 2 +- .../examples_valid_callout.tsx | 64 ++-- .../categorization_view/field_examples.tsx | 4 +- .../categorization_view/metric_selection.tsx | 34 +- .../services/ml_api_service/index.d.ts | 15 +- .../application/services/ml_server_info.ts | 8 +- .../ml/server/models/job_service/index.js | 7 +- .../job_service/new_job/categorization.ts | 314 ------------------ .../new_job/categorization/examples.ts | 206 ++++++++++++ .../new_job/categorization/index.ts | 8 + .../new_job/categorization/top_categories.ts | 164 +++++++++ .../categorization/validation_results.ts | 208 ++++++++++++ .../models/job_service/new_job/index.ts | 2 +- 20 files changed, 763 insertions(+), 420 deletions(-) delete mode 100644 x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts create mode 100644 x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/examples.ts create mode 100644 x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/index.ts create mode 100644 x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/top_categories.ts create mode 100644 x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/validation_results.ts diff --git a/x-pack/legacy/plugins/ml/common/constants/new_job.ts b/x-pack/legacy/plugins/ml/common/constants/new_job.ts index 3c98b372afdf73..862fa72d11fdb7 100644 --- a/x-pack/legacy/plugins/ml/common/constants/new_job.ts +++ b/x-pack/legacy/plugins/ml/common/constants/new_job.ts @@ -26,7 +26,14 @@ export const DEFAULT_QUERY_DELAY = '60s'; export const SHARED_RESULTS_INDEX_NAME = 'shared'; +// Categorization export const NUMBER_OF_CATEGORY_EXAMPLES = 5; export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000; export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75; export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02; + +export enum CATEGORY_EXAMPLES_VALIDATION_STATUS { + VALID = 'valid', + PARTIALLY_VALID = 'partially_valid', + INVALID = 'invalid', +} diff --git a/x-pack/legacy/plugins/ml/common/types/categories.ts b/x-pack/legacy/plugins/ml/common/types/categories.ts index 6ccd13ed9a39ec..765053ced52012 100644 --- a/x-pack/legacy/plugins/ml/common/types/categories.ts +++ b/x-pack/legacy/plugins/ml/common/types/categories.ts @@ -4,6 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ +import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../constants/new_job'; + export type CategoryId = number; export interface Category { @@ -23,3 +25,30 @@ export interface Token { type: string; position: number; } + +export interface CategorizationAnalyzer { + char_filter?: any[]; + tokenizer?: string; + filter?: any[]; + analyzer?: string; +} + +export interface CategoryFieldExample { + text: string; + tokens: Token[]; +} + +export enum VALIDATION_RESULT { + TOKEN_COUNT, + MEDIAN_LINE_LENGTH, + NULL_VALUES, + TOO_MANY_TOKENS, + FAILED_TO_TOKENIZE, + INSUFFICIENT_PRIVILEGES, +} + +export interface FieldExampleCheck { + id: VALIDATION_RESULT; + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS; + message: string; +} diff --git a/x-pack/legacy/plugins/ml/common/util/string_utils.test.ts b/x-pack/legacy/plugins/ml/common/util/string_utils.test.ts index aba2dbd230ada3..026c8e6110c993 100644 --- a/x-pack/legacy/plugins/ml/common/util/string_utils.test.ts +++ b/x-pack/legacy/plugins/ml/common/util/string_utils.test.ts @@ -4,7 +4,17 @@ * you may not use this file except in compliance with the Elastic License. */ -import { renderTemplate } from './string_utils'; +import { renderTemplate, getMedianStringLength } from './string_utils'; + +const strings: string[] = [ + 'foo', + 'foofoofoofoofoo', + 'foofoofoo', + 'f', + 'f', + 'foofoofoofoofoofoofoo', +]; +const noStrings: string[] = []; describe('ML - string utils', () => { describe('renderTemplate', () => { @@ -24,4 +34,16 @@ describe('ML - string utils', () => { expect(result).toBe('string with 1 replacement, and a 2nd one.'); }); }); + + describe('getMedianStringLength', () => { + test('test median for string array', () => { + const result = getMedianStringLength(strings); + expect(result).toBe(9); + }); + + test('test median for no strings', () => { + const result = getMedianStringLength(noStrings); + expect(result).toBe(0); + }); + }); }); diff --git a/x-pack/legacy/plugins/ml/common/util/string_utils.ts b/x-pack/legacy/plugins/ml/common/util/string_utils.ts index 432baabe773cc1..9dd2ce3d74cd5d 100644 --- a/x-pack/legacy/plugins/ml/common/util/string_utils.ts +++ b/x-pack/legacy/plugins/ml/common/util/string_utils.ts @@ -17,3 +17,8 @@ export function renderTemplate(str: string, data?: Record): stri return str; } + +export function getMedianStringLength(strings: string[]) { + const sortedStringLengths = strings.map(s => s.length).sort((a, b) => a - b); + return sortedStringLengths[Math.floor(sortedStringLengths.length / 2)] || 0; +} diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts index 71619311c4361d..0ff0ffb6f3bb39 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_creator/categorization_job_creator.ts @@ -16,25 +16,31 @@ import { CREATED_BY_LABEL, DEFAULT_BUCKET_SPAN, DEFAULT_RARE_BUCKET_SPAN, + CATEGORY_EXAMPLES_VALIDATION_STATUS, } from '../../../../../../common/constants/new_job'; import { ML_JOB_AGGREGATION } from '../../../../../../common/constants/aggregation_types'; +import { + CategorizationAnalyzer, + CategoryFieldExample, + FieldExampleCheck, +} from '../../../../../../common/types/categories'; import { getRichDetectors } from './util/general'; -import { CategorizationExamplesLoader, CategoryExample } from '../results_loader'; -import { CategorizationAnalyzer, getNewJobDefaults } from '../../../../services/ml_server_info'; - -type CategorizationAnalyzerType = CategorizationAnalyzer | null; +import { CategorizationExamplesLoader } from '../results_loader'; +import { getNewJobDefaults } from '../../../../services/ml_server_info'; export class CategorizationJobCreator extends JobCreator { protected _type: JOB_TYPE = JOB_TYPE.CATEGORIZATION; private _createCountDetector: () => void = () => {}; private _createRareDetector: () => void = () => {}; private _examplesLoader: CategorizationExamplesLoader; - private _categoryFieldExamples: CategoryExample[] = []; - private _categoryFieldValid: number = 0; + private _categoryFieldExamples: CategoryFieldExample[] = []; + private _validationChecks: FieldExampleCheck[] = []; + private _overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS = + CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID; private _detectorType: ML_JOB_AGGREGATION.COUNT | ML_JOB_AGGREGATION.RARE = ML_JOB_AGGREGATION.COUNT; - private _categorizationAnalyzer: CategorizationAnalyzerType = null; - private _defaultCategorizationAnalyzer: CategorizationAnalyzerType; + private _categorizationAnalyzer: CategorizationAnalyzer = {}; + private _defaultCategorizationAnalyzer: CategorizationAnalyzer; constructor( indexPattern: IndexPattern, @@ -46,7 +52,7 @@ export class CategorizationJobCreator extends JobCreator { this._examplesLoader = new CategorizationExamplesLoader(this, indexPattern, query); const { anomaly_detectors: anomalyDetectors } = getNewJobDefaults(); - this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || null; + this._defaultCategorizationAnalyzer = anomalyDetectors.categorization_analyzer || {}; } public setDefaultDetectorProperties( @@ -93,7 +99,7 @@ export class CategorizationJobCreator extends JobCreator { } else { delete this._job_config.analysis_config.categorization_field_name; this._categoryFieldExamples = []; - this._categoryFieldValid = 0; + this._validationChecks = []; } } @@ -102,31 +108,38 @@ export class CategorizationJobCreator extends JobCreator { } public async loadCategorizationFieldExamples() { - const { valid, examples, sampleSize } = await this._examplesLoader.loadExamples(); + const { + examples, + sampleSize, + overallValidStatus, + validationChecks, + } = await this._examplesLoader.loadExamples(); this._categoryFieldExamples = examples; - this._categoryFieldValid = valid; - return { valid, examples, sampleSize }; + this._validationChecks = validationChecks; + this._overallValidStatus = overallValidStatus; + return { examples, sampleSize, overallValidStatus, validationChecks }; } public get categoryFieldExamples() { return this._categoryFieldExamples; } - public get categoryFieldValid() { - return this._categoryFieldValid; + public get validationChecks() { + return this._validationChecks; + } + + public get overallValidStatus() { + return this._overallValidStatus; } public get selectedDetectorType() { return this._detectorType; } - public set categorizationAnalyzer(analyzer: CategorizationAnalyzerType) { + public set categorizationAnalyzer(analyzer: CategorizationAnalyzer) { this._categorizationAnalyzer = analyzer; - if ( - analyzer === null || - isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer) - ) { + if (isEqual(this._categorizationAnalyzer, this._defaultCategorizationAnalyzer)) { delete this._job_config.analysis_config.categorization_analyzer; } else { this._job_config.analysis_config.categorization_analyzer = analyzer; diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_validator/job_validator.ts b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_validator/job_validator.ts index 976e94b377ae8b..8f6b16c407fb66 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_validator/job_validator.ts +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/job_validator/job_validator.ts @@ -16,7 +16,7 @@ import { JobCreator, JobCreatorType, isCategorizationJobCreator } from '../job_c import { populateValidationMessages, checkForExistingJobAndGroupIds } from './util'; import { ExistingJobsAndGroups } from '../../../../services/job_service'; import { cardinalityValidator, CardinalityValidatorResult } from './validators'; -import { CATEGORY_EXAMPLES_ERROR_LIMIT } from '../../../../../../common/constants/new_job'; +import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../common/constants/new_job'; // delay start of validation to allow the user to make changes // e.g. if they are typing in a new value, try not to validate @@ -207,7 +207,7 @@ export class JobValidator { private _runAdvancedValidation() { if (isCategorizationJobCreator(this._jobCreator)) { this._advancedValidations.categorizationFieldValid.valid = - this._jobCreator.categoryFieldValid > CATEGORY_EXAMPLES_ERROR_LIMIT; + this._jobCreator.overallValidStatus !== CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID; } } diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts index ce1ea0bdaf181c..62a4d070fec328 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/categorization_examples_loader.ts @@ -6,15 +6,12 @@ import { IndexPattern } from '../../../../../../../../../../src/plugins/data/public'; import { IndexPatternTitle } from '../../../../../../common/types/kibana'; -import { Token } from '../../../../../../common/types/categories'; import { CategorizationJobCreator } from '../job_creator'; import { ml } from '../../../../services/ml_api_service'; -import { NUMBER_OF_CATEGORY_EXAMPLES } from '../../../../../../common/constants/new_job'; - -export interface CategoryExample { - text: string; - tokens: Token[]; -} +import { + NUMBER_OF_CATEGORY_EXAMPLES, + CATEGORY_EXAMPLES_VALIDATION_STATUS, +} from '../../../../../../common/constants/new_job'; export class CategorizationExamplesLoader { private _jobCreator: CategorizationJobCreator; @@ -36,20 +33,22 @@ export class CategorizationExamplesLoader { const analyzer = this._jobCreator.categorizationAnalyzer; const categorizationFieldName = this._jobCreator.categorizationFieldName; if (categorizationFieldName === null) { - return { valid: 0, examples: [], sampleSize: 0 }; + return { + examples: [], + sampleSize: 0, + overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID, + validationChecks: [], + }; } - const start = Math.floor( - this._jobCreator.start + (this._jobCreator.end - this._jobCreator.start) / 2 - ); const resp = await ml.jobs.categorizationFieldExamples( this._indexPatternTitle, this._query, NUMBER_OF_CATEGORY_EXAMPLES, categorizationFieldName, this._timeFieldName, - start, - 0, + this._jobCreator.start, + this._jobCreator.end, analyzer ); return resp; diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/index.ts b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/index.ts index 724c62f22e469f..e15d859f8e6c31 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/index.ts +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/common/results_loader/index.ts @@ -5,4 +5,4 @@ */ export { ResultsLoader, Results, ModelItem, Anomaly } from './results_loader'; -export { CategorizationExamplesLoader, CategoryExample } from './categorization_examples_loader'; +export { CategorizationExamplesLoader } from './categorization_examples_loader'; diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx index 270ba99d938cdc..ac886a3aea61a7 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/examples_valid_callout.tsx @@ -9,27 +9,24 @@ import { EuiCallOut, EuiSpacer, EuiCallOutProps } from '@elastic/eui'; import { i18n } from '@kbn/i18n'; import { FormattedMessage } from '@kbn/i18n/react'; -import { CategorizationAnalyzer } from '../../../../../../../services/ml_server_info'; -import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout'; import { - CATEGORY_EXAMPLES_ERROR_LIMIT, - CATEGORY_EXAMPLES_WARNING_LIMIT, -} from '../../../../../../../../../common/constants/new_job'; - -type CategorizationAnalyzerType = CategorizationAnalyzer | null; + CategorizationAnalyzer, + FieldExampleCheck, +} from '../../../../../../../../../common/types/categories'; +import { EditCategorizationAnalyzerFlyout } from '../../../common/edit_categorization_analyzer_flyout'; +import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job'; interface Props { - examplesValid: number; - sampleSize: number; - categorizationAnalyzer: CategorizationAnalyzerType; + validationChecks: FieldExampleCheck[]; + overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS; + categorizationAnalyzer: CategorizationAnalyzer; } export const ExamplesValidCallout: FC = ({ - examplesValid, + overallValidStatus, + validationChecks, categorizationAnalyzer, - sampleSize, }) => { - const percentageText = ; const analyzerUsed = ; let color: EuiCallOutProps['color'] = 'success'; @@ -40,7 +37,7 @@ export const ExamplesValidCallout: FC = ({ } ); - if (examplesValid < CATEGORY_EXAMPLES_ERROR_LIMIT) { + if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID) { color = 'danger'; title = i18n.translate( 'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.invalid', @@ -48,7 +45,7 @@ export const ExamplesValidCallout: FC = ({ defaultMessage: 'Selected category field is invalid', } ); - } else if (examplesValid < CATEGORY_EXAMPLES_WARNING_LIMIT) { + } else if (overallValidStatus === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID) { color = 'warning'; title = i18n.translate( 'xpack.ml.newJob.wizard.pickFieldsStep.categorizationFieldCalloutTitle.possiblyInvalid', @@ -60,45 +57,24 @@ export const ExamplesValidCallout: FC = ({ return ( - {percentageText} + {validationChecks.map((v, i) => ( +
{v.message}
+ ))} {analyzerUsed}
); }; -const PercentageText: FC<{ examplesValid: number; sampleSize: number }> = ({ - examplesValid, - sampleSize, -}) => ( -
- -
-); - -const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzerType }> = ({ +const AnalyzerUsed: FC<{ categorizationAnalyzer: CategorizationAnalyzer }> = ({ categorizationAnalyzer, }) => { let analyzer = ''; - if (typeof categorizationAnalyzer === null) { - return null; - } - if (typeof categorizationAnalyzer === 'string') { - analyzer = categorizationAnalyzer; - } else { - if (categorizationAnalyzer?.tokenizer !== undefined) { - analyzer = categorizationAnalyzer?.tokenizer!; - } else if (categorizationAnalyzer?.analyzer !== undefined) { - analyzer = categorizationAnalyzer?.analyzer!; - } + if (categorizationAnalyzer?.tokenizer !== undefined) { + analyzer = categorizationAnalyzer.tokenizer; + } else if (categorizationAnalyzer?.analyzer !== undefined) { + analyzer = categorizationAnalyzer.analyzer; } return ( diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/field_examples.tsx b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/field_examples.tsx index 7f9b2e43b90050..51cea179a6c0d9 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/field_examples.tsx +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/field_examples.tsx @@ -7,10 +7,10 @@ import React, { FC } from 'react'; import { i18n } from '@kbn/i18n'; import { EuiBasicTable, EuiText } from '@elastic/eui'; -import { CategoryExample } from '../../../../../common/results_loader'; +import { CategoryFieldExample } from '../../../../../../../../../common/types/categories'; interface Props { - fieldExamples: CategoryExample[] | null; + fieldExamples: CategoryFieldExample[] | null; } const TOKEN_HIGHLIGHT_COLOR = '#b0ccf7'; diff --git a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx index 52b5c61e70fe5e..411f6e898bd486 100644 --- a/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx +++ b/x-pack/legacy/plugins/ml/public/application/jobs/new_job/pages/components/pick_fields_step/components/categorization_view/metric_selection.tsx @@ -14,7 +14,11 @@ import { CategorizationField } from '../categorization_field'; import { CategorizationDetector } from '../categorization_detector'; import { FieldExamples } from './field_examples'; import { ExamplesValidCallout } from './examples_valid_callout'; -import { CategoryExample } from '../../../../../common/results_loader'; +import { + CategoryFieldExample, + FieldExampleCheck, +} from '../../../../../../../../../common/types/categories'; +import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../../../../../../common/constants/new_job'; import { LoadingWrapper } from '../../../charts/loading_wrapper'; interface Props { @@ -31,9 +35,11 @@ export const CategorizationDetectors: FC = ({ setIsValid }) => { const [categorizationAnalyzerString, setCategorizationAnalyzerString] = useState( JSON.stringify(jobCreator.categorizationAnalyzer) ); - const [fieldExamples, setFieldExamples] = useState(null); - const [examplesValid, setExamplesValid] = useState(0); - const [sampleSize, setSampleSize] = useState(0); + const [fieldExamples, setFieldExamples] = useState(null); + const [overallValidStatus, setOverallValidStatus] = useState( + CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID + ); + const [validationChecks, setValidationChecks] = useState([]); const [categorizationFieldName, setCategorizationFieldName] = useState( jobCreator.categorizationFieldName @@ -73,28 +79,32 @@ export const CategorizationDetectors: FC = ({ setIsValid }) => { setLoadingData(true); try { const { - valid, examples, - sampleSize: tempSampleSize, + overallValidStatus: tempOverallValidStatus, + validationChecks: tempValidationChecks, } = await jobCreator.loadCategorizationFieldExamples(); setFieldExamples(examples); - setExamplesValid(valid); + setOverallValidStatus(tempOverallValidStatus); + setValidationChecks(tempValidationChecks); setLoadingData(false); - setSampleSize(tempSampleSize); } catch (error) { setLoadingData(false); + setFieldExamples(null); + setValidationChecks([]); + setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID); mlMessageBarService.notify.error(error); } } else { setFieldExamples(null); - setExamplesValid(0); + setValidationChecks([]); + setOverallValidStatus(CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID); } setIsValid(categorizationFieldName !== null); } useEffect(() => { jobCreatorUpdate(); - }, [examplesValid]); + }, [overallValidStatus]); return ( <> @@ -109,8 +119,8 @@ export const CategorizationDetectors: FC = ({ setIsValid }) => { {fieldExamples !== null && loadingData === false && ( <> diff --git a/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts b/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts index db9d158c0ead99..6420b60e4c8380 100644 --- a/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts +++ b/x-pack/legacy/plugins/ml/public/application/services/ml_api_service/index.d.ts @@ -22,6 +22,12 @@ import { PartitionFieldsDefinition } from '../results_service/result_service_rx' import { annotations } from './annotations'; import { Calendar, CalendarId, UpdateCalendar } from '../../../../common/types/calendars'; import { CombinedJob, JobId } from '../../jobs/new_job/common/job_creator/configs'; +import { + CategorizationAnalyzer, + CategoryFieldExample, + FieldExampleCheck, +} from '../../../../common/types/categories'; +import { CATEGORY_EXAMPLES_VALIDATION_STATUS } from '../../../../common/constants/new_job'; // TODO This is not a complete representation of all methods of `ml.*`. // It just satisfies needs for other parts of the code area which use @@ -184,8 +190,13 @@ declare interface Ml { timeField: string | undefined, start: number, end: number, - analyzer: any - ): Promise<{ valid: number; examples: any[]; sampleSize: number }>; + analyzer: CategorizationAnalyzer + ): Promise<{ + examples: CategoryFieldExample[]; + sampleSize: number; + overallValidStatus: CATEGORY_EXAMPLES_VALIDATION_STATUS; + validationChecks: FieldExampleCheck[]; + }>; topCategories( jobId: string, count: number diff --git a/x-pack/legacy/plugins/ml/public/application/services/ml_server_info.ts b/x-pack/legacy/plugins/ml/public/application/services/ml_server_info.ts index 6bf5a7b0c97433..304778281c2f2f 100644 --- a/x-pack/legacy/plugins/ml/public/application/services/ml_server_info.ts +++ b/x-pack/legacy/plugins/ml/public/application/services/ml_server_info.ts @@ -5,6 +5,7 @@ */ import { ml } from './ml_api_service'; +import { CategorizationAnalyzer } from '../../../common/types/categories'; export interface MlServerDefaults { anomaly_detectors: { @@ -16,13 +17,6 @@ export interface MlServerDefaults { datafeeds: { scroll_size?: number }; } -export interface CategorizationAnalyzer { - char_filter?: any[]; - tokenizer?: string; - filter?: any[]; - analyzer?: string; -} - export interface MlServerLimits { max_model_memory_limit?: string; } diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/index.js b/x-pack/legacy/plugins/ml/server/models/job_service/index.js index 186bcbae84546f..5c0eff3112a53e 100644 --- a/x-pack/legacy/plugins/ml/server/models/job_service/index.js +++ b/x-pack/legacy/plugins/ml/server/models/job_service/index.js @@ -8,7 +8,11 @@ import { datafeedsProvider } from './datafeeds'; import { jobsProvider } from './jobs'; import { groupsProvider } from './groups'; import { newJobCapsProvider } from './new_job_caps'; -import { newJobChartsProvider, categorizationExamplesProvider } from './new_job'; +import { + newJobChartsProvider, + categorizationExamplesProvider, + topCategoriesProvider, +} from './new_job'; export function jobServiceProvider(callWithRequest, request) { return { @@ -18,5 +22,6 @@ export function jobServiceProvider(callWithRequest, request) { ...newJobCapsProvider(callWithRequest, request), ...newJobChartsProvider(callWithRequest, request), ...categorizationExamplesProvider(callWithRequest, request), + ...topCategoriesProvider(callWithRequest, request), }; } diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts deleted file mode 100644 index b3c70bf589cd04..00000000000000 --- a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization.ts +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -import { chunk } from 'lodash'; -import { ML_RESULTS_INDEX_PATTERN } from '../../../../common/constants/index_patterns'; -import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../common/constants/new_job'; -import { CategoryId, Category, Token } from '../../../../common/types/categories'; -import { callWithRequestType } from '../../../../common/types/kibana'; - -const VALID_TOKEN_COUNT = 3; -const CHUNK_SIZE = 100; - -export function categorizationExamplesProvider(callWithRequest: callWithRequestType) { - async function categorizationExamples( - indexPatternTitle: string, - query: any, - size: number, - categorizationFieldName: string, - timeField: string | undefined, - start: number, - end: number, - analyzer?: any - ) { - if (timeField !== undefined) { - const range = { - range: { - [timeField]: { - gte: start, - format: 'epoch_millis', - }, - }, - }; - - if (query.bool === undefined) { - query.bool = {}; - } - if (query.bool.filter === undefined) { - query.bool.filter = range; - } else { - if (Array.isArray(query.bool.filter)) { - query.bool.filter.push(range); - } else { - query.bool.filter.range = range; - } - } - } - - const results = await callWithRequest('search', { - index: indexPatternTitle, - size, - body: { - _source: categorizationFieldName, - query, - }, - }); - const examples: string[] = results.hits?.hits - ?.map((doc: any) => doc._source[categorizationFieldName]) - .filter((example: string | null | undefined) => example !== undefined && example !== null); - - async function loadTokens(chunkSize: number) { - const exampleChunks = chunk(examples, chunkSize); - const tokensPerChunks = await Promise.all(exampleChunks.map(c => getTokens(c, analyzer))); - const tokensPerExample = tokensPerChunks.flat(); - return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] })); - } - try { - return loadTokens(CHUNK_SIZE); - } catch (error) { - // if an error is thrown when loading the tokens, lower the chunk size by half and try again - // the error may have been caused by too many tokens being found. - // the _analyze endpoint has a maximum of 10000 tokens. - return loadTokens(CHUNK_SIZE / 2); - } - } - - async function getTokens(examples: string[], analyzer?: any) { - const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', { - body: { - ...getAnalyzer(analyzer), - text: examples, - }, - }); - - const lengths = examples.map(e => e.length); - const sumLengths = lengths.map((s => (a: number) => (s += a))(0)); - - const tokensPerExample: Token[][] = examples.map(e => []); - - tokens.forEach((t, i) => { - for (let g = 0; g < sumLengths.length; g++) { - if (t.start_offset <= sumLengths[g] + g) { - const offset = g > 0 ? sumLengths[g - 1] + g : 0; - tokensPerExample[g].push({ - ...t, - start_offset: t.start_offset - offset, - end_offset: t.end_offset - offset, - }); - break; - } - } - }); - return tokensPerExample; - } - - function getAnalyzer(analyzer: any) { - if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) { - return analyzer; - } else { - return { analyzer: 'standard' }; - } - } - - async function validateCategoryExamples( - indexPatternTitle: string, - query: any, - size: number, - categorizationFieldName: string, - timeField: string | undefined, - start: number, - end: number, - analyzer?: any - ) { - const resp = await categorizationExamples( - indexPatternTitle, - query, - CATEGORY_EXAMPLES_SAMPLE_SIZE, - categorizationFieldName, - timeField, - start, - end, - analyzer - ); - - const sortedExamples = resp - .map((e, i) => ({ ...e, origIndex: i })) - .sort((a, b) => b.tokens.length - a.tokens.length); - const validExamples = sortedExamples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT); - const sampleSize = sortedExamples.length; - - const multiple = Math.floor(sampleSize / size) || sampleSize; - const filteredExamples = []; - let i = 0; - while (filteredExamples.length < size && i < sortedExamples.length) { - filteredExamples.push(sortedExamples[i]); - i += multiple; - } - const examples = filteredExamples - .sort((a, b) => a.origIndex - b.origIndex) - .map(e => ({ text: e.text, tokens: e.tokens })); - - return { - sampleSize, - valid: sortedExamples.length === 0 ? 0 : validExamples.length / sortedExamples.length, - examples, - }; - } - - async function getTotalCategories(jobId: string): Promise<{ total: number }> { - const totalResp = await callWithRequest('search', { - index: ML_RESULTS_INDEX_PATTERN, - size: 0, - body: { - query: { - bool: { - filter: [ - { - term: { - job_id: jobId, - }, - }, - { - exists: { - field: 'category_id', - }, - }, - ], - }, - }, - }, - }); - return totalResp?.hits?.total?.value ?? 0; - } - - async function getTopCategoryCounts(jobId: string, numberOfCategories: number) { - const top = await callWithRequest('search', { - index: ML_RESULTS_INDEX_PATTERN, - size: 0, - body: { - query: { - bool: { - filter: [ - { - term: { - job_id: jobId, - }, - }, - { - term: { - result_type: 'model_plot', - }, - }, - { - term: { - by_field_name: 'mlcategory', - }, - }, - ], - }, - }, - aggs: { - cat_count: { - terms: { - field: 'by_field_value', - size: numberOfCategories, - }, - }, - }, - }, - }); - - const catCounts: Array<{ - id: CategoryId; - count: number; - }> = top.aggregations?.cat_count?.buckets.map((c: any) => ({ - id: c.key, - count: c.doc_count, - })); - return catCounts || []; - } - - async function getCategories( - jobId: string, - catIds: CategoryId[], - size: number - ): Promise { - const categoryFilter = catIds.length - ? { - terms: { - category_id: catIds, - }, - } - : { - exists: { - field: 'category_id', - }, - }; - const result = await callWithRequest('search', { - index: ML_RESULTS_INDEX_PATTERN, - size, - body: { - query: { - bool: { - filter: [ - { - term: { - job_id: jobId, - }, - }, - categoryFilter, - ], - }, - }, - }, - }); - - return result.hits.hits?.map((c: { _source: Category }) => c._source) || []; - } - - async function topCategories(jobId: string, numberOfCategories: number) { - const catCounts = await getTopCategoryCounts(jobId, numberOfCategories); - const categories = await getCategories( - jobId, - catCounts.map(c => c.id), - catCounts.length || numberOfCategories - ); - - const catsById = categories.reduce((p, c) => { - p[c.category_id] = c; - return p; - }, {} as { [id: number]: Category }); - - const total = await getTotalCategories(jobId); - - if (catCounts.length) { - return { - total, - categories: catCounts.map(({ id, count }) => { - return { - count, - category: catsById[id] ?? null, - }; - }), - }; - } else { - return { - total, - categories: categories.map(category => { - return { - category, - }; - }), - }; - } - } - - return { - categorizationExamples, - validateCategoryExamples, - topCategories, - }; -} diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/examples.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/examples.ts new file mode 100644 index 00000000000000..76473bd55db7fb --- /dev/null +++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/examples.ts @@ -0,0 +1,206 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { chunk } from 'lodash'; +import { SearchResponse } from 'elasticsearch'; +import { CATEGORY_EXAMPLES_SAMPLE_SIZE } from '../../../../../common/constants/new_job'; +import { + Token, + CategorizationAnalyzer, + CategoryFieldExample, +} from '../../../../../common/types/categories'; +import { callWithRequestType } from '../../../../../common/types/kibana'; +import { ValidationResults } from './validation_results'; + +const CHUNK_SIZE = 100; + +export function categorizationExamplesProvider(callWithRequest: callWithRequestType) { + const validationResults = new ValidationResults(); + + async function categorizationExamples( + indexPatternTitle: string, + query: any, + size: number, + categorizationFieldName: string, + timeField: string | undefined, + start: number, + end: number, + analyzer: CategorizationAnalyzer + ): Promise<{ examples: CategoryFieldExample[]; error?: any }> { + if (timeField !== undefined) { + const range = { + range: { + [timeField]: { + gte: start, + lt: end, + format: 'epoch_millis', + }, + }, + }; + if (query.bool === undefined) { + query.bool = {}; + } + if (query.bool.filter === undefined) { + query.bool.filter = range; + } else { + if (Array.isArray(query.bool.filter)) { + query.bool.filter.push(range); + } else { + query.bool.filter.range = range; + } + } + } + + const results: SearchResponse<{ [id: string]: string }> = await callWithRequest('search', { + index: indexPatternTitle, + size, + body: { + _source: categorizationFieldName, + query, + sort: ['_doc'], + }, + }); + + const tempExamples = results.hits.hits.map(({ _source }) => _source[categorizationFieldName]); + + validationResults.createNullValueResult(tempExamples); + + const allExamples = tempExamples.filter( + (example: string | null | undefined) => example !== undefined && example !== null + ); + + validationResults.createMedianMessageLengthResult(allExamples); + + try { + const examplesWithTokens = await getTokens(CHUNK_SIZE, allExamples, analyzer); + return { examples: examplesWithTokens }; + } catch (err) { + // console.log('dropping to 50 chunk size'); + // if an error is thrown when loading the tokens, lower the chunk size by half and try again + // the error may have been caused by too many tokens being found. + // the _analyze endpoint has a maximum of 10000 tokens. + const halfExamples = allExamples.splice(0, Math.ceil(allExamples.length / 2)); + const halfChunkSize = CHUNK_SIZE / 2; + try { + const examplesWithTokens = await getTokens(halfChunkSize, halfExamples, analyzer); + return { examples: examplesWithTokens }; + } catch (error) { + validationResults.createTooManyTokensResult(error, halfChunkSize); + return { examples: halfExamples.map(e => ({ text: e, tokens: [] })) }; + } + } + } + + async function getTokens( + chunkSize: number, + examples: string[], + analyzer: CategorizationAnalyzer + ): Promise { + const exampleChunks = chunk(examples, chunkSize); + const tokensPerExampleChunks: Token[][][] = []; + for (const c of exampleChunks) { + tokensPerExampleChunks.push(await loadTokens(c, analyzer)); + } + const tokensPerExample = tokensPerExampleChunks.flat(); + return examples.map((e, i) => ({ text: e, tokens: tokensPerExample[i] })); + } + + async function loadTokens(examples: string[], analyzer: CategorizationAnalyzer) { + const { tokens }: { tokens: Token[] } = await callWithRequest('indices.analyze', { + body: { + ...getAnalyzer(analyzer), + text: examples, + }, + }); + + const lengths = examples.map(e => e.length); + const sumLengths = lengths.map((s => (a: number) => (s += a))(0)); + + const tokensPerExample: Token[][] = examples.map(e => []); + + tokens.forEach((t, i) => { + for (let g = 0; g < sumLengths.length; g++) { + if (t.start_offset <= sumLengths[g] + g) { + const offset = g > 0 ? sumLengths[g - 1] + g : 0; + tokensPerExample[g].push({ + ...t, + start_offset: t.start_offset - offset, + end_offset: t.end_offset - offset, + }); + break; + } + } + }); + return tokensPerExample; + } + + function getAnalyzer(analyzer: CategorizationAnalyzer) { + if (typeof analyzer === 'object' && analyzer.tokenizer !== undefined) { + return analyzer; + } else { + return { analyzer: 'standard' }; + } + } + + async function validateCategoryExamples( + indexPatternTitle: string, + query: any, + size: number, + categorizationFieldName: string, + timeField: string | undefined, + start: number, + end: number, + analyzer: CategorizationAnalyzer + ) { + const resp = await categorizationExamples( + indexPatternTitle, + query, + CATEGORY_EXAMPLES_SAMPLE_SIZE, + categorizationFieldName, + timeField, + start, + end, + analyzer + ); + + const { examples } = resp; + const sampleSize = examples.length; + validationResults.createTokenCountResult(examples, sampleSize); + + // sort examples by number of tokens, keeping track of their original order + // with an origIndex property + const sortedExamples = examples + .map((e, i) => ({ ...e, origIndex: i })) + .sort((a, b) => b.tokens.length - a.tokens.length); + + // we only want 'size' (e.g. 5) number of examples, + // so loop through the sorted examples, taking 5 at evenly + // spread intervals + const multiple = Math.floor(sampleSize / size) || sampleSize; + const filteredExamples = []; + let i = 0; + while (filteredExamples.length < size && i < sampleSize) { + filteredExamples.push(sortedExamples[i]); + i += multiple; + } + + // sort back into original order and remove origIndex property + const processedExamples = filteredExamples + .sort((a, b) => a.origIndex - b.origIndex) + .map(e => ({ text: e.text, tokens: e.tokens })); + + return { + overallValidStatus: validationResults.overallResult, + validationChecks: validationResults.results, + sampleSize, + examples: processedExamples, + }; + } + + return { + validateCategoryExamples, + }; +} diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/index.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/index.ts new file mode 100644 index 00000000000000..be32b99b5e527e --- /dev/null +++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/index.ts @@ -0,0 +1,8 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +export { categorizationExamplesProvider } from './examples'; +export { topCategoriesProvider } from './top_categories'; diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/top_categories.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/top_categories.ts new file mode 100644 index 00000000000000..3361cc454e2b7b --- /dev/null +++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/top_categories.ts @@ -0,0 +1,164 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { SearchResponse } from 'elasticsearch'; +import { ML_RESULTS_INDEX_PATTERN } from '../../../../../common/constants/index_patterns'; +import { CategoryId, Category } from '../../../../../common/types/categories'; +import { callWithRequestType } from '../../../../../common/types/kibana'; + +export function topCategoriesProvider(callWithRequest: callWithRequestType) { + async function getTotalCategories(jobId: string): Promise<{ total: number }> { + const totalResp = await callWithRequest('search', { + index: ML_RESULTS_INDEX_PATTERN, + size: 0, + body: { + query: { + bool: { + filter: [ + { + term: { + job_id: jobId, + }, + }, + { + exists: { + field: 'category_id', + }, + }, + ], + }, + }, + }, + }); + return totalResp?.hits?.total?.value ?? 0; + } + + async function getTopCategoryCounts(jobId: string, numberOfCategories: number) { + const top: SearchResponse = await callWithRequest('search', { + index: ML_RESULTS_INDEX_PATTERN, + size: 0, + body: { + query: { + bool: { + filter: [ + { + term: { + job_id: jobId, + }, + }, + { + term: { + result_type: 'model_plot', + }, + }, + { + term: { + by_field_name: 'mlcategory', + }, + }, + ], + }, + }, + aggs: { + cat_count: { + terms: { + field: 'by_field_value', + size: numberOfCategories, + }, + }, + }, + }, + }); + + const catCounts: Array<{ + id: CategoryId; + count: number; + }> = top.aggregations?.cat_count?.buckets.map((c: any) => ({ + id: c.key, + count: c.doc_count, + })); + return catCounts || []; + } + + async function getCategories( + jobId: string, + catIds: CategoryId[], + size: number + ): Promise { + const categoryFilter = catIds.length + ? { + terms: { + category_id: catIds, + }, + } + : { + exists: { + field: 'category_id', + }, + }; + const result: SearchResponse = await callWithRequest('search', { + index: ML_RESULTS_INDEX_PATTERN, + size, + body: { + query: { + bool: { + filter: [ + { + term: { + job_id: jobId, + }, + }, + categoryFilter, + ], + }, + }, + }, + }); + + return result.hits.hits?.map((c: { _source: Category }) => c._source) || []; + } + + async function topCategories(jobId: string, numberOfCategories: number) { + const catCounts = await getTopCategoryCounts(jobId, numberOfCategories); + const categories = await getCategories( + jobId, + catCounts.map(c => c.id), + catCounts.length || numberOfCategories + ); + + const catsById = categories.reduce((p, c) => { + p[c.category_id] = c; + return p; + }, {} as { [id: number]: Category }); + + const total = await getTotalCategories(jobId); + + if (catCounts.length) { + return { + total, + categories: catCounts.map(({ id, count }) => { + return { + count, + category: catsById[id] ?? null, + }; + }), + }; + } else { + return { + total, + categories: categories.map(category => { + return { + category, + }; + }), + }; + } + } + + return { + topCategories, + }; +} diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/validation_results.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/validation_results.ts new file mode 100644 index 00000000000000..e173b893dfbfa1 --- /dev/null +++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/categorization/validation_results.ts @@ -0,0 +1,208 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { i18n } from '@kbn/i18n'; +import { + CATEGORY_EXAMPLES_VALIDATION_STATUS, + CATEGORY_EXAMPLES_ERROR_LIMIT, + CATEGORY_EXAMPLES_WARNING_LIMIT, +} from '../../../../../common/constants/new_job'; +import { + FieldExampleCheck, + CategoryFieldExample, + VALIDATION_RESULT, +} from '../../../../../common/types/categories'; +import { getMedianStringLength } from '../../../../../common/util/string_utils'; + +const VALID_TOKEN_COUNT = 3; +const MEDIAN_LINE_LENGTH_LIMIT = 400; +const NULL_COUNT_PERCENT_LIMIT = 0.75; + +export class ValidationResults { + private _results: FieldExampleCheck[] = []; + + public get results() { + return this._results; + } + + public get overallResult() { + if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID)) { + return CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID; + } + if (this._results.some(c => c.valid === CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID)) { + return CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID; + } + return CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID; + } + + private _resultExists(id: VALIDATION_RESULT) { + return this._results.some(r => r.id === id); + } + + public createTokenCountResult(examples: CategoryFieldExample[], sampleSize: number) { + if (examples.length === 0) { + this.createNoExamplesResult(); + return; + } + + if (this._resultExists(VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES) === true) { + // if tokenizing has failed due to insufficient privileges, don't show + // the message about token count + return; + } + + const validExamplesSize = examples.filter(e => e.tokens.length >= VALID_TOKEN_COUNT).length; + const percentValid = sampleSize === 0 ? 0 : validExamplesSize / sampleSize; + + let valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.VALID; + if (percentValid < CATEGORY_EXAMPLES_ERROR_LIMIT) { + valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID; + } else if (percentValid < CATEGORY_EXAMPLES_WARNING_LIMIT) { + valid = CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID; + } + + const message = i18n.translate( + 'xpack.ml.models.jobService.categorization.messages.tokenLengthValidation', + { + defaultMessage: + '{number} field {number, plural, zero {value} one {value} other {values}} analyzed, {percentage}% contain {validTokenCount} or more tokens.', + values: { + number: sampleSize, + percentage: Math.floor(percentValid * 100), + validTokenCount: VALID_TOKEN_COUNT, + }, + } + ); + + if ( + this._resultExists(VALIDATION_RESULT.TOO_MANY_TOKENS) === false && + this._resultExists(VALIDATION_RESULT.FAILED_TO_TOKENIZE) === false + ) { + this._results.unshift({ + id: VALIDATION_RESULT.TOKEN_COUNT, + valid, + message, + }); + } + } + + public createMedianMessageLengthResult(examples: string[]) { + const median = getMedianStringLength(examples); + + if (median > MEDIAN_LINE_LENGTH_LIMIT) { + this._results.push({ + id: VALIDATION_RESULT.MEDIAN_LINE_LENGTH, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID, + message: i18n.translate( + 'xpack.ml.models.jobService.categorization.messages.medianLineLength', + { + defaultMessage: + 'The median length for the field values analyzed is over {medianLimit} characters.', + values: { medianLimit: MEDIAN_LINE_LENGTH_LIMIT }, + } + ), + }); + } + } + + public createNoExamplesResult() { + this._results.push({ + id: VALIDATION_RESULT.NULL_VALUES, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID, + message: i18n.translate('xpack.ml.models.jobService.categorization.messages.noDataFound', { + defaultMessage: + 'No examples for this field could be found. Please ensure the selected date range contains data.', + }), + }); + } + + public createNullValueResult(examples: Array) { + const nullCount = examples.filter(e => e === null).length; + + if (nullCount / examples.length >= NULL_COUNT_PERCENT_LIMIT) { + this._results.push({ + id: VALIDATION_RESULT.NULL_VALUES, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID, + message: i18n.translate('xpack.ml.models.jobService.categorization.messages.nullValues', { + defaultMessage: 'More than {percent}% of field values are null.', + values: { percent: NULL_COUNT_PERCENT_LIMIT * 100 }, + }), + }); + } + } + + public createTooManyTokensResult(error: any, sampleSize: number) { + // expecting error message: + // The number of tokens produced by calling _analyze has exceeded the allowed maximum of [10000]. + // This limit can be set by changing the [index.analyze.max_token_count] index level setting. + + if (error.statusCode === 403) { + this.createPrivilegesErrorResult(error); + return; + } + const message: string = error.message; + if (message) { + const rxp = /exceeded the allowed maximum of \[(\d+?)\]/; + const match = rxp.exec(message); + if (match?.length === 2) { + const tokenLimit = match[1]; + this._results.push({ + id: VALIDATION_RESULT.TOO_MANY_TOKENS, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID, + message: i18n.translate( + 'xpack.ml.models.jobService.categorization.messages.tooManyTokens', + { + defaultMessage: + 'Tokenization of field value examples has failed due to more than {tokenLimit} tokens being found in a sample of {sampleSize} values.', + values: { sampleSize, tokenLimit }, + } + ), + }); + return; + } + return; + } + this.createFailureToTokenize(message); + } + + public createPrivilegesErrorResult(error: any) { + const message: string = error.message; + if (message) { + this._results.push({ + id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID, + message: i18n.translate( + 'xpack.ml.models.jobService.categorization.messages.insufficientPrivileges', + { + defaultMessage: + 'Tokenization of field value examples could not be performed due to insufficient privileges. Field values cannot therefore be checked to see if they are appropriate for use in a categorization job.', + } + ), + }); + this._results.push({ + id: VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.PARTIALLY_VALID, + message, + }); + return; + } + } + + public createFailureToTokenize(message: string | undefined) { + this._results.push({ + id: VALIDATION_RESULT.FAILED_TO_TOKENIZE, + valid: CATEGORY_EXAMPLES_VALIDATION_STATUS.INVALID, + message: i18n.translate( + 'xpack.ml.models.jobService.categorization.messages.failureToGetTokens', + { + defaultMessage: + 'It was not possible to tokenize a sample of example field values. {message}', + values: { message: message || '' }, + } + ), + }); + } +} diff --git a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/index.ts b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/index.ts index da23efa67d0b5c..da60a90f4bfbc1 100644 --- a/x-pack/legacy/plugins/ml/server/models/job_service/new_job/index.ts +++ b/x-pack/legacy/plugins/ml/server/models/job_service/new_job/index.ts @@ -5,4 +5,4 @@ */ export { newJobChartsProvider } from './charts'; -export { categorizationExamplesProvider } from './categorization'; +export { categorizationExamplesProvider, topCategoriesProvider } from './categorization';