From ea7a3b7851794e30c3e9d4e66f73cf85c84f6c7c Mon Sep 17 00:00:00 2001 From: James Rucker Date: Mon, 11 Jul 2022 14:16:27 -0700 Subject: [PATCH] [Enterprise Search] Create API Index API (#135877) * Getting started with an index create API * Added default mappings and filter settings * Added analysis settings, fixed type errors, added simple tests. * [CI] Auto-commit changed files from 'node scripts/eslint --no-cache --fix' * PascalCase it is. * Bubble up the ability to use the default language. * [CI] Auto-commit changed files from 'node scripts/precommit_hook.js --ref HEAD~1..HEAD --fix' * Clean up types * [CI] Auto-commit changed files from 'node scripts/precommit_hook.js --ref HEAD~1..HEAD --fix' * Fix jest specs Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> --- .../server/lib/indices/create_index.test.ts | 188 +++++++++++ .../server/lib/indices/create_index.ts | 76 +++++ .../server/lib/indices/text_analysis.test.ts | 263 ++++++++++++++++ .../server/lib/indices/text_analysis.ts | 295 ++++++++++++++++++ .../routes/enterprise_search/indices.ts | 29 ++ 5 files changed, 851 insertions(+) create mode 100644 x-pack/plugins/enterprise_search/server/lib/indices/create_index.test.ts create mode 100644 x-pack/plugins/enterprise_search/server/lib/indices/create_index.ts create mode 100644 x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.test.ts create mode 100644 x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.ts diff --git a/x-pack/plugins/enterprise_search/server/lib/indices/create_index.test.ts b/x-pack/plugins/enterprise_search/server/lib/indices/create_index.test.ts new file mode 100644 index 000000000000000..877fbc1dcc12e2b --- /dev/null +++ b/x-pack/plugins/enterprise_search/server/lib/indices/create_index.test.ts @@ -0,0 +1,188 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { elasticsearchServiceMock } from '@kbn/core/server/mocks'; + +import { createApiIndex } from './create_index'; + +describe('createApiIndex lib function', () => { + const mockClient = elasticsearchServiceMock.createScopedClusterClient(); + + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('successfully creates an index', async () => { + await expect(createApiIndex(mockClient, 'index_name', 'en')).resolves.toEqual({ + body: {}, + headers: { + 'x-elastic-product': 'Elasticsearch', + }, + meta: {}, + statusCode: 200, + warnings: [], + }); + expect(mockClient.asCurrentUser.indices.create).toHaveBeenCalledWith({ + body: { + mappings: { + dynamic: true, + dynamic_templates: [ + { + all_text_fields: { + mapping: { + analyzer: 'iq_text_base', + fields: { + delimiter: { + analyzer: 'iq_text_delimiter', + index_options: 'freqs', + type: 'text', + }, + enum: { + ignore_above: 2048, + type: 'keyword', + }, + joined: { + analyzer: 'i_text_bigram', + index_options: 'freqs', + search_analyzer: 'q_text_bigram', + type: 'text', + }, + prefix: { + analyzer: 'i_prefix', + index_options: 'docs', + search_analyzer: 'q_prefix', + type: 'text', + }, + stem: { + analyzer: 'iq_text_stem', + type: 'text', + }, + }, + }, + match_mapping_type: 'string', + }, + }, + ], + }, + settings: { + analysis: { + analyzer: { + i_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'front_ngram'], + tokenizer: 'standard', + type: 'custom', + }, + i_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stem-filter', + 'bigram_joiner', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_base: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'en-stop-words-filter'], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_delimiter: { + filter: [ + 'delimiter', + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stop-words-filter', + 'en-stem-filter', + ], + tokenizer: 'whitespace', + type: 'custom', + }, + iq_text_stem: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stop-words-filter', + 'en-stem-filter', + ], + tokenizer: 'standard', + type: 'custom', + }, + q_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding'], + tokenizer: 'standard', + type: 'custom', + }, + q_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stem-filter', + 'bigram_joiner_unigrams', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + }, + filter: { + bigram_joiner: { + max_shingle_size: 2, + output_unigrams: false, + token_separator: '', + type: 'shingle', + }, + bigram_joiner_unigrams: { + max_shingle_size: 2, + output_unigrams: true, + token_separator: '', + type: 'shingle', + }, + bigram_max_size: { + max: 16, + min: 0, + type: 'length', + }, + delimiter: { + catenate_all: true, + catenate_numbers: true, + catenate_words: true, + generate_number_parts: true, + generate_word_parts: true, + preserve_original: false, + split_on_case_change: true, + split_on_numerics: true, + stem_english_possessive: true, + type: 'word_delimiter_graph', + }, + 'en-stem-filter': { + name: 'light_english', + language: 'light_english', + type: 'stemmer', + }, + 'en-stop-words-filter': { + stopwords: '_english_', + type: 'stop', + }, + front_ngram: { + max_gram: 12, + min_gram: 1, + type: 'edge_ngram', + }, + }, + }, + }, + }, + index: 'index_name', + }); + }); +}); diff --git a/x-pack/plugins/enterprise_search/server/lib/indices/create_index.ts b/x-pack/plugins/enterprise_search/server/lib/indices/create_index.ts new file mode 100644 index 000000000000000..be6748f0a9bdfaa --- /dev/null +++ b/x-pack/plugins/enterprise_search/server/lib/indices/create_index.ts @@ -0,0 +1,76 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { MappingKeywordProperty, MappingTextProperty } from '@elastic/elasticsearch/lib/api/types'; +import { IScopedClusterClient } from '@kbn/core/server'; + +import { textAnalysisSettings } from './text_analysis'; + +const prefixMapping: MappingTextProperty = { + search_analyzer: 'q_prefix', + analyzer: 'i_prefix', + type: 'text', + index_options: 'docs', +}; + +const delimiterMapping: MappingTextProperty = { + analyzer: 'iq_text_delimiter', + type: 'text', + index_options: 'freqs', +}; + +const joinedMapping: MappingTextProperty = { + search_analyzer: 'q_text_bigram', + analyzer: 'i_text_bigram', + type: 'text', + index_options: 'freqs', +}; + +const enumMapping: MappingKeywordProperty = { + ignore_above: 2048, + type: 'keyword', +}; + +const stemMapping: MappingTextProperty = { + analyzer: 'iq_text_stem', + type: 'text', +}; + +const defaultMappings = { + dynamic: true, + dynamic_templates: [ + { + all_text_fields: { + match_mapping_type: 'string', + mapping: { + analyzer: 'iq_text_base', + fields: { + prefix: prefixMapping, + delimiter: delimiterMapping, + joined: joinedMapping, + enum: enumMapping, + stem: stemMapping, + }, + }, + }, + }, + ], +}; + +export const createApiIndex = async ( + client: IScopedClusterClient, + indexName: string, + language: string | undefined +) => { + return await client.asCurrentUser.indices.create({ + index: indexName, + body: { + mappings: defaultMappings, + settings: textAnalysisSettings(language), + }, + }); +}; diff --git a/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.test.ts b/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.test.ts new file mode 100644 index 000000000000000..ff9a1266bf3f7bd --- /dev/null +++ b/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.test.ts @@ -0,0 +1,263 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { textAnalysisSettings } from './text_analysis'; + +describe('textAnalysisSettings lib function', () => { + it('supports a default language', async () => { + expect(textAnalysisSettings()).toEqual({ + analysis: { + analyzer: { + i_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'front_ngram'], + tokenizer: 'standard', + type: 'custom', + }, + i_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stem-filter', + 'bigram_joiner', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_base: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'en-stop-words-filter'], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_delimiter: { + filter: [ + 'delimiter', + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stop-words-filter', + 'en-stem-filter', + ], + tokenizer: 'whitespace', + type: 'custom', + }, + iq_text_stem: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stop-words-filter', + 'en-stem-filter', + ], + tokenizer: 'standard', + type: 'custom', + }, + q_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding'], + tokenizer: 'standard', + type: 'custom', + }, + q_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'en-stem-filter', + 'bigram_joiner_unigrams', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + }, + filter: { + bigram_joiner: { + max_shingle_size: 2, + output_unigrams: false, + token_separator: '', + type: 'shingle', + }, + bigram_joiner_unigrams: { + max_shingle_size: 2, + output_unigrams: true, + token_separator: '', + type: 'shingle', + }, + bigram_max_size: { + max: 16, + min: 0, + type: 'length', + }, + delimiter: { + catenate_all: true, + catenate_numbers: true, + catenate_words: true, + generate_number_parts: true, + generate_word_parts: true, + preserve_original: false, + split_on_case_change: true, + split_on_numerics: true, + stem_english_possessive: true, + type: 'word_delimiter_graph', + }, + 'en-stem-filter': { + name: 'light_english', + language: 'light_english', + type: 'stemmer', + }, + 'en-stop-words-filter': { + stopwords: '_english_', + type: 'stop', + }, + front_ngram: { + max_gram: 12, + min_gram: 1, + type: 'edge_ngram', + }, + }, + }, + }); + }); + + it('returns settings for another language', async () => { + expect(textAnalysisSettings('fr')).toEqual({ + analysis: { + analyzer: { + i_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'front_ngram'], + tokenizer: 'standard', + type: 'custom', + }, + i_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'fr-stem-filter', + 'bigram_joiner', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_base: { + filter: ['cjk_width', 'lowercase', 'asciifolding', 'fr-stop-words-filter'], + tokenizer: 'standard', + type: 'custom', + }, + iq_text_delimiter: { + filter: [ + 'fr-elision', + 'delimiter', + 'cjk_width', + 'lowercase', + 'asciifolding', + 'fr-stop-words-filter', + 'fr-stem-filter', + ], + tokenizer: 'whitespace', + type: 'custom', + }, + iq_text_stem: { + filter: [ + 'fr-elision', + 'cjk_width', + 'lowercase', + 'asciifolding', + 'fr-stop-words-filter', + 'fr-stem-filter', + ], + tokenizer: 'standard', + type: 'custom', + }, + q_prefix: { + filter: ['cjk_width', 'lowercase', 'asciifolding'], + tokenizer: 'standard', + type: 'custom', + }, + q_text_bigram: { + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + 'fr-stem-filter', + 'bigram_joiner_unigrams', + 'bigram_max_size', + ], + tokenizer: 'standard', + type: 'custom', + }, + }, + filter: { + bigram_joiner: { + max_shingle_size: 2, + output_unigrams: false, + token_separator: '', + type: 'shingle', + }, + bigram_joiner_unigrams: { + max_shingle_size: 2, + output_unigrams: true, + token_separator: '', + type: 'shingle', + }, + bigram_max_size: { + max: 16, + min: 0, + type: 'length', + }, + delimiter: { + catenate_all: true, + catenate_numbers: true, + catenate_words: true, + generate_number_parts: true, + generate_word_parts: true, + preserve_original: false, + split_on_case_change: true, + split_on_numerics: true, + stem_english_possessive: true, + type: 'word_delimiter_graph', + }, + 'fr-elision': { + articles: [ + 'l', + 'm', + 't', + 'qu', + 'n', + 's', + 'j', + 'd', + 'c', + 'jusqu', + 'quoiqu', + 'lorsqu', + 'puisqu', + ], + articles_case: true, + type: 'elision', + }, + 'fr-stem-filter': { + name: 'light_french', + language: 'light_french', + type: 'stemmer', + }, + 'fr-stop-words-filter': { + stopwords: '_french_', + type: 'stop', + }, + front_ngram: { + max_gram: 12, + min_gram: 1, + type: 'edge_ngram', + }, + }, + }, + }); + }); +}); diff --git a/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.ts b/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.ts new file mode 100644 index 000000000000000..87f0b2cfca4ed95 --- /dev/null +++ b/x-pack/plugins/enterprise_search/server/lib/indices/text_analysis.ts @@ -0,0 +1,295 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { AnalysisTokenFilter } from '@elastic/elasticsearch/lib/api/types'; + +interface LanguageDataEntry { + name: string; + stemmer: string; + stop_words: string; + custom_filter_definitions?: object; + prepended_filters?: string[]; + postpended_filters?: string[]; +} + +const languageData: Record = { + da: { + name: 'Danish', + stemmer: 'danish', + stop_words: '_danish_', + }, + de: { + name: 'German', + stemmer: 'light_german', + stop_words: '_german_', + }, + en: { + name: 'English', + stemmer: 'light_english', + stop_words: '_english_', + }, + es: { + name: 'Spanish', + stemmer: 'light_spanish', + stop_words: '_spanish_', + }, + fr: { + name: 'French', + stemmer: 'light_french', + stop_words: '_french_', + custom_filter_definitions: { + 'fr-elision': { + type: 'elision' as const, + articles: [ + 'l', + 'm', + 't', + 'qu', + 'n', + 's', + 'j', + 'd', + 'c', + 'jusqu', + 'quoiqu', + 'lorsqu', + 'puisqu', + ], + articles_case: true, + }, + }, + prepended_filters: ['fr-elision'], + }, + it: { + name: 'Italian', + stemmer: 'light_italian', + stop_words: '_italian_', + custom_filter_definitions: { + 'it-elision': { + type: 'elision' as const, + articles: [ + 'c', + 'l', + 'all', + 'dall', + 'dell', + 'nell', + 'sull', + 'coll', + 'pell', + 'gl', + 'agl', + 'dagl', + 'degl', + 'negl', + 'sugl', + 'un', + 'm', + 't', + 's', + 'v', + 'd', + ], + articles_case: true, + }, + }, + prepended_filters: ['it-elision'], + }, + ja: { + name: 'Japanese', + stemmer: 'light_english', + stop_words: '_english_', + postpended_filters: ['cjk_bigram'], + }, + ko: { + name: 'Korean', + stemmer: 'light_english', + stop_words: '_english_', + postpended_filters: ['cjk_bigram'], + }, + nl: { + name: 'Dutch', + stemmer: 'dutch', + stop_words: '_dutch_', + }, + pt: { + name: 'Portuguese', + stemmer: 'light_portuguese', + stop_words: '_portuguese_', + }, + 'pt-br': { + name: 'Portuguese (Brazil)', + stemmer: 'brazilian', + stop_words: '_brazilian_', + }, + ru: { + name: 'Russian', + stemmer: 'russian', + stop_words: '_russian_', + }, + th: { + name: 'Thai', + stemmer: 'light_english', + stop_words: '_thai_', + }, + zh: { + name: 'Chinese', + stemmer: 'light_english', + stop_words: '_english_', + postpended_filters: ['cjk_bigram'], + }, +}; + +const FRONT_NGRAM_MAX_GRAM = 12; + +const genericFilters: Record = { + front_ngram: { + type: 'edge_ngram' as const, + min_gram: 1, + max_gram: FRONT_NGRAM_MAX_GRAM, + }, + delimiter: { + type: 'word_delimiter_graph' as const, + generate_word_parts: true, + generate_number_parts: true, + catenate_words: true, + catenate_numbers: true, + catenate_all: true, + preserve_original: false, + split_on_case_change: true, + split_on_numerics: true, + stem_english_possessive: true, + }, + bigram_joiner: { + type: 'shingle' as const, + token_separator: '', + max_shingle_size: 2, + output_unigrams: false, + }, + bigram_joiner_unigrams: { + type: 'shingle' as const, + token_separator: '', + max_shingle_size: 2, + output_unigrams: true, + }, + bigram_max_size: { + type: 'length' as const, + min: 0, + max: 16, + }, +}; + +export const textAnalysisSettings = (language: string = 'en') => { + return { + analysis: { + analyzer: analyzerDefinitions(language), + filter: filterDefinitions(language), + }, + }; +}; + +const stemFilterName = (languageCode: string) => { + return `${languageCode}-stem-filter`; +}; + +const stopWordsFilterName = (languageCode: string) => { + return `${languageCode}-stop-words-filter`; +}; + +const analyzerDefinitions = (language: string) => { + const prependedFilters = languageData[language].prepended_filters || []; + const postpendedFilters = languageData[language].postpended_filters || []; + + return { + i_prefix: { + type: 'custom' as const, + tokenizer: 'standard', + filter: ['cjk_width', 'lowercase', 'asciifolding', 'front_ngram'], + }, + q_prefix: { + type: 'custom' as const, + tokenizer: 'standard', + filter: ['cjk_width', 'lowercase', 'asciifolding'], + }, + iq_text_base: { + type: 'custom' as const, + tokenizer: 'standard', + filter: ['cjk_width', 'lowercase', 'asciifolding', stopWordsFilterName(language)], + }, + iq_text_stem: { + type: 'custom' as const, + tokenizer: 'standard', + filter: [ + ...prependedFilters, + 'cjk_width', + 'lowercase', + 'asciifolding', + stopWordsFilterName(language), + stemFilterName(language), + ...postpendedFilters, + ], + }, + iq_text_delimiter: { + type: 'custom' as const, + tokenizer: 'whitespace', + filter: [ + ...prependedFilters, + 'delimiter', + 'cjk_width', + 'lowercase', + 'asciifolding', + stopWordsFilterName(language), + stemFilterName(language), + ...postpendedFilters, + ], + }, + i_text_bigram: { + type: 'custom' as const, + tokenizer: 'standard', + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + stemFilterName(language), + 'bigram_joiner', + 'bigram_max_size', + ], + }, + q_text_bigram: { + type: 'custom' as const, + tokenizer: 'standard', + filter: [ + 'cjk_width', + 'lowercase', + 'asciifolding', + stemFilterName(language), + 'bigram_joiner_unigrams', + 'bigram_max_size', + ], + }, + }; +}; + +const filterDefinitions = (language: string) => { + const stemmerName = languageData[language].stemmer; + const stopWordsName = languageData[language].stop_words; + const customFilterDefinitions = languageData[language].custom_filter_definitions || {}; + + return { + ...genericFilters, + [stemFilterName(language)]: { + type: 'stemmer' as const, + name: stemmerName, + language: stemmerName, + }, + [stopWordsFilterName(language)]: { + type: 'stop' as const, + stopwords: stopWordsName, + }, + ...customFilterDefinitions, + }; +}; diff --git a/x-pack/plugins/enterprise_search/server/routes/enterprise_search/indices.ts b/x-pack/plugins/enterprise_search/server/routes/enterprise_search/indices.ts index 7636acdfa26ad3b..6a0fd3935628ca5 100644 --- a/x-pack/plugins/enterprise_search/server/routes/enterprise_search/indices.ts +++ b/x-pack/plugins/enterprise_search/server/routes/enterprise_search/indices.ts @@ -7,6 +7,7 @@ import { schema } from '@kbn/config-schema'; +import { createApiIndex } from '../../lib/indices/create_index'; import { fetchIndex } from '../../lib/indices/fetch_index'; import { fetchIndices } from '../../lib/indices/fetch_indices'; import { generateApiKey } from '../../lib/indices/generate_api_key'; @@ -124,4 +125,32 @@ export function registerIndexRoutes({ router }: RouteDependencies) { } } ); + + router.post( + { + path: '/internal/enterprise_search/indices', + validate: { + body: schema.object({ + indexName: schema.string(), + language: schema.maybe(schema.string()), + }), + }, + }, + async (context, request, response) => { + const { indexName, language } = request.body; + const { client } = (await context.core).elasticsearch; + try { + const createIndexResponse = await createApiIndex(client, indexName, language); + return response.ok({ + body: createIndexResponse, + headers: { 'content-type': 'application/json' }, + }); + } catch (error) { + return response.customError({ + body: 'Error fetching data from Enterprise Search', + statusCode: 502, + }); + } + } + ); }