From 75e40c3c26fcdd8b323bea12d31f090fcb6dca57 Mon Sep 17 00:00:00 2001 From: Ramon Bartl Date: Mon, 13 Mar 2023 12:30:11 +0100 Subject: [PATCH] Improve performance for sample listing index (#2273) * Removed stopword remover from lexicon * Explicit values for listing_searchable_text index * Upgrade step added * Removed unused import * Index also client ID and Name * Changelog updated * Ensure unicodes * Use getId instead * Do not split on `-` * Allow certain characters * Fixed test * Allow more special characters * Do not split on : and / * Support for wildcards * URL unquote the searchterm first * Fix regex * Percentages are filtered out * No literals --- CHANGES.rst | 1 + src/senaite/core/api/catalog.py | 31 +++++++--- src/senaite/core/catalog/indexer/sample.py | 36 ++++++----- .../core/profiles/default/metadata.xml | 2 +- .../core/tests/doctests/API_catalog.rst | 60 ++++++++++++++----- src/senaite/core/upgrade/v02_05_000.py | 21 +++++++ src/senaite/core/upgrade/v02_05_000.zcml | 8 +++ 7 files changed, 121 insertions(+), 38 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e6d6b20a6f..6b486dd92d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,7 @@ Changelog 2.5.0 (unreleased) ------------------ +- #2273 Improve performance for sample listing index - #2272 Allow to configure the position of additional value columns diff --git a/src/senaite/core/api/catalog.py b/src/senaite/core/api/catalog.py index 4d94a88736..4e1156c78c 100644 --- a/src/senaite/core/api/catalog.py +++ b/src/senaite/core/api/catalog.py @@ -21,6 +21,7 @@ import re import six +from six.moves.urllib.parse import unquote_plus from bika.lims.api import APIError from bika.lims.api import get_tool @@ -28,7 +29,6 @@ from Products.CMFPlone.UnicodeSplitter import CaseNormalizer from Products.CMFPlone.UnicodeSplitter import Splitter from Products.ZCatalog.interfaces import IZCatalog -from Products.ZCTextIndex.Lexicon import StopWordAndSingleCharRemover from Products.ZCTextIndex.ZCTextIndex import PLexicon @@ -149,8 +149,7 @@ def add_zc_text_index(catalog, index, lex_id="Lexicon", indexed_attrs=None): # create the lexicon first splitter = Splitter() casenormalizer = CaseNormalizer() - stopwordremover = StopWordAndSingleCharRemover() - pipeline = [splitter, casenormalizer, stopwordremover] + pipeline = [splitter, casenormalizer] lexicon = PLexicon(lex_id, "Lexicon", *pipeline) catalog._setObject(lex_id, lexicon) @@ -228,6 +227,7 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True): :returns: sarchable text string """ OPERATORS = ["AND", "OR"] + WILDCARDS = ["*", "?"] if op not in OPERATORS: op = "AND" @@ -237,6 +237,9 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True): def is_op(token): return token.upper() in OPERATORS + def is_wc(char): + return char in WILDCARDS + def append_op_after(index, token, tokens): # do not append an operator after the last token if index == len(tokens) - 1: @@ -249,10 +252,21 @@ def append_op_after(index, token, tokens): return True # convert to unicode - term = safe_unicode(qs) + term = unquote_plus(safe_unicode(qs)) + + # Wildcards at the beginning are not allowed and therefore removed! + first_char = term[0] if len(term) > 0 else "" + if is_wc(first_char): + term = term.replace(first_char, "", 1) + + # splits the string on all characters that do not match the regex + regex = r"[^\w\-\_\.\<\>\+\{\}\:\/\?\$]" - # splits the string on all non alphanumeric characters - tokens = re.split(r"[^\w]", term, flags=re.U | re.I) + # allow only words when searching just a single character + if len(term) == 1: + regex = r"[^\w]" + + tokens = re.split(regex, term, flags=re.U | re.I) # filter out all empty tokens tokens = filter(None, tokens) @@ -269,13 +283,16 @@ def append_op_after(index, token, tokens): for num, token in enumerate(tokens): + # retain wildcards at the end of a token + last_token_char = token[-1] if len(token) > 0 else "" + # append operators without changes and continue if is_op(token): parts.append(token.upper()) continue # append wildcard to token - if wildcard and not is_op(token): + if wildcard and not is_op(token) and not is_wc(last_token_char): token = token + "*" # append the token diff --git a/src/senaite/core/catalog/indexer/sample.py b/src/senaite/core/catalog/indexer/sample.py index 299d95bf35..d7ea9e7d84 100644 --- a/src/senaite/core/catalog/indexer/sample.py +++ b/src/senaite/core/catalog/indexer/sample.py @@ -21,8 +21,6 @@ from bika.lims import api from bika.lims.interfaces import IAnalysisRequest from plone.indexer import indexer -from senaite.core.catalog import SAMPLE_CATALOG -from senaite.core.catalog.utils import get_searchable_text_tokens from senaite.core.interfaces import ISampleCatalog @@ -91,20 +89,30 @@ def is_received(instance): @indexer(IAnalysisRequest, ISampleCatalog) def listing_searchable_text(instance): - """Retrieves all the values of metadata columns in the catalog for - wildcard searches - :return: all metadata values joined in a string + """Retrieves most commonly searched values for samples + + :returns: string with search terms """ entries = set() - catalog = SAMPLE_CATALOG - # add searchable text tokens for the root sample - tokens = get_searchable_text_tokens(instance, catalog) - entries.update(tokens) + for obj in [instance] + instance.getDescendants(): + entries.add(obj.getId()) + entries.add(obj.getClientOrderNumber()) + entries.add(obj.getClientReference()) + entries.add(obj.getClientSampleID()) + + # we use this approach to bypass the computed fields + client = obj.getClient() + entries.add(client.getName()) + entries.add(client.getClientID()) + + sampletype = obj.getSampleType() + entries.add(sampletype.Title() if sampletype else '') + + samplepoint = obj.getSamplePoint() + entries.add(samplepoint.Title() if samplepoint else '') - # add searchable text tokens for descendant samples - for descendant in instance.getDescendants(): - tokens = get_searchable_text_tokens(descendant, catalog) - entries.update(tokens) + batch = obj.getBatch() + entries.add(batch.getId() if batch else '') - return u" ".join(list(entries)) + return u" ".join(map(api.safe_unicode, entries)) diff --git a/src/senaite/core/profiles/default/metadata.xml b/src/senaite/core/profiles/default/metadata.xml index 3da64c6b1f..9975121cb2 100644 --- a/src/senaite/core/profiles/default/metadata.xml +++ b/src/senaite/core/profiles/default/metadata.xml @@ -1,6 +1,6 @@ - 2500 + 2501 profile-Products.ATContentTypes:base profile-Products.CMFEditions:CMFEditions diff --git a/src/senaite/core/tests/doctests/API_catalog.rst b/src/senaite/core/tests/doctests/API_catalog.rst index 316a262f8f..5c791893fa 100644 --- a/src/senaite/core/tests/doctests/API_catalog.rst +++ b/src/senaite/core/tests/doctests/API_catalog.rst @@ -163,6 +163,30 @@ Without wildcard: >>> capi.to_searchable_text_qs("sample", wildcard=False) u'sample' +Wildcards at the beginning of the searchterms are not supported: + + >>> capi.to_searchable_text_qs("?H2O") + u'H2O*' + + >>> capi.to_searchable_text_qs("*H2O") + u'H2O*' + +Wildcards at the end of the searchterms are retained: + + >>> capi.to_searchable_text_qs("H2O?") + u'H2O?' + + >>> capi.to_searchable_text_qs("H2O*") + u'H2O*' + +If the search contains only a single character, it needs to be a word: + + >>> capi.to_searchable_text_qs("W") + u'W*' + + >>> capi.to_searchable_text_qs("$") + u'' + Searching for a unicode word: >>> capi.to_searchable_text_qs("AäOöUüZ") @@ -176,7 +200,7 @@ Searching for multiple unicode words: Searching for a concatenated word: >>> capi.to_searchable_text_qs("H2O-0001") - u'H2O* AND 0001*' + u'H2O-0001*' Searching for two words: @@ -188,38 +212,42 @@ Tricky query strings (with and/or in words or in between): >>> capi.to_searchable_text_qs("Fresh and Funky Oranges from Andorra") u'Fresh* AND Funky* AND Oranges* AND from* AND Andorra*' -All wildcards are removed and replaced with `*` to avoid parse errors: +Search with special characters: + + >>> capi.to_searchable_text_qs("H2O_0001") + u'H2O_0001*' - >>> capi.to_searchable_text_qs("Ca? OR Mg?") - u'Ca* OR Mg*' + >>> capi.to_searchable_text_qs("H2O.0001") + u'H2O.0001*' -Search with special characters: + >>> capi.to_searchable_text_qs("H2O<>0001") + u'H2O<>0001*' + + >>> capi.to_searchable_text_qs("H2O:0001") + u'H2O:0001*' + + >>> capi.to_searchable_text_qs("H2O/0001") + u'H2O/0001*' >>> capi.to_searchable_text_qs("'H2O-0001'") - u'H2O* AND 0001*' + u'H2O-0001*' >>> capi.to_searchable_text_qs("\'H2O-0001\'") - u'H2O* AND 0001*' + u'H2O-0001*' >>> capi.to_searchable_text_qs("(H2O-0001)*") - u'H2O* AND 0001*' + u'H2O-0001*' >>> capi.to_searchable_text_qs("****([H2O-0001])****") - u'H2O* AND 0001*' + u'H2O-0001*' >>> capi.to_searchable_text_qs("********************") u'' - >>> capi.to_searchable_text_qs("????????????????????") - u'' - - >>> capi.to_searchable_text_qs("?H2O?") - u'H2O*' - >>> capi.to_searchable_text_qs("*H2O*") u'H2O*' - >>> capi.to_searchable_text_qs("And the question is: AND OR maybe NOT AND") + >>> capi.to_searchable_text_qs("And the question is AND OR maybe NOT AND") u'the* AND question* AND is* AND OR maybe* AND NOT*' >>> capi.to_searchable_text_qs("AND OR") diff --git a/src/senaite/core/upgrade/v02_05_000.py b/src/senaite/core/upgrade/v02_05_000.py index 275ff97da7..30290a7241 100644 --- a/src/senaite/core/upgrade/v02_05_000.py +++ b/src/senaite/core/upgrade/v02_05_000.py @@ -18,7 +18,12 @@ # Copyright 2018-2023 by it's authors. # Some rights reserved, see README and LICENSE. +from bika.lims import api from senaite.core import logger +from senaite.core.api.catalog import add_index +from senaite.core.api.catalog import del_index +from senaite.core.api.catalog import reindex_index +from senaite.core.catalog import SAMPLE_CATALOG from senaite.core.config import PROJECTNAME as product from senaite.core.upgrade import upgradestep from senaite.core.upgrade.utils import UpgradeUtils @@ -44,3 +49,19 @@ def upgrade(tool): logger.info("{0} upgraded to version {1}".format(product, version)) return True + + +def rebuild_sample_zctext_index_and_lexicon(tool): + """Recreate sample listing_searchable_text ZCText index and Lexicon + """ + # remove the existing index + index = "listing_searchable_text" + del_index(SAMPLE_CATALOG, index) + # remove the Lexicon + catalog = api.get_tool(SAMPLE_CATALOG) + if "Lexicon" in catalog.objectIds(): + catalog.manage_delObjects("Lexicon") + # recreate the index + lexicon + add_index(SAMPLE_CATALOG, index, "ZCTextIndex") + # reindex + reindex_index(SAMPLE_CATALOG, index) diff --git a/src/senaite/core/upgrade/v02_05_000.zcml b/src/senaite/core/upgrade/v02_05_000.zcml index 03f96c3355..e5067eac7c 100644 --- a/src/senaite/core/upgrade/v02_05_000.zcml +++ b/src/senaite/core/upgrade/v02_05_000.zcml @@ -3,6 +3,14 @@ xmlns:genericsetup="http://namespaces.zope.org/genericsetup" i18n_domain="senaite.core"> + +