From 75e40c3c26fcdd8b323bea12d31f090fcb6dca57 Mon Sep 17 00:00:00 2001
From: Ramon Bartl <rb@ridingbytes.com>
Date: Mon, 13 Mar 2023 12:30:11 +0100
Subject: [PATCH] Improve performance for sample listing index (#2273)

* Removed stopword remover from lexicon

* Explicit values for listing_searchable_text index

* Upgrade step added

* Removed unused import

* Index also client ID and Name

* Changelog updated

* Ensure unicodes

* Use getId instead

* Do not split on `-`

* Allow certain characters

* Fixed test

* Allow more special characters

* Do not split on : and /

* Support for wildcards

* URL unquote the searchterm first

* Fix regex

* Percentages are filtered out

* No literals
---
 CHANGES.rst                                   |  1 +
 src/senaite/core/api/catalog.py               | 31 +++++++---
 src/senaite/core/catalog/indexer/sample.py    | 36 ++++++-----
 .../core/profiles/default/metadata.xml        |  2 +-
 .../core/tests/doctests/API_catalog.rst       | 60 ++++++++++++++-----
 src/senaite/core/upgrade/v02_05_000.py        | 21 +++++++
 src/senaite/core/upgrade/v02_05_000.zcml      |  8 +++
 7 files changed, 121 insertions(+), 38 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index e6d6b20a6f..6b486dd92d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -4,6 +4,7 @@ Changelog
 2.5.0 (unreleased)
 ------------------
 
+- #2273 Improve performance for sample listing index
 - #2272 Allow to configure the position of additional value columns
 
 
diff --git a/src/senaite/core/api/catalog.py b/src/senaite/core/api/catalog.py
index 4d94a88736..4e1156c78c 100644
--- a/src/senaite/core/api/catalog.py
+++ b/src/senaite/core/api/catalog.py
@@ -21,6 +21,7 @@
 import re
 
 import six
+from six.moves.urllib.parse import unquote_plus
 
 from bika.lims.api import APIError
 from bika.lims.api import get_tool
@@ -28,7 +29,6 @@
 from Products.CMFPlone.UnicodeSplitter import CaseNormalizer
 from Products.CMFPlone.UnicodeSplitter import Splitter
 from Products.ZCatalog.interfaces import IZCatalog
-from Products.ZCTextIndex.Lexicon import StopWordAndSingleCharRemover
 from Products.ZCTextIndex.ZCTextIndex import PLexicon
 
 
@@ -149,8 +149,7 @@ def add_zc_text_index(catalog, index, lex_id="Lexicon", indexed_attrs=None):
         # create the lexicon first
         splitter = Splitter()
         casenormalizer = CaseNormalizer()
-        stopwordremover = StopWordAndSingleCharRemover()
-        pipeline = [splitter, casenormalizer, stopwordremover]
+        pipeline = [splitter, casenormalizer]
         lexicon = PLexicon(lex_id, "Lexicon", *pipeline)
         catalog._setObject(lex_id, lexicon)
 
@@ -228,6 +227,7 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
     :returns: sarchable text string
     """
     OPERATORS = ["AND", "OR"]
+    WILDCARDS = ["*", "?"]
 
     if op not in OPERATORS:
         op = "AND"
@@ -237,6 +237,9 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
     def is_op(token):
         return token.upper() in OPERATORS
 
+    def is_wc(char):
+        return char in WILDCARDS
+
     def append_op_after(index, token, tokens):
         # do not append an operator after the last token
         if index == len(tokens) - 1:
@@ -249,10 +252,21 @@ def append_op_after(index, token, tokens):
         return True
 
     # convert to unicode
-    term = safe_unicode(qs)
+    term = unquote_plus(safe_unicode(qs))
+
+    # Wildcards at the beginning are not allowed and therefore removed!
+    first_char = term[0] if len(term) > 0 else ""
+    if is_wc(first_char):
+        term = term.replace(first_char, "", 1)
+
+    # splits the string on all characters that do not match the regex
+    regex = r"[^\w\-\_\.\<\>\+\{\}\:\/\?\$]"
 
-    # splits the string on all non alphanumeric characters
-    tokens = re.split(r"[^\w]", term, flags=re.U | re.I)
+    # allow only words when searching just a single character
+    if len(term) == 1:
+        regex = r"[^\w]"
+
+    tokens = re.split(regex, term, flags=re.U | re.I)
 
     # filter out all empty tokens
     tokens = filter(None, tokens)
@@ -269,13 +283,16 @@ def append_op_after(index, token, tokens):
 
     for num, token in enumerate(tokens):
 
+        # retain wildcards at the end of a token
+        last_token_char = token[-1] if len(token) > 0 else ""
+
         # append operators without changes and continue
         if is_op(token):
             parts.append(token.upper())
             continue
 
         # append wildcard to token
-        if wildcard and not is_op(token):
+        if wildcard and not is_op(token) and not is_wc(last_token_char):
             token = token + "*"
 
         # append the token
diff --git a/src/senaite/core/catalog/indexer/sample.py b/src/senaite/core/catalog/indexer/sample.py
index 299d95bf35..d7ea9e7d84 100644
--- a/src/senaite/core/catalog/indexer/sample.py
+++ b/src/senaite/core/catalog/indexer/sample.py
@@ -21,8 +21,6 @@
 from bika.lims import api
 from bika.lims.interfaces import IAnalysisRequest
 from plone.indexer import indexer
-from senaite.core.catalog import SAMPLE_CATALOG
-from senaite.core.catalog.utils import get_searchable_text_tokens
 from senaite.core.interfaces import ISampleCatalog
 
 
@@ -91,20 +89,30 @@ def is_received(instance):
 
 @indexer(IAnalysisRequest, ISampleCatalog)
 def listing_searchable_text(instance):
-    """Retrieves all the values of metadata columns in the catalog for
-    wildcard searches
-    :return: all metadata values joined in a string
+    """Retrieves most commonly searched values for samples
+
+    :returns: string with search terms
     """
     entries = set()
-    catalog = SAMPLE_CATALOG
 
-    # add searchable text tokens for the root sample
-    tokens = get_searchable_text_tokens(instance, catalog)
-    entries.update(tokens)
+    for obj in [instance] + instance.getDescendants():
+        entries.add(obj.getId())
+        entries.add(obj.getClientOrderNumber())
+        entries.add(obj.getClientReference())
+        entries.add(obj.getClientSampleID())
+
+        # we use this approach to bypass the computed fields
+        client = obj.getClient()
+        entries.add(client.getName())
+        entries.add(client.getClientID())
+
+        sampletype = obj.getSampleType()
+        entries.add(sampletype.Title() if sampletype else '')
+
+        samplepoint = obj.getSamplePoint()
+        entries.add(samplepoint.Title() if samplepoint else '')
 
-    # add searchable text tokens for descendant samples
-    for descendant in instance.getDescendants():
-        tokens = get_searchable_text_tokens(descendant, catalog)
-        entries.update(tokens)
+        batch = obj.getBatch()
+        entries.add(batch.getId() if batch else '')
 
-    return u" ".join(list(entries))
+    return u" ".join(map(api.safe_unicode, entries))
diff --git a/src/senaite/core/profiles/default/metadata.xml b/src/senaite/core/profiles/default/metadata.xml
index 3da64c6b1f..9975121cb2 100644
--- a/src/senaite/core/profiles/default/metadata.xml
+++ b/src/senaite/core/profiles/default/metadata.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <metadata>
-  <version>2500</version>
+  <version>2501</version>
   <dependencies>
     <dependency>profile-Products.ATContentTypes:base</dependency>
     <dependency>profile-Products.CMFEditions:CMFEditions</dependency>
diff --git a/src/senaite/core/tests/doctests/API_catalog.rst b/src/senaite/core/tests/doctests/API_catalog.rst
index 316a262f8f..5c791893fa 100644
--- a/src/senaite/core/tests/doctests/API_catalog.rst
+++ b/src/senaite/core/tests/doctests/API_catalog.rst
@@ -163,6 +163,30 @@ Without wildcard:
     >>> capi.to_searchable_text_qs("sample", wildcard=False)
     u'sample'
 
+Wildcards at the beginning of the searchterms are not supported:
+
+    >>> capi.to_searchable_text_qs("?H2O")
+    u'H2O*'
+
+    >>> capi.to_searchable_text_qs("*H2O")
+    u'H2O*'
+
+Wildcards at the end of the searchterms are retained:
+
+    >>> capi.to_searchable_text_qs("H2O?")
+    u'H2O?'
+
+    >>> capi.to_searchable_text_qs("H2O*")
+    u'H2O*'
+
+If the search contains only a single character, it needs to be a word:
+
+    >>> capi.to_searchable_text_qs("W")
+    u'W*'
+
+    >>> capi.to_searchable_text_qs("$")
+    u''
+
 Searching for a unicode word:
 
     >>> capi.to_searchable_text_qs("AäOöUüZ")
@@ -176,7 +200,7 @@ Searching for multiple unicode words:
 Searching for a concatenated word:
 
     >>> capi.to_searchable_text_qs("H2O-0001")
-    u'H2O* AND 0001*'
+    u'H2O-0001*'
 
 Searching for two words:
 
@@ -188,38 +212,42 @@ Tricky query strings (with and/or in words or in between):
     >>> capi.to_searchable_text_qs("Fresh and Funky Oranges from Andorra")
     u'Fresh* AND Funky* AND Oranges* AND from* AND Andorra*'
 
-All wildcards are removed and replaced with `*` to avoid parse errors:
+Search with special characters:
+
+    >>> capi.to_searchable_text_qs("H2O_0001")
+    u'H2O_0001*'
 
-    >>> capi.to_searchable_text_qs("Ca? OR Mg?")
-    u'Ca* OR Mg*'
+    >>> capi.to_searchable_text_qs("H2O.0001")
+    u'H2O.0001*'
 
-Search with special characters:
+    >>> capi.to_searchable_text_qs("H2O<>0001")
+    u'H2O<>0001*'
+
+    >>> capi.to_searchable_text_qs("H2O:0001")
+    u'H2O:0001*'
+
+    >>> capi.to_searchable_text_qs("H2O/0001")
+    u'H2O/0001*'
 
     >>> capi.to_searchable_text_qs("'H2O-0001'")
-    u'H2O* AND 0001*'
+    u'H2O-0001*'
 
     >>> capi.to_searchable_text_qs("\'H2O-0001\'")
-    u'H2O* AND 0001*'
+    u'H2O-0001*'
 
     >>> capi.to_searchable_text_qs("(H2O-0001)*")
-    u'H2O* AND 0001*'
+    u'H2O-0001*'
 
     >>> capi.to_searchable_text_qs("****([H2O-0001])****")
-    u'H2O* AND 0001*'
+    u'H2O-0001*'
 
     >>> capi.to_searchable_text_qs("********************")
     u''
 
-    >>> capi.to_searchable_text_qs("????????????????????")
-    u''
-
-    >>> capi.to_searchable_text_qs("?H2O?")
-    u'H2O*'
-
     >>> capi.to_searchable_text_qs("*H2O*")
     u'H2O*'
 
-    >>> capi.to_searchable_text_qs("And the question is: AND OR maybe NOT AND")
+    >>> capi.to_searchable_text_qs("And the question is AND OR maybe NOT AND")
     u'the* AND question* AND is* AND OR maybe* AND NOT*'
 
     >>> capi.to_searchable_text_qs("AND OR")
diff --git a/src/senaite/core/upgrade/v02_05_000.py b/src/senaite/core/upgrade/v02_05_000.py
index 275ff97da7..30290a7241 100644
--- a/src/senaite/core/upgrade/v02_05_000.py
+++ b/src/senaite/core/upgrade/v02_05_000.py
@@ -18,7 +18,12 @@
 # Copyright 2018-2023 by it's authors.
 # Some rights reserved, see README and LICENSE.
 
+from bika.lims import api
 from senaite.core import logger
+from senaite.core.api.catalog import add_index
+from senaite.core.api.catalog import del_index
+from senaite.core.api.catalog import reindex_index
+from senaite.core.catalog import SAMPLE_CATALOG
 from senaite.core.config import PROJECTNAME as product
 from senaite.core.upgrade import upgradestep
 from senaite.core.upgrade.utils import UpgradeUtils
@@ -44,3 +49,19 @@ def upgrade(tool):
 
     logger.info("{0} upgraded to version {1}".format(product, version))
     return True
+
+
+def rebuild_sample_zctext_index_and_lexicon(tool):
+    """Recreate sample listing_searchable_text ZCText index and Lexicon
+    """
+    # remove the existing index
+    index = "listing_searchable_text"
+    del_index(SAMPLE_CATALOG, index)
+    # remove the Lexicon
+    catalog = api.get_tool(SAMPLE_CATALOG)
+    if "Lexicon" in catalog.objectIds():
+        catalog.manage_delObjects("Lexicon")
+    # recreate the index + lexicon
+    add_index(SAMPLE_CATALOG, index, "ZCTextIndex")
+    # reindex
+    reindex_index(SAMPLE_CATALOG, index)
diff --git a/src/senaite/core/upgrade/v02_05_000.zcml b/src/senaite/core/upgrade/v02_05_000.zcml
index 03f96c3355..e5067eac7c 100644
--- a/src/senaite/core/upgrade/v02_05_000.zcml
+++ b/src/senaite/core/upgrade/v02_05_000.zcml
@@ -3,6 +3,14 @@
     xmlns:genericsetup="http://namespaces.zope.org/genericsetup"
     i18n_domain="senaite.core">
 
+  <genericsetup:upgradeStep
+      title="SENAITE.CORE 2.5.0: Recreate listing_searchable_text ZCText index and Lexicon in Sample Catalog"
+      description="Rebuild listing_searchable_text and Lexicon for better performance"
+      source="2500"
+      destination="2501"
+      handler=".v02_05_000.rebuild_sample_zctext_index_and_lexicon"
+      profile="senaite.core:default"/>
+
   <genericsetup:upgradeStep
       title="Upgrade to SENAITE.CORE 2.5.0"
       source="2423"