Skip to content

Commit

Permalink
Improve performance for sample listing index (#2273)
Browse files Browse the repository at this point in the history
* Removed stopword remover from lexicon

* Explicit values for listing_searchable_text index

* Upgrade step added

* Removed unused import

* Index also client ID and Name

* Changelog updated

* Ensure unicodes

* Use getId instead

* Do not split on `-`

* Allow certain characters

* Fixed test

* Allow more special characters

* Do not split on : and /

* Support for wildcards

* URL unquote the searchterm first

* Fix regex

* Percentages are filtered out

* No literals
  • Loading branch information
ramonski authored Mar 13, 2023
1 parent c8e7513 commit 75e40c3
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 38 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Changelog
2.5.0 (unreleased)
------------------

- #2273 Improve performance for sample listing index
- #2272 Allow to configure the position of additional value columns


Expand Down
31 changes: 24 additions & 7 deletions src/senaite/core/api/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
import re

import six
from six.moves.urllib.parse import unquote_plus

from bika.lims.api import APIError
from bika.lims.api import get_tool
from bika.lims.api import safe_unicode
from Products.CMFPlone.UnicodeSplitter import CaseNormalizer
from Products.CMFPlone.UnicodeSplitter import Splitter
from Products.ZCatalog.interfaces import IZCatalog
from Products.ZCTextIndex.Lexicon import StopWordAndSingleCharRemover
from Products.ZCTextIndex.ZCTextIndex import PLexicon


Expand Down Expand Up @@ -149,8 +149,7 @@ def add_zc_text_index(catalog, index, lex_id="Lexicon", indexed_attrs=None):
# create the lexicon first
splitter = Splitter()
casenormalizer = CaseNormalizer()
stopwordremover = StopWordAndSingleCharRemover()
pipeline = [splitter, casenormalizer, stopwordremover]
pipeline = [splitter, casenormalizer]
lexicon = PLexicon(lex_id, "Lexicon", *pipeline)
catalog._setObject(lex_id, lexicon)

Expand Down Expand Up @@ -228,6 +227,7 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
:returns: sarchable text string
"""
OPERATORS = ["AND", "OR"]
WILDCARDS = ["*", "?"]

if op not in OPERATORS:
op = "AND"
Expand All @@ -237,6 +237,9 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
def is_op(token):
return token.upper() in OPERATORS

def is_wc(char):
return char in WILDCARDS

def append_op_after(index, token, tokens):
# do not append an operator after the last token
if index == len(tokens) - 1:
Expand All @@ -249,10 +252,21 @@ def append_op_after(index, token, tokens):
return True

# convert to unicode
term = safe_unicode(qs)
term = unquote_plus(safe_unicode(qs))

# Wildcards at the beginning are not allowed and therefore removed!
first_char = term[0] if len(term) > 0 else ""
if is_wc(first_char):
term = term.replace(first_char, "", 1)

# splits the string on all characters that do not match the regex
regex = r"[^\w\-\_\.\<\>\+\{\}\:\/\?\$]"

# splits the string on all non alphanumeric characters
tokens = re.split(r"[^\w]", term, flags=re.U | re.I)
# allow only words when searching just a single character
if len(term) == 1:
regex = r"[^\w]"

tokens = re.split(regex, term, flags=re.U | re.I)

# filter out all empty tokens
tokens = filter(None, tokens)
Expand All @@ -269,13 +283,16 @@ def append_op_after(index, token, tokens):

for num, token in enumerate(tokens):

# retain wildcards at the end of a token
last_token_char = token[-1] if len(token) > 0 else ""

# append operators without changes and continue
if is_op(token):
parts.append(token.upper())
continue

# append wildcard to token
if wildcard and not is_op(token):
if wildcard and not is_op(token) and not is_wc(last_token_char):
token = token + "*"

# append the token
Expand Down
36 changes: 22 additions & 14 deletions src/senaite/core/catalog/indexer/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from bika.lims import api
from bika.lims.interfaces import IAnalysisRequest
from plone.indexer import indexer
from senaite.core.catalog import SAMPLE_CATALOG
from senaite.core.catalog.utils import get_searchable_text_tokens
from senaite.core.interfaces import ISampleCatalog


Expand Down Expand Up @@ -91,20 +89,30 @@ def is_received(instance):

@indexer(IAnalysisRequest, ISampleCatalog)
def listing_searchable_text(instance):
"""Retrieves all the values of metadata columns in the catalog for
wildcard searches
:return: all metadata values joined in a string
"""Retrieves most commonly searched values for samples
:returns: string with search terms
"""
entries = set()
catalog = SAMPLE_CATALOG

# add searchable text tokens for the root sample
tokens = get_searchable_text_tokens(instance, catalog)
entries.update(tokens)
for obj in [instance] + instance.getDescendants():
entries.add(obj.getId())
entries.add(obj.getClientOrderNumber())
entries.add(obj.getClientReference())
entries.add(obj.getClientSampleID())

# we use this approach to bypass the computed fields
client = obj.getClient()
entries.add(client.getName())
entries.add(client.getClientID())

sampletype = obj.getSampleType()
entries.add(sampletype.Title() if sampletype else '')

samplepoint = obj.getSamplePoint()
entries.add(samplepoint.Title() if samplepoint else '')

# add searchable text tokens for descendant samples
for descendant in instance.getDescendants():
tokens = get_searchable_text_tokens(descendant, catalog)
entries.update(tokens)
batch = obj.getBatch()
entries.add(batch.getId() if batch else '')

return u" ".join(list(entries))
return u" ".join(map(api.safe_unicode, entries))
2 changes: 1 addition & 1 deletion src/senaite/core/profiles/default/metadata.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0"?>
<metadata>
<version>2500</version>
<version>2501</version>
<dependencies>
<dependency>profile-Products.ATContentTypes:base</dependency>
<dependency>profile-Products.CMFEditions:CMFEditions</dependency>
Expand Down
60 changes: 44 additions & 16 deletions src/senaite/core/tests/doctests/API_catalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,30 @@ Without wildcard:
>>> capi.to_searchable_text_qs("sample", wildcard=False)
u'sample'

Wildcards at the beginning of the searchterms are not supported:

>>> capi.to_searchable_text_qs("?H2O")
u'H2O*'

>>> capi.to_searchable_text_qs("*H2O")
u'H2O*'

Wildcards at the end of the searchterms are retained:

>>> capi.to_searchable_text_qs("H2O?")
u'H2O?'

>>> capi.to_searchable_text_qs("H2O*")
u'H2O*'

If the search contains only a single character, it needs to be a word:

>>> capi.to_searchable_text_qs("W")
u'W*'

>>> capi.to_searchable_text_qs("$")
u''

Searching for a unicode word:

>>> capi.to_searchable_text_qs("AäOöUüZ")
Expand All @@ -176,7 +200,7 @@ Searching for multiple unicode words:
Searching for a concatenated word:

>>> capi.to_searchable_text_qs("H2O-0001")
u'H2O* AND 0001*'
u'H2O-0001*'

Searching for two words:

Expand All @@ -188,38 +212,42 @@ Tricky query strings (with and/or in words or in between):
>>> capi.to_searchable_text_qs("Fresh and Funky Oranges from Andorra")
u'Fresh* AND Funky* AND Oranges* AND from* AND Andorra*'

All wildcards are removed and replaced with `*` to avoid parse errors:
Search with special characters:

>>> capi.to_searchable_text_qs("H2O_0001")
u'H2O_0001*'

>>> capi.to_searchable_text_qs("Ca? OR Mg?")
u'Ca* OR Mg*'
>>> capi.to_searchable_text_qs("H2O.0001")
u'H2O.0001*'

Search with special characters:
>>> capi.to_searchable_text_qs("H2O<>0001")
u'H2O<>0001*'

>>> capi.to_searchable_text_qs("H2O:0001")
u'H2O:0001*'

>>> capi.to_searchable_text_qs("H2O/0001")
u'H2O/0001*'

>>> capi.to_searchable_text_qs("'H2O-0001'")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("\'H2O-0001\'")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("(H2O-0001)*")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("****([H2O-0001])****")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("********************")
u''

>>> capi.to_searchable_text_qs("????????????????????")
u''

>>> capi.to_searchable_text_qs("?H2O?")
u'H2O*'

>>> capi.to_searchable_text_qs("*H2O*")
u'H2O*'

>>> capi.to_searchable_text_qs("And the question is: AND OR maybe NOT AND")
>>> capi.to_searchable_text_qs("And the question is AND OR maybe NOT AND")
u'the* AND question* AND is* AND OR maybe* AND NOT*'

>>> capi.to_searchable_text_qs("AND OR")
Expand Down
21 changes: 21 additions & 0 deletions src/senaite/core/upgrade/v02_05_000.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
# Copyright 2018-2023 by it's authors.
# Some rights reserved, see README and LICENSE.

from bika.lims import api
from senaite.core import logger
from senaite.core.api.catalog import add_index
from senaite.core.api.catalog import del_index
from senaite.core.api.catalog import reindex_index
from senaite.core.catalog import SAMPLE_CATALOG
from senaite.core.config import PROJECTNAME as product
from senaite.core.upgrade import upgradestep
from senaite.core.upgrade.utils import UpgradeUtils
Expand All @@ -44,3 +49,19 @@ def upgrade(tool):

logger.info("{0} upgraded to version {1}".format(product, version))
return True


def rebuild_sample_zctext_index_and_lexicon(tool):
"""Recreate sample listing_searchable_text ZCText index and Lexicon
"""
# remove the existing index
index = "listing_searchable_text"
del_index(SAMPLE_CATALOG, index)
# remove the Lexicon
catalog = api.get_tool(SAMPLE_CATALOG)
if "Lexicon" in catalog.objectIds():
catalog.manage_delObjects("Lexicon")
# recreate the index + lexicon
add_index(SAMPLE_CATALOG, index, "ZCTextIndex")
# reindex
reindex_index(SAMPLE_CATALOG, index)
8 changes: 8 additions & 0 deletions src/senaite/core/upgrade/v02_05_000.zcml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
xmlns:genericsetup="http://namespaces.zope.org/genericsetup"
i18n_domain="senaite.core">

<genericsetup:upgradeStep
title="SENAITE.CORE 2.5.0: Recreate listing_searchable_text ZCText index and Lexicon in Sample Catalog"
description="Rebuild listing_searchable_text and Lexicon for better performance"
source="2500"
destination="2501"
handler=".v02_05_000.rebuild_sample_zctext_index_and_lexicon"
profile="senaite.core:default"/>

<genericsetup:upgradeStep
title="Upgrade to SENAITE.CORE 2.5.0"
source="2423"
Expand Down

0 comments on commit 75e40c3

Please sign in to comment.