Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance for sample listing index #2273

Merged
merged 18 commits into from
Mar 13, 2023
Merged
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Changelog
2.5.0 (unreleased)
------------------

- #2273 Improve performance for sample listing index
- #2272 Allow to configure the position of additional value columns


Expand Down
31 changes: 24 additions & 7 deletions src/senaite/core/api/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
import re

import six
from six.moves.urllib.parse import unquote_plus

from bika.lims.api import APIError
from bika.lims.api import get_tool
from bika.lims.api import safe_unicode
from Products.CMFPlone.UnicodeSplitter import CaseNormalizer
from Products.CMFPlone.UnicodeSplitter import Splitter
from Products.ZCatalog.interfaces import IZCatalog
from Products.ZCTextIndex.Lexicon import StopWordAndSingleCharRemover
from Products.ZCTextIndex.ZCTextIndex import PLexicon


Expand Down Expand Up @@ -149,8 +149,7 @@ def add_zc_text_index(catalog, index, lex_id="Lexicon", indexed_attrs=None):
# create the lexicon first
splitter = Splitter()
casenormalizer = CaseNormalizer()
stopwordremover = StopWordAndSingleCharRemover()
pipeline = [splitter, casenormalizer, stopwordremover]
pipeline = [splitter, casenormalizer]
lexicon = PLexicon(lex_id, "Lexicon", *pipeline)
catalog._setObject(lex_id, lexicon)

Expand Down Expand Up @@ -228,6 +227,7 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
:returns: sarchable text string
"""
OPERATORS = ["AND", "OR"]
WILDCARDS = ["*", "?"]

if op not in OPERATORS:
op = "AND"
Expand All @@ -237,6 +237,9 @@ def to_searchable_text_qs(qs, op="AND", wildcard=True):
def is_op(token):
return token.upper() in OPERATORS

def is_wc(char):
return char in WILDCARDS

def append_op_after(index, token, tokens):
# do not append an operator after the last token
if index == len(tokens) - 1:
Expand All @@ -249,10 +252,21 @@ def append_op_after(index, token, tokens):
return True

# convert to unicode
term = safe_unicode(qs)
term = unquote_plus(safe_unicode(qs))

# Wildcards at the beginning are not allowed and therefore removed!
first_char = term[0] if len(term) > 0 else ""
if is_wc(first_char):
term = term.replace(first_char, "", 1)

# splits the string on all characters that do not match the regex
regex = r"[^\w\-\_\.\<\>\+\{\}\:\/\?\$]"

# splits the string on all non alphanumeric characters
tokens = re.split(r"[^\w]", term, flags=re.U | re.I)
# allow only words when searching just a single character
if len(term) == 1:
regex = r"[^\w]"

tokens = re.split(regex, term, flags=re.U | re.I)

# filter out all empty tokens
tokens = filter(None, tokens)
Expand All @@ -269,13 +283,16 @@ def append_op_after(index, token, tokens):

for num, token in enumerate(tokens):

# retain wildcards at the end of a token
last_token_char = token[-1] if len(token) > 0 else ""

# append operators without changes and continue
if is_op(token):
parts.append(token.upper())
continue

# append wildcard to token
if wildcard and not is_op(token):
if wildcard and not is_op(token) and not is_wc(last_token_char):
token = token + "*"

# append the token
Expand Down
36 changes: 22 additions & 14 deletions src/senaite/core/catalog/indexer/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from bika.lims import api
from bika.lims.interfaces import IAnalysisRequest
from plone.indexer import indexer
from senaite.core.catalog import SAMPLE_CATALOG
from senaite.core.catalog.utils import get_searchable_text_tokens
from senaite.core.interfaces import ISampleCatalog


Expand Down Expand Up @@ -91,20 +89,30 @@ def is_received(instance):

@indexer(IAnalysisRequest, ISampleCatalog)
def listing_searchable_text(instance):
"""Retrieves all the values of metadata columns in the catalog for
wildcard searches
:return: all metadata values joined in a string
"""Retrieves most commonly searched values for samples

:returns: string with search terms
"""
entries = set()
catalog = SAMPLE_CATALOG

# add searchable text tokens for the root sample
tokens = get_searchable_text_tokens(instance, catalog)
entries.update(tokens)
for obj in [instance] + instance.getDescendants():
entries.add(obj.getId())
entries.add(obj.getClientOrderNumber())
entries.add(obj.getClientReference())
entries.add(obj.getClientSampleID())

# we use this approach to bypass the computed fields
client = obj.getClient()
entries.add(client.getName())
entries.add(client.getClientID())

sampletype = obj.getSampleType()
entries.add(sampletype.Title() if sampletype else '')

samplepoint = obj.getSamplePoint()
entries.add(samplepoint.Title() if samplepoint else '')

# add searchable text tokens for descendant samples
for descendant in instance.getDescendants():
tokens = get_searchable_text_tokens(descendant, catalog)
entries.update(tokens)
batch = obj.getBatch()
entries.add(batch.getId() if batch else '')

return u" ".join(list(entries))
return u" ".join(map(api.safe_unicode, entries))
2 changes: 1 addition & 1 deletion src/senaite/core/profiles/default/metadata.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0"?>
<metadata>
<version>2500</version>
<version>2501</version>
<dependencies>
<dependency>profile-Products.ATContentTypes:base</dependency>
<dependency>profile-Products.CMFEditions:CMFEditions</dependency>
Expand Down
60 changes: 44 additions & 16 deletions src/senaite/core/tests/doctests/API_catalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,30 @@ Without wildcard:
>>> capi.to_searchable_text_qs("sample", wildcard=False)
u'sample'

Wildcards at the beginning of the searchterms are not supported:

>>> capi.to_searchable_text_qs("?H2O")
u'H2O*'

>>> capi.to_searchable_text_qs("*H2O")
u'H2O*'

Wildcards at the end of the searchterms are retained:

>>> capi.to_searchable_text_qs("H2O?")
u'H2O?'

>>> capi.to_searchable_text_qs("H2O*")
u'H2O*'

If the search contains only a single character, it needs to be a word:

>>> capi.to_searchable_text_qs("W")
u'W*'

>>> capi.to_searchable_text_qs("$")
u''

Searching for a unicode word:

>>> capi.to_searchable_text_qs("AäOöUüZ")
Expand All @@ -176,7 +200,7 @@ Searching for multiple unicode words:
Searching for a concatenated word:

>>> capi.to_searchable_text_qs("H2O-0001")
u'H2O* AND 0001*'
u'H2O-0001*'

Searching for two words:

Expand All @@ -188,38 +212,42 @@ Tricky query strings (with and/or in words or in between):
>>> capi.to_searchable_text_qs("Fresh and Funky Oranges from Andorra")
u'Fresh* AND Funky* AND Oranges* AND from* AND Andorra*'

All wildcards are removed and replaced with `*` to avoid parse errors:
Search with special characters:

>>> capi.to_searchable_text_qs("H2O_0001")
u'H2O_0001*'

>>> capi.to_searchable_text_qs("Ca? OR Mg?")
u'Ca* OR Mg*'
>>> capi.to_searchable_text_qs("H2O.0001")
u'H2O.0001*'

Search with special characters:
>>> capi.to_searchable_text_qs("H2O<>0001")
u'H2O<>0001*'

>>> capi.to_searchable_text_qs("H2O:0001")
u'H2O:0001*'

>>> capi.to_searchable_text_qs("H2O/0001")
u'H2O/0001*'

>>> capi.to_searchable_text_qs("'H2O-0001'")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("\'H2O-0001\'")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("(H2O-0001)*")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("****([H2O-0001])****")
u'H2O* AND 0001*'
u'H2O-0001*'

>>> capi.to_searchable_text_qs("********************")
u''

>>> capi.to_searchable_text_qs("????????????????????")
u''

>>> capi.to_searchable_text_qs("?H2O?")
u'H2O*'

>>> capi.to_searchable_text_qs("*H2O*")
u'H2O*'

>>> capi.to_searchable_text_qs("And the question is: AND OR maybe NOT AND")
>>> capi.to_searchable_text_qs("And the question is AND OR maybe NOT AND")
u'the* AND question* AND is* AND OR maybe* AND NOT*'

>>> capi.to_searchable_text_qs("AND OR")
Expand Down
21 changes: 21 additions & 0 deletions src/senaite/core/upgrade/v02_05_000.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
# Copyright 2018-2023 by it's authors.
# Some rights reserved, see README and LICENSE.

from bika.lims import api
from senaite.core import logger
from senaite.core.api.catalog import add_index
from senaite.core.api.catalog import del_index
from senaite.core.api.catalog import reindex_index
from senaite.core.catalog import SAMPLE_CATALOG
from senaite.core.config import PROJECTNAME as product
from senaite.core.upgrade import upgradestep
from senaite.core.upgrade.utils import UpgradeUtils
Expand All @@ -44,3 +49,19 @@ def upgrade(tool):

logger.info("{0} upgraded to version {1}".format(product, version))
return True


def rebuild_sample_zctext_index_and_lexicon(tool):
"""Recreate sample listing_searchable_text ZCText index and Lexicon
"""
# remove the existing index
index = "listing_searchable_text"
del_index(SAMPLE_CATALOG, index)
# remove the Lexicon
catalog = api.get_tool(SAMPLE_CATALOG)
if "Lexicon" in catalog.objectIds():
catalog.manage_delObjects("Lexicon")
# recreate the index + lexicon
add_index(SAMPLE_CATALOG, index, "ZCTextIndex")
# reindex
reindex_index(SAMPLE_CATALOG, index)
8 changes: 8 additions & 0 deletions src/senaite/core/upgrade/v02_05_000.zcml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
xmlns:genericsetup="http://namespaces.zope.org/genericsetup"
i18n_domain="senaite.core">

<genericsetup:upgradeStep
title="SENAITE.CORE 2.5.0: Recreate listing_searchable_text ZCText index and Lexicon in Sample Catalog"
description="Rebuild listing_searchable_text and Lexicon for better performance"
source="2500"
destination="2501"
handler=".v02_05_000.rebuild_sample_zctext_index_and_lexicon"
profile="senaite.core:default"/>

<genericsetup:upgradeStep
title="Upgrade to SENAITE.CORE 2.5.0"
source="2423"
Expand Down