Skip to content

Commit

Permalink
Merge pull request #121 from ecmwf-projects/copds-1777-fts
Browse files Browse the repository at this point in the history
COPDS-1777: custom ranking on catalogue search
  • Loading branch information
alex75 authored Jul 2, 2024
2 parents b2175c5 + 19c9652 commit 2a88c3e
Show file tree
Hide file tree
Showing 13 changed files with 162 additions and 3 deletions.
46 changes: 46 additions & 0 deletions alembic/versions/63827287c182_fields_for_advanced_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""fields for advanced fts.
Revision ID: 63827287c182
Revises: 654a874249a8
Create Date: 2024-06-27 09:34:31.278052
"""

import sqlalchemy as sa
import sqlalchemy_utils

from alembic import op
from cads_catalogue import database

# revision identifiers, used by Alembic.
revision = "63827287c182"
down_revision = "654a874249a8"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
"resources",
sa.Column(
"fts",
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
"to_tsvector('english', coalesce(high_priority_terms, ''))",
persisted=True,
),
),
)
op.create_index(
"idx_resources_fts",
"resources",
["fts"],
postgresql_using="gin",
)
op.add_column("resources", sa.Column("popularity", sa.Integer, default=1))
op.execute(database.add_rank_function_sql)


def downgrade() -> None:
op.drop_column("resources", "fts")
op.execute(database.drop_rank_function_sql)
40 changes: 39 additions & 1 deletion cads_catalogue/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,27 @@
BaseModel = sa.orm.declarative_base(metadata=metadata)


add_rank_function_sql = """
CREATE OR REPLACE FUNCTION ts_rank2(w real[], v1 tsvector, v2 tsvector, q tsquery, n integer) RETURNS real
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE STRICT
AS $function$
DECLARE
original_rank REAL;
htp_rank REAL;
BEGIN
SELECT INTO original_rank ts_rank(w,v1,q);
SELECT INTO htp_rank ts_rank(v2,q);
RETURN htp_rank*n*10 + original_rank;
END;
$function$;
"""

drop_rank_function_sql = """
DROP FUNCTION ts_rank2(w real[], v1 tsvector, v2 tsvector, q tsquery, n integer);
"""


class CatalogueUpdate(BaseModel):
"""Catalogue manager update information ORM model."""

Expand Down Expand Up @@ -232,6 +253,7 @@ class Resource(BaseModel):
# fulltextsearch-related
fulltext = sa.Column(sa.String)
high_priority_terms = sa.Column(sa.String)
popularity = sa.Column(sa.Integer, default=1)
search_field: str = sa.Column(
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
Expand All @@ -242,7 +264,13 @@ class Resource(BaseModel):
persisted=True,
),
)

fts: str = sa.Column(
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
"to_tsvector('english', coalesce(high_priority_terms, ''))",
persisted=True,
),
)
# relationship attributes
resource_data = sa.orm.relationship(
ResourceData, uselist=False, back_populates="resource", lazy="select"
Expand Down Expand Up @@ -274,6 +302,7 @@ class Resource(BaseModel):

__table_args__ = (
sa.Index("idx_resources_search_field", search_field, postgresql_using="gin"),
sa.Index("idx_resources_fts", fts, postgresql_using="gin"),
)


Expand Down Expand Up @@ -332,6 +361,13 @@ def ensure_session_obj(read_only: bool = False) -> sa.orm.sessionmaker:
return session_obj


def create_catalogue_functions(engine):
"""Add customized functions in the catalogue database."""
with engine.connect() as conn:
conn.execute(sa.text(add_rank_function_sql))
conn.commit()


def init_database(connection_string: str, force: bool = False) -> sa.engine.Engine:
"""Make sure the db located at URI `connection_string` exists updated and return the engine object.
Expand All @@ -358,6 +394,7 @@ def init_database(connection_string: str, force: bool = False) -> sa.engine.Engi
# cleanup and create the schema
BaseModel.metadata.drop_all(engine)
BaseModel.metadata.create_all(engine)
create_catalogue_functions(engine)
alembic.command.stamp(alembic_cfg, "head")
else:
# check the structure is empty or incomplete
Expand All @@ -372,6 +409,7 @@ def init_database(connection_string: str, force: bool = False) -> sa.engine.Engi
# NOTE: tables no more in metadata are not removed with drop_all
BaseModel.metadata.drop_all(engine)
BaseModel.metadata.create_all(engine)
create_catalogue_functions(engine)
alembic.command.stamp(alembic_cfg, "head")
else:
# update db structure
Expand Down
1 change: 1 addition & 0 deletions cads_catalogue/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ def load_resource_metadata_file(folder_path: str | pathlib.Path) -> dict[str, An
metadata["licence_uids"] = data.get("licences", [])

metadata["lineage"] = data.get("lineage")
metadata["popularity"] = data.get("popularity", 1)
default_public_date = "2017-01-01"
metadata["publication_date"] = data.get("publication_date")
if not metadata["publication_date"]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"Variable domain: Land (biosphere)",
"Provider: Copernicus C3S"
],
"popularity": 500,
"qos_tags": ["tag1", "tag2", "tag3"],
"api_enforce_constraints": true,
"title": "ERA5-Land hourly data from 1950 to present",
Expand Down
4 changes: 4 additions & 0 deletions tests/data/dumped_resources1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"constraints": "an url",
"form": "an url for form.json",
"layout": "an url for layout.json",
Expand Down Expand Up @@ -55,11 +56,13 @@
"variables": [],
"fulltext": null,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B"
},
{
"resource_id": 2,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"constraints": "an url",
"form": "an url for form.json",
"layout": "an url for layout.json",
Expand Down Expand Up @@ -109,6 +112,7 @@
"variables": [],
"fulltext": "climate reanalysis past land era5 hydrology physics biosphere copernicus c3s conditions variables monthly means",
"high_priority_terms": "",
"fts": "",
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B"
}
]
4 changes: 4 additions & 0 deletions tests/data/dumped_resources2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"constraints": "an url",
"api_enforce_constraints": true,
"form": "an url for form.json",
Expand Down Expand Up @@ -55,11 +56,13 @@
"variables": [],
"fulltext": null,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B"
},
{
"resource_id": 2,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"constraints": "an url",
"form": "a new url for form.json",
"layout": "a new url for layout.json",
Expand Down Expand Up @@ -109,6 +112,7 @@
"variables": [],
"fulltext": "climate reanalysis past land era5 hydrology physics biosphere copernicus c3s conditions variables monthly means",
"high_priority_terms": "",
"fts": "",
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B"
}
]
2 changes: 2 additions & 0 deletions tests/data/dumped_resources3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"api_enforce_constraints": true,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -36,6 +37,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand Down
14 changes: 14 additions & 0 deletions tests/data/dumped_resources4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 4,
"resource_uid": "cams-global-reanalysis-eac4",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -103,6 +104,7 @@
"format_version": "1",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "Copernicus Atmospheric Monitoring Service",
"representative_fraction": null,
"responsible_organisation": "ECMWF",
Expand Down Expand Up @@ -899,6 +901,7 @@
{
"resource_id": 3,
"resource_uid": "cams-global-reanalysis-eac4-monthly",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1000,6 +1003,7 @@
"format_version": "1",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "Copernicus Atmospheric Monitoring Service",
"representative_fraction": null,
"responsible_organisation": "ECMWF",
Expand Down Expand Up @@ -1426,6 +1430,7 @@
{
"resource_id": 5,
"resource_uid": "derived-near-surface-meteorological-variables",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1460,6 +1465,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1480,6 +1486,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"api_enforce_constraints": true,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1514,6 +1521,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1538,6 +1546,7 @@
{
"resource_id": 6,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1572,6 +1581,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1592,6 +1602,7 @@
{
"resource_id": 7,
"resource_uid": "reanalysis-era5-pressure-levels",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1626,6 +1637,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1646,6 +1658,7 @@
{
"resource_id": 8,
"resource_uid": "satellite-surface-radiation-budget",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1680,6 +1693,7 @@
"format_version": "3",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand Down
Loading

0 comments on commit 2a88c3e

Please sign in to comment.