Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COPDS-1777: custom ranking on catalogue search #121

Merged
merged 5 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions alembic/versions/63827287c182_fields_for_advanced_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""fields for advanced fts.

Revision ID: 63827287c182
Revises: 654a874249a8
Create Date: 2024-06-27 09:34:31.278052

"""

import sqlalchemy as sa
import sqlalchemy_utils

from alembic import op
from cads_catalogue import database

# revision identifiers, used by Alembic.
revision = "63827287c182"
down_revision = "654a874249a8"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
"resources",
sa.Column(
"fts",
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
"to_tsvector('english', coalesce(high_priority_terms, ''))",
persisted=True,
),
),
)
op.create_index(
"idx_resources_fts",
"resources",
["fts"],
postgresql_using="gin",
)
op.add_column("resources", sa.Column("popularity", sa.Integer, default=1))
op.execute(database.add_rank_function_sql)


def downgrade() -> None:
op.drop_column("resources", "fts")
op.execute(database.drop_rank_function_sql)
40 changes: 39 additions & 1 deletion cads_catalogue/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,27 @@
BaseModel = sa.orm.declarative_base(metadata=metadata)


add_rank_function_sql = """
CREATE OR REPLACE FUNCTION ts_rank2(w real[], v1 tsvector, v2 tsvector, q tsquery, n integer) RETURNS real
LANGUAGE plpgsql
IMMUTABLE PARALLEL SAFE STRICT
AS $function$
DECLARE
original_rank REAL;
htp_rank REAL;
BEGIN
SELECT INTO original_rank ts_rank(w,v1,q);
SELECT INTO htp_rank ts_rank(v2,q);
RETURN htp_rank*n*10 + original_rank;
END;
$function$;
"""

drop_rank_function_sql = """
DROP FUNCTION ts_rank2(w real[], v1 tsvector, v2 tsvector, q tsquery, n integer);
"""


class CatalogueUpdate(BaseModel):
"""Catalogue manager update information ORM model."""

Expand Down Expand Up @@ -232,6 +253,7 @@ class Resource(BaseModel):
# fulltextsearch-related
fulltext = sa.Column(sa.String)
high_priority_terms = sa.Column(sa.String)
popularity = sa.Column(sa.Integer, default=1)
search_field: str = sa.Column(
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
Expand All @@ -242,7 +264,13 @@ class Resource(BaseModel):
persisted=True,
),
)

fts: str = sa.Column(
sqlalchemy_utils.types.ts_vector.TSVectorType(regconfig="english"),
sa.Computed(
"to_tsvector('english', coalesce(high_priority_terms, ''))",
persisted=True,
),
)
# relationship attributes
resource_data = sa.orm.relationship(
ResourceData, uselist=False, back_populates="resource", lazy="select"
Expand Down Expand Up @@ -274,6 +302,7 @@ class Resource(BaseModel):

__table_args__ = (
sa.Index("idx_resources_search_field", search_field, postgresql_using="gin"),
sa.Index("idx_resources_fts", fts, postgresql_using="gin"),
)


Expand Down Expand Up @@ -332,6 +361,13 @@ def ensure_session_obj(read_only: bool = False) -> sa.orm.sessionmaker:
return session_obj


def create_catalogue_functions(engine):
"""Add customized functions in the catalogue database."""
with engine.connect() as conn:
conn.execute(sa.text(add_rank_function_sql))
conn.commit()


def init_database(connection_string: str, force: bool = False) -> sa.engine.Engine:
"""Make sure the db located at URI `connection_string` exists updated and return the engine object.

Expand All @@ -358,6 +394,7 @@ def init_database(connection_string: str, force: bool = False) -> sa.engine.Engi
# cleanup and create the schema
BaseModel.metadata.drop_all(engine)
BaseModel.metadata.create_all(engine)
create_catalogue_functions(engine)
alembic.command.stamp(alembic_cfg, "head")
else:
# check the structure is empty or incomplete
Expand All @@ -372,6 +409,7 @@ def init_database(connection_string: str, force: bool = False) -> sa.engine.Engi
# NOTE: tables no more in metadata are not removed with drop_all
BaseModel.metadata.drop_all(engine)
BaseModel.metadata.create_all(engine)
create_catalogue_functions(engine)
alembic.command.stamp(alembic_cfg, "head")
else:
# update db structure
Expand Down
1 change: 1 addition & 0 deletions cads_catalogue/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ def load_resource_metadata_file(folder_path: str | pathlib.Path) -> dict[str, An
metadata["licence_uids"] = data.get("licences", [])

metadata["lineage"] = data.get("lineage")
metadata["popularity"] = data.get("popularity", 1)
default_public_date = "2017-01-01"
metadata["publication_date"] = data.get("publication_date")
if not metadata["publication_date"]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"Variable domain: Land (biosphere)",
"Provider: Copernicus C3S"
],
"popularity": 500,
"qos_tags": ["tag1", "tag2", "tag3"],
"api_enforce_constraints": true,
"title": "ERA5-Land hourly data from 1950 to present",
Expand Down
4 changes: 4 additions & 0 deletions tests/data/dumped_resources1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"constraints": "an url",
"form": "an url for form.json",
"layout": "an url for layout.json",
Expand Down Expand Up @@ -55,11 +56,13 @@
"variables": [],
"fulltext": null,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B"
},
{
"resource_id": 2,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"constraints": "an url",
"form": "an url for form.json",
"layout": "an url for layout.json",
Expand Down Expand Up @@ -109,6 +112,7 @@
"variables": [],
"fulltext": "climate reanalysis past land era5 hydrology physics biosphere copernicus c3s conditions variables monthly means",
"high_priority_terms": "",
"fts": "",
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B"
}
]
4 changes: 4 additions & 0 deletions tests/data/dumped_resources2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"constraints": "an url",
"api_enforce_constraints": true,
"form": "an url for form.json",
Expand Down Expand Up @@ -55,11 +56,13 @@
"variables": [],
"fulltext": null,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"search_field": "'1950':11A 'accur':92B 'across':65B 'back':87B 'climat':56B,96B 'combin':59B 'compar':38B 'complet':71B 'compon':51B 'consist':23B,73B 'data':9A,61B,82B 'dataset':20B,74B 'decad':33B,86B 'descript':93B 'ecmwf':54B 'enhanc':36B 'era5':3A,6A,15B,40B,42B,55B,101 'era5-land':5A,14B,41B 'evolut':27B 'global':70B 'goe':84B 'hour':8A 'land':4A,7A,16B,29B,43B,50B,102 'law':77B 'model':60B 'observ':63B 'past':99B 'physic':79B 'present':13A 'produc':46B,81B 'provid':21B,90B 'reanalysi':2A,19B,57B,58B,80B,100 'reanalysis-era5-land':1A 'replay':48B 'resolut':37B 'sever':32B,85B 'time':89B 'use':75B 'variabl':30B 'view':24B 'world':67B"
},
{
"resource_id": 2,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"constraints": "an url",
"form": "a new url for form.json",
"layout": "a new url for layout.json",
Expand Down Expand Up @@ -109,6 +112,7 @@
"variables": [],
"fulltext": "climate reanalysis past land era5 hydrology physics biosphere copernicus c3s conditions variables monthly means",
"high_priority_terms": "",
"fts": "",
"search_field": "'1950':14A 'accur':95B 'across':68B 'averag':11A 'back':90B 'biospher':110C 'c3s':112C 'climat':59B,99B,103C 'combin':62B 'compar':41B 'complet':74B 'compon':54B 'condit':113C 'consist':26B,76B 'copernicus':111C 'data':12A,64B,85B 'dataset':23B,77B 'decad':36B,89B 'descript':96B 'ecmwf':57B 'enhanc':39B 'era5':3A,8A,18B,43B,45B,58B,107C 'era5-land':7A,17B,44B 'evolut':30B 'global':73B 'goe':87B 'hydrolog':108C 'land':4A,9A,19B,32B,46B,53B,106C 'law':80B 'mean':6A,116C 'model':63B 'month':5A,10A,115C 'observ':66B 'past':102B,105C 'physic':82B,109C 'present':16A 'produc':49B,84B 'provid':24B,93B 'reanalysi':2A,22B,60B,61B,83B,104C 'reanalysis-era5-land-monthly-means':1A 'replay':51B 'resolut':40B 'sever':35B,88B 'time':92B 'use':78B 'variabl':33B,114C 'view':27B 'world':70B"
}
]
2 changes: 2 additions & 0 deletions tests/data/dumped_resources3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"api_enforce_constraints": true,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -36,6 +37,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand Down
14 changes: 14 additions & 0 deletions tests/data/dumped_resources4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{
"resource_id": 4,
"resource_uid": "cams-global-reanalysis-eac4",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -103,6 +104,7 @@
"format_version": "1",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "Copernicus Atmospheric Monitoring Service",
"representative_fraction": null,
"responsible_organisation": "ECMWF",
Expand Down Expand Up @@ -899,6 +901,7 @@
{
"resource_id": 3,
"resource_uid": "cams-global-reanalysis-eac4-monthly",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1000,6 +1003,7 @@
"format_version": "1",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "Copernicus Atmospheric Monitoring Service",
"representative_fraction": null,
"responsible_organisation": "ECMWF",
Expand Down Expand Up @@ -1426,6 +1430,7 @@
{
"resource_id": 5,
"resource_uid": "derived-near-surface-meteorological-variables",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1460,6 +1465,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1480,6 +1486,7 @@
{
"resource_id": 1,
"resource_uid": "reanalysis-era5-land",
"popularity": 500,
"api_enforce_constraints": true,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1514,6 +1521,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "reanalysis ERA5 land",
"fts": "'era5':2 'land':3 'reanalysi':1",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1538,6 +1546,7 @@
{
"resource_id": 6,
"resource_uid": "reanalysis-era5-land-monthly-means",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1572,6 +1581,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1592,6 +1602,7 @@
{
"resource_id": 7,
"resource_uid": "reanalysis-era5-pressure-levels",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1626,6 +1637,7 @@
"format_version": null,
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand All @@ -1646,6 +1658,7 @@
{
"resource_id": 8,
"resource_uid": "satellite-surface-radiation-budget",
"popularity": 1,
"api_enforce_constraints": false,
"constraints": "an url",
"form": "an url",
Expand Down Expand Up @@ -1680,6 +1693,7 @@
"format_version": "3",
"hidden": false,
"high_priority_terms": "",
"fts": "",
"lineage": "EC Copernicus program",
"representative_fraction": 0.25,
"responsible_organisation": "ECMWF",
Expand Down
Loading