From c815475f5453a19e7b143e811bb563065a1bc966 Mon Sep 17 00:00:00 2001 From: Denis Laxalde Date: Thu, 9 Mar 2023 10:06:56 +0100 Subject: [PATCH] Fix query decode when PostgreSQL encoding name does not match Python one PostgreSQL and Python have different names for encodings. Since we retrieve PostgreSQL encoding name from 'pg_catalog.pg_encoding_to_char(pg_database.encoding)' and then use this to decode Python bytes, we need to convert the former into a Python encoding name. We thus use psycopg._encodings.pg2pyenc() to translate encoding names; this is not part of the public API of psycopg at the moment, but should be in the future. For psycopg2, we implement a fallback mechanism in pg.decode(); not all PostgreSQL encodings will then be accounted for. Typically, from https://www.postgresql.org/docs/current/multibyte.html#CHARSET-TABLE, KOI8R, KOI8U and WIN* are not handled (as far as I can tell). (SQL_ASCII is not handled either, but the fallback is enough.) Test previously introduced using SQL_ASCII now passes. Additionally, we add another test in which there is no mapping between Python and PostgreSQL: zh_TW. In CI, we thus add zh_TW to the host locales. --- .github/workflows/tests.yml | 4 ++-- CHANGELOG.md | 6 +++++- pgactivity/data.py | 4 ++-- pgactivity/pg.py | 25 +++++++++++++++++++++++++ pgactivity/types.py | 12 ++++++------ tests/test_data.py | 12 +++++++----- 6 files changed, 47 insertions(+), 16 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7343f246..cf07c47e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,9 +24,9 @@ jobs: python-version: ${{ matrix.python }} - name: Install tox run: pip install tox - - name: Add fr_FR ISO-8859-1 for test purposes + - name: Add fr_FR and zh_TW for test purposes run: | - sudo locale-gen fr_FR + sudo locale-gen fr_FR zh_TW.EUC-TW sudo update-locale - name: Test run: tox -e py-${{ matrix.psycopg }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a813064..1d09644c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ # Change log -## pg\_activity 3.1.2 - UNRELEASED +## pg\_activity 3.2.0 - UNRELEASED ### Fixed +* Handle conversion between PostgreSQL encoding names and Python ones while + decoding the `query` column from `pg_stat_activity` with + `pg_database.encoding` (#348). + * Fix typos in man pages, spotted by codespell. ### Added diff --git a/pgactivity/data.py b/pgactivity/data.py index 27e08559..f4f56b2e 100644 --- a/pgactivity/data.py +++ b/pgactivity/data.py @@ -69,7 +69,7 @@ class Data: pg_conn: Connection pg_version: str pg_num_version: int - server_encoding: str + server_encoding: bytes min_duration: float filters: Filters dsn_parameters: Dict[str, str] @@ -111,7 +111,7 @@ def pg_connect( pg_conn, pg_version, pg.server_version(pg_conn), - server_encoding, + server_encoding.encode(), min_duration=min_duration, failed_queries=FailedQueriesInfo(), filters=filters, diff --git a/pgactivity/pg.py b/pgactivity/pg.py index 1759a064..d6c7cb69 100644 --- a/pgactivity/pg.py +++ b/pgactivity/pg.py @@ -20,10 +20,12 @@ import psycopg from psycopg import sql as sql + from psycopg._encodings import pg2pyenc from psycopg.adapt import Buffer, Loader from psycopg.conninfo import make_conninfo, conninfo_to_dict from psycopg.rows import dict_row from psycopg.errors import ( + NotSupportedError, FeatureNotSupported as FeatureNotSupported, InterfaceError as InterfaceError, InvalidPassword as InvalidPassword, @@ -170,7 +172,19 @@ def fetchall( with cursor(conn, mkrow, text_as_bytes) as cur: return cur.execute(query, args, prepare=True).fetchall() + def decode(value: bytes, pgenc: bytes, *, errors: str) -> str: + """Decode 'value' with PostgreSQL encoding 'pgenc' converted to Python encoding + name if available. + """ + try: + pyenc = pg2pyenc(pgenc) + except NotSupportedError: + pyenc = "utf-8" + return value.decode(pyenc, errors=errors) + except ImportError: + import codecs + import psycopg2 import psycopg2.extensions from psycopg2.extras import DictCursor @@ -247,6 +261,16 @@ def fetchall( # type: ignore[no-redef] return [mkrow(**row) for row in rows] return rows + def decode(value: bytes, pgenc: bytes, *, errors: str) -> str: + """Decode 'value' with PostgreSQL encoding 'pgenc' converted to Python encoding + name if available. + """ + try: + pyenc = codecs.lookup(pgenc.decode()).name + except LookupError: + pyenc = "utf-8" + return value.decode(pyenc, errors=errors) + __all__ = [ "__version__", @@ -260,6 +284,7 @@ def fetchall( # type: ignore[no-redef] "QueryCanceled", "connect", "connection_parameters", + "decode", "execute", "fetchall", "fetchone", diff --git a/pgactivity/types.py b/pgactivity/types.py index 4f9fc332..d00b32bf 100644 --- a/pgactivity/types.py +++ b/pgactivity/types.py @@ -23,7 +23,7 @@ import psutil from attr import validators -from . import compat, colors, utils +from . import compat, colors, utils, pg class Pct(float): @@ -917,7 +917,7 @@ class BaseProcess: @classmethod def from_bytes( cls: Type[_P], - server_encoding: str, + server_encoding: bytes, *, encoding: Optional[Union[str, bytes]], **kwargs: Any, @@ -925,12 +925,12 @@ def from_bytes( if encoding is None: enc = server_encoding elif isinstance(encoding, bytes): # psycopg2 - enc = encoding = encoding.decode() - else: - enc = encoding + enc, encoding = encoding, encoding.decode() + elif isinstance(encoding, str): + enc = encoding.encode() for name, value in kwargs.items(): if isinstance(value, bytes): - kwargs[name] = value.decode(enc, errors="replace") + kwargs[name] = pg.decode(value, enc, errors="replace") return cls(encoding=encoding, **kwargs) diff --git a/tests/test_data.py b/tests/test_data.py index 3fdb4f9c..238cd272 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,4 +1,5 @@ import time +from typing import Optional import attr import pytest @@ -154,17 +155,18 @@ def test_client_encoding(postgresql, encoding: str) -> None: @pytest.mark.parametrize( - "pyenc, pgenc", + "pyenc, pgenc, locale", [ - ("utf8", "UTF8"), - pytest.param("ascii", "SQL_ASCII", marks=pytest.mark.xfail), + ("utf8", "UTF8", None), + ("ascii", "SQL_ASCII", None), + ("unknown", "EUC_TW", "zh_TW.euctw"), ], ) def test_postgres_and_python_encoding( - database_factory, pyenc: str, pgenc: str, data, postgresql + database_factory, pyenc: str, pgenc: str, locale: Optional[str], data, postgresql ) -> None: dbname = pyenc - database_factory(dbname, encoding=pgenc) + database_factory(dbname, encoding=pgenc, locale=locale) with psycopg.connect( postgresql.info.dsn, dbname=dbname, client_encoding="utf-8" ) as conn: