Skip to content

Commit

Permalink
Fix query decode when PostgreSQL encoding name does not match Python one
Browse files Browse the repository at this point in the history
PostgreSQL and Python have different names for encodings. Since we
retrieve PostgreSQL encoding name from
'pg_catalog.pg_encoding_to_char(pg_database.encoding)' and then use this
to decode Python bytes, we need to convert the former into a Python
encoding name.

We thus use psycopg._encodings.pg2pyenc() to translate encoding names;
this is not part of the public API of psycopg at the moment, but should
be in the future.

For psycopg2, we implement a fallback mechanism in pg.decode(); not
all PostgreSQL encodings will then be accounted for. Typically, from
https://www.postgresql.org/docs/current/multibyte.html#CHARSET-TABLE,
KOI8R, KOI8U and WIN* are not handled (as far as I can tell). (SQL_ASCII
is not handled either, but the fallback is enough.)

Test previously introduced using SQL_ASCII now passes. Additionally, we
add another test in which there is no mapping between Python and
PostgreSQL: zh_TW. In CI, we thus add zh_TW to the host locales.
  • Loading branch information
dlax committed Mar 15, 2023
1 parent e365adf commit c815475
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
python-version: ${{ matrix.python }}
- name: Install tox
run: pip install tox
- name: Add fr_FR ISO-8859-1 for test purposes
- name: Add fr_FR and zh_TW for test purposes
run: |
sudo locale-gen fr_FR
sudo locale-gen fr_FR zh_TW.EUC-TW
sudo update-locale
- name: Test
run: tox -e py-${{ matrix.psycopg }}
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Change log

## pg\_activity 3.1.2 - UNRELEASED
## pg\_activity 3.2.0 - UNRELEASED

### Fixed

* Handle conversion between PostgreSQL encoding names and Python ones while
decoding the `query` column from `pg_stat_activity` with
`pg_database.encoding` (#348).

* Fix typos in man pages, spotted by codespell.

### Added
Expand Down
4 changes: 2 additions & 2 deletions pgactivity/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class Data:
pg_conn: Connection
pg_version: str
pg_num_version: int
server_encoding: str
server_encoding: bytes
min_duration: float
filters: Filters
dsn_parameters: Dict[str, str]
Expand Down Expand Up @@ -111,7 +111,7 @@ def pg_connect(
pg_conn,
pg_version,
pg.server_version(pg_conn),
server_encoding,
server_encoding.encode(),
min_duration=min_duration,
failed_queries=FailedQueriesInfo(),
filters=filters,
Expand Down
25 changes: 25 additions & 0 deletions pgactivity/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@

import psycopg
from psycopg import sql as sql
from psycopg._encodings import pg2pyenc
from psycopg.adapt import Buffer, Loader
from psycopg.conninfo import make_conninfo, conninfo_to_dict
from psycopg.rows import dict_row
from psycopg.errors import (
NotSupportedError,
FeatureNotSupported as FeatureNotSupported,
InterfaceError as InterfaceError,
InvalidPassword as InvalidPassword,
Expand Down Expand Up @@ -170,7 +172,19 @@ def fetchall(
with cursor(conn, mkrow, text_as_bytes) as cur:
return cur.execute(query, args, prepare=True).fetchall()

def decode(value: bytes, pgenc: bytes, *, errors: str) -> str:
"""Decode 'value' with PostgreSQL encoding 'pgenc' converted to Python encoding
name if available.
"""
try:
pyenc = pg2pyenc(pgenc)
except NotSupportedError:
pyenc = "utf-8"
return value.decode(pyenc, errors=errors)

except ImportError:
import codecs

import psycopg2
import psycopg2.extensions
from psycopg2.extras import DictCursor
Expand Down Expand Up @@ -247,6 +261,16 @@ def fetchall( # type: ignore[no-redef]
return [mkrow(**row) for row in rows]
return rows

def decode(value: bytes, pgenc: bytes, *, errors: str) -> str:
"""Decode 'value' with PostgreSQL encoding 'pgenc' converted to Python encoding
name if available.
"""
try:
pyenc = codecs.lookup(pgenc.decode()).name
except LookupError:
pyenc = "utf-8"
return value.decode(pyenc, errors=errors)


__all__ = [
"__version__",
Expand All @@ -260,6 +284,7 @@ def fetchall( # type: ignore[no-redef]
"QueryCanceled",
"connect",
"connection_parameters",
"decode",
"execute",
"fetchall",
"fetchone",
Expand Down
12 changes: 6 additions & 6 deletions pgactivity/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import psutil
from attr import validators

from . import compat, colors, utils
from . import compat, colors, utils, pg


class Pct(float):
Expand Down Expand Up @@ -917,20 +917,20 @@ class BaseProcess:
@classmethod
def from_bytes(
cls: Type[_P],
server_encoding: str,
server_encoding: bytes,
*,
encoding: Optional[Union[str, bytes]],
**kwargs: Any,
) -> _P:
if encoding is None:
enc = server_encoding
elif isinstance(encoding, bytes): # psycopg2
enc = encoding = encoding.decode()
else:
enc = encoding
enc, encoding = encoding, encoding.decode()
elif isinstance(encoding, str):
enc = encoding.encode()
for name, value in kwargs.items():
if isinstance(value, bytes):
kwargs[name] = value.decode(enc, errors="replace")
kwargs[name] = pg.decode(value, enc, errors="replace")
return cls(encoding=encoding, **kwargs)


Expand Down
12 changes: 7 additions & 5 deletions tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from typing import Optional

import attr
import pytest
Expand Down Expand Up @@ -154,17 +155,18 @@ def test_client_encoding(postgresql, encoding: str) -> None:


@pytest.mark.parametrize(
"pyenc, pgenc",
"pyenc, pgenc, locale",
[
("utf8", "UTF8"),
pytest.param("ascii", "SQL_ASCII", marks=pytest.mark.xfail),
("utf8", "UTF8", None),
("ascii", "SQL_ASCII", None),
("unknown", "EUC_TW", "zh_TW.euctw"),
],
)
def test_postgres_and_python_encoding(
database_factory, pyenc: str, pgenc: str, data, postgresql
database_factory, pyenc: str, pgenc: str, locale: Optional[str], data, postgresql
) -> None:
dbname = pyenc
database_factory(dbname, encoding=pgenc)
database_factory(dbname, encoding=pgenc, locale=locale)
with psycopg.connect(
postgresql.info.dsn, dbname=dbname, client_encoding="utf-8"
) as conn:
Expand Down

0 comments on commit c815475

Please sign in to comment.