diff --git a/commitizen/out.py b/commitizen/out.py index cdc80cf52..63bc2046b 100644 --- a/commitizen/out.py +++ b/commitizen/out.py @@ -4,9 +4,35 @@ from termcolor import colored -if sys.platform == "win32": - if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7): - sys.stdout.reconfigure(encoding="utf-8") + +def _ensure_utf8_stdout(stream: object) -> None: + """Reconfigure non-UTF-8 stdout streams to emit UTF-8 bytes. + + The primary fix is switching stdout from locale-dependent encodings to + UTF-8 so normal Unicode output (for example, ``\U0001f680`` 🚀 or the + ``\u2019`` typographic apostrophe) does not raise ``UnicodeEncodeError`` on: + + * Windows ``cmd.exe`` defaulting to ``cp1252`` (the historical case), + * Linux/macOS terminals with a non-UTF-8 ``LANG`` such as + ``de_CH.ISO8859-1`` (#956). + + ``errors="replace"`` is a defensive fallback for genuinely + un-encodable input, such as lone surrogates produced by buggy callers. + It does not make terminals render bytes; rendering is handled after the + encoder has produced UTF-8 bytes. + """ + if not isinstance(stream, io.TextIOWrapper): + return + encoding = (stream.encoding or "").lower().replace("-", "").replace("_", "") + if encoding == "utf8": + return + try: + stream.reconfigure(encoding="utf-8", errors="replace") + except (AttributeError, ValueError): # pragma: no cover - safety net + pass + + +_ensure_utf8_stdout(sys.stdout) def write(value: object, *args: object) -> None: diff --git a/docs/images/cli_interactive/bump.gif b/docs/images/cli_interactive/bump.gif index bfb3dc2a1..d5d770758 100644 Binary files a/docs/images/cli_interactive/bump.gif and b/docs/images/cli_interactive/bump.gif differ diff --git a/docs/images/cli_interactive/commit.gif b/docs/images/cli_interactive/commit.gif index 235980b6e..c653b69f4 100644 Binary files a/docs/images/cli_interactive/commit.gif and b/docs/images/cli_interactive/commit.gif differ diff --git a/docs/images/cli_interactive/init.gif b/docs/images/cli_interactive/init.gif index 8463700ee..cb77cadb5 100644 Binary files a/docs/images/cli_interactive/init.gif and b/docs/images/cli_interactive/init.gif differ diff --git a/docs/images/cli_interactive/shortcut_custom.gif b/docs/images/cli_interactive/shortcut_custom.gif index 2c2803518..981954b0e 100644 Binary files a/docs/images/cli_interactive/shortcut_custom.gif and b/docs/images/cli_interactive/shortcut_custom.gif differ diff --git a/docs/images/cli_interactive/shortcut_default.gif b/docs/images/cli_interactive/shortcut_default.gif index 37a646659..cc65b4c67 100644 Binary files a/docs/images/cli_interactive/shortcut_default.gif and b/docs/images/cli_interactive/shortcut_default.gif differ diff --git a/tests/test_out.py b/tests/test_out.py new file mode 100644 index 000000000..12ca21bdd --- /dev/null +++ b/tests/test_out.py @@ -0,0 +1,98 @@ +"""Tests for ``commitizen.out``. + +Mostly focused on the stdout-encoding helper introduced for #956: the +function must reconfigure non-UTF-8 streams to UTF-8 so commitizen output +(emoji, typographic quotes) doesn't crash with ``UnicodeEncodeError`` on +terminals using locale-dependent encodings such as ``cp1252`` (Windows) or +``ISO8859-1`` (Linux/macOS). The helper also sets ``errors="replace"`` as a +fallback for genuinely un-encodable input such as lone surrogates. +""" + +from __future__ import annotations + +import io +from typing import Any + +from commitizen.out import _ensure_utf8_stdout + + +class _StubStream(io.TextIOWrapper): + """Light-weight ``TextIOWrapper`` that records calls to ``reconfigure``. + + Subclassing ``TextIOWrapper`` keeps the ``isinstance`` check in + ``_ensure_utf8_stdout`` happy without monkey-patching ``sys.stdout``. + """ + + reconfigure_calls: list[dict[str, Any]] + output: io.BytesIO + + def __init__(self, encoding: str) -> None: + output = io.BytesIO() + super().__init__(output, encoding=encoding) + self.output = output + self.reconfigure_calls = [] + + def reconfigure(self, **kwargs: Any) -> None: + self.reconfigure_calls.append(kwargs) + super().reconfigure(**kwargs) + + +def test_ensure_utf8_stdout_noop_when_already_utf8(): + stream = _StubStream(encoding="utf-8") + _ensure_utf8_stdout(stream) + assert stream.reconfigure_calls == [] + + +def test_ensure_utf8_stdout_noop_for_dashless_utf8_alias(): + stream = _StubStream(encoding="UTF8") + _ensure_utf8_stdout(stream) + assert stream.reconfigure_calls == [] + + +def test_ensure_utf8_stdout_reconfigures_iso8859_1_terminal(): + """Regression test for #956 (Linux/macOS ``LANG=de_CH.ISO8859-1``).""" + stream = _StubStream(encoding="latin-1") + _ensure_utf8_stdout(stream) + assert stream.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + + +def test_ensure_utf8_stdout_reconfigures_windows_cp1252(): + """Regression test for the historical Windows ``cmd.exe`` case.""" + stream = _StubStream(encoding="cp1252") + _ensure_utf8_stdout(stream) + assert stream.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}] + + +def test_ensure_utf8_stdout_skips_non_textio_streams(): + class NotATextIO: + encoding = "latin-1" + reconfigure_calls: list[dict[str, Any]] = [] + + def reconfigure(self, **kwargs: Any) -> None: # pragma: no cover - unused + self.reconfigure_calls.append(kwargs) + + stream = NotATextIO() + _ensure_utf8_stdout(stream) + assert stream.reconfigure_calls == [] + + +def test_ensure_utf8_stdout_after_reconfigure_can_emit_emoji(): + """End-to-end: after reconfiguration, writing an emoji must not raise.""" + stream = _StubStream(encoding="latin-1") + _ensure_utf8_stdout(stream) + + # The primary regression guard: switching to UTF-8 means normal Unicode + # output, such as emoji, no longer raises UnicodeEncodeError. + stream.write("Configuration complete \U0001f680") + stream.flush() + + +def test_ensure_utf8_stdout_replaces_lone_surrogate_on_write(): + """``errors="replace"`` handles genuinely un-encodable input.""" + stream = _StubStream(encoding="latin-1") + _ensure_utf8_stdout(stream) + + stream.write("ok\udc00ok") + stream.flush() + + assert stream.output.getvalue() == b"ok?ok"