From a18a13098e3f15c2cf39d1f5167fc5537c0cb552 Mon Sep 17 00:00:00 2001 From: Mario Penterman Date: Sat, 20 Jun 2026 09:55:38 +0200 Subject: [PATCH] Emit lone surrogates as \uXXXX escapes in unicode strings A lone surrogate (U+D800..U+DFFF) in a unicode string is marshalled as the 3-byte WTF-8/CESU-8 sequence 0xED 0xA0-0xBF 0x80-0xBF, which is invalid UTF-8. Writing it raw produces an undecodable .py. Detect that sequence in the string renderer and emit the code point as a \uXXXX escape, which Python parses back to the identical surrogate. Well-formed multi-byte UTF-8 (accents, astral characters) is still passed through unchanged. Signed-off-by: Mario Penterman --- pyc_string.cpp | 18 +++++++++++++++--- tests/compiled/surrogate_escape.3.11.pyc | Bin 0 -> 194 bytes tests/input/surrogate_escape.py | 3 +++ tests/tokenized/surrogate_escape.txt | 3 +++ 4 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 tests/compiled/surrogate_escape.3.11.pyc create mode 100644 tests/input/surrogate_escape.py create mode 100644 tests/tokenized/surrogate_escape.txt diff --git a/pyc_string.cpp b/pyc_string.cpp index 5dd78065a..803c321ba 100644 --- a/pyc_string.cpp +++ b/pyc_string.cpp @@ -111,7 +111,8 @@ void PycString::print(std::ostream &pyc_output, PycModule* mod, bool triple, else pyc_output << (useQuotes ? '"' : '\''); } - for (char ch : m_value) { + for (size_t i = 0; i < m_value.size(); ++i) { + char ch = m_value[i]; if (static_cast(ch) < 0x20 || ch == 0x7F) { if (ch == '\r') { pyc_output << "\\r"; @@ -127,8 +128,19 @@ void PycString::print(std::ostream &pyc_output, PycModule* mod, bool triple, } } else if (static_cast(ch) >= 0x80) { if (type() == TYPE_UNICODE) { - // Unicode stored as UTF-8... Let the stream interpret it - pyc_output << ch; + unsigned char b0 = ch; + if (b0 == 0xED && i + 2 < m_value.size() + && (unsigned char)m_value[i + 1] >= 0xA0 + && (unsigned char)m_value[i + 1] <= 0xBF + && ((unsigned char)m_value[i + 2] & 0xC0) == 0x80) { + unsigned cp = ((b0 & 0x0F) << 12) + | (((unsigned char)m_value[i + 1] & 0x3F) << 6) + | ((unsigned char)m_value[i + 2] & 0x3F); + formatted_print(pyc_output, "\\u%04x", cp); + i += 2; + } else { + pyc_output << ch; + } } else { formatted_print(pyc_output, "\\x%02x", (ch & 0xFF)); } diff --git a/tests/compiled/surrogate_escape.3.11.pyc b/tests/compiled/surrogate_escape.3.11.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93a215cb8e297b6dd86e24a8c49c86460735f046 GIT binary patch literal 194 zcmZ3^%ge<81WE2@Svo-aF^B^Lj8MjBAs}NqLkdF_LkeRQV+vCgQwno1gCmS5Wzj!zMRBr8Fniu80Gu9%Oy71d#Z^%*e=igN3I7NbrIP Nfd)npEMfyn0svxTFZBQb literal 0 HcmV?d00001 diff --git a/tests/input/surrogate_escape.py b/tests/input/surrogate_escape.py new file mode 100644 index 000000000..3a2f1d8a4 --- /dev/null +++ b/tests/input/surrogate_escape.py @@ -0,0 +1,3 @@ +s = '\ud800' +t = 'caf\xe9' +u = '\U0001f600' diff --git a/tests/tokenized/surrogate_escape.txt b/tests/tokenized/surrogate_escape.txt new file mode 100644 index 000000000..a7156c51d --- /dev/null +++ b/tests/tokenized/surrogate_escape.txt @@ -0,0 +1,3 @@ +s = '\ud800' +t = 'café' +u = '😀'