diff --git a/pyc_string.cpp b/pyc_string.cpp index 5dd78065a..803c321ba 100644 --- a/pyc_string.cpp +++ b/pyc_string.cpp @@ -111,7 +111,8 @@ void PycString::print(std::ostream &pyc_output, PycModule* mod, bool triple, else pyc_output << (useQuotes ? '"' : '\''); } - for (char ch : m_value) { + for (size_t i = 0; i < m_value.size(); ++i) { + char ch = m_value[i]; if (static_cast(ch) < 0x20 || ch == 0x7F) { if (ch == '\r') { pyc_output << "\\r"; @@ -127,8 +128,19 @@ void PycString::print(std::ostream &pyc_output, PycModule* mod, bool triple, } } else if (static_cast(ch) >= 0x80) { if (type() == TYPE_UNICODE) { - // Unicode stored as UTF-8... Let the stream interpret it - pyc_output << ch; + unsigned char b0 = ch; + if (b0 == 0xED && i + 2 < m_value.size() + && (unsigned char)m_value[i + 1] >= 0xA0 + && (unsigned char)m_value[i + 1] <= 0xBF + && ((unsigned char)m_value[i + 2] & 0xC0) == 0x80) { + unsigned cp = ((b0 & 0x0F) << 12) + | (((unsigned char)m_value[i + 1] & 0x3F) << 6) + | ((unsigned char)m_value[i + 2] & 0x3F); + formatted_print(pyc_output, "\\u%04x", cp); + i += 2; + } else { + pyc_output << ch; + } } else { formatted_print(pyc_output, "\\x%02x", (ch & 0xFF)); } diff --git a/tests/compiled/surrogate_escape.3.11.pyc b/tests/compiled/surrogate_escape.3.11.pyc new file mode 100644 index 000000000..93a215cb8 Binary files /dev/null and b/tests/compiled/surrogate_escape.3.11.pyc differ diff --git a/tests/input/surrogate_escape.py b/tests/input/surrogate_escape.py new file mode 100644 index 000000000..3a2f1d8a4 --- /dev/null +++ b/tests/input/surrogate_escape.py @@ -0,0 +1,3 @@ +s = '\ud800' +t = 'caf\xe9' +u = '\U0001f600' diff --git a/tests/tokenized/surrogate_escape.txt b/tests/tokenized/surrogate_escape.txt new file mode 100644 index 000000000..a7156c51d --- /dev/null +++ b/tests/tokenized/surrogate_escape.txt @@ -0,0 +1,3 @@ +s = '\ud800' +t = 'café' +u = '😀'