From b6d21f27d3ed543b164bcdd96ae2ffa050c922b8 Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Sun, 20 Dec 2020 18:53:43 +0100 Subject: [PATCH 1/6] python3-sys: Add PyUnicode_READY, PyUnicode_KIND and PyUnicode_DATA functions for access to raw data of unicode objects. --- python3-sys/src/unicodeobject.rs | 113 ++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/python3-sys/src/unicodeobject.rs b/python3-sys/src/unicodeobject.rs index fb604815..3d1fddc5 100644 --- a/python3-sys/src/unicodeobject.rs +++ b/python3-sys/src/unicodeobject.rs @@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t}; use crate::object::*; use crate::pyport::Py_ssize_t; +#[cfg(not(Py_LIMITED_API))] +use crate::pyport::Py_hash_t; #[cfg(not(Py_LIMITED_API))] #[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")] @@ -123,7 +125,7 @@ extern "C" { pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject; #[cfg(not(Py_3_9))] pub fn PyUnicode_ClearFreeList() -> c_int; - #[cfg(not(Py_LIMITED_API))] + #[cfg(any(not(Py_LIMITED_API), Py_3_10))] pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char; #[cfg(not(Py_LIMITED_API))] pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char; @@ -429,4 +431,113 @@ extern "C" { pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int; #[cfg(not(Py_LIMITED_API))] pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE; + + #[cfg(not(Py_LIMITED_API))] + fn _PyUnicode_Ready(o: *mut PyObject) -> c_int; +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyASCIIObject { + pub ob_base: PyObject, + pub length: Py_ssize_t, + pub hash: Py_hash_t, + pub state: u32, + pub wstr: *mut c_void +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyCompactUnicodeObject { + _base: PyASCIIObject, + utf8_length: Py_ssize_t, + utf8: *mut u8, + wstr_length: Py_ssize_t +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyUnicodeObject { + _base: PyASCIIObject, + data: *mut c_void +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool { + let ascii_bit = 1 << 6; + let state = (*(o as *mut PyASCIIObject)).state; + (state & ascii_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool { + let compact_bit = 1 << 5; + let state = (*(o as *mut PyASCIIObject)).state; + (state & compact_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_WCHAR_KIND: u32 = 0; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_1BYTE_KIND: u32 = 1; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_2BYTE_KIND: u32 = 2; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_4BYTE_KIND: u32 = 4; + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + let state = (*(o as *mut PyASCIIObject)).state; + (state >> 2) & 7 +} + +#[cfg(not(Py_LIMITED_API))] +pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + if PyUnicode_IS_COMPACT(o) { + // fn _PyUnicode_COMPACT_DATA + if PyUnicode_IS_ASCII(o) { + (o as *mut PyASCIIObject).offset(1) as *mut c_void + } else { + (o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void + } + } else { + // fn _PyUnicode_NONCOMPACT_DATA + let data = (*(o as *mut PyUnicodeObject)).data; + debug_assert!(!data.is_null()); + data + } +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + (*(o as *mut PyASCIIObject)).length +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool { + let ready_bit = 1 << 7; + let state = (*(o as *mut PyASCIIObject)).state; + (state & ready_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int { + debug_assert!(PyUnicode_Check(o) > 0); + if PyUnicode_IS_READY(o) { + 0 + } else { + _PyUnicode_Ready(o) + } } From 444070223abf8ce5c50d67b1a0bbed041f952503 Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Sun, 20 Dec 2020 19:55:07 +0100 Subject: [PATCH 2/6] `PyString::data()`: return the internal representation of the Python unicode object This fixes #246: panic in `PyString::to_string` and `PyString::to_string_lossy` when a Python3 unicode string contains unpaired surrogates. --- src/objects/string.rs | 63 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/src/objects/string.rs b/src/objects/string.rs index 05711b1f..c2c563d7 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -283,17 +283,24 @@ impl PyString { } #[cfg(feature = "python3-sys")] - fn data_impl(&self, py: Python) -> PyStringData { - // TODO: return the original representation instead - // of forcing the UTF-8 representation to be created. - let mut size: ffi::Py_ssize_t = 0; + fn data_impl(&self, _py: Python) -> PyStringData { + let ptr = self.as_ptr(); unsafe { - let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8; - if data.is_null() { - PyErr::fetch(py).print(py); - panic!("PyUnicode_AsUTF8AndSize failed"); + let ready = ffi::PyUnicode_READY(ptr); + if ready < 0 { + // should fail only on OOM + ffi::PyErr_Print(); + panic!("PyUnicode_READY failed"); + } + let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize; + let data = ffi::PyUnicode_DATA(ptr); + let kind = ffi::PyUnicode_KIND(ptr); + match kind { + ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)), + ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)), + ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)), + _ => panic!("Unknown PyUnicode_KIND") } - PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize)) } } @@ -535,6 +542,7 @@ impl RefFromPyObject for [u8] { mod test { use crate::conversion::{RefFromPyObject, ToPyObject}; use crate::python::{Python, PythonObject}; + use super::{PyString, PyStringData}; #[test] fn test_non_bmp() { @@ -583,4 +591,41 @@ mod test { let v = py_bytes.extract::>(py).unwrap(); assert_eq!(b"Hello", &v[..]); } + + #[test] + fn test_extract_umlaut() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap(); + let data = py_string.cast_as::(py).unwrap().data(py); + if let PyStringData::Latin1(s) = data { + assert_eq!([b'x', b'=', 0xe4], *s); + } else { + panic!("Expected PyStringData::Latin1"); + } + assert_eq!("x=ä", py_string.extract::(py).unwrap()); + } + + #[test] + fn test_extract_lone_surrogate() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); + let data = py_string.cast_as::(py).unwrap().data(py); + if let PyStringData::Utf16(s) = data { + assert_eq!(['x' as u16, '=' as u16, 0xd800], *s); + } else { + panic!("Expected PyStringData::Utf16"); + } + assert!(py_string.extract::(py).is_err()); + } + + #[test] + fn test_extract_lone_surrogate_lossy() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); + let result = py_string.cast_as::(py).unwrap().to_string_lossy(py); + assert_eq!("x=\u{fffd}", result); + } } From 0e26c7be88db3a058474dcac940668d3dc4c90a2 Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Mon, 21 Dec 2020 15:27:21 +0100 Subject: [PATCH 3/6] Don't expect a specific PyStringData representation on Python2. --- src/objects/string.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/objects/string.rs b/src/objects/string.rs index c2c563d7..0df065d3 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> { )), }, PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { + if data.is_ascii() { Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })) } else { Ok(Cow::Owned(data.iter().map(|&b| b as char).collect())) @@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> { match self { PyStringData::Utf8(data) => String::from_utf8_lossy(data), PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { + if data.is_ascii() { Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }) } else { Cow::Owned(data.iter().map(|&b| b as char).collect()) @@ -598,6 +598,7 @@ mod test { let py = gil.python(); let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap(); let data = py_string.cast_as::(py).unwrap().data(py); + #[cfg(feature = "python3-sys")] if let PyStringData::Latin1(s) = data { assert_eq!([b'x', b'=', 0xe4], *s); } else { @@ -612,6 +613,7 @@ mod test { let py = gil.python(); let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); let data = py_string.cast_as::(py).unwrap().data(py); + #[cfg(feature = "python3-sys")] if let PyStringData::Utf16(s) = data { assert_eq!(['x' as u16, '=' as u16, 0xd800], *s); } else { From 6f57d5a4edd2ce148e889a3c0102775e4e977b7c Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Mon, 21 Dec 2020 16:00:55 +0100 Subject: [PATCH 4/6] `PyString::to_string`: directly use `PyUnicode_AsUTF8AndSize`. --- src/objects/string.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/objects/string.rs b/src/objects/string.rs index 0df065d3..5d9a13c3 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -313,7 +313,26 @@ impl PyString { /// (containing unpaired surrogates, or a Python 2.7 byte string that is /// not valid UTF-8). pub fn to_string(&self, py: Python) -> PyResult> { - self.data(py).to_string(py) + #[cfg(feature = "python3-sys")] + unsafe { + // On Python 3, we can use the UTF-8 representation stored + // inside the Python string. + // This should produce identical results to + // `self.data(py).to_string(py)` but avoids + // re-encoding the string on every to_string call. + let mut size: ffi::Py_ssize_t = 0; + let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size); + if data.is_null() { + return Err(PyErr::fetch(py)); + } else { + let slice = std::slice::from_raw_parts(data as *const u8, size as usize); + return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice))); + } + } + #[cfg(feature = "python27-sys")] + { + return self.data(py).to_string(py); + } } /// Convert the `PyString` into a Rust string. From 9638c8956b735bc17840fde381332a59c1c186b8 Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Mon, 21 Dec 2020 19:25:04 +0100 Subject: [PATCH 5/6] Fix build with Rust 1.32.0. --- src/objects/string.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/objects/string.rs b/src/objects/string.rs index 5d9a13c3..9a5480a0 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -618,10 +618,12 @@ mod test { let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap(); let data = py_string.cast_as::(py).unwrap().data(py); #[cfg(feature = "python3-sys")] - if let PyStringData::Latin1(s) = data { - assert_eq!([b'x', b'=', 0xe4], *s); - } else { - panic!("Expected PyStringData::Latin1"); + { + if let PyStringData::Latin1(s) = data { + assert_eq!([b'x', b'=', 0xe4], *s); + } else { + panic!("Expected PyStringData::Latin1"); + } } assert_eq!("x=ä", py_string.extract::(py).unwrap()); } @@ -633,10 +635,12 @@ mod test { let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); let data = py_string.cast_as::(py).unwrap().data(py); #[cfg(feature = "python3-sys")] - if let PyStringData::Utf16(s) = data { - assert_eq!(['x' as u16, '=' as u16, 0xd800], *s); - } else { - panic!("Expected PyStringData::Utf16"); + { + if let PyStringData::Utf16(s) = data { + assert_eq!(['x' as u16, '=' as u16, 0xd800], *s); + } else { + panic!("Expected PyStringData::Utf16"); + } } assert!(py_string.extract::(py).is_err()); } From e3976d19863a262703a919caac7ac752d9abef3f Mon Sep 17 00:00:00 2001 From: Daniel Grunwald Date: Wed, 17 Feb 2021 22:15:31 +0100 Subject: [PATCH 6/6] Suppress compiling warning when building for Python 2.7. --- src/objects/string.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/objects/string.rs b/src/objects/string.rs index 9a5480a0..11e51aed 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -611,6 +611,7 @@ mod test { assert_eq!(b"Hello", &v[..]); } + #[allow(unused_variables)] // when compiling for py2.7 #[test] fn test_extract_umlaut() { let gil = Python::acquire_gil(); @@ -628,6 +629,7 @@ mod test { assert_eq!("x=ä", py_string.extract::(py).unwrap()); } + #[allow(unused_variables)] // when compiling for py2.7 #[test] fn test_extract_lone_surrogate() { let gil = Python::acquire_gil();