From db879703b7edaf5d1200d8bc3784fa1460edc7a3 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Mon, 26 May 2025 12:25:07 -0600 Subject: [PATCH 1/6] BUG: add bounds-checking to in-place string multiply --- numpy/_core/src/umath/string_buffer.h | 12 ++++++++++++ numpy/_core/src/umath/string_ufuncs.cpp | 13 +++++++++++-- numpy/_core/tests/test_strings.py | 8 ++++++++ numpy/typing/tests/data/pass/ma.py | 3 ++- 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index 554f9ece5197..dafedcbc03ff 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -297,6 +297,18 @@ struct Buffer { return num_codepoints; } + inline size_t + buffer_width() + { + switch (enc) { + case ENCODING::ASCII: + case ENCODING::UTF8: + return after - buf; + case ENCODING::UTF32: + return (after - buf) / sizeof(npy_ucs4); + } + } + inline Buffer& operator+=(npy_int64 rhs) { diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index ffd757815364..7f7857c49c34 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -175,6 +175,12 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) return; } + size_t width = out.buffer_width(); + size_t pad = 0; + if (width < len1*reps) { + reps = width / len1; + pad = width % len1; + } if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); @@ -184,6 +190,8 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) buf1.buffer_memcpy(out, len1); out += len1; } + buf1.buffer_memcpy(out, pad); + out += pad; out.buffer_fill_with_zeros_after_index(0); } } @@ -752,10 +760,11 @@ string_multiply_resolve_descriptors( if (given_descrs[2] == NULL) { PyErr_SetString( PyExc_TypeError, - "The 'out' kwarg is necessary. Use numpy.strings.multiply without it."); + "The 'out' kwarg is necessary when using the string multiply ufunc " + "directly. Use numpy.strings.multiply to multiply strings without " + "specifying 'out'."); return _NPY_ERROR_OCCURRED_IN_CAST; } - loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]); if (loop_descrs[0] == NULL) { return _NPY_ERROR_OCCURRED_IN_CAST; diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index f6de208d7951..f87db9dcdd85 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -227,6 +227,14 @@ def test_multiply_raises(self, dt): with pytest.raises(MemoryError): np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize) + def test_inplace_multiply(self, dt): + arr = np.array(['foo ', 'bar'], dtype=dt) + arr *= 2 + if dt != "T": + assert_array_equal(arr, np.array(['foo ', 'barb'], dtype=dt)) + else: + assert_array_equal(arr, ['foo foo ', 'barbar']) + @pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32, np.int64, np.int_]) def test_multiply_integer_dtypes(self, i_dt, dt): diff --git a/numpy/typing/tests/data/pass/ma.py b/numpy/typing/tests/data/pass/ma.py index dc9474fe4069..e7915a583210 100644 --- a/numpy/typing/tests/data/pass/ma.py +++ b/numpy/typing/tests/data/pass/ma.py @@ -16,7 +16,8 @@ MAR_M_dt64: MaskedArray[np.datetime64] = np.ma.MaskedArray([np.datetime64(1, "D")]) MAR_S: MaskedArray[np.bytes_] = np.ma.MaskedArray([b'foo'], dtype=np.bytes_) MAR_U: MaskedArray[np.str_] = np.ma.MaskedArray(['foo'], dtype=np.str_) -MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], np.ma.MaskedArray(["a"], "T")) +MAR_T = cast(np.ma.MaskedArray[Any, np.dtypes.StringDType], + np.ma.MaskedArray(["a"], dtype="T")) AR_b: npt.NDArray[np.bool] = np.array([True, False, True]) From 032c8cb3b68046f8b7654b01b3ae150d1676d184 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 10:05:01 -0600 Subject: [PATCH 2/6] MNT: check for overflow and raise OverflowError --- doc/release/upcoming_changes/29060.change.rst | 3 ++ numpy/_core/src/umath/string_ufuncs.cpp | 49 ++++++++++++------- numpy/_core/src/umath/stringdtype_ufuncs.cpp | 6 +-- numpy/_core/strings.py | 2 +- numpy/_core/tests/test_strings.py | 5 +- 5 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 doc/release/upcoming_changes/29060.change.rst diff --git a/doc/release/upcoming_changes/29060.change.rst b/doc/release/upcoming_changes/29060.change.rst new file mode 100644 index 000000000000..1561da7bf94e --- /dev/null +++ b/doc/release/upcoming_changes/29060.change.rst @@ -0,0 +1,3 @@ +* Multiplication between a string and integer now raises OverflowError instead + of MemoryError if the result of the multiplication would create a string that + is too large to be represented. This follows Python's behavior. diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 7f7857c49c34..502062095f2d 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -15,6 +15,7 @@ #include "dtypemeta.h" #include "convert_datatype.h" #include "gil_utils.h" +#include "templ_common.h" /* for npy_mul_size_with_overflow_size_t */ #include "string_ufuncs.h" #include "string_fastsearch.h" @@ -166,34 +167,44 @@ string_add(Buffer buf1, Buffer buf2, Buffer out) template -static inline void +static inline int string_multiply(Buffer buf1, npy_int64 reps, Buffer out) { size_t len1 = buf1.num_codepoints(); if (reps < 1 || len1 == 0) { out.buffer_fill_with_zeros_after_index(0); - return; + return 0; } size_t width = out.buffer_width(); - size_t pad = 0; - if (width < len1*reps) { - reps = width / len1; - pad = width % len1; - } if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); + return 0; } - else { - for (npy_int64 i = 0; i < reps; i++) { - buf1.buffer_memcpy(out, len1); - out += len1; - } - buf1.buffer_memcpy(out, pad); - out += pad; - out.buffer_fill_with_zeros_after_index(0); + + size_t newlen; + if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) { + return -1; } + + size_t pad = 0; + if (width < newlen) { + reps = width / len1; + pad = width % len1; + } + + for (npy_int64 i = 0; i < reps; i++) { + buf1.buffer_memcpy(out, len1); + out += len1; + } + + buf1.buffer_memcpy(out, pad); + out += pad; + + out.buffer_fill_with_zeros_after_index(0); + + return 0; } @@ -246,7 +257,9 @@ string_multiply_strint_loop(PyArrayMethod_Context *context, while (N--) { Buffer buf(in1, elsize); Buffer outbuf(out, outsize); - string_multiply(buf, *(npy_int64 *)in2, outbuf); + if (NPY_UNLIKELY(string_multiply(buf, *(npy_int64 *)in2, outbuf) < 0)) { + npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply"); + } in1 += strides[0]; in2 += strides[1]; @@ -275,7 +288,9 @@ string_multiply_intstr_loop(PyArrayMethod_Context *context, while (N--) { Buffer buf(in2, elsize); Buffer outbuf(out, outsize); - string_multiply(buf, *(npy_int64 *)in1, outbuf); + if (NPY_UNLIKELY(string_multiply(buf, *(npy_int64 *)in1, outbuf) < 0)) { + npy_gil_error(PyExc_OverflowError, "Overflow detected in string multiply"); + } in1 += strides[0]; in2 += strides[1]; diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index adcbfd3b7480..36418a489136 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -137,9 +137,9 @@ static int multiply_loop_core( size_t newsize; int overflowed = npy_mul_with_overflow_size_t( &newsize, cursize, factor); - if (overflowed) { - npy_gil_error(PyExc_MemoryError, - "Failed to allocate string in string multiply"); + if (overflowed || newsize > PY_SSIZE_T_MAX) { + npy_gil_error(PyExc_OverflowError, + "Overflow encountered in string multiply"); goto fail; } diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index fc0a2d0b4d1a..b4dc1656024f 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -218,7 +218,7 @@ def multiply(a, i): # Ensure we can do a_len * i without overflow. if np.any(a_len > sys.maxsize / np.maximum(i, 1)): - raise MemoryError("repeated string is too long") + raise OverflowError("Overflow encountered in string multiply") buffersizes = a_len * i out_dtype = f"{a.dtype.char}{buffersizes.max()}" diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index f87db9dcdd85..56e928df4d7b 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -224,7 +224,7 @@ def test_multiply_raises(self, dt): with pytest.raises(TypeError, match="unsupported type"): np.strings.multiply(np.array("abc", dtype=dt), 3.14) - with pytest.raises(MemoryError): + with pytest.raises(OverflowError): np.strings.multiply(np.array("abc", dtype=dt), sys.maxsize) def test_inplace_multiply(self, dt): @@ -235,6 +235,9 @@ def test_inplace_multiply(self, dt): else: assert_array_equal(arr, ['foo foo ', 'barbar']) + with pytest.raises(OverflowError): + arr *= sys.maxsize + @pytest.mark.parametrize("i_dt", [np.int8, np.int16, np.int32, np.int64, np.int_]) def test_multiply_integer_dtypes(self, i_dt, dt): From 1903aa545d4570ba87cb79520f6117a8de6e29d1 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:04 -0600 Subject: [PATCH 3/6] MNT: respond to review suggestion --- numpy/_core/src/umath/string_ufuncs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 502062095f2d..8d71dadddca0 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -184,7 +184,7 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) } size_t newlen; - if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) < 0) || newlen > PY_SSIZE_T_MAX) { + if (NPY_UNLIKELY(npy_mul_with_overflow_size_t(&newlen, reps, len1) != 0) || newlen > PY_SSIZE_T_MAX) { return -1; } From 24643be593de91ab68a959048a30256bcee4b5a4 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:15 -0600 Subject: [PATCH 4/6] MNT: handle overflow in one more spot --- numpy/_core/src/umath/stringdtype_ufuncs.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index 36418a489136..b0181d4186c9 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -1748,9 +1748,9 @@ center_ljust_rjust_strided_loop(PyArrayMethod_Context *context, width - num_codepoints); newsize += s1.size; - if (overflowed) { - npy_gil_error(PyExc_MemoryError, - "Failed to allocate string in %s", ufunc_name); + if (overflowed || newsize > PY_SSIZE_T_MAX) { + npy_gil_error(PyExc_OverflowError, + "Overflow encountered in %s", ufunc_name); goto fail; } From 03fa9292198bbcf2286620bfce306d241c0e9799 Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:00:29 -0600 Subject: [PATCH 5/6] MNT: make test behave the same on all architectures --- numpy/_core/tests/test_stringdtype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/_core/tests/test_stringdtype.py b/numpy/_core/tests/test_stringdtype.py index 1c15c4895eaf..9bab810d4421 100644 --- a/numpy/_core/tests/test_stringdtype.py +++ b/numpy/_core/tests/test_stringdtype.py @@ -128,8 +128,8 @@ def test_null_roundtripping(): def test_string_too_large_error(): arr = np.array(["a", "b", "c"], dtype=StringDType()) - with pytest.raises(MemoryError): - arr * (2**63 - 2) + with pytest.raises(OverflowError): + arr * (sys.maxsize + 1) @pytest.mark.parametrize( From 3272c4834ce5a0fb586b5205d5e41b9db66b9bfd Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Tue, 27 May 2025 11:15:54 -0600 Subject: [PATCH 6/6] MNT: reorder to avoid work in some cases --- numpy/_core/src/umath/string_ufuncs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 8d71dadddca0..95f30ccb109e 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -176,7 +176,6 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) return 0; } - size_t width = out.buffer_width(); if (len1 == 1) { out.buffer_memset(*buf1, reps); out.buffer_fill_with_zeros_after_index(reps); @@ -189,6 +188,7 @@ string_multiply(Buffer buf1, npy_int64 reps, Buffer out) } size_t pad = 0; + size_t width = out.buffer_width(); if (width < newlen) { reps = width / len1; pad = width % len1;