@@ -1075,16 +1075,46 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"):
10751075 elif byteorder == "big" :
10761076 bom = "big"
10771077
1078- for c in s :
1079- ch = ord (c )
1080- ch2 = 0
1081- if ch >= 0x10000 :
1078+ i = 0
1079+ while i < len (s ):
1080+ ch = ord (s [i ])
1081+
1082+ # Check for surrogates - each surrogate is invalid in UTF-16
1083+ # regardless of whether it could form a pair
1084+ if 0xD800 <= ch <= 0xDFFF :
1085+ # Surrogate - handle with error handler
1086+ startinpos = i
1087+ endinpos = i + 1
1088+ res = unicode_call_errorhandler (
1089+ errors , "utf-16-le" if bom == "little" else "utf-16-be" ,
1090+ "surrogates not allowed" , s , startinpos , endinpos
1091+ )
1092+ # res[0] is the replacement string, res[1] is the new position
1093+ for replacement_char in res [0 ]:
1094+ rch = ord (replacement_char )
1095+ if rch >= 0x10000 :
1096+ # Encode as surrogate pair
1097+ rch2 = 0xDC00 | ((rch - 0x10000 ) & 0x3FF )
1098+ rch = 0xD800 | ((rch - 0x10000 ) >> 10 )
1099+ p += STORECHAR (rch , bom )
1100+ p += STORECHAR (rch2 , bom )
1101+ elif 0xD800 <= rch <= 0xDFFF :
1102+ # Don't encode surrogates in the replacement
1103+ pass
1104+ else :
1105+ p += STORECHAR (rch , bom )
1106+ i = res [1 ]
1107+ elif ch >= 0x10000 :
1108+ # Regular character above BMP - encode as surrogate pair
10821109 ch2 = 0xDC00 | ((ch - 0x10000 ) & 0x3FF )
10831110 ch = 0xD800 | ((ch - 0x10000 ) >> 10 )
1084-
1085- p += STORECHAR (ch , bom )
1086- if ch2 :
1111+ p += STORECHAR (ch , bom )
10871112 p += STORECHAR (ch2 , bom )
1113+ i += 1
1114+ else :
1115+ # Regular BMP character
1116+ p += STORECHAR (ch , bom )
1117+ i += 1
10881118
10891119 return p
10901120
@@ -1183,9 +1213,29 @@ def STORECHAR32(ch, byteorder):
11831213 if size == 0 :
11841214 return p
11851215
1186- for c in s :
1187- ch = ord (c )
1188- p += STORECHAR32 (ch , bom )
1216+ i = 0
1217+ while i < len (s ):
1218+ ch = ord (s [i ])
1219+
1220+ # Check for surrogates - they are not valid in UTF-32
1221+ if 0xD800 <= ch <= 0xDFFF :
1222+ # Surrogate - handle with error handler
1223+ startinpos = i
1224+ endinpos = i + 1
1225+ res = unicode_call_errorhandler (
1226+ errors , "utf-32-le" if bom == "little" else "utf-32-be" ,
1227+ "surrogates not allowed" , s , startinpos , endinpos , False
1228+ )
1229+ # res[0] is the replacement string, res[1] is the new position
1230+ for replacement_char in res [0 ]:
1231+ rch = ord (replacement_char )
1232+ # Don't encode surrogates in the replacement
1233+ if not (0xD800 <= rch <= 0xDFFF ):
1234+ p += STORECHAR32 (rch , bom )
1235+ i = res [1 ]
1236+ else :
1237+ p += STORECHAR32 (ch , bom )
1238+ i += 1
11891239
11901240 return p
11911241
0 commit comments