🌐 AI搜索 & 代理 主页
Skip to content

Commit 9303d62

Browse files
committed
Separate out bytea sort support from varlena.c
In the wake of commit b45242f, bytea_sortsupport() still called out to varstr_sortsupport(). Treating bytea as a kind of text/varchar required varstr_sortsupport() to allow for the possibility of NUL bytes, but only for C collation. This was confusing. For better separation of concerns, create an independent sortsupport implementation in bytea.c. The heuristics for bytea_abbrev_abort() remain the same as for varstr_abbrev_abort(). It's possible that the bytea case warrants different treatment, but that is left for future investigation. In passing, adjust some strange looking comparisons in varstr_abbrev_abort(). Author: Aleksander Alekseev <aleksander@tigerdata.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://postgr.es/m/CAJ7c6TP1bAbEhUJa6+rgceN6QJWMSsxhg1=mqfSN=Nb-n6DAKg@mail.gmail.com
1 parent 15f68ce commit 9303d62

File tree

3 files changed

+242
-40
lines changed

3 files changed

+242
-40
lines changed

src/backend/utils/adt/bytea.c

Lines changed: 233 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,19 @@
1515
#include "postgres.h"
1616

1717
#include "access/detoast.h"
18-
#include "catalog/pg_collation_d.h"
19-
#include "catalog/pg_type_d.h"
18+
#include "common/hashfn.h"
2019
#include "common/int.h"
2120
#include "fmgr.h"
21+
#include "lib/hyperloglog.h"
2222
#include "libpq/pqformat.h"
2323
#include "port/pg_bitutils.h"
24+
#include "port/pg_bswap.h"
2425
#include "utils/builtins.h"
2526
#include "utils/bytea.h"
2627
#include "utils/fmgrprotos.h"
28+
#include "utils/guc.h"
2729
#include "utils/memutils.h"
2830
#include "utils/sortsupport.h"
29-
#include "utils/varlena.h"
3031
#include "varatt.h"
3132

3233
/* GUC variable */
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
3738
bool length_not_specified);
3839
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
3940

41+
typedef struct
42+
{
43+
bool abbreviate; /* Should we abbreviate keys? */
44+
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
45+
hyperLogLogState full_card; /* Full key cardinality state */
46+
double prop_card; /* Required cardinality proportion */
47+
} ByteaSortSupport;
48+
49+
/* Static function declarations for sort support */
50+
static int byteafastcmp(Datum x, Datum y, SortSupport ssup);
51+
static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);
52+
static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);
53+
4054
/*
4155
* bytea_catenate
4256
* Guts of byteacat(), broken out so it can be used by other functions
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
10011015
PG_RETURN_BYTEA_P(result);
10021016
}
10031017

1018+
/*
1019+
* sortsupport comparison func
1020+
*/
1021+
static int
1022+
byteafastcmp(Datum x, Datum y, SortSupport ssup)
1023+
{
1024+
bytea *arg1 = DatumGetByteaPP(x);
1025+
bytea *arg2 = DatumGetByteaPP(y);
1026+
char *a1p,
1027+
*a2p;
1028+
int len1,
1029+
len2,
1030+
result;
1031+
1032+
a1p = VARDATA_ANY(arg1);
1033+
a2p = VARDATA_ANY(arg2);
1034+
1035+
len1 = VARSIZE_ANY_EXHDR(arg1);
1036+
len2 = VARSIZE_ANY_EXHDR(arg2);
1037+
1038+
result = memcmp(a1p, a2p, Min(len1, len2));
1039+
if ((result == 0) && (len1 != len2))
1040+
result = (len1 < len2) ? -1 : 1;
1041+
1042+
/* We can't afford to leak memory here. */
1043+
if (PointerGetDatum(arg1) != x)
1044+
pfree(arg1);
1045+
if (PointerGetDatum(arg2) != y)
1046+
pfree(arg2);
1047+
1048+
return result;
1049+
}
1050+
1051+
/*
1052+
* Conversion routine for sortsupport. Converts original to abbreviated key
1053+
* representation. Our encoding strategy is simple -- pack the first 8 bytes
1054+
* of the bytea data into a Datum (on little-endian machines, the bytes are
1055+
* stored in reverse order), and treat it as an unsigned integer.
1056+
*/
1057+
static Datum
1058+
bytea_abbrev_convert(Datum original, SortSupport ssup)
1059+
{
1060+
const size_t max_prefix_bytes = sizeof(Datum);
1061+
ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
1062+
bytea *authoritative = DatumGetByteaPP(original);
1063+
char *authoritative_data = VARDATA_ANY(authoritative);
1064+
Datum res;
1065+
char *pres;
1066+
int len;
1067+
uint32 hash;
1068+
1069+
pres = (char *) &res;
1070+
1071+
/* memset(), so any non-overwritten bytes are NUL */
1072+
memset(pres, 0, max_prefix_bytes);
1073+
len = VARSIZE_ANY_EXHDR(authoritative);
1074+
1075+
/*
1076+
* Short byteas will have terminating NUL bytes in the abbreviated datum.
1077+
* Abbreviated comparison need not make a distinction between these NUL
1078+
* bytes, and NUL bytes representing actual NULs in the authoritative
1079+
* representation.
1080+
*
1081+
* Hopefully a comparison at or past one abbreviated key's terminating NUL
1082+
* byte will resolve the comparison without consulting the authoritative
1083+
* representation; specifically, some later non-NUL byte in the longer
1084+
* bytea can resolve the comparison against a subsequent terminating NUL
1085+
* in the shorter bytea. There will usually be what is effectively a
1086+
* "length-wise" resolution there and then.
1087+
*
1088+
* If that doesn't work out -- if all bytes in the longer bytea positioned
1089+
* at or past the offset of the smaller bytea (first) terminating NUL are
1090+
* actually representative of NUL bytes in the authoritative binary bytea
1091+
* (perhaps with some *terminating* NUL bytes towards the end of the
1092+
* longer bytea iff it happens to still be small) -- then an authoritative
1093+
* tie-breaker will happen, and do the right thing: explicitly consider
1094+
* bytea length.
1095+
*/
1096+
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
1097+
1098+
/*
1099+
* Maintain approximate cardinality of both abbreviated keys and original,
1100+
* authoritative keys using HyperLogLog. Used as cheap insurance against
1101+
* the worst case, where we do many string abbreviations for no saving in
1102+
* full memcmp()-based comparisons. These statistics are used by
1103+
* bytea_abbrev_abort().
1104+
*
1105+
* First, Hash key proper, or a significant fraction of it. Mix in length
1106+
* in order to compensate for cases where differences are past
1107+
* PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
1108+
*/
1109+
hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
1110+
Min(len, PG_CACHE_LINE_SIZE)));
1111+
1112+
if (len > PG_CACHE_LINE_SIZE)
1113+
hash ^= DatumGetUInt32(hash_uint32((uint32) len));
1114+
1115+
addHyperLogLog(&bss->full_card, hash);
1116+
1117+
/* Hash abbreviated key */
1118+
{
1119+
uint32 tmp;
1120+
1121+
tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
1122+
hash = DatumGetUInt32(hash_uint32(tmp));
1123+
}
1124+
1125+
addHyperLogLog(&bss->abbr_card, hash);
1126+
1127+
/*
1128+
* Byteswap on little-endian machines.
1129+
*
1130+
* This is needed so that ssup_datum_unsigned_cmp() works correctly on all
1131+
* platforms.
1132+
*/
1133+
res = DatumBigEndianToNative(res);
1134+
1135+
/* Don't leak memory here */
1136+
if (PointerGetDatum(authoritative) != original)
1137+
pfree(authoritative);
1138+
1139+
return res;
1140+
}
1141+
1142+
/*
1143+
* Callback for estimating effectiveness of abbreviated key optimization, using
1144+
* heuristic rules. Returns value indicating if the abbreviation optimization
1145+
* should be aborted, based on its projected effectiveness.
1146+
*
1147+
* This is based on varstr_abbrev_abort(), but some comments have been elided
1148+
* for brevity. See there for more details.
1149+
*/
1150+
static bool
1151+
bytea_abbrev_abort(int memtupcount, SortSupport ssup)
1152+
{
1153+
ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
1154+
double abbrev_distinct,
1155+
key_distinct;
1156+
1157+
Assert(ssup->abbreviate);
1158+
1159+
/* Have a little patience */
1160+
if (memtupcount < 100)
1161+
return false;
1162+
1163+
abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);
1164+
key_distinct = estimateHyperLogLog(&bss->full_card);
1165+
1166+
/*
1167+
* Clamp cardinality estimates to at least one distinct value. While
1168+
* NULLs are generally disregarded, if only NULL values were seen so far,
1169+
* that might misrepresent costs if we failed to clamp.
1170+
*/
1171+
if (abbrev_distinct < 1.0)
1172+
abbrev_distinct = 1.0;
1173+
1174+
if (key_distinct < 1.0)
1175+
key_distinct = 1.0;
1176+
1177+
if (trace_sort)
1178+
{
1179+
double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
1180+
1181+
elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "
1182+
"(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
1183+
memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
1184+
bss->prop_card);
1185+
}
1186+
1187+
/*
1188+
* If the number of distinct abbreviated keys approximately matches the
1189+
* number of distinct original keys, continue with abbreviation.
1190+
*/
1191+
if (abbrev_distinct > key_distinct * bss->prop_card)
1192+
{
1193+
/*
1194+
* Decay required cardinality aggressively after 10,000 tuples.
1195+
*/
1196+
if (memtupcount > 10000)
1197+
bss->prop_card *= 0.65;
1198+
1199+
return false;
1200+
}
1201+
1202+
/*
1203+
* Abort abbreviation strategy.
1204+
*/
1205+
if (trace_sort)
1206+
elog(LOG, "bytea_abbrev: aborted abbreviation at %d "
1207+
"(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
1208+
memtupcount, abbrev_distinct, key_distinct, bss->prop_card);
1209+
1210+
return true;
1211+
}
1212+
10041213
Datum
10051214
bytea_sortsupport(PG_FUNCTION_ARGS)
10061215
{
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
10091218

10101219
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
10111220

1012-
/* Use generic string SortSupport, forcing "C" collation */
1013-
varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
1221+
ssup->comparator = byteafastcmp;
1222+
1223+
/*
1224+
* Set up abbreviation support if requested.
1225+
*/
1226+
if (ssup->abbreviate)
1227+
{
1228+
ByteaSortSupport *bss;
1229+
1230+
bss = palloc_object(ByteaSortSupport);
1231+
bss->abbreviate = true;
1232+
bss->prop_card = 0.20;
1233+
initHyperLogLog(&bss->abbr_card, 10);
1234+
initHyperLogLog(&bss->full_card, 10);
1235+
1236+
ssup->ssup_extra = bss;
1237+
ssup->abbrev_full_comparator = ssup->comparator;
1238+
ssup->comparator = ssup_datum_unsigned_cmp;
1239+
ssup->abbrev_converter = bytea_abbrev_convert;
1240+
ssup->abbrev_abort = bytea_abbrev_abort;
1241+
}
10141242

10151243
MemoryContextSwitchTo(oldcontext);
10161244

src/backend/utils/adt/varlena.c

Lines changed: 8 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ typedef struct
9292
int last_returned; /* Last comparison result (cache) */
9393
bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
9494
bool collate_c;
95-
Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
95+
Oid typid; /* Actual datatype (text/bpchar/name) */
9696
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
9797
hyperLogLogState full_card; /* Full key cardinality state */
9898
double prop_card; /* Required cardinality proportion */
@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)
16171617
* Includes locale support, and support for BpChar semantics (i.e. removing
16181618
* trailing spaces before comparison).
16191619
*
1620-
* Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1621-
* same representation. Callers that always use the C collation (e.g.
1622-
* non-collatable type callers like bytea) may have NUL bytes in their strings;
1623-
* this will not work with any other collation, though.
1620+
* Relies on the assumption that text, VarChar, and BpChar all have the
1621+
* same representation.
16241622
*/
16251623
void
16261624
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
19831981
* representation. Our encoding strategy is simple -- pack the first 8 bytes
19841982
* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
19851983
* stored in reverse order), and treat it as an unsigned integer. When the "C"
1986-
* locale is used, or in case of bytea, just memcpy() from original instead.
1984+
* locale is used just memcpy() from original instead.
19871985
*/
19881986
static Datum
19891987
varstr_abbrev_convert(Datum original, SortSupport ssup)
@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
20102008

20112009
/*
20122010
* If we're using the C collation, use memcpy(), rather than strxfrm(), to
2013-
* abbreviate keys. The full comparator for the C locale is always
2014-
* memcmp(). It would be incorrect to allow bytea callers (callers that
2015-
* always force the C collation -- bytea isn't a collatable type, but this
2016-
* approach is convenient) to use strxfrm(). This is because bytea
2017-
* strings may contain NUL bytes. Besides, this should be faster, too.
2018-
*
2019-
* More generally, it's okay that bytea callers can have NUL bytes in
2020-
* strings because abbreviated cmp need not make a distinction between
2021-
* terminating NUL bytes, and NUL bytes representing actual NULs in the
2022-
* authoritative representation. Hopefully a comparison at or past one
2023-
* abbreviated key's terminating NUL byte will resolve the comparison
2024-
* without consulting the authoritative representation; specifically, some
2025-
* later non-NUL byte in the longer string can resolve the comparison
2026-
* against a subsequent terminating NUL in the shorter string. There will
2027-
* usually be what is effectively a "length-wise" resolution there and
2028-
* then.
2029-
*
2030-
* If that doesn't work out -- if all bytes in the longer string
2031-
* positioned at or past the offset of the smaller string's (first)
2032-
* terminating NUL are actually representative of NUL bytes in the
2033-
* authoritative binary string (perhaps with some *terminating* NUL bytes
2034-
* towards the end of the longer string iff it happens to still be small)
2035-
* -- then an authoritative tie-breaker will happen, and do the right
2036-
* thing: explicitly consider string length.
2011+
* abbreviate keys. The full comparator for the C locale is also
2012+
* memcmp(). This should be faster than strxfrm().
20372013
*/
20382014
if (sss->collate_c)
20392015
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
21152091
* strxfrm() blob is itself NUL terminated, leaving no danger of
21162092
* misinterpreting any NUL bytes not intended to be interpreted as
21172093
* logically representing termination.
2118-
*
2119-
* (Actually, even if there were NUL bytes in the blob it would be
2120-
* okay. See remarks on bytea case above.)
21212094
*/
21222095
memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
21232096
}
@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)
21982171
* NULLs are generally disregarded, if only NULL values were seen so far,
21992172
* that might misrepresent costs if we failed to clamp.
22002173
*/
2201-
if (abbrev_distinct <= 1.0)
2174+
if (abbrev_distinct < 1.0)
22022175
abbrev_distinct = 1.0;
22032176

2204-
if (key_distinct <= 1.0)
2177+
if (key_distinct < 1.0)
22052178
key_distinct = 1.0;
22062179

22072180
/*

src/tools/pgindent/typedefs.list

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ BulkWriteBuffer
366366
BulkWriteState
367367
BumpBlock
368368
BumpContext
369+
ByteaSortSupport
369370
CACHESIGN
370371
CAC_state
371372
CCFastEqualFN

0 commit comments

Comments
 (0)