1515#include "postgres.h"
1616
1717#include "access/detoast.h"
18- #include "catalog/pg_collation_d.h"
19- #include "catalog/pg_type_d.h"
18+ #include "common/hashfn.h"
2019#include "common/int.h"
2120#include "fmgr.h"
21+ #include "lib/hyperloglog.h"
2222#include "libpq/pqformat.h"
2323#include "port/pg_bitutils.h"
24+ #include "port/pg_bswap.h"
2425#include "utils/builtins.h"
2526#include "utils/bytea.h"
2627#include "utils/fmgrprotos.h"
28+ #include "utils/guc.h"
2729#include "utils/memutils.h"
2830#include "utils/sortsupport.h"
29- #include "utils/varlena.h"
3031#include "varatt.h"
3132
3233/* GUC variable */
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
3738 bool length_not_specified );
3839static bytea * bytea_overlay (bytea * t1 , bytea * t2 , int sp , int sl );
3940
41+ typedef struct
42+ {
43+ bool abbreviate ; /* Should we abbreviate keys? */
44+ hyperLogLogState abbr_card ; /* Abbreviated key cardinality state */
45+ hyperLogLogState full_card ; /* Full key cardinality state */
46+ double prop_card ; /* Required cardinality proportion */
47+ } ByteaSortSupport ;
48+
49+ /* Static function declarations for sort support */
50+ static int byteafastcmp (Datum x , Datum y , SortSupport ssup );
51+ static Datum bytea_abbrev_convert (Datum original , SortSupport ssup );
52+ static bool bytea_abbrev_abort (int memtupcount , SortSupport ssup );
53+
4054/*
4155 * bytea_catenate
4256 * Guts of byteacat(), broken out so it can be used by other functions
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
10011015 PG_RETURN_BYTEA_P (result );
10021016}
10031017
1018+ /*
1019+ * sortsupport comparison func
1020+ */
1021+ static int
1022+ byteafastcmp (Datum x , Datum y , SortSupport ssup )
1023+ {
1024+ bytea * arg1 = DatumGetByteaPP (x );
1025+ bytea * arg2 = DatumGetByteaPP (y );
1026+ char * a1p ,
1027+ * a2p ;
1028+ int len1 ,
1029+ len2 ,
1030+ result ;
1031+
1032+ a1p = VARDATA_ANY (arg1 );
1033+ a2p = VARDATA_ANY (arg2 );
1034+
1035+ len1 = VARSIZE_ANY_EXHDR (arg1 );
1036+ len2 = VARSIZE_ANY_EXHDR (arg2 );
1037+
1038+ result = memcmp (a1p , a2p , Min (len1 , len2 ));
1039+ if ((result == 0 ) && (len1 != len2 ))
1040+ result = (len1 < len2 ) ? -1 : 1 ;
1041+
1042+ /* We can't afford to leak memory here. */
1043+ if (PointerGetDatum (arg1 ) != x )
1044+ pfree (arg1 );
1045+ if (PointerGetDatum (arg2 ) != y )
1046+ pfree (arg2 );
1047+
1048+ return result ;
1049+ }
1050+
1051+ /*
1052+ * Conversion routine for sortsupport. Converts original to abbreviated key
1053+ * representation. Our encoding strategy is simple -- pack the first 8 bytes
1054+ * of the bytea data into a Datum (on little-endian machines, the bytes are
1055+ * stored in reverse order), and treat it as an unsigned integer.
1056+ */
1057+ static Datum
1058+ bytea_abbrev_convert (Datum original , SortSupport ssup )
1059+ {
1060+ const size_t max_prefix_bytes = sizeof (Datum );
1061+ ByteaSortSupport * bss = (ByteaSortSupport * ) ssup -> ssup_extra ;
1062+ bytea * authoritative = DatumGetByteaPP (original );
1063+ char * authoritative_data = VARDATA_ANY (authoritative );
1064+ Datum res ;
1065+ char * pres ;
1066+ int len ;
1067+ uint32 hash ;
1068+
1069+ pres = (char * ) & res ;
1070+
1071+ /* memset(), so any non-overwritten bytes are NUL */
1072+ memset (pres , 0 , max_prefix_bytes );
1073+ len = VARSIZE_ANY_EXHDR (authoritative );
1074+
1075+ /*
1076+ * Short byteas will have terminating NUL bytes in the abbreviated datum.
1077+ * Abbreviated comparison need not make a distinction between these NUL
1078+ * bytes, and NUL bytes representing actual NULs in the authoritative
1079+ * representation.
1080+ *
1081+ * Hopefully a comparison at or past one abbreviated key's terminating NUL
1082+ * byte will resolve the comparison without consulting the authoritative
1083+ * representation; specifically, some later non-NUL byte in the longer
1084+ * bytea can resolve the comparison against a subsequent terminating NUL
1085+ * in the shorter bytea. There will usually be what is effectively a
1086+ * "length-wise" resolution there and then.
1087+ *
1088+ * If that doesn't work out -- if all bytes in the longer bytea positioned
1089+ * at or past the offset of the smaller bytea (first) terminating NUL are
1090+ * actually representative of NUL bytes in the authoritative binary bytea
1091+ * (perhaps with some *terminating* NUL bytes towards the end of the
1092+ * longer bytea iff it happens to still be small) -- then an authoritative
1093+ * tie-breaker will happen, and do the right thing: explicitly consider
1094+ * bytea length.
1095+ */
1096+ memcpy (pres , authoritative_data , Min (len , max_prefix_bytes ));
1097+
1098+ /*
1099+ * Maintain approximate cardinality of both abbreviated keys and original,
1100+ * authoritative keys using HyperLogLog. Used as cheap insurance against
1101+ * the worst case, where we do many string abbreviations for no saving in
1102+ * full memcmp()-based comparisons. These statistics are used by
1103+ * bytea_abbrev_abort().
1104+ *
1105+ * First, Hash key proper, or a significant fraction of it. Mix in length
1106+ * in order to compensate for cases where differences are past
1107+ * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
1108+ */
1109+ hash = DatumGetUInt32 (hash_any ((unsigned char * ) authoritative_data ,
1110+ Min (len , PG_CACHE_LINE_SIZE )));
1111+
1112+ if (len > PG_CACHE_LINE_SIZE )
1113+ hash ^= DatumGetUInt32 (hash_uint32 ((uint32 ) len ));
1114+
1115+ addHyperLogLog (& bss -> full_card , hash );
1116+
1117+ /* Hash abbreviated key */
1118+ {
1119+ uint32 tmp ;
1120+
1121+ tmp = DatumGetUInt32 (res ) ^ (uint32 ) (DatumGetUInt64 (res ) >> 32 );
1122+ hash = DatumGetUInt32 (hash_uint32 (tmp ));
1123+ }
1124+
1125+ addHyperLogLog (& bss -> abbr_card , hash );
1126+
1127+ /*
1128+ * Byteswap on little-endian machines.
1129+ *
1130+ * This is needed so that ssup_datum_unsigned_cmp() works correctly on all
1131+ * platforms.
1132+ */
1133+ res = DatumBigEndianToNative (res );
1134+
1135+ /* Don't leak memory here */
1136+ if (PointerGetDatum (authoritative ) != original )
1137+ pfree (authoritative );
1138+
1139+ return res ;
1140+ }
1141+
1142+ /*
1143+ * Callback for estimating effectiveness of abbreviated key optimization, using
1144+ * heuristic rules. Returns value indicating if the abbreviation optimization
1145+ * should be aborted, based on its projected effectiveness.
1146+ *
1147+ * This is based on varstr_abbrev_abort(), but some comments have been elided
1148+ * for brevity. See there for more details.
1149+ */
1150+ static bool
1151+ bytea_abbrev_abort (int memtupcount , SortSupport ssup )
1152+ {
1153+ ByteaSortSupport * bss = (ByteaSortSupport * ) ssup -> ssup_extra ;
1154+ double abbrev_distinct ,
1155+ key_distinct ;
1156+
1157+ Assert (ssup -> abbreviate );
1158+
1159+ /* Have a little patience */
1160+ if (memtupcount < 100 )
1161+ return false;
1162+
1163+ abbrev_distinct = estimateHyperLogLog (& bss -> abbr_card );
1164+ key_distinct = estimateHyperLogLog (& bss -> full_card );
1165+
1166+ /*
1167+ * Clamp cardinality estimates to at least one distinct value. While
1168+ * NULLs are generally disregarded, if only NULL values were seen so far,
1169+ * that might misrepresent costs if we failed to clamp.
1170+ */
1171+ if (abbrev_distinct < 1.0 )
1172+ abbrev_distinct = 1.0 ;
1173+
1174+ if (key_distinct < 1.0 )
1175+ key_distinct = 1.0 ;
1176+
1177+ if (trace_sort )
1178+ {
1179+ double norm_abbrev_card = abbrev_distinct / (double ) memtupcount ;
1180+
1181+ elog (LOG , "bytea_abbrev: abbrev_distinct after %d: %f "
1182+ "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)" ,
1183+ memtupcount , abbrev_distinct , key_distinct , norm_abbrev_card ,
1184+ bss -> prop_card );
1185+ }
1186+
1187+ /*
1188+ * If the number of distinct abbreviated keys approximately matches the
1189+ * number of distinct original keys, continue with abbreviation.
1190+ */
1191+ if (abbrev_distinct > key_distinct * bss -> prop_card )
1192+ {
1193+ /*
1194+ * Decay required cardinality aggressively after 10,000 tuples.
1195+ */
1196+ if (memtupcount > 10000 )
1197+ bss -> prop_card *= 0.65 ;
1198+
1199+ return false;
1200+ }
1201+
1202+ /*
1203+ * Abort abbreviation strategy.
1204+ */
1205+ if (trace_sort )
1206+ elog (LOG , "bytea_abbrev: aborted abbreviation at %d "
1207+ "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)" ,
1208+ memtupcount , abbrev_distinct , key_distinct , bss -> prop_card );
1209+
1210+ return true;
1211+ }
1212+
10041213Datum
10051214bytea_sortsupport (PG_FUNCTION_ARGS )
10061215{
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
10091218
10101219 oldcontext = MemoryContextSwitchTo (ssup -> ssup_cxt );
10111220
1012- /* Use generic string SortSupport, forcing "C" collation */
1013- varstr_sortsupport (ssup , BYTEAOID , C_COLLATION_OID );
1221+ ssup -> comparator = byteafastcmp ;
1222+
1223+ /*
1224+ * Set up abbreviation support if requested.
1225+ */
1226+ if (ssup -> abbreviate )
1227+ {
1228+ ByteaSortSupport * bss ;
1229+
1230+ bss = palloc_object (ByteaSortSupport );
1231+ bss -> abbreviate = true;
1232+ bss -> prop_card = 0.20 ;
1233+ initHyperLogLog (& bss -> abbr_card , 10 );
1234+ initHyperLogLog (& bss -> full_card , 10 );
1235+
1236+ ssup -> ssup_extra = bss ;
1237+ ssup -> abbrev_full_comparator = ssup -> comparator ;
1238+ ssup -> comparator = ssup_datum_unsigned_cmp ;
1239+ ssup -> abbrev_converter = bytea_abbrev_convert ;
1240+ ssup -> abbrev_abort = bytea_abbrev_abort ;
1241+ }
10141242
10151243 MemoryContextSwitchTo (oldcontext );
10161244
0 commit comments