@@ -773,7 +773,7 @@ intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
773773 *
774774 * Simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
775775 * called "codewords". The number of integers packed into a single codeword
776- * depends on the integers being packed: small integers are encoded using
776+ * depends on the integers being packed; small integers are encoded using
777777 * fewer bits than large integers. A single codeword can store a single
778778 * 60-bit integer, or two 30-bit integers, for example.
779779 *
@@ -783,11 +783,11 @@ intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
783783 * of the absolute values.
784784 *
785785 * In Simple-8b, each codeword consists of a 4-bit selector, which indicates
786- * how many integers are encoded in the codeword, and the encoded integers
786+ * how many integers are encoded in the codeword, and the encoded integers are
787787 * packed into the remaining 60 bits. The selector allows for 16 different
788- * ways of using the remaining 60 bits, "modes". The number of integers
789- * packed into a single codeword is listed in the simple8b_modes table below.
790- * For example, consider the following codeword:
788+ * ways of using the remaining 60 bits, called "modes". The number of integers
789+ * packed into a single codeword in each mode is listed in the simple8b_modes
790+ * table below. For example, consider the following codeword:
791791 *
792792 * 20-bit integer 20-bit integer 20-bit integer
793793 * 1101 00000000000000010010 01111010000100100000 00000000000000010100
@@ -835,22 +835,28 @@ static const struct
835835 {20 , 3 }, /* mode 13: three 20-bit integers */
836836 {30 , 2 }, /* mode 14: two 30-bit integers */
837837 {60 , 1 }, /* mode 15: one 60-bit integer */
838+
838839 {0 , 0 } /* sentinel value */
839840};
840841
841842/*
842843 * EMPTY_CODEWORD is a special value, used to indicate "no values".
843844 * It is used if the next value is too large to be encoded with Simple-8b.
844845 *
845- * This value looks like a 0-mode codeword, but we check for it
846+ * This value looks like a 0-mode codeword, but we check for it
846847 * specifically. (In a real 0-mode codeword, all the unused bits are zero.)
847848 */
848- #define EMPTY_CODEWORD UINT64CONST(0xFFFFFFFFFFFFFFF0 )
849+ #define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF )
849850
850851/*
851852 * Encode a number of integers into a Simple-8b codeword.
852853 *
853- * Returns the number of integers that were encoded.
854+ * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
855+ * elements.
856+ *
857+ * Returns the encoded codeword, and sets *num_encoded to the number
858+ * input integers that were encoded. It can be zero, if the first input is
859+ * too large to be encoded.
854860 */
855861static uint64
856862simple8b_encode (uint64 * ints , int * num_encoded , uint64 base )
@@ -861,7 +867,6 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
861867 uint64 diff ;
862868 uint64 last_val ;
863869 uint64 codeword ;
864- uint64 diffs [60 ];
865870 int i ;
866871
867872 Assert (ints [0 ] > base );
@@ -891,13 +896,12 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
891896 selector ++ ;
892897 nints = simple8b_modes [selector ].num_ints ;
893898 bits = simple8b_modes [selector ].bits_per_int ;
899+
894900 if (i >= nints )
895901 break ;
896902 }
897903 else
898904 {
899- if (i < 60 )
900- diffs [i ] = diff ;
901905 i ++ ;
902906 if (i >= nints )
903907 break ;
@@ -910,7 +914,13 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
910914
911915 if (nints == 0 )
912916 {
913- /* The next value is too large and be encoded with Simple-8b */
917+ /*
918+ * The first value is too large to be encoded with Simple-8b.
919+ *
920+ * If there is at least one not-too-large integer in the input, we
921+ * will encode it using mode 15 (or a more compact mode). Hence, we
922+ * only get here, if the *first* input integer is >= 2^60.
923+ */
914924 Assert (i == 0 );
915925 * num_encoded = 0 ;
916926 return EMPTY_CODEWORD ;
@@ -924,16 +934,18 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
924934 codeword = 0 ;
925935 if (bits > 0 )
926936 {
927- for (i = nints - 1 ; i >= 0 ; i -- )
937+ for (i = nints - 1 ; i > 0 ; i -- )
928938 {
939+ diff = ints [i ] - ints [i - 1 ] - 1 ;
940+ codeword |= diff ;
929941 codeword <<= bits ;
930- codeword |= diffs [i ];
931942 }
943+ diff = ints [0 ] - base - 1 ;
944+ codeword |= diff ;
932945 }
933946
934947 /* add selector to the codeword, and return */
935- codeword <<= 4 ;
936- codeword |= selector ;
948+ codeword |= (uint64 ) selector << 60 ;
937949
938950 * num_encoded = nints ;
939951 return codeword ;
@@ -945,7 +957,7 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
945957static int
946958simple8b_decode (uint64 codeword , uint64 * decoded , uint64 base )
947959{
948- int selector = codeword & 0x0f ;
960+ int selector = ( codeword >> 60 ) ;
949961 int nints = simple8b_modes [selector ].num_ints ;
950962 uint64 bits = simple8b_modes [selector ].bits_per_int ;
951963 uint64 mask = (UINT64CONST (1 ) << bits ) - 1 ;
@@ -954,8 +966,6 @@ simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
954966 if (codeword == EMPTY_CODEWORD )
955967 return 0 ;
956968
957- codeword >>= 4 ; /* shift out the selector */
958-
959969 prev_value = base ;
960970 for (int i = 0 ; i < nints ; i ++ )
961971 {
@@ -976,15 +986,13 @@ simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
976986static bool
977987simple8b_contains (uint64 codeword , uint64 key , uint64 base )
978988{
979- int selector = codeword & 0x0f ;
989+ int selector = ( codeword >> 60 ) ;
980990 int nints = simple8b_modes [selector ].num_ints ;
981991 int bits = simple8b_modes [selector ].bits_per_int ;
982992
983993 if (codeword == EMPTY_CODEWORD )
984994 return false;
985995
986- codeword >>= 4 ; /* shift out the selector */
987-
988996 if (bits == 0 )
989997 {
990998 /* Special handling for 0-bit cases. */
0 commit comments