@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
8080 return false;
8181}
8282
83+ #ifndef USE_NO_SIMD
84+ /*
85+ * pg_lfind32_simd_helper
86+ *
87+ * Searches one 4-register-block of integers. The caller is responsible for
88+ * ensuring that there are at least 4-registers-worth of integers remaining.
89+ */
90+ static inline bool
91+ pg_lfind32_simd_helper (const Vector32 keys , uint32 * base )
92+ {
93+ const uint32 nelem_per_vector = sizeof (Vector32 ) / sizeof (uint32 );
94+ Vector32 vals1 ,
95+ vals2 ,
96+ vals3 ,
97+ vals4 ,
98+ result1 ,
99+ result2 ,
100+ result3 ,
101+ result4 ,
102+ tmp1 ,
103+ tmp2 ,
104+ result ;
105+
106+ /* load the next block into 4 registers */
107+ vector32_load (& vals1 , base );
108+ vector32_load (& vals2 , & base [nelem_per_vector ]);
109+ vector32_load (& vals3 , & base [nelem_per_vector * 2 ]);
110+ vector32_load (& vals4 , & base [nelem_per_vector * 3 ]);
111+
112+ /* compare each value to the key */
113+ result1 = vector32_eq (keys , vals1 );
114+ result2 = vector32_eq (keys , vals2 );
115+ result3 = vector32_eq (keys , vals3 );
116+ result4 = vector32_eq (keys , vals4 );
117+
118+ /* combine the results into a single variable */
119+ tmp1 = vector32_or (result1 , result2 );
120+ tmp2 = vector32_or (result3 , result4 );
121+ result = vector32_or (tmp1 , tmp2 );
122+
123+ /* return whether there was a match */
124+ return vector32_is_highbit_set (result );
125+ }
126+ #endif /* ! USE_NO_SIMD */
127+
83128/*
84129 * pg_lfind32
85130 *
@@ -95,8 +140,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
95140
96141 /*
97142 * For better instruction-level parallelism, each loop iteration operates
98- * on a block of four registers. Testing for SSE2 has showed this is ~40%
99- * faster than using a block of two registers.
143+ * on a block of four registers.
100144 */
101145 const Vector32 keys = vector32_broadcast (key ); /* load copies of key */
102146 const uint32 nelem_per_vector = sizeof (Vector32 ) / sizeof (uint32 );
@@ -109,57 +153,51 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
109153 bool assert_result = false;
110154
111155 /* pre-compute the result for assert checking */
112- for (i = 0 ; i < nelem ; i ++ )
156+ for (int j = 0 ; j < nelem ; j ++ )
113157 {
114- if (key == base [i ])
158+ if (key == base [j ])
115159 {
116160 assert_result = true;
117161 break ;
118162 }
119163 }
120164#endif
121165
122- for (i = 0 ; i < tail_idx ; i += nelem_per_iteration )
166+ /*
167+ * If there aren't enough elements for the SIMD code, jump to the standard
168+ * one-by-one linear search code.
169+ */
170+ if (nelem < nelem_per_iteration )
171+ goto one_by_one ;
172+
173+ /*
174+ * Process as many elements as possible with a block of 4 registers.
175+ */
176+ do
123177 {
124- Vector32 vals1 ,
125- vals2 ,
126- vals3 ,
127- vals4 ,
128- result1 ,
129- result2 ,
130- result3 ,
131- result4 ,
132- tmp1 ,
133- tmp2 ,
134- result ;
135-
136- /* load the next block into 4 registers */
137- vector32_load (& vals1 , & base [i ]);
138- vector32_load (& vals2 , & base [i + nelem_per_vector ]);
139- vector32_load (& vals3 , & base [i + nelem_per_vector * 2 ]);
140- vector32_load (& vals4 , & base [i + nelem_per_vector * 3 ]);
141-
142- /* compare each value to the key */
143- result1 = vector32_eq (keys , vals1 );
144- result2 = vector32_eq (keys , vals2 );
145- result3 = vector32_eq (keys , vals3 );
146- result4 = vector32_eq (keys , vals4 );
147-
148- /* combine the results into a single variable */
149- tmp1 = vector32_or (result1 , result2 );
150- tmp2 = vector32_or (result3 , result4 );
151- result = vector32_or (tmp1 , tmp2 );
152-
153- /* see if there was a match */
154- if (vector32_is_highbit_set (result ))
178+ if (pg_lfind32_simd_helper (keys , & base [i ]))
155179 {
156180 Assert (assert_result == true);
157181 return true;
158182 }
159- }
183+
184+ i += nelem_per_iteration ;
185+
186+ } while (i < tail_idx );
187+
188+ /*
189+ * Process the last 'nelem_per_iteration' elements in the array with a
190+ * 4-register block. This will cause us to check a subset of the elements
191+ * more than once, but that won't affect correctness, and testing has
192+ * demonstrated that this helps more cases than it harms.
193+ */
194+ Assert (assert_result == pg_lfind32_simd_helper (keys , & base [nelem - nelem_per_iteration ]));
195+ return pg_lfind32_simd_helper (keys , & base [nelem - nelem_per_iteration ]);
196+
160197#endif /* ! USE_NO_SIMD */
161198
162- /* Process the remaining elements one at a time. */
199+ one_by_one :
200+ /* Process the elements one at a time. */
163201 for (; i < nelem ; i ++ )
164202 {
165203 if (key == base [i ])
0 commit comments