99 * Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010 * ALL RIGHTS RESERVED;
1111 *
12- * levenshtein()
13- * -------------
14- * Written based on a description of the algorithm by Michael Gilleland
15- * found at http://www.merriampark.com/ld.htm
16- * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
17- * inspiration.
18- * Configurable penalty costs extension is introduced by Volkan
19- * YAZICI <volkan.yazici@gmail.com>.
20- *
2112 * metaphone()
2213 * -----------
2314 * Modified for PostgreSQL by Joe Conway.
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
6152 */
6253extern Datum levenshtein_with_costs (PG_FUNCTION_ARGS );
6354extern Datum levenshtein (PG_FUNCTION_ARGS );
55+ extern Datum levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS );
56+ extern Datum levenshtein_less_equal (PG_FUNCTION_ARGS );
6457extern Datum metaphone (PG_FUNCTION_ARGS );
6558extern Datum soundex (PG_FUNCTION_ARGS );
6659extern Datum difference (PG_FUNCTION_ARGS );
@@ -85,16 +78,6 @@ soundex_code(char letter)
8578 return letter ;
8679}
8780
88-
89- /*
90- * Levenshtein
91- */
92- #define MAX_LEVENSHTEIN_STRLEN 255
93-
94- static int levenshtein_internal (text * s , text * t ,
95- int ins_c , int del_c , int sub_c );
96-
97-
9881/*
9982 * Metaphone
10083 */
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
197180 return true;
198181}
199182
200- /*
201- * levenshtein_internal - Calculates Levenshtein distance metric
202- * between supplied strings. Generally
203- * (1, 1, 1) penalty costs suffices common
204- * cases, but your mileage may vary.
205- */
206- static int
207- levenshtein_internal (text * s , text * t ,
208- int ins_c , int del_c , int sub_c )
209- {
210- int m ,
211- n ,
212- s_bytes ,
213- t_bytes ;
214- int * prev ;
215- int * curr ;
216- int * s_char_len = NULL ;
217- int i ,
218- j ;
219- const char * s_data ;
220- const char * t_data ;
221- const char * y ;
222-
223- /* Extract a pointer to the actual character data. */
224- s_data = VARDATA_ANY (s );
225- t_data = VARDATA_ANY (t );
226-
227- /* Determine length of each string in bytes and characters. */
228- s_bytes = VARSIZE_ANY_EXHDR (s );
229- t_bytes = VARSIZE_ANY_EXHDR (t );
230- m = pg_mbstrlen_with_len (s_data , s_bytes );
231- n = pg_mbstrlen_with_len (t_data , t_bytes );
232-
233- /*
234- * We can transform an empty s into t with n insertions, or a non-empty t
235- * into an empty s with m deletions.
236- */
237- if (!m )
238- return n * ins_c ;
239- if (!n )
240- return m * del_c ;
241-
242- /*
243- * For security concerns, restrict excessive CPU+RAM usage. (This
244- * implementation uses O(m) memory and has O(mn) complexity.)
245- */
246- if (m > MAX_LEVENSHTEIN_STRLEN ||
247- n > MAX_LEVENSHTEIN_STRLEN )
248- ereport (ERROR ,
249- (errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
250- errmsg ("argument exceeds the maximum length of %d bytes" ,
251- MAX_LEVENSHTEIN_STRLEN )));
252-
253- /*
254- * In order to avoid calling pg_mblen() repeatedly on each character in s,
255- * we cache all the lengths before starting the main loop -- but if all the
256- * characters in both strings are single byte, then we skip this and use
257- * a fast-path in the main loop. If only one string contains multi-byte
258- * characters, we still build the array, so that the fast-path needn't
259- * deal with the case where the array hasn't been initialized.
260- */
261- if (m != s_bytes || n != t_bytes )
262- {
263- int i ;
264- const char * cp = s_data ;
265-
266- s_char_len = (int * ) palloc ((m + 1 ) * sizeof (int ));
267- for (i = 0 ; i < m ; ++ i )
268- {
269- s_char_len [i ] = pg_mblen (cp );
270- cp += s_char_len [i ];
271- }
272- s_char_len [i ] = 0 ;
273- }
274-
275- /* One more cell for initialization column and row. */
276- ++ m ;
277- ++ n ;
278-
279- /*
280- * One way to compute Levenshtein distance is to incrementally construct
281- * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
282- * of operations required to transform the first i characters of s into
283- * the first j characters of t. The last column of the final row is the
284- * answer.
285- *
286- * We use that algorithm here with some modification. In lieu of holding
287- * the entire array in memory at once, we'll just use two arrays of size
288- * m+1 for storing accumulated values. At each step one array represents
289- * the "previous" row and one is the "current" row of the notional large
290- * array.
291- */
292- prev = (int * ) palloc (2 * m * sizeof (int ));
293- curr = prev + m ;
294-
295- /*
296- * To transform the first i characters of s into the first 0 characters
297- * of t, we must perform i deletions.
298- */
299- for (i = 0 ; i < m ; i ++ )
300- prev [i ] = i * del_c ;
301-
302- /* Loop through rows of the notional array */
303- for (y = t_data , j = 1 ; j < n ; j ++ )
304- {
305- int * temp ;
306- const char * x = s_data ;
307- int y_char_len = n != t_bytes + 1 ? pg_mblen (y ) : 1 ;
308-
309- /*
310- * To transform the first 0 characters of s into the first j
311- * characters of t, we must perform j insertions.
312- */
313- curr [0 ] = j * ins_c ;
314-
315- /*
316- * This inner loop is critical to performance, so we include a
317- * fast-path to handle the (fairly common) case where no multibyte
318- * characters are in the mix. The fast-path is entitled to assume
319- * that if s_char_len is not initialized then BOTH strings contain
320- * only single-byte characters.
321- */
322- if (s_char_len != NULL )
323- {
324- for (i = 1 ; i < m ; i ++ )
325- {
326- int ins ;
327- int del ;
328- int sub ;
329- int x_char_len = s_char_len [i - 1 ];
330-
331- /*
332- * Calculate costs for insertion, deletion, and substitution.
333- *
334- * When calculating cost for substitution, we compare the last
335- * character of each possibly-multibyte character first,
336- * because that's enough to rule out most mis-matches. If we
337- * get past that test, then we compare the lengths and the
338- * remaining bytes.
339- */
340- ins = prev [i ] + ins_c ;
341- del = curr [i - 1 ] + del_c ;
342- if (x [x_char_len - 1 ] == y [y_char_len - 1 ]
343- && x_char_len == y_char_len &&
344- (x_char_len == 1 || rest_of_char_same (x , y , x_char_len )))
345- sub = prev [i - 1 ];
346- else
347- sub = prev [i - 1 ] + sub_c ;
348-
349- /* Take the one with minimum cost. */
350- curr [i ] = Min (ins , del );
351- curr [i ] = Min (curr [i ], sub );
352-
353- /* Point to next character. */
354- x += x_char_len ;
355- }
356- }
357- else
358- {
359- for (i = 1 ; i < m ; i ++ )
360- {
361- int ins ;
362- int del ;
363- int sub ;
183+ #include "levenshtein.c"
184+ #define LEVENSHTEIN_LESS_EQUAL
185+ #include "levenshtein.c"
364186
365- /* Calculate costs for insertion, deletion, and substitution. */
366- ins = prev [i ] + ins_c ;
367- del = curr [i - 1 ] + del_c ;
368- sub = prev [i - 1 ] + ((* x == * y ) ? 0 : sub_c );
369-
370- /* Take the one with minimum cost. */
371- curr [i ] = Min (ins , del );
372- curr [i ] = Min (curr [i ], sub );
187+ PG_FUNCTION_INFO_V1 (levenshtein_with_costs );
188+ Datum
189+ levenshtein_with_costs (PG_FUNCTION_ARGS )
190+ {
191+ text * src = PG_GETARG_TEXT_PP (0 );
192+ text * dst = PG_GETARG_TEXT_PP (1 );
193+ int ins_c = PG_GETARG_INT32 (2 );
194+ int del_c = PG_GETARG_INT32 (3 );
195+ int sub_c = PG_GETARG_INT32 (4 );
373196
374- /* Point to next character. */
375- x ++ ;
376- }
377- }
197+ PG_RETURN_INT32 (levenshtein_internal (src , dst , ins_c , del_c , sub_c ));
198+ }
378199
379- /* Swap current row with previous row. */
380- temp = curr ;
381- curr = prev ;
382- prev = temp ;
383200
384- /* Point to next character. */
385- y += y_char_len ;
386- }
201+ PG_FUNCTION_INFO_V1 (levenshtein );
202+ Datum
203+ levenshtein (PG_FUNCTION_ARGS )
204+ {
205+ text * src = PG_GETARG_TEXT_PP (0 );
206+ text * dst = PG_GETARG_TEXT_PP (1 );
387207
388- /*
389- * Because the final value was swapped from the previous row to the
390- * current row, that's where we'll find it.
391- */
392- return prev [m - 1 ];
208+ PG_RETURN_INT32 (levenshtein_internal (src , dst , 1 , 1 , 1 ));
393209}
394210
395211
396- PG_FUNCTION_INFO_V1 (levenshtein_with_costs );
212+ PG_FUNCTION_INFO_V1 (levenshtein_less_equal_with_costs );
397213Datum
398- levenshtein_with_costs (PG_FUNCTION_ARGS )
214+ levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS )
399215{
400216 text * src = PG_GETARG_TEXT_PP (0 );
401217 text * dst = PG_GETARG_TEXT_PP (1 );
402218 int ins_c = PG_GETARG_INT32 (2 );
403219 int del_c = PG_GETARG_INT32 (3 );
404220 int sub_c = PG_GETARG_INT32 (4 );
221+ int max_d = PG_GETARG_INT32 (5 );
405222
406- PG_RETURN_INT32 (levenshtein_internal (src , dst , ins_c , del_c , sub_c ));
223+ PG_RETURN_INT32 (levenshtein_less_equal_internal (src , dst , ins_c , del_c , sub_c , max_d ));
407224}
408225
409226
410- PG_FUNCTION_INFO_V1 (levenshtein );
227+ PG_FUNCTION_INFO_V1 (levenshtein_less_equal );
411228Datum
412- levenshtein (PG_FUNCTION_ARGS )
229+ levenshtein_less_equal (PG_FUNCTION_ARGS )
413230{
414231 text * src = PG_GETARG_TEXT_PP (0 );
415232 text * dst = PG_GETARG_TEXT_PP (1 );
233+ int max_d = PG_GETARG_INT32 (2 );
416234
417- PG_RETURN_INT32 (levenshtein_internal (src , dst , 1 , 1 , 1 ));
235+ PG_RETURN_INT32 (levenshtein_less_equal_internal (src , dst , 1 , 1 , 1 , max_d ));
418236}
419237
420238
0 commit comments