|
| 1 | +/* |
| 2 | + This edit distance code is taken from trn3.6. A few minor |
| 3 | + modifications have been made by Andrew Tridgell <[email protected]> |
| 4 | + for use in spamsum. |
| 5 | +*/ |
| 6 | + |
| 7 | + |
| 8 | +/***************************************************************************/ |
| 9 | + |
| 10 | + |
| 11 | +/* The authors make no claims as to the fitness or correctness of this software |
| 12 | + * for any use whatsoever, and it is provided as is. Any use of this software |
| 13 | + * is at the user's own risk. |
| 14 | + */ |
| 15 | + |
| 16 | +#include <stdio.h> |
| 17 | +#include <stdlib.h> |
| 18 | + |
| 19 | +/* edit_dist -- returns the minimum edit distance between two strings |
| 20 | +
|
| 21 | + Program by: Mark Maimone CMU Computer Science 13 Nov 89 |
| 22 | + Last Modified: 28 Jan 90 |
| 23 | +
|
| 24 | + If the input strings have length n and m, the algorithm runs in time |
| 25 | + O(nm) and space O(min(m,n)). |
| 26 | +
|
| 27 | +HISTORY |
| 28 | + 13 Nov 89 (mwm) Created edit_dist() and set_costs(). |
| 29 | +
|
| 30 | + 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD |
| 31 | + computations will work even when THRESHOLD is not a multiple of |
| 32 | + sizeof(int). |
| 33 | +
|
| 34 | + 17 May 93 (mwm) Improved performance when used with trn's newsgroup |
| 35 | + processing; assume all costs are 1, and you can terminate when a |
| 36 | + threshold is exceeded. |
| 37 | +*/ |
| 38 | + |
| 39 | +#define MIN_DIST 100 |
| 40 | + |
| 41 | +#define TRN_SPEEDUP /* Use a less-general version of the |
| 42 | + routine, one that's better for trn. |
| 43 | + All change costs are 1, and it's okay |
| 44 | + to terminate if the edit distance is |
| 45 | + known to exceed MIN_DIST */ |
| 46 | + |
| 47 | +#define THRESHOLD 4000 /* worry about allocating more memory only |
| 48 | + when this # of bytes is exceeded */ |
| 49 | +#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2)) |
| 50 | + |
| 51 | +#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y)) |
| 52 | + |
| 53 | +#define swap_int(x,y) do { int _iswap = (x); (x) = (y); (y) = _iswap; } while (0) |
| 54 | +#define swap_char(x,y) do { const char *_cswap = (x); (x) = (y); (y) = _cswap; } while (0) |
| 55 | + |
| 56 | +static inline int min3(int x, int y, int z) { |
| 57 | + return x < y ? (x < z ? x : z) : (z < y) ? z : y; |
| 58 | +} |
| 59 | +static inline int min2(int x, int y) |
| 60 | +{ |
| 61 | + return x < y ? x : y; |
| 62 | +} |
| 63 | + |
| 64 | +static int insert_cost = 1; |
| 65 | +static int delete_cost = 1; |
| 66 | +#ifndef TRN_SPEEDUP |
| 67 | +static int change_cost = 1; |
| 68 | +static int swap_cost = 1; |
| 69 | +#endif |
| 70 | + |
| 71 | +/* edit_distn -- returns the edit distance between two strings, or -1 on |
| 72 | + failure */ |
| 73 | + |
| 74 | +int |
| 75 | +edit_distn(const char *from, int from_len, const char *to, int to_len) |
| 76 | +{ |
| 77 | +#ifndef TRN_SPEEDUP |
| 78 | + register int ins, del, ch; /* local copies of edit costs */ |
| 79 | +#endif |
| 80 | + register int row, col, index; /* dynamic programming counters */ |
| 81 | + register int radix; /* radix for modular indexing */ |
| 82 | +#ifdef TRN_SPEEDUP |
| 83 | + register int low; |
| 84 | +#endif |
| 85 | + int *buffer; /* pointer to storage for one row |
| 86 | + of the d.p. array */ |
| 87 | + int store[THRESHOLD / sizeof (int)]; |
| 88 | + /* a small amount of static |
| 89 | + storage, to be used when the |
| 90 | + input strings are small enough */ |
| 91 | + |
| 92 | +/* Handle trivial cases when one string is empty */ |
| 93 | + |
| 94 | + if (from == NULL || !from_len) |
| 95 | + if (to == NULL || !to_len) |
| 96 | + return 0; |
| 97 | + else |
| 98 | + return to_len * insert_cost; |
| 99 | + else if (to == NULL || !to_len) |
| 100 | + return from_len * delete_cost; |
| 101 | + |
| 102 | +/* Initialize registers */ |
| 103 | + |
| 104 | + radix = 2 * from_len + 3; |
| 105 | +#ifdef TRN_SPEEDUP |
| 106 | +#define ins 1 |
| 107 | +#define del 1 |
| 108 | +#define ch 3 |
| 109 | +#define swap_cost 5 |
| 110 | +#else |
| 111 | + ins = insert_cost; |
| 112 | + del = delete_cost; |
| 113 | + ch = change_cost; |
| 114 | +#endif |
| 115 | + |
| 116 | +/* Make from short enough to fit in the static storage, if it's at all |
| 117 | + possible */ |
| 118 | + |
| 119 | + if (from_len > to_len && from_len > STRLENTHRESHOLD) { |
| 120 | + swap_int(from_len, to_len); |
| 121 | + swap_char(from, to); |
| 122 | +#ifndef TRN_SPEEDUP |
| 123 | + swap_int(ins, del); |
| 124 | +#endif |
| 125 | + } /* if from_len > to_len */ |
| 126 | + |
| 127 | +/* Allocate the array storage (from the heap if necessary) */ |
| 128 | + |
| 129 | + if (from_len <= STRLENTHRESHOLD) |
| 130 | + buffer = store; |
| 131 | + else |
| 132 | + buffer = (int *) malloc(radix * sizeof (int)); |
| 133 | + |
| 134 | +/* Here's where the fun begins. We will find the minimum edit distance |
| 135 | + using dynamic programming. We only need to store two rows of the matrix |
| 136 | + at a time, since we always progress down the matrix. For example, |
| 137 | + given the strings "one" and "two", and insert, delete and change costs |
| 138 | + equal to 1: |
| 139 | +
|
| 140 | + _ o n e |
| 141 | + _ 0 1 2 3 |
| 142 | + t 1 1 2 3 |
| 143 | + w 2 2 2 3 |
| 144 | + o 3 2 3 3 |
| 145 | +
|
| 146 | + The dynamic programming recursion is defined as follows: |
| 147 | +
|
| 148 | + ar(x,0) := x * insert_cost |
| 149 | + ar(0,y) := y * delete_cost |
| 150 | + ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change), |
| 151 | + a(x - 1, y) + insert_cost, |
| 152 | + a(x, y - 1) + delete_cost, |
| 153 | + a(x - 2, y - 2) + (from[x] == to[y-1] && |
| 154 | + from[x-1] == to[y] ? swap_cost : |
| 155 | + infinity)) |
| 156 | +
|
| 157 | + Since this only looks at most two rows and three columns back, we need |
| 158 | + only store the values for the two preceeding rows. In this |
| 159 | + implementation, we do not explicitly store the zero column, so only 2 * |
| 160 | + from_len + 2 words are needed. However, in the implementation of the |
| 161 | + swap_cost check, the current matrix value is used as a buffer; we |
| 162 | + can't overwrite the earlier value until the swap_cost check has |
| 163 | + been performed. So we use 2 * from_len + 3 elements in the buffer. |
| 164 | +*/ |
| 165 | + |
| 166 | +#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \ |
| 167 | + buffer[mod(index)])) |
| 168 | +#define NW(x,y) ar(x, y, index + from_len + 2) |
| 169 | +#define N(x,y) ar(x, y, index + from_len + 3) |
| 170 | +#define W(x,y) ar(x, y, index + radix - 1) |
| 171 | +#define NNWW(x,y) ar(x, y, index + 1) |
| 172 | +#define mod(x) ((x) % radix) |
| 173 | + |
| 174 | + index = 0; |
| 175 | + |
| 176 | +#ifdef DEBUG_EDITDIST |
| 177 | + printf(" "); |
| 178 | + for (col = 0; col < from_len; col++) |
| 179 | + printf(" %c ", from[col]); |
| 180 | + printf("\n "); |
| 181 | + |
| 182 | + for (col = 0; col <= from_len; col++) |
| 183 | + printf("%2d ", col * del); |
| 184 | +#endif |
| 185 | + |
| 186 | +/* Row 0 is handled implicitly; its value at a given column is col*del. |
| 187 | + The loop below computes the values for Row 1. At this point we know the |
| 188 | + strings are nonempty. We also don't need to consider swap costs in row |
| 189 | + 1. |
| 190 | +
|
| 191 | + COMMENT: the indicies row and col below point into the STRING, so |
| 192 | + the corresponding MATRIX indicies are row+1 and col+1. |
| 193 | +*/ |
| 194 | + |
| 195 | + buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch)); |
| 196 | +#ifdef TRN_SPEEDUP |
| 197 | + low = buffer[mod(index + radix - 1)]; |
| 198 | +#endif |
| 199 | + |
| 200 | +#ifdef DEBUG_EDITDIST |
| 201 | + printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]); |
| 202 | +#endif |
| 203 | + |
| 204 | + for (col = 1; col < from_len; col++) { |
| 205 | + buffer[index] = min3( |
| 206 | + col * del + ((from[col] == to[0]) ? 0 : ch), |
| 207 | + (col + 1) * del + ins, |
| 208 | + buffer[index - 1] + del); |
| 209 | +#ifdef TRN_SPEEDUP |
| 210 | + if (buffer[index] < low) |
| 211 | + low = buffer[index]; |
| 212 | +#endif |
| 213 | + index++; |
| 214 | + |
| 215 | +#ifdef DEBUG_EDITDIST |
| 216 | + printf("%2d ", buffer[index - 1]); |
| 217 | +#endif |
| 218 | + |
| 219 | + } /* for col = 1 */ |
| 220 | + |
| 221 | +#ifdef DEBUG_EDITDIST |
| 222 | + printf("\n %c %2d ", to[1], 2 * ins); |
| 223 | +#endif |
| 224 | + |
| 225 | +/* Now handle the rest of the matrix */ |
| 226 | + |
| 227 | + for (row = 1; row < to_len; row++) { |
| 228 | + for (col = 0; col < from_len; col++) { |
| 229 | + buffer[index] = min3( |
| 230 | + NW(row, col) + ((from[col] == to[row]) ? 0 : ch), |
| 231 | + N(row, col + 1) + ins, |
| 232 | + W(row + 1, col) + del); |
| 233 | + if (from[col] == to[row - 1] && col > 0 && |
| 234 | + from[col - 1] == to[row]) |
| 235 | + buffer[index] = min2(buffer[index], |
| 236 | + NNWW(row - 1, col - 1) + swap_cost); |
| 237 | + |
| 238 | +#ifdef DEBUG_EDITDIST |
| 239 | + printf("%2d ", buffer[index]); |
| 240 | +#endif |
| 241 | +#ifdef TRN_SPEEDUP |
| 242 | + if (buffer[index] < low || col == 0) |
| 243 | + low = buffer[index]; |
| 244 | +#endif |
| 245 | + |
| 246 | + index = mod(index + 1); |
| 247 | + } /* for col = 1 */ |
| 248 | +#ifdef DEBUG_EDITDIST |
| 249 | + if (row < to_len - 1) |
| 250 | + printf("\n %c %2d ", to[row+1], (row + 2) * ins); |
| 251 | + else |
| 252 | + printf("\n"); |
| 253 | +#endif |
| 254 | +#ifdef TRN_SPEEDUP |
| 255 | + if (low > MIN_DIST) |
| 256 | + break; |
| 257 | +#endif |
| 258 | + } /* for row = 1 */ |
| 259 | + |
| 260 | + row = buffer[mod(index + radix - 1)]; |
| 261 | + if (buffer != store) |
| 262 | + free((char *) buffer); |
| 263 | + return row; |
| 264 | +} /* edit_distn */ |
| 265 | + |
| 266 | + |
0 commit comments