Skip to content
This repository was archived by the owner on Apr 20, 2023. It is now read-only.

Commit 716fbcc

Browse files
author
boddumanohar
committed
new design
1 parent 61467aa commit 716fbcc

15 files changed

+3126
-1372
lines changed

Makefile

+1-4
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,15 @@ PLATFORM_OS := $(shell uname | cut -d_ -f1)
77

88
srcdir = .
99
prefix = /usr/local
10-
1110
exec_prefix = $(prefix)
1211
sysconfdir = $(prefix)/etc
1312
includedir = $(prefix)/include
1413
datarootdir = $(prefix)/share
1514
localstatedir = $(prefix)/var
16-
1715
bindir = $(exec_prefix)/bin
1816
libdir = $(exec_prefix)/lib
1917
libexecdir = $(exec_prefix)/libexec
20-
sbindir = $(exec_prefix)/sbin
21-
18+
sbindir = $(exec_prefix)/sbin:
2219
datadir = $(datarootdir)
2320
docdir = $(datarootdir)/doc/pev
2421
infodir = $(datarootdir)/info

edit_dist.c

+266
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
/*
2+
This edit distance code is taken from trn3.6. A few minor
3+
modifications have been made by Andrew Tridgell <[email protected]>
4+
for use in spamsum.
5+
*/
6+
7+
8+
/***************************************************************************/
9+
10+
11+
/* The authors make no claims as to the fitness or correctness of this software
12+
* for any use whatsoever, and it is provided as is. Any use of this software
13+
* is at the user's own risk.
14+
*/
15+
16+
#include <stdio.h>
17+
#include <stdlib.h>
18+
19+
/* edit_dist -- returns the minimum edit distance between two strings
20+
21+
Program by: Mark Maimone CMU Computer Science 13 Nov 89
22+
Last Modified: 28 Jan 90
23+
24+
If the input strings have length n and m, the algorithm runs in time
25+
O(nm) and space O(min(m,n)).
26+
27+
HISTORY
28+
13 Nov 89 (mwm) Created edit_dist() and set_costs().
29+
30+
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
31+
computations will work even when THRESHOLD is not a multiple of
32+
sizeof(int).
33+
34+
17 May 93 (mwm) Improved performance when used with trn's newsgroup
35+
processing; assume all costs are 1, and you can terminate when a
36+
threshold is exceeded.
37+
*/
38+
39+
#define MIN_DIST 100
40+
41+
#define TRN_SPEEDUP /* Use a less-general version of the
42+
routine, one that's better for trn.
43+
All change costs are 1, and it's okay
44+
to terminate if the edit distance is
45+
known to exceed MIN_DIST */
46+
47+
#define THRESHOLD 4000 /* worry about allocating more memory only
48+
when this # of bytes is exceeded */
49+
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
50+
51+
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
52+
53+
#define swap_int(x,y) do { int _iswap = (x); (x) = (y); (y) = _iswap; } while (0)
54+
#define swap_char(x,y) do { const char *_cswap = (x); (x) = (y); (y) = _cswap; } while (0)
55+
56+
static inline int min3(int x, int y, int z) {
57+
return x < y ? (x < z ? x : z) : (z < y) ? z : y;
58+
}
59+
static inline int min2(int x, int y)
60+
{
61+
return x < y ? x : y;
62+
}
63+
64+
static int insert_cost = 1;
65+
static int delete_cost = 1;
66+
#ifndef TRN_SPEEDUP
67+
static int change_cost = 1;
68+
static int swap_cost = 1;
69+
#endif
70+
71+
/* edit_distn -- returns the edit distance between two strings, or -1 on
72+
failure */
73+
74+
int
75+
edit_distn(const char *from, int from_len, const char *to, int to_len)
76+
{
77+
#ifndef TRN_SPEEDUP
78+
register int ins, del, ch; /* local copies of edit costs */
79+
#endif
80+
register int row, col, index; /* dynamic programming counters */
81+
register int radix; /* radix for modular indexing */
82+
#ifdef TRN_SPEEDUP
83+
register int low;
84+
#endif
85+
int *buffer; /* pointer to storage for one row
86+
of the d.p. array */
87+
int store[THRESHOLD / sizeof (int)];
88+
/* a small amount of static
89+
storage, to be used when the
90+
input strings are small enough */
91+
92+
/* Handle trivial cases when one string is empty */
93+
94+
if (from == NULL || !from_len)
95+
if (to == NULL || !to_len)
96+
return 0;
97+
else
98+
return to_len * insert_cost;
99+
else if (to == NULL || !to_len)
100+
return from_len * delete_cost;
101+
102+
/* Initialize registers */
103+
104+
radix = 2 * from_len + 3;
105+
#ifdef TRN_SPEEDUP
106+
#define ins 1
107+
#define del 1
108+
#define ch 3
109+
#define swap_cost 5
110+
#else
111+
ins = insert_cost;
112+
del = delete_cost;
113+
ch = change_cost;
114+
#endif
115+
116+
/* Make from short enough to fit in the static storage, if it's at all
117+
possible */
118+
119+
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
120+
swap_int(from_len, to_len);
121+
swap_char(from, to);
122+
#ifndef TRN_SPEEDUP
123+
swap_int(ins, del);
124+
#endif
125+
} /* if from_len > to_len */
126+
127+
/* Allocate the array storage (from the heap if necessary) */
128+
129+
if (from_len <= STRLENTHRESHOLD)
130+
buffer = store;
131+
else
132+
buffer = (int *) malloc(radix * sizeof (int));
133+
134+
/* Here's where the fun begins. We will find the minimum edit distance
135+
using dynamic programming. We only need to store two rows of the matrix
136+
at a time, since we always progress down the matrix. For example,
137+
given the strings "one" and "two", and insert, delete and change costs
138+
equal to 1:
139+
140+
_ o n e
141+
_ 0 1 2 3
142+
t 1 1 2 3
143+
w 2 2 2 3
144+
o 3 2 3 3
145+
146+
The dynamic programming recursion is defined as follows:
147+
148+
ar(x,0) := x * insert_cost
149+
ar(0,y) := y * delete_cost
150+
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
151+
a(x - 1, y) + insert_cost,
152+
a(x, y - 1) + delete_cost,
153+
a(x - 2, y - 2) + (from[x] == to[y-1] &&
154+
from[x-1] == to[y] ? swap_cost :
155+
infinity))
156+
157+
Since this only looks at most two rows and three columns back, we need
158+
only store the values for the two preceeding rows. In this
159+
implementation, we do not explicitly store the zero column, so only 2 *
160+
from_len + 2 words are needed. However, in the implementation of the
161+
swap_cost check, the current matrix value is used as a buffer; we
162+
can't overwrite the earlier value until the swap_cost check has
163+
been performed. So we use 2 * from_len + 3 elements in the buffer.
164+
*/
165+
166+
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
167+
buffer[mod(index)]))
168+
#define NW(x,y) ar(x, y, index + from_len + 2)
169+
#define N(x,y) ar(x, y, index + from_len + 3)
170+
#define W(x,y) ar(x, y, index + radix - 1)
171+
#define NNWW(x,y) ar(x, y, index + 1)
172+
#define mod(x) ((x) % radix)
173+
174+
index = 0;
175+
176+
#ifdef DEBUG_EDITDIST
177+
printf(" ");
178+
for (col = 0; col < from_len; col++)
179+
printf(" %c ", from[col]);
180+
printf("\n ");
181+
182+
for (col = 0; col <= from_len; col++)
183+
printf("%2d ", col * del);
184+
#endif
185+
186+
/* Row 0 is handled implicitly; its value at a given column is col*del.
187+
The loop below computes the values for Row 1. At this point we know the
188+
strings are nonempty. We also don't need to consider swap costs in row
189+
1.
190+
191+
COMMENT: the indicies row and col below point into the STRING, so
192+
the corresponding MATRIX indicies are row+1 and col+1.
193+
*/
194+
195+
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
196+
#ifdef TRN_SPEEDUP
197+
low = buffer[mod(index + radix - 1)];
198+
#endif
199+
200+
#ifdef DEBUG_EDITDIST
201+
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
202+
#endif
203+
204+
for (col = 1; col < from_len; col++) {
205+
buffer[index] = min3(
206+
col * del + ((from[col] == to[0]) ? 0 : ch),
207+
(col + 1) * del + ins,
208+
buffer[index - 1] + del);
209+
#ifdef TRN_SPEEDUP
210+
if (buffer[index] < low)
211+
low = buffer[index];
212+
#endif
213+
index++;
214+
215+
#ifdef DEBUG_EDITDIST
216+
printf("%2d ", buffer[index - 1]);
217+
#endif
218+
219+
} /* for col = 1 */
220+
221+
#ifdef DEBUG_EDITDIST
222+
printf("\n %c %2d ", to[1], 2 * ins);
223+
#endif
224+
225+
/* Now handle the rest of the matrix */
226+
227+
for (row = 1; row < to_len; row++) {
228+
for (col = 0; col < from_len; col++) {
229+
buffer[index] = min3(
230+
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
231+
N(row, col + 1) + ins,
232+
W(row + 1, col) + del);
233+
if (from[col] == to[row - 1] && col > 0 &&
234+
from[col - 1] == to[row])
235+
buffer[index] = min2(buffer[index],
236+
NNWW(row - 1, col - 1) + swap_cost);
237+
238+
#ifdef DEBUG_EDITDIST
239+
printf("%2d ", buffer[index]);
240+
#endif
241+
#ifdef TRN_SPEEDUP
242+
if (buffer[index] < low || col == 0)
243+
low = buffer[index];
244+
#endif
245+
246+
index = mod(index + 1);
247+
} /* for col = 1 */
248+
#ifdef DEBUG_EDITDIST
249+
if (row < to_len - 1)
250+
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
251+
else
252+
printf("\n");
253+
#endif
254+
#ifdef TRN_SPEEDUP
255+
if (low > MIN_DIST)
256+
break;
257+
#endif
258+
} /* for row = 1 */
259+
260+
row = buffer[mod(index + radix - 1)];
261+
if (buffer != store)
262+
free((char *) buffer);
263+
return row;
264+
} /* edit_distn */
265+
266+

0 commit comments

Comments
 (0)