/* * aegis - project change supervisor * Copyright (C) 1991, 1993, 1994 Peter Miller. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. * * MANIFEST: functions to make fuzzy comparisons between strings * * This code is based on the heart of a file comparison program * written by David I. Bell, and used by kind permission. * This notice must be retained in all copies and derivatives. * Contact the author of aegis for a copy of the file comparison program. * * This code is based on the algorithm in: * An O(ND) Difference Algorithm and Its Variations * Eugene W. Myers * (TR 85-6, April 10, 1985) * Department of Computer Science * The University of Arizona * Tuscon, Arizona 85721 * * Also see: * A File Comparison Program * Webb Miller and Eugene W. Myers * Software Practice and Experience * (Volume 15, No. 11, November 1985) */ #include #include #include #include #include typedef struct snake_t snake_t; struct snake_t { long line1; long line2; long count; snake_t *next; }; static long tablesize; /* needed table size */ static long tablesize_max; /* allocated table size */ static long *V1; /* the row containing the last d */ static long *V1_table; static long *V2; /* another row */ static long *V2_table; static snake_t *nextsnake; /* next allocable snake structure */ static snake_t *snake_table; /* allocable snake structures */ typedef struct file file; struct file { char *f_lines; long f_linecount; }; typedef struct fc_t fc_t; struct fc_t { file fileA; file fileB; long maxlines; long minlines; long inserts; long deletes; long matches; }; static fc_t fc; /* * Routine to find the middle snake of an optimial D-path spanning * lines A to A+N in file A to lines B to B+N in file B. Returns the * length D of the D-path as a return value, and the upper left and * lower right relative coordinates of a snake midway through the D-path. */ static long midsnake _((int depth, long A, long N, long B, long M, long *ulx, long *uly, long *lrx, long *lry)); static long midsnake(depth, A, N, B, M, ulx, uly, lrx, lry) int depth; long A; long N; long B; long M; long *ulx; long *uly; long *lrx; long *lry; { long x; long y; long k; long oldx; char *lp1; char *lp2; long DELTA; long odd; long MAXD; long changes; long D; trace(("midsnake(depth = %d, A = %ld, N = %ld, B = %ld, M = %ld)\n{\n" /*}*/, depth, A, N, B, M)); trace(("searching: %ld,%ld to %ld,%ld\n", A, B, A + N, B + M)); DELTA = N - M; odd = DELTA & 1; MAXD = (M + N + 1) / 2; V1[1] = 0; V2[-1] = 0; changes = -odd - 2; /* * This is the main loop for searching for the snake. * D is the distance off the diagonals, and is the number * of changes needed to get from the upper left to the * lower right corner of the region. */ for (D = 0; D <= MAXD; D++) { changes += 2; /* * Examine all diagonals within current distance. * First search from upper left to lower right, * and then search from lower right to upper left. */ for (k = -D; k <= D; k += 2) { /* * Find the end of the furthest forward D-path * in diagonal k. */ if (k == -D || (k != D && (V1[k-1] < V1[k+1]))) x = V1[k+1]; else x = V1[k-1] + 1; y = x - k; lp1 = &fc.fileA.f_lines[A+x]; lp2 = &fc.fileB.f_lines[B+y]; oldx = x; while (x < N && y < M && *lp1 == *lp2) { x++; y++; lp1++; lp2++; } V1[k] = x; /* * See if path overlaps furthest reverse D-path. * If so, then we have found the snake. */ if (odd && (k >= (DELTA - (D-1))) && (k <= (DELTA + (D-1)))) { if ((x + V2[k-DELTA]) >= N) { *ulx = oldx; *uly = oldx - k; *lrx = x; *lry = y; trace(("midsnake: %ld,%ld to %ld,%ld (odd)\n", *ulx, *uly, *lrx, *lry)); trace(("return %ld;\n", changes)); trace((/*{*/"}\n")); return changes; } } } for (k = -D; k <= D; k += 2) { /* * Find the end of the furthest reaching reverse * path in diagonal k+DELTA. */ if (k == D || (k != -D && (V2[k+1] < V2[k-1]))) x = V2[k-1]; else x = V2[k+1] + 1; y = x + k; lp1 = &fc.fileA.f_lines[A+N-x-1]; lp2 = &fc.fileB.f_lines[B+M-y-1]; oldx = x; while (x < N && y < M && *lp1 == *lp2) { x++; y++; lp1--; lp2--; } V2[k] = x; /* * See if path overlaps furthest forward D-path. * If so, then we have found the snake. */ if (!odd && (k <= D-DELTA) && (k >= -D-DELTA)) { if ((x + V1[k+DELTA]) >= N) { *ulx = N - x; *uly = M - y; *lrx = N - oldx; *lry = *lrx + *uly - *ulx; trace(("midsnake: %ld,%ld to %ld,%ld (even)\n", *ulx, *uly, *lrx, *lry)); trace(("return %ld;\n", changes)); trace((/*{*/"}\n")); return changes; } } } } /* * Middle snake procedure failed! */ assert(0); return 0; } /* * Recursive routine to find a minimal D-path through the edit graph * of the two input files. Arguments are the beginning line numbers in * the files, and the number of lines to examine. This is basically a * divide-and-conquer routine which finds the middle snake of an optimal * D-path, then calls itself to find the remainder of the path before the * snake and after the snake. */ static void findsnake _((int depth, long A, long N, long B, long M)); static void findsnake(depth, A, N, B, M) int depth; long A; long N; long B; long M; { snake_t *sp; long ulx; long uly; long lrx; long lry; long D; long count; trace(("findsnake(depth = %d, A = %ld, N = %ld, B = %ld, M = %ld)\n{\n" /*}*/, depth, A, N, B, M)); /* * If more than one change needed, then call ourself for each part. */ D = midsnake(depth, A, N, B, M, &ulx, &uly, &lrx, &lry); if (D > 1) { if (ulx > 0 && uly > 0) findsnake(depth + 1, A, ulx, B, uly); count = lrx - ulx; sp = nextsnake++; sp->line1 = A + ulx; sp->line2 = B + uly; sp->count = count; N -= lrx; M -= lry; if (N > 0 && M > 0) findsnake(depth + 1, A + lrx, N, B + lry, M); trace((/*{*/"}\n")); return; } /* * Only 0 or 1 change needed, so we can compute the result directly. * First compute the snake coming from the upper left corner if any. */ if (N > M) count = uly; else count = ulx; sp = nextsnake++; sp->line1 = A; sp->line2 = B; sp->count = count; /* * Finally compute the snake coming from the lower right corner if any. */ count = lrx - ulx; sp = nextsnake++; sp->line1 = A + ulx; sp->line2 = B + uly; sp->count = count; trace((/*{*/"}\n")); } double fstrcmp(s1, s2) char *s1; char *s2; { double result; snake_t *sp; /* current snake element */ long line1; /* current line in file A */ long line2; /* current line in file B */ trace(("fstrcmp(s1 = %08lX, s2 = %08lX)\n{\n"/*}*/, s1, s2)); trace(("s1 = \"%s\";\n", s1)); trace(("s2 = \"%s\";\n", s2)); fc.fileA.f_lines = s1; fc.fileA.f_linecount = strlen(s1); fc.fileB.f_lines = s2; fc.fileB.f_linecount = strlen(s2); /* * Check for trivial case of two empty strings. * This also avoids a division by zero at the end of is function. */ if (!fc.fileA.f_linecount && !fc.fileB.f_linecount) { trace(("return 1;\n")); trace((/*{*/"}\n")); return 1; } if (fc.fileA.f_linecount < fc.fileB.f_linecount) { fc.minlines = fc.fileA.f_linecount; fc.maxlines = fc.fileB.f_linecount; } else { fc.minlines = fc.fileB.f_linecount; fc.maxlines = fc.fileA.f_linecount; } tablesize = fc.maxlines * 2 + 1; if (tablesize > tablesize_max) { tablesize_max = tablesize; V1_table = mem_change_size ( V1_table, sizeof(long) * tablesize_max ); V2_table = mem_change_size ( V2_table, sizeof(long) * tablesize_max ); snake_table = mem_change_size ( snake_table, sizeof(snake_t) * tablesize_max ); } V1 = V1_table + fc.maxlines; V2 = V2_table + fc.maxlines; nextsnake = snake_table; if (fc.fileA.f_linecount > 0 && fc.fileB.f_linecount > 0) { findsnake ( 0, 0L, fc.fileA.f_linecount, 0L, fc.fileB.f_linecount ); } /* * End the list with the lower right endpoint */ sp = nextsnake++; sp->line1 = fc.fileA.f_linecount; sp->line2 = fc.fileB.f_linecount; sp->count = 0; /* * print out the snake list */ #ifdef DEBUG for (sp = snake_table; sp < nextsnake; sp++) { trace (( "%d: line1 = %ld; line2 = %ld; count = %ld;\n", sp - snake_table, sp->line1, sp->line2, sp->count )); } #endif /* * Scan the snake list and calculate the number of inserted, * deleted, and matching lines. */ line1 = 0; line2 = 0; fc.deletes = 0; fc.inserts = 0; fc.matches = 0; for (sp = snake_table; sp < nextsnake; sp++) { fc.deletes += (sp->line1 - line1); fc.inserts += (sp->line2 - line2); fc.matches += sp->count; line1 = sp->line1 + sp->count; line2 = sp->line2 + sp->count; } /* * the result is 0 if the strings are entirely unalike, * and 1 if the strings are identical, and somewhere in between * if the are in any way similar. */ result = ( 1 - (double)(fc.inserts + fc.deletes) / (fc.fileA.f_linecount + fc.fileB.f_linecount) ); trace(("return %.6f;\n", result)); trace((/*{*/"}\n")); return result; }