// // aegis - project change supervisor // Copyright (C) 1991, 1993, 1994, 2002-2008, 2012 Peter Miller. // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see // . // // This code is based on the heart of a file comparison program // written by David I. Bell, and used by kind permission. // This notice must be retained in all copies and derivatives. // Contact the author of aegis for a copy of the file comparison program. // // This code is based on the algorithm in: // An O(ND) Difference Algorithm and Its Variations // Eugene W. Myers // (TR 85-6, April 10, 1985) // Department of Computer Science // The University of Arizona // Tuscon, Arizona 85721 // // Also see: // A File Comparison Program // Webb Miller and Eugene W. Myers // Software Practice and Experience // (Volume 15, No. 11, November 1985) // #include #include #include #include #include #include struct snake_t { long line1; long line2; long count; snake_t *next; }; static long tablesize; // needed table size static long tablesize_max; // allocated table size static long *V1; // the row containing the last d static long *V1_table; static long *V2; // another row static long *V2_table; static snake_t *nextsnake; // next allocable snake structure static snake_t *snake_table; // allocable snake structures struct file { const char *f_lines; long f_linecount; }; struct fc_t { file fileA; file fileB; long maxlines; long minlines; long inserts; long deletes; long matches; }; static fc_t fc; // // Routine to find the middle snake of an optimial D-path spanning // lines A to A+N in file A to lines B to B+N in file B. Returns the // length D of the D-path as a return value, and the upper left and // lower right relative coordinates of a snake midway through the D-path. // static long midsnake(long A, long N, long B, long M, long *ulx, long *uly, long *lrx, long *lry) { long x; long y; long k; long oldx; const char *lp1; const char *lp2; long DELTA; long odd; long MAXD; long changes; long D; trace(("midsnake(A = %ld, N = %ld, B = %ld, M = %ld)\n{\n", A, N, B, M)); trace(("searching: %ld,%ld to %ld,%ld\n", A, B, A + N, B + M)); DELTA = N - M; odd = DELTA & 1; MAXD = (M + N + 1) / 2; V1[1] = 0; V2[-1] = 0; changes = -odd - 2; // // This is the main loop for searching for the snake. // D is the distance off the diagonals, and is the number // of changes needed to get from the upper left to the // lower right corner of the region. // for (D = 0; D <= MAXD; D++) { changes += 2; // // Examine all diagonals within current distance. // First search from upper left to lower right, // and then search from lower right to upper left. // for (k = -D; k <= D; k += 2) { // // Find the end of the furthest forward D-path // in diagonal k. // if (k == -D || (k != D && (V1[k - 1] < V1[k + 1]))) x = V1[k + 1]; else x = V1[k - 1] + 1; y = x - k; lp1 = &fc.fileA.f_lines[A + x]; lp2 = &fc.fileB.f_lines[B + y]; oldx = x; while (x < N && y < M && *lp1 == *lp2) { x++; y++; lp1++; lp2++; } V1[k] = x; // // See if path overlaps furthest reverse D-path. // If so, then we have found the snake. // if (odd && (k >= (DELTA - (D - 1))) && (k <= (DELTA + (D - 1)))) { if ((x + V2[k - DELTA]) >= N) { *ulx = oldx; *uly = oldx - k; *lrx = x; *lry = y; trace(("midsnake: %ld,%ld to %ld,%ld (odd)\n", *ulx, *uly, *lrx, *lry)); trace(("return %ld;\n", changes)); trace(("}\n")); return changes; } } } for (k = -D; k <= D; k += 2) { // // Find the end of the furthest reaching reverse // path in diagonal k+DELTA. // if (k == D || (k != -D && (V2[k + 1] < V2[k - 1]))) x = V2[k - 1]; else x = V2[k + 1] + 1; y = x + k; lp1 = &fc.fileA.f_lines[A + N - x - 1]; lp2 = &fc.fileB.f_lines[B + M - y - 1]; oldx = x; while (x < N && y < M && *lp1 == *lp2) { x++; y++; lp1--; lp2--; } V2[k] = x; // // See if path overlaps furthest forward D-path. // If so, then we have found the snake. // if (!odd && (k <= D - DELTA) && (k >= -D - DELTA)) { if ((x + V1[k + DELTA]) >= N) { *ulx = N - x; *uly = M - y; *lrx = N - oldx; *lry = *lrx + *uly - *ulx; trace(("midsnake: %ld,%ld to %ld,%ld (even)\n", *ulx, *uly, *lrx, *lry)); trace(("return %ld;\n", changes)); trace(("}\n")); return changes; } } } } // // Middle snake procedure failed! // assert(!"Middle snake procedure failed"); return 0; } // // Recursive routine to find a minimal D-path through the edit graph // of the two input files. Arguments are the beginning line numbers in // the files, and the number of lines to examine. This is basically a // divide-and-conquer routine which finds the middle snake of an optimal // D-path, then calls itself to find the remainder of the path before the // snake and after the snake. // static void findsnake(int depth, long A, long N, long B, long M) { snake_t *sp; long ulx = 0; long uly = 0; long lrx = 0; long lry = 0; long D; long count; trace(("findsnake(depth = %d, A = %ld, N = %ld, B = %ld, M = %ld)\n{\n", depth, A, N, B, M)); // // If more than one change needed, then call ourself for each part. // D = midsnake(A, N, B, M, &ulx, &uly, &lrx, &lry); if (D > 1) { if (ulx > 0 && uly > 0) findsnake(depth + 1, A, ulx, B, uly); count = lrx - ulx; sp = nextsnake++; sp->line1 = A + ulx; sp->line2 = B + uly; sp->count = count; N -= lrx; M -= lry; if (N > 0 && M > 0) findsnake(depth + 1, A + lrx, N, B + lry, M); trace(("}\n")); return; } // // Only 0 or 1 change needed, so we can compute the result directly. // First compute the snake coming from the upper left corner if any. // if (N > M) count = uly; else count = ulx; sp = nextsnake++; sp->line1 = A; sp->line2 = B; sp->count = count; // // Finally compute the snake coming from the lower right corner if any. // count = lrx - ulx; sp = nextsnake++; sp->line1 = A + ulx; sp->line2 = B + uly; sp->count = count; trace(("}\n")); } double fstrcmp(const char *s1, const char *s2) { double result; snake_t *sp; // current snake element long line1; // current line in file A long line2; // current line in file B trace(("fstrcmp(s1 = %p, s2 = %p)\n{\n", s1, s2)); trace(("s1 = \"%s\";\n", s1)); trace(("s2 = \"%s\";\n", s2)); fc.fileA.f_lines = s1; fc.fileA.f_linecount = strlen(s1); fc.fileB.f_lines = s2; fc.fileB.f_linecount = strlen(s2); // // Check for trivial case of two empty strings. // This also avoids a division by zero at the end of is function. // if (!fc.fileA.f_linecount && !fc.fileB.f_linecount) { trace(("return 1;\n")); trace(("}\n")); return 1; } if (fc.fileA.f_linecount < fc.fileB.f_linecount) { fc.minlines = fc.fileA.f_linecount; fc.maxlines = fc.fileB.f_linecount; } else { fc.minlines = fc.fileB.f_linecount; fc.maxlines = fc.fileA.f_linecount; } tablesize = fc.maxlines * 2 + 1; if (tablesize > tablesize_max) { tablesize_max = tablesize; delete [] V1_table; V1_table = new long [tablesize_max]; delete [] V2_table; V2_table = new long [tablesize_max]; delete [] snake_table; snake_table = new snake_t [tablesize_max]; } V1 = V1_table + fc.maxlines; V2 = V2_table + fc.maxlines; nextsnake = snake_table; if (fc.fileA.f_linecount > 0 && fc.fileB.f_linecount > 0) { findsnake(0, 0L, fc.fileA.f_linecount, 0L, fc.fileB.f_linecount); } // // End the list with the lower right endpoint // sp = nextsnake++; sp->line1 = fc.fileA.f_linecount; sp->line2 = fc.fileB.f_linecount; sp->count = 0; // // print out the snake list // #ifdef DEBUG for (sp = snake_table; sp < nextsnake; sp++) { trace(("%ld: line1 = %ld; line2 = %ld; count = %ld;\n", sp - snake_table, sp->line1, sp->line2, sp->count)); } #endif // // Scan the snake list and calculate the number of inserted, // deleted, and matching lines. // line1 = 0; line2 = 0; fc.deletes = 0; fc.inserts = 0; fc.matches = 0; for (sp = snake_table; sp < nextsnake; sp++) { fc.deletes += (sp->line1 - line1); fc.inserts += (sp->line2 - line2); fc.matches += sp->count; line1 = sp->line1 + sp->count; line2 = sp->line2 + sp->count; } // // the result is 0 if the strings are entirely unalike, // and 1 if the strings are identical, and somewhere in between // if the are in any way similar. // result = ( 1 - (double)(fc.inserts + fc.deletes) / (fc.fileA.f_linecount + fc.fileB.f_linecount) ); trace(("return %.6f;\n", result)); trace(("}\n")); return result; } // vim: set ts=8 sw=4 et :