### -*-awk-*-
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "1.00",
###     date            = "28 January 2000",
###     time            = "08:38:46 MST",
###     filename        = "ndiff.awk",
###     copyright       = "Copyright (c) 2000 Nelson H. F. Beebe. This
###                        code is licensed under the GNU General Public
###                        License, version 2 or later.",
###     address         = "Center for Scientific Computing
###                        University of Utah
###                        Department of Mathematics, 322 INSCC
###                        155 S 1400 E RM 233
###                        Salt Lake City, UT 84112-0090
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 585 1640, +1 801 581 4148",
###     URL             = "http://www.math.utah.edu/~beebe",
###     checksum        = "10489 476 1904 14057",
###     email           = "beebe@math.utah.edu, beebe@acm.org,
###                        beebe@ieee.org (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "numerical file differencing",
###     supported       = "yes",
###     docstring       = "This program compares two putatively similar
###                        files, ignoring small numeric differences.
###                        Complete documentation can be found in the
###                        accompanying UNIX manual page file,
###                        ndiff.man.
###
###                        Usage:
###                       	awk -f ndiff.awk \
###                       		[-v ABSERR=x] \
###                       		[-v FIELDS=n1a-n1b,n2,n3a-n3b,...] \
###                       		[-v FS=regexp] \
###                       		[-v MINWIDTH=n] \
###                       		[-v QUIET=n] \
###                       		[-v RELERR=x] \
###                       		[-v SILENT=n] \
###                       		infile1 infile2
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN \
{
    initialize()

    compare_files(ARGV[1], ARGV[2])

    exit (Ndiff != 0)
}


function abs(a)
{
    ## Return the absolute value of the argument.

    return ((a < 0) ? -a : a)
}


function awkfloat(s)
{
    ## Convert a numeric string to an awk floating-point number, and
    ## return the result as a floating-point number.
    ##
    ## Fortran use has any of E, e, D, d, Q, or q, or even nothing at
    ## all, for the exponent letter, but awk and C only allow E and e.
    ##
    ## Ada usefully permits nonsignificant underscores for
    ## readability: 3.14159265358979323846 and
    ## 3.14159_26535_89793_23846 are equivalent.
    ##
    ## We can safely assume that there are no leading or trailing
    ## whitespace characters, because all strings passed to this
    ## function are the result of splitting lines into
    ## whitespace-delimited fields.

    gsub("_","",s)		# remove Ada-style separators
    gsub("[DdQq]","e",s)	# convert Fortran exponent letters to awk-style
    if (match(s,"[0-9.][-+][0-9]+$")) # then letter-less exponent
	s = substr(s,1,RSTART) "e" substr(s,RSTART+1) # insert exponent letter e
    return (0 + s)		# coerce to a number
}


function compare_all(f1line,f2line,f1parts,f2parts,n, k)
{
    ## Compare all fields in f1line and f2line, assuming that they have
    ## already been split into n parts in f1parts[] and f2parts[].
    ##
    ## If any fields differ, print a diff-style report, and increment
    ## global variable Ndiff,

    for (k = 1; k <= n; ++k)
    {
	if (diff_field(f1parts[k], f2parts[k], k) != 0)
	{
	    report_difference(f1line,f2line,k)
	    return
	}
    }
}


function compare_files(file1,file2, f1line,f2line,f1parts,f2parts,n1,n2)
{
    ## Compare all lines in two files, printing a diff-style report of
    ## differences.  If any numeric differences have been found, print a
    ## one-line report of which matching line had the largest numeric
    ## difference.  Finally, print a diagnostic if the files differ in
    ## length.

    NRLINE = 0
    while (((getline f1line < file1) > 0) && \
	   ((getline f2line < file2) > 0))
    {
	NRLINE++
	n1 = split(f1line,f1parts)
	n2 = split(f2line,f2parts)
	if (n1 == n2)
	{
	    if (N_Fields == 0)
		compare_all(f1line,f2line,f1parts,f2parts,n1)
	    else
		compare_some(f1line,f2line,f1parts,f2parts,n1)
	}
	else
	    report_difference(f1line,f2line,max(n1,n2))
    }
    if (QUIET == 0)
    {
	if (Max_Abserr > 0)
	    printf("### Maximum absolute error in matching lines = %.2e at line %d field %d\n", \
		   Max_Abserr, Max_Abserr_NR, Max_Abserr_NF)
	if (Max_Relerr > 0)
	    printf("### Maximum relative error in matching lines = %.2e at line %d field %d\n", \
		   Max_Relerr, Max_Relerr_NR, Max_Relerr_NF)
    }
    if ((getline f1line < file1) > 0) {
	warning("file " file2 " is short")
   Ndiff++ }
    if ((getline f2line < file2) > 0) {
	warning("file " file1 " is short")
   Ndiff++ }
}


function compare_some(f1line,f2line,f1parts,f2parts,n, k,m)
{
    ## Compare selected fields in f1line and f2line, assuming that they
    ## have already been split into n parts in f1parts[] and f2parts[].
    ## The globals (N_Fields, Fields[]) define which fields are to be
    ## compared.
    ##
    ## If any fields differ, print a diff-style report, and increment
    ## global variable Ndiff.

    for (k = 1; (k <= N_Fields) && (k <= n); ++k)
    {
	m = Fields[k]
	if ((m <= n) && (diff_field(f1parts[m], f2parts[m], m) != 0))
	{
	    report_difference(f1line,f2line,m)
	    return
	}
    }
}


function diff_field(field1,field2,nfield)
{
    ## If both fields are identical as strings, return 0.
    ##
    ## Otherwise, if both fields are numeric, return 0 if they are close
    ## enough (as determined by the globals ABSERR and RELERR), or are
    ## both ignorable (as determined by MINWIDTH), and otherwise return
    ## 1.
    ##
    ## Otherwise, return 1.
    ##
    ## The computed absolute and relative errors are saved in global
    ## variables (This_Abserr and This_Relerr) for later use in
    ## diagnostic reports.  These values are always zero for
    ## nonnumeric fields.

    This_Abserr = 0
    This_Relerr = 0

    if (field1 == field2) # handle the commonest, and easiest, case first
	return (0)
    else if ((field1 ~ NUMBER_PATTERN) && (field2 ~ NUMBER_PATTERN))
    {
	## Handle MINWIDTH test while the fields are still strings
	if (ignore(field1) && ignore(field2))
	    return (0)

	## Now coerce both fields to floating-point numbers,
	## converting Fortran-style exponents, if necessary.
	field1 = awkfloat(field1)
	field2 = awkfloat(field2)

	This_Abserr = abs(field1 - field2)
	This_Relerr = maxrelerr(field1,field2)
	if ( ((ABSERR != "") && (This_Abserr > ABSERR)) || \
	     ((RELERR != "") && (This_Relerr > RELERR)) )
	{
	    if (This_Abserr > Max_Abserr)
	    {
		Max_Abserr_NF = nfield
		Max_Abserr_NR = NRLINE
		Max_Abserr = This_Abserr
	    }
	    if (This_Relerr > Max_Relerr)
	    {
		Max_Relerr_NF = nfield
		Max_Relerr_NR = NRLINE
		Max_Relerr = This_Relerr
	    }
	    return (1)
	}
	else
	    return (0)
    }
    else
	return (1)
}


function error(message)
{
    ## Issue an error message and terminate with a failing status code.

    warning("ERROR: " message)
    exit(1)
}


function ignore(field)
{
    ## Return 1 if field is ignorable, because it is shorter than
    ## MINWIDTH and appears to be a real number.  Otherwise, return 0.

    return ((MINWIDTH > 0) && \
	    (length(field) < MINWIDTH) && \
	    (field ~ "[.DdEeQq]"))
}


function initialize( eps)
{
    ## Process command-line options, and initialize global variables.

    Stderr = "/dev/stderr"

    Macheps = machine_epsilon()

    if (ABSERR != "")
	ABSERR = abs(awkfloat(ABSERR)) # coerce to positive number

    if (RELERR != "")
    {
	RELERR = abs(awkfloat(RELERR)) # coerce to positive number
	if (RELERR < Macheps)
	    warning("RELERR = " RELERR " is below machine epsilon " Macheps)
	else if (RELERR >= 1)	# RELERR=nnn means nnn*(machine epsilon)
	    RELERR *= Macheps
    }

    if ((ABSERR == "") && (RELERR == "")) # supply default (see man pages)
	RELERR = max(1.0e-15, 8.0 * Macheps)

	## printf( "RELERR is %15.10f\n", RELERR )
    ## Coerce remaining options to numbers
    MINWIDTH += 0
    QUIET += 0
    SILENT += 0

    Max_Relerr = 0
    Max_Relerr_NR = 0
    Max_Relerr_NF = 0

    Max_Abserr = 0
    Max_Abserr_NR = 0
    Max_Abserr_NF = 0

    This_Abserr = 0
    This_Relerr = 0

    if (FIELDS != "")
	initialize_fields()
    else
	N_Fields = 0

    ## The precise value of this regular expression to match both an
    ## integer and a floating-point number is critical, and documented
    ## in the accompanying manual page: it must match not only the
    ## awk- and C-style -nnn, -n.nnn, and -n.nnne+nn, but also the
    ## Fortran styles -nnn, -n.nnn, -n.D+nn, -.nnD+nn, -nD+nn,
    ## -n.nnnQ+nn, -n.nnnd+nn, and -n.nnn+nnn.  The Fortran forms will
    ## be converted by awkfloat() to awk-form.  Ada permits an
    ## nonsignificant underscore between digits, so we support that
    ## too.

    NUMBER_PATTERN = "^[-+]?([0-9](_?[0-9])*([.]?([0-9](_?[0-9])*)*)?|[.][0-9](_?[0-9])*)([DdEeQq]?[-+]?[0-9](_?[0-9])*)?$"

    Ndiff = 0
    if (ARGC != 3)
	error("Incorrect argument count\n\tUsage: awk -f ndiff.awk [-v ABSERR=x] [-v FIELDS=n1a-n1b,n2,n3a-n3b,...] [-v FS='regexp'] [-v MINWIDTH=n] [-v RELERR=x] infile1 infile2")
}


function initialize_fields( j,k,m,n,numbers,parts)
{
    ## Convert a FIELDS=n1a-n1b,n2,n3a-n3b,... specification to a list
    ## of N_Fields numbers in Fields[].

    N_Fields = 0
    n = split(FIELDS,parts,",")
    for (k = 1; k <= n; ++k)
    {
	m = split(parts[k],numbers,"-+")
	if (m == 1)
	{
	    if (parts[k] !~ "^[0-9]+$")
		error("non-numeric FIELDS value [" parts[k] "]")
	    else if (parts[k] == 0)
		error("zero FIELDS value [" parts[k] "]: fields are numbered from 1")
	    else
		Fields[++N_Fields] = parts[k]
	}
	else if (m == 2)
	{
	    if ((numbers[1] !~ "^[0-9]+$") || \
		(numbers[2] !~ "^[0-9]+$"))
		error("non-numeric FIELDS range [" parts[k] "]")
	    else if ((numbers[1] == 0) || (numbers[2] == 0))
		error("zero value in FIELDS range [" parts[k] "]: fields are numbered from 1")
	    else if (numbers[1] > numbers[2])
		error("bad FIELDS range [" parts[k] "]")
	    else if ((numbers[2] - numbers[1] + 1) > 100)
		error("FIELDS range [" parts[k] "] exceeds 100")
	    else
	    {
		for (j = numbers[1]; j <= numbers[2]; ++j)
		    Fields[++N_Fields] = j
	    }
	}
	else
	    error("bad FIELDS range [" parts[k] "]")
    }
    ## printf("DEBUG: Fields = [")
    ## for (k = 1; k <= N_Fields; ++k)
    ##     printf("%d,", Fields[k])
    ## print "]"
    ## exit(0)
}


function machine_epsilon( x)
{
    ## Tests on these architectures with awk, gawk, mawk, and nawk all
    ## produced identical results:
    ##
    ##		Apple Macintosh PPC G3	Rhapsody 5.5
    ##		DEC Alpha		OSF/1 4.0F
    ##		HP 9000/735		HP-UX 10.01
    ##		IBM PowerPC		AIX 4.2
    ##		Intel Pentium III	GNU/Linux 2.2.12-20smp (Redhat 6.1)
    ##		NeXT Turbostation	Mach 3.3
    ##		SGI Indigo/2		IRIX 5.3
    ##		SGI Origin 200		IRIX 6.5
    ##		Sun SPARC		GNU/Linux 2.2.12-42smp (Redhat 6.1)
    ##		Sun SPARC		Solaris 2.6
    ##		Sun SPARC		Solaris 2.7
    ##
    ##		/usr/local/bin/awk:  2.22045e-16
    ##		/usr/local/bin/gawk: 2.22045e-16
    ##		/usr/local/bin/mawk: 2.22045e-16
    ##		/usr/local/bin/nawk: 2.22045e-16
    ##
    ## Thus, there does not appear to be concern for surprises from
    ## long registers, such as on the Intel x86 architecture.

    x = 1.0
    while ((1.0 + x/2.0) != 1.0)
	x /= 2.0
    return (x)
}


function max(a,b)
{
    ## Return the (numerically or lexicographically) larger of the two
    ## arguments.

    return ((a > b) ? a : b)
}


function maxrelerr(x,y)
{
    ## Return the maximum relative error of two values.

    #x = abs(x + 0)		# coerce to nonnegative numbers
    #y = abs(y + 0)    		# coerce to nonnegative numbers

    ## See the documentation of the -relerr option in ndiff.man for the
    ## explanation of this complex definition:

    if (x == y)
	return (0)
    else if ((x != 0) && (y != 0))
	return (abs(x-y)/min(abs(x),abs(y)))
    else if ((x == 0) && (y != 0))
	return (1)
    else if ((y == 0) && (x != 0))
	return (1)
    else
	return (0)
}


function min(a,b)
{
    ## Return the (numerically or lexicographically) smaller of the two
    ## arguments.

    return ((a < b) ? a : b)
}


function report_difference(f1line,f2line,nfield, emult)
{
    ## Print a diff-style difference of two lines, but also show in
    ## the separator line the field number at which they differ, and
    ## the global absolute and relative errors, if they are nonzero.

    if (SILENT == 0)
    {
	printf("%dc%d\n", NRLINE, NRLINE)
	printf("< %s\n", f1line)
	## if ((This_Abserr != 0) || (This_Relerr != 0))
	## {
	##     emult = This_Relerr / Macheps
	##     if (emult >= 10000)
	##	printf("--- field %d\tabsolute error %.2e\trelative error %.2e\n",
	##	       nfield, This_Abserr, This_Relerr)
	##    else
	##	printf("--- field %d\tabsolute error %.2e\trelative error %.2e [%d*(machine epsilon)]\n",
	##	       nfield, This_Abserr, This_Relerr, int(emult + 0.5))
	##}
	##else
	##    printf("--- field %d\n", nfield)
	printf("> %s\n", f2line)
    }
    Ndiff++
}


function warning(message)
{
    ## Print a warning message on stderr, using emacs
    ## compile-command-style message format.

    if (FNR > 0)
	print FILENAME ":" FNR ":%%" message >Stderr
    else	# special case for diagnostics during initialization
	print message >Stderr
}