//-----------------------------------------------------------------------------
// Copyright © 2004 - Philip Howard - All rights reserved
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//-----------------------------------------------------------------------------
// package	libh/misc
// homepage	http://libh.slashusr.org/
//-----------------------------------------------------------------------------
// author	Philip Howard
// email	libh at ipal dot org
// homepage	http://phil.ipal.org/
//-----------------------------------------------------------------------------
// This file is best viewed using a fixed spaced font such as Courier
// and in a display at least 120 columns wide.
//-----------------------------------------------------------------------------

//-----------------------------------------------------------------------------
// program	crosslink
//
// purpose	Make hard links between files to share data pages.  Unless
//		specified otherwise, the permission mode, owning userid,
//		owning groupid, and date/time metadata must also be the same
//		for a file to be hardlinked.  Specified metadata can also be
//		ignored for this purpose.
//
// usage	Shell command
//
// syntax	crosslink  [options]  [names]
//
// options	-n	Scan and checksum everything, but take no action
//		-t	IGNORE date AND time when checking for match
//		-p	IGNORE permission mode when checking for match
//		-u	IGNORE owning userid when checking for match
//		-g	IGNORE owning groupid when checking for match
//		-x	DO NOT cross filesystem boundary (see notes 1 and 2)
//		-q	Decrease verbosity
//		-v	Increase verbosity
//		--md4		Use md4 hashing of file contents
//		--md5		Use md5 hashing of file contents (default)
//		--ripemd160	Use ripemd160 hashing of file contents
//		--sha1		Use sha1 hashing of file contents
//
// requires	OpenSSL (libcrypto), LIBH (avl)
//
// compile	gcc -O3 -o crosslink crosslink.c -lcrypto -lh
//
// note 1	Although files in a different filesystem cannot be hardlinked,
//		crosslink will descend across filesystem boundaries and just
//		link those of the same filesystem.  The -x option specifies
//		that descending is confined to the same filesystem only.  This
//		restriction is applied separately for each argument name given.
//
// note 2	Files in different bind mounted file trees that reside on the
//		same filesystem will simply be treated as being on the same
//		filesystem.  However, if -x is specified to avoid descending
//		a different filesystem, then a bind mounted tree deeper down
//		may not be processed even if it is the same filesystem as
//		the top directory crosslink starts with.
//
// WARNING	Ignoring permissions, userid, and groupid can result in even
//		more space savings, but will result in this file meta data
//		being altered in ways that could disrupt system operations
//		and cause security exposures.  Use with EXTREME CAUTION.
//
// Warning	Ignoring date and time may cause some problems in some systems
//		or with some programs that detect what files are to be used in
//		preference over others.  Use care when ignoring date and time.
//
// warning	Some systems and programs do not deal well with certain files
//		being hardlinked, even if they are exactly identical in all
//		ways.  Use good judgement where you apply this program.
//-----------------------------------------------------------------------------

#include <errno.h>
#include <stdint.h>
#include <stdio.h>

#include <fcntl.h>
#include <fts.h>
#include <limits.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

#include <libh/avl.h>


//-----------------------------------------------------------------------------
// Set up MD5 parameters.
//-----------------------------------------------------------------------------
#ifndef CROSSLINK_NO_MD5
#ifndef DEFAULT_DIGEST_FUNC
#define DEFAULT_DIGEST_FUNC	md5_digest
#define DEFAULT_DIGEST_NAME	"md5"
#define DEFAULT_DIGEST_SIZE	MD5_DIGEST_LENGTH
#endif
#include <openssl/md5.h>
#endif

//-----------------------------------------------------------------------------
// Set up SHA1 parameters.
//-----------------------------------------------------------------------------
#ifndef CROSSLINK_NO_SHA
#ifndef DEFAULT_DIGEST_FUNC
#define DEFAULT_DIGEST_FUNC	sha1_digest
#define DEFAULT_DIGEST_NAME	"sha1"
#define DEFAULT_DIGEST_SIZE	SHA_DIGEST_LENGTH
#endif
#include <openssl/sha.h>
#endif

//-----------------------------------------------------------------------------
// Set up RIPEMD160 parameters.
//-----------------------------------------------------------------------------
#ifndef CROSSLINK_NO_RIPEMD
#ifndef DEFAULT_DIGEST_FUNC
#define DEFAULT_DIGEST_FUNC	ripemd160_digest
#define DEFAULT_DIGEST_NAME	"ripemd160"
#define DEFAULT_DIGEST_SIZE	RIPEMD160_DIGEST_LENGTH
#endif
#include <openssl/ripemd.h>
#endif

//-----------------------------------------------------------------------------
// Set up MD4 parameters.
//-----------------------------------------------------------------------------
#ifndef CROSSLINK_NO_MD4
#ifndef DEFAULT_DIGEST_FUNC
#define DEFAULT_DIGEST_FUNC	md4_digest
#define DEFAULT_DIGEST_NAME	"md4"
#define DEFAULT_DIGEST_SIZE	MD4_DIGEST_LENGTH
#endif
#include <openssl/md4.h>
#endif

//-----------------------------------------------------------------------------
// Make sure at least one algorithm is compiled in.
//-----------------------------------------------------------------------------
#ifndef DEFAULT_DIGEST_FUNC
#error All checksum algorithms are disabled but at least one is needed.
#endif

//-----------------------------------------------------------------------------
// Define macros to select which madvise appears to be available.
// If none appears available, then just define them as zero.
//-----------------------------------------------------------------------------
#ifdef MADV_SEQUENTIAL
#define madvise_sequential(p,s)	madvise((p),(s),MADV_SEQUENTIAL)
#else
#ifdef POSIX_MADV_SEQUENTIAL
#define madvise_sequential(p,s) posix_madvise((p),(s),POSIX_MADV_SEQUENTIAL)
#else
#define madvise_sequential(p,s)	(0)
#endif
#endif

#ifdef MADV_DONTNEED
#define madvise_dontneed(p,s)	madvise((p),(s),MADV_DONTNEED)
#else
#ifdef POSIX_MADV_DONTNEED
#define madvise_dontneed(p,s)	posix_madvise((p),(s),POSIX_MADV_DONTNEED)
#else
#define madvise_dontneed(p,s)	(0)
#endif
#endif


//-----------------------------------------------------------------------------
// Define compile time configurations.
//-----------------------------------------------------------------------------
#define CHUNK_SIZE	1048576

//-----------------------------------------------------------------------------
// struct	file_node
// purpose	Hold the state of a file
//
// In addition to the information shown in the coded struct definition,
// the nodes will be allocated additional space to hold the actual content
// digest bytes and the full relative path name of the file.  Because the
// size of the additional space is determined at run time, it is not coded
// as a part of the struct definition here.
//-----------------------------------------------------------------------------
struct file_node {
    union links {
	struct file_node *	sll		; // for link list
	avl_link		avl		; // for AVL tree
    }				link		;
    dev_t		fn_dev			;
    off_t		fn_size			;
    mode_t		fn_mode			;
    time_t		fn_mtime		;
    uid_t		fn_uid			;
    gid_t		fn_gid			;
    int			flag			;
    uint32_t		digest	[1]		; // actual usage is 4 or 5
};

#define node_path(p)	(((char *)(p))+node_size)

//-----------------------------------------------------------------------------
// Define common local variables.
//-----------------------------------------------------------------------------
static unsigned long long	digest_bytes		= 0;
static int		( *	digest_func	)( struct file_node * );
static long			count_digested		= 0;
static size_t			chunk_size		= 0;
static unsigned int		node_size		= 0;
static unsigned int		digest_size		= 0;
static int			use_perm		= 1;
static int			use_uid			= 1;
static int			use_gid			= 1;
static int			use_time		= 1;


//-----------------------------------------------------------------------------
// function	cmp_key
//
// purpose	Compare file meta data, and if necessary file content digest
//		data, between two nodes, for AVL tree operations.  If the
//		digest data is not available, obtain it now and cache it in
//		the node for later comparisons.
//
// arguments	1 (avl_link *) (struct file_node *) 1st node to compare
//		2 (avl_link *) (struct file_node *) 2nd node to compare
//
// returns	(int)  < 0 : 1st  < 2nd
//		(int) == 0 : 1st == 2nd
//		(int)  > 0 : 1st  > 2nd
//
// note		Normally the arguments are the (avl_link *) link structure
//		pointers.  Since the link structure member is the first
//		member, the pointers will always be the same.  It is easier
//		to work with the node from the perspective of its own pointer.
//-----------------------------------------------------------------------------
static
int
cmp_key (
    struct file_node *	arg_1st
    ,
    struct file_node *	arg_2nd
    ,
    AVL *		arg_tree
    )
{
    //-- Files with different sizes are obviously different.
    if ( arg_1st->fn_size < arg_2nd->fn_size ) return -1;
    if ( arg_1st->fn_size > arg_2nd->fn_size ) return 1;

    //-- Files on different filesystems cannot be linked anyway.
    if ( arg_1st->fn_dev < arg_2nd->fn_dev ) return -1;
    if ( arg_1st->fn_dev > arg_2nd->fn_dev ) return 1;

    //-- Unless overridden, files with different permissions are different.
    if ( use_perm ) {
	if ( arg_1st->fn_mode < arg_2nd->fn_mode ) return -1;
	if ( arg_1st->fn_mode > arg_2nd->fn_mode ) return 1;
    }

    //-- Unless overridden, files with different times are different.
    if ( use_time ) {
	if ( arg_1st->fn_mtime < arg_2nd->fn_mtime ) return -1;
	if ( arg_1st->fn_mtime > arg_2nd->fn_mtime ) return 1;
    }

    //-- Unless overridden, files with different owning userids are different.
    if ( use_uid ) {
	if ( arg_1st->fn_uid < arg_2nd->fn_uid ) return -1;
	if ( arg_1st->fn_uid > arg_2nd->fn_uid ) return 1;
    }

    //-- Unless overridden, files with different owning groups are different.
    if ( use_gid ) {
	if ( arg_1st->fn_gid < arg_2nd->fn_gid ) return -1;
	if ( arg_1st->fn_gid > arg_2nd->fn_gid ) return 1;
    }

    //-- If the 1st file has no digest, yet, get it now.
    if ( arg_1st->flag == 0 ) arg_1st->flag = ( * digest_func )( arg_1st );
    if ( arg_1st->flag < 0 ) return -1;

    //-- If the 2nd file has no digest, yet, get it now.
    if ( arg_2nd->flag == 0 ) arg_2nd->flag = ( * digest_func )( arg_2nd );
    if ( arg_2nd->flag < 0 ) return -1;

    //-- Compare the first 16 bytes of file digest.
    if ( arg_1st->digest[0] < arg_2nd->digest[0] ) return -1;
    if ( arg_1st->digest[0] > arg_2nd->digest[0] ) return 1;
    if ( arg_1st->digest[1] < arg_2nd->digest[1] ) return -1;
    if ( arg_1st->digest[1] > arg_2nd->digest[1] ) return 1;
    if ( arg_1st->digest[2] < arg_2nd->digest[2] ) return -1;
    if ( arg_1st->digest[2] > arg_2nd->digest[2] ) return 1;
    if ( arg_1st->digest[3] < arg_2nd->digest[3] ) return -1;
    if ( arg_1st->digest[3] > arg_2nd->digest[3] ) return 1;
    if ( digest_size == 16 ) return 0;

    //-- Compare the next 4 bytes of a 20 byte file digest.
    if ( arg_1st->digest[4] < arg_2nd->digest[4] ) return -1;
    if ( arg_1st->digest[4] > arg_2nd->digest[4] ) return 1;
    if ( digest_size == 20 ) return 0;

    //-- If any more than this, just memcmp() the rest.
    return memcmp( ( (unsigned char *) (arg_1st->digest) ) + 20,
		   ( (unsigned char *) (arg_2nd->digest) ) + 20,
		   digest_size - 20 );
}


#ifndef CROSSLINK_NO_MD5
//-----------------------------------------------------------------------------
// function	md5_digest
//
// purpose	Calculate MD5 checksum of the file referenced by a given node.
//
// arguments	1 (file_node *) pointer to file node
//
// returns	(int) == -1 : error
//		(int) ==  1 : OK
//-----------------------------------------------------------------------------
static
int
md5_digest (
    struct file_node *	arg_node
    )
{
    MD5_CTX		ctx		;
    off_t		map_off		;
    char *		map_ptr		;
    size_t		map_size	;
    int			fd		;

    if ( ( fd = open( node_path( arg_node ), O_RDONLY ) ) < 0 ) goto md5_error_do_return;
    MD5_Init( & ctx );
    map_off = 0;
    while ( map_off < arg_node->fn_size ) {
	map_size = chunk_size;
	if ( map_off + map_size > arg_node->fn_size ) map_size = arg_node->fn_size - map_off;
	map_ptr = mmap( 0, map_size, PROT_READ, MAP_SHARED, fd, map_off );
	if ( map_ptr == MAP_FAILED ) goto md5_error_do_close;
	madvise_sequential( map_ptr, map_size );
	MD5_Update( & ctx, map_ptr, map_size );
	madvise_dontneed( map_ptr, map_size );
	if ( munmap( map_ptr, map_size ) < 0 ) goto md5_error_do_close;
	map_off += map_size;
	digest_bytes += map_size;
    }
    close( fd );
    MD5_Final( (unsigned char *) arg_node->digest, & ctx );
    ++ count_digested;
    return 1;

 md5_error_do_close:
    close( fd );
 md5_error_do_return:
    return -1;
}
#endif /* CROSSLINK_NO_MD5 */


#ifndef CROSSLINK_NO_SHA
//-----------------------------------------------------------------------------
// function	sha1_digest
//
// purpose	Calculate SHA1 checksum of the file referenced by a given node.
//
// arguments	1 (file_node *) pointer to file node
//
// returns	(int) == -1 : error
//		(int) ==  1 : OK
//-----------------------------------------------------------------------------
static
int
sha1_digest (
    struct file_node *	arg_node
    )
{
    SHA_CTX		ctx		;
    off_t		map_off		;
    char *		map_ptr		;
    size_t		map_size	;
    int			fd		;

    if ( ( fd = open( node_path( arg_node ), O_RDONLY ) ) < 0 ) goto sha1_error_do_return;
    SHA1_Init( & ctx );
    map_off = 0;
    while ( map_off < arg_node->fn_size ) {
	map_size = chunk_size;
	if ( map_off + map_size > arg_node->fn_size ) map_size = arg_node->fn_size - map_off;
	map_ptr = mmap( 0, map_size, PROT_READ, MAP_SHARED, fd, map_off );
	if ( map_ptr == MAP_FAILED ) goto sha1_error_do_close;
	madvise_sequential( map_ptr, map_size );
	SHA1_Update( & ctx, map_ptr, map_size );
	madvise_dontneed( map_ptr, map_size );
	if ( munmap( map_ptr, map_size ) < 0 ) goto sha1_error_do_close;
	map_off += map_size;
	digest_bytes += map_size;
    }
    close( fd );
    SHA1_Final( (unsigned char *) arg_node->digest, & ctx );
    ++ count_digested;
    return 1;

 sha1_error_do_close:
    close( fd );
 sha1_error_do_return:
    return -1;
}
#endif /* CROSSLINK_NO_SHA */


#ifndef CROSSLINK_NO_RIPEMD
//-----------------------------------------------------------------------------
// function	ripemd160_digest
//
// purpose	Calculate RIPEMD160 checksum of the file referenced by a given node.
//
// arguments	1 (file_node *) pointer to file node
//
// returns	(int) == -1 : error
//		(int) ==  1 : OK
//-----------------------------------------------------------------------------
static
int
ripemd160_digest (
    struct file_node *	arg_node
    )
{
    RIPEMD160_CTX	ctx		;
    off_t		map_off		;
    char *		map_ptr		;
    size_t		map_size	;
    int			fd		;

    if ( ( fd = open( node_path( arg_node ), O_RDONLY ) ) < 0 ) goto ripemd160_error_do_return;
    RIPEMD160_Init( & ctx );
    map_off = 0;
    while ( map_off < arg_node->fn_size ) {
	map_size = chunk_size;
	if ( map_off + map_size > arg_node->fn_size ) map_size = arg_node->fn_size - map_off;
	map_ptr = mmap( 0, map_size, PROT_READ, MAP_SHARED, fd, map_off );
	if ( map_ptr == MAP_FAILED ) goto ripemd160_error_do_close;
	madvise_sequential( map_ptr, map_size );
	RIPEMD160_Update( & ctx, map_ptr, map_size );
	madvise_dontneed( map_ptr, map_size );
	if ( munmap( map_ptr, map_size ) < 0 ) goto ripemd160_error_do_close;
	map_off += map_size;
	digest_bytes += map_size;
    }
    close( fd );
    RIPEMD160_Final( (unsigned char *) arg_node->digest, & ctx );
    ++ count_digested;
    return 1;

 ripemd160_error_do_close:
    close( fd );
 ripemd160_error_do_return:
    return -1;
}
#endif /* CROSSLINK_NO_RIPEMD */


#ifndef CROSSLINK_NO_MD4
//-----------------------------------------------------------------------------
// function	md4_digest
//
// purpose	Calculate MD4 checksum of the file referenced by a given node.
//
// arguments	1 (file_node *) pointer to file node
//
// returns	(int) == -1 : error
//		(int) ==  1 : OK
//-----------------------------------------------------------------------------
static
int
md4_digest (
    struct file_node *	arg_node
    )
{
    MD4_CTX		ctx		;
    off_t		map_off		;
    char *		map_ptr		;
    size_t		map_size	;
    int			fd		;

    if ( ( fd = open( node_path( arg_node ), O_RDONLY ) ) < 0 ) goto md4_error_do_return;
    MD4_Init( & ctx );
    map_off = 0;
    while ( map_off < arg_node->fn_size ) {
	map_size = chunk_size;
	if ( map_off + map_size > arg_node->fn_size ) map_size = arg_node->fn_size - map_off;
	map_ptr = mmap( 0, map_size, PROT_READ, MAP_SHARED, fd, map_off );
	if ( map_ptr == MAP_FAILED ) goto md4_error_do_close;
	madvise_sequential( map_ptr, map_size );
	MD4_Update( & ctx, map_ptr, map_size );
	madvise_dontneed( map_ptr, map_size );
	if ( munmap( map_ptr, map_size ) < 0 ) goto md4_error_do_close;
	map_off += map_size;
	digest_bytes += map_size;
    }
    close( fd );
    MD4_Final( (unsigned char *) arg_node->digest, & ctx );
    ++ count_digested;
    return 1;

 md4_error_do_close:
    close( fd );
 md4_error_do_return:
    return -1;
}
#endif /* CROSSLINK_NO_MD4 */


//-----------------------------------------------------------------------------
// function	main
//-----------------------------------------------------------------------------
int
main (
    int		argc
    ,
    char * *	argv
    )
{
    unsigned long long		saved_bytes	;
    unsigned long long		total_bytes	;

    struct stat			stat_buf	;
    struct file_node		anchor_node	;

    AVL				file_tree	;
    FTS *			fts_p		;
    FTSENT *			fts_ent_p	;

    struct file_node *		that_node	;
    struct file_node *		this_node	;
    struct file_node *		match_node	;

    char *			fts_array	[2]	;

    const char *		digest_name	;
    const char *		this_path	;

    time_t			this_time	;
    time_t			prev_time	;

    size_t			len		;

    unsigned long		count_errors	;
    unsigned long		count_failed	;
    unsigned long		count_linked	;
    unsigned long		count_names	;
    unsigned long		count_unique	;

    int				arg_num		;
    int				no_action	;
    int				same_fs		;
    int				verbose		;


    //-----------------------------------------
    // Initialize variables that need it first.
    //-----------------------------------------
    digest_bytes	= 0;
    saved_bytes		= 0;
    total_bytes		= 0;

    count_errors	= 0;
    count_failed	= 0;
    count_linked	= 0;
    count_names		= 0;
    count_unique	= 0;
    no_action		= 0;
    prev_time		= 0;
    same_fs		= 0;

    digest_name		= NULL;
    digest_size		= 0;

    use_perm		= 1;
    use_uid		= 1;
    use_gid		= 1;
    use_time		= 1;

    verbose		= 1;


    //--------------
    // Scan options.
    //--------------
    while ( -- argc && * ++ argv && argv[0][0] == '-' ) {

	if ( argv[0][1] == '-' ) {

	    if ( argv[0][2] == 0 ) break;

#ifndef CROSSLINK_NO_MD5
	    else if ( strcmp( argv[0]+2, "md5" ) == 0 ) {
		if ( ! digest_func ) {
		    digest_func = md5_digest;
		    digest_name = argv[0]+2;
		    digest_size = MD5_DIGEST_LENGTH;
		} else if ( digest_func != md5_digest ) {
		    fprintf( stderr, "Digest method \"%s\" already specified.\n", digest_name );
		    ++ count_errors;
		}
	    }
#endif /* CROSSLINK_NO_MD5 */

#ifndef CROSSLINK_NO_SHA
	    else if ( strcmp( argv[0]+2, "sha1" ) == 0 ) {
		if ( ! digest_func ) {
		    digest_func = sha1_digest;
		    digest_name = argv[0]+2;
		    digest_size = SHA_DIGEST_LENGTH;
		} else if ( digest_func != sha1_digest ) {
		    fprintf( stderr, "Digest method \"%s\" already specified.\n", digest_name );
		    ++ count_errors;
		}
	    }
#endif /* CROSSLINK_NO_SHA */

#ifndef CROSSLINK_NO_RIPEMD
	    else if ( strcmp( argv[0]+2, "ripemd160" ) == 0 ) {
		if ( ! digest_func ) {
		    digest_func = ripemd160_digest;
		    digest_name = argv[0]+2;
		    digest_size = RIPEMD160_DIGEST_LENGTH;
		} else if ( digest_func != ripemd160_digest ) {
		    fprintf( stderr, "Digest method \"%s\" already specified.\n", digest_name );
		    ++ count_errors;
		}
	    }
#endif /* CROSSLINK_NO_RIPEMD */

#ifndef CROSSLINK_NO_MD4
	    else if ( strcmp( argv[0]+2, "md4" ) == 0 ) {
		if ( ! digest_func ) {
		    digest_func = md4_digest;
		    digest_name = argv[0]+2;
		    digest_size = MD4_DIGEST_LENGTH;
		} else if ( digest_func != md4_digest ) {
		    fprintf( stderr, "Digest method \"%s\" already specified.\n", digest_name );
		    ++ count_errors;
		}
	    }
#endif /* CROSSLINK_NO_MD4 */
	}

	//-------------------------------
	// Scan single character options.
	//-------------------------------
	{
	    const char *arg_str;

	    arg_str = argv[0];
	    while ( * ++ arg_str ) {
		switch ( * arg_str ) {
		case 'n':
		    no_action = 1;
		    break;
		case 'p':
		    use_perm = 0;
		    break;
		case 'u':
		    use_uid = 0;
		    break;
		case 'g':
		    use_gid = 0;
		    break;
		case 't':
		    use_time = 0;
		    break;
		case 'q':
		    -- verbose;
		    break;
		case 'v':
		    ++ verbose;
		    break;
		case 'x':
		    same_fs = FTS_XDEV;
		    break;
		default:
		    fprintf( stderr, "Unknown option '%s'\n", argv[0] );
		    ++ count_errors;
		}
	    }
	}
    }

    //------------------------
    // Stop now if any errors.
    //------------------------
    if ( count_errors ) return 1;

    //----------------------------
    // Do setups based on options.
    //----------------------------
    if ( ! digest_func ) {
	digest_func = DEFAULT_DIGEST_FUNC;
	digest_name = DEFAULT_DIGEST_NAME;
	digest_size = DEFAULT_DIGEST_SIZE;
    }

    //------------------------------------------------------
    // The actual size of a node is the size of the struct
    // up to the digest member, plus the size of the digest,
    // plus the path string length to be added in later.
    //------------------------------------------------------
    node_size = digest_size + (size_t) ( (struct file_node *) 0 )->digest;

    //------------------------------------------------------
    // Determine the digesting memory map chunk size to use.
    //------------------------------------------------------
    len = sysconf( _SC_PAGESIZE );
    chunk_size = CHUNK_SIZE / len;
    chunk_size = ( chunk_size == 0 ) ? len : chunk_size * len;

    //-------------------------
    // Initialize the AVL tree.
    //-------------------------
    avl_init( & file_tree, struct file_node, link.avl, (int (*)(avl_link *, avl_link *, AVL * )) cmp_key );

    //------------------------------------------------------
    // Collect all file paths in advance into a linked list.
    //------------------------------------------------------
    if ( verbose > 1 ) fputs( "Collecting .. ", stderr );

    //-- Start the linked list at the anchor dummy node.
    that_node = & anchor_node;

    //-- Go through each name given on the command line.
    fts_array[1] = NULL;
    arg_num = 0;
    for ( ; argc ; -- argc, ++ argv ) {
	++ arg_num;

	//-- Make sure the specified name exists.
	if ( lstat( argv[0], & stat_buf ) < 0 ) {
	    fprintf( stderr, "%s: %s\n", argv[0], strerror( errno ) );
	    ++ count_errors;
	    continue;
	}

	//-- Skip anything that is not a file or directory.
	if ( ! S_ISREG( stat_buf.st_mode ) && ! S_ISDIR( stat_buf.st_mode ) ) continue;

	//-- Start a new file tree for this argument.
	fts_array[0] = argv[0];
	if ( ! ( fts_p = fts_open( fts_array, same_fs, NULL ) ) ) {
	    fprintf( stderr, "fts_open( [\"%s\",NULL], %d, NULL ): %s\n", fts_array[0], same_fs, strerror( errno ) );
	    continue;
	}

	//-- Walk through the tree and collect file name paths.
	while ( ( fts_ent_p = fts_read( fts_p ) ) ) {

	    //-- If not a regular file, skip it.
	    if ( ! ( S_ISREG( fts_ent_p->fts_statp->st_mode ) ) ) continue;

	    //-- Check the path string length.
	    len = strlen( fts_ent_p->fts_path );
	    if ( len > PATH_MAX ) {
		fprintf( stderr, "file path too long (%lu > %lu): %s\n",
			 (unsigned long) len, (unsigned long) PATH_MAX, fts_ent_p->fts_path );
		continue;
	    }

	    //-- Allocate space for a node.
	    len += node_size + 1;
	    if ( ! ( this_node = malloc( len ) ) ) {
		fprintf( stderr, "malloc( %lu ): %s\n", (unsigned long) len, strerror( errno ) );
		return 1;
	    }

	    //-- Fill in the node, except for the digest.
	    this_node->fn_dev	= fts_ent_p->fts_statp->st_dev;
	    total_bytes +=
	    this_node->fn_size	= fts_ent_p->fts_statp->st_size;
	    this_node->fn_mode	= fts_ent_p->fts_statp->st_mode;
	    this_node->fn_mtime	= fts_ent_p->fts_statp->st_mtime;
	    this_node->fn_uid	= fts_ent_p->fts_statp->st_uid;
	    this_node->fn_gid	= fts_ent_p->fts_statp->st_gid;
	    this_node->flag	= 0;
	    strcpy( node_path( this_node ), fts_ent_p->fts_path );

	    //-- Link this node into the list.
	    that_node->link.sll = this_node;
	    that_node = this_node;
	    ++ count_names;

	    //-- Output progress.
	    if ( verbose > 1 ) {
		this_time = time( NULL );
		if ( this_time > prev_time ) {
		    fprintf( stderr, "\rCollecting .. total:%8lu", count_names );
		    prev_time = this_time;
		}
	    }
	}

	//-- Clean up this completed file tree walk.
	fts_close( fts_p );

    }

    //-- Last node has a NULL link so we know it is last.
    that_node->link.sll = NULL;

    //-- Finish the file collection message.
    if ( verbose > 1 ) {
	fprintf( stderr, "\rCollecting .. total:%8lu\n", count_names );
    }

    //-- Start file matching progress message.
    if ( verbose == 2 ) {
	fputs( "Matching .... ", stderr );
    }

    //---------------------------------------
    // Scan all files for duplicate contents.
    //---------------------------------------
    that_node = anchor_node.link.sll;
    while ( ( this_node = that_node ) ) {
	that_node = this_node->link.sll;

	this_path = node_path( this_node );
	if ( verbose > 2 ) fputs( this_path, stdout );

	//-- Search for a file with the exact same key.
	if ( ( match_node = avl_find_exact( & file_tree, this_node ) ) ) {
	    if ( verbose > 2 ) printf( " => %s\n", node_path( match_node ) );

	    //-- Matching file is found, so unlink old file and relink to common one.
	    if ( ! no_action ) {
		if ( unlink( this_path ) < 0 ) {
		    ++ count_failed;
		    continue;
		}
		if ( link( node_path( match_node ), this_path ) < 0 ) {
		    ++ count_failed;
		    continue;
		}
	    }
	    //-- Do not bother to free the node.
	    ++ count_linked;
	    saved_bytes += this_node->fn_size;
	} else {
	    if ( verbose > 2 ) fputs( " [new]\n", stdout );

	    //-- Matching file not found, so insert this file node in tree.
	    avl_insert( & file_tree, this_node );
	    ++ count_unique;
	}

	//-- Output progress.
	if ( verbose == 2 ) {
	    this_time = time( NULL );
	    if ( this_time > prev_time ) {
		fprintf( stderr, "\rMatching .... total:%8lu  unique:%7lu  linked:%7lu",
			 count_unique + count_linked, count_unique, count_linked );
		prev_time = this_time;
	    }
	}

	if ( verbose > 0 ) fflush( stdout );
    }
    if ( verbose == 2 ) {
	fprintf( stderr, "\rMatching .... total:%8lu  unique:%7lu  linked:%7lu\n",
		 count_unique + count_linked, count_unique, count_linked );
    }

    //--------------------------------
    // Output statistics if requested.
    //--------------------------------
    if ( verbose > 0 ) {
	unsigned long long	percentage	;
	unsigned long		count_total	;

	count_total = count_failed + count_linked + count_unique;

	fprintf( stderr, "Total number of files:   %11lu\n", count_total );

	if ( count_total ) {

	    if ( count_failed ) {
		percentage = count_failed;
		percentage *= 10000ULL;
		percentage /= count_total;
		fprintf( stderr, "Number of files failed:  %11lu  (%3lu.%02lu%%)\n",
			 count_failed,
			 (unsigned long) ( percentage / 100ULL ),
			 (unsigned long) ( percentage % 100ULL ) );
	    }

	    percentage = count_linked;
	    percentage *= 10000ULL;
	    percentage /= count_total;
	    fprintf( stderr, "Number of files linked:   %10lu  (%3lu.%02lu%%)%s\n",
		     count_linked,
		     (unsigned long) ( percentage / 100ULL ),
		     (unsigned long) ( percentage % 100ULL ),
		     no_action ? " N/A" : "" );

	    percentage = count_unique;
	    percentage *= 10000ULL;
	    percentage /= count_total;
	    fprintf( stderr, "Number of files unique:   %10lu  (%3lu.%02lu%%)\n",
		     count_unique,
		     (unsigned long) ( percentage / 100ULL ),
		     (unsigned long) ( percentage % 100ULL ) );

	    percentage = count_digested;
	    percentage *= 10000ULL;
	    percentage /= count_total;
	    fprintf( stderr, "Number of files digested: %10lu  (%3lu.%02lu%%)\n",
		     count_digested,
		     (unsigned long) ( percentage / 100ULL ),
		     (unsigned long) ( percentage % 100ULL ) );

	    fprintf( stderr, "Total bytes:    %20llu\n", total_bytes );

	    percentage = saved_bytes;
	    percentage *= 10000ULL;
	    percentage /= total_bytes;
	    fprintf( stderr, "Bytes saved:    %20llu  (%3lu.%02lu%%)\n",
		     saved_bytes,
		     (unsigned long) ( percentage / 100ULL ),
		     (unsigned long) ( percentage % 100ULL ) );

	    percentage = digest_bytes;
	    percentage *= 10000ULL;
	    percentage /= total_bytes;
	    fprintf( stderr, "Bytes digested: %20llu  (%3lu.%02lu%%)\n",
		     digest_bytes,
		     (unsigned long) ( percentage / 100ULL ),
		     (unsigned long) ( percentage % 100ULL ) );
	}

    }

    //-------------------------------------------------
    // Why bother deallocating a bunch of tree nodes in
    // a virtual address space that is about to vanish.
    //-------------------------------------------------
    return 0;
}

