//-----------------------------------------------------------------------------
// Copyright © 2003 - Philip Howard - All rights reserved
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//-----------------------------------------------------------------------------
// package	libh/avl
// homepage	http://libh.slashusr.org/
//-----------------------------------------------------------------------------
// author	Philip Howard
// email	libh at ipal dot org
// homepage	http://phil.ipal.org/
//-----------------------------------------------------------------------------
// This file is best viewed using a fixed spaced font such as Courier
// and in a display at least 120 columns wide.
//-----------------------------------------------------------------------------

//-----------------------------------------------------------------------------
// program	wordfreq
//
// purpose	Count the frequency of different word items in the input.
//
//		Demonstrate how single node can be defined to have 2 tree
//		links allowing each instance to be in 2 tree classes at
//		the same time.
//
// syntax	wordfreq ["sepchars"] < infile
//-----------------------------------------------------------------------------
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <libh/avl.h>

#define WORDSIZE	256

//-----------------------------------------------------------------------------
// Define a struct that implements a link for a character string key.
// Define a struct that implements a link for an unsigned long key.
// Define a struct that implements a node to carry both links together.
// Define a function that compares the character string keys in two links.
// Define a function that compares the unsigned long keys in two links.
//-----------------------------------------------------------------------------
struct link_str {
    avl_link		link		;
    char *		key		;
};

struct link_ul {
    avl_link		link		;
    unsigned long	key		;
};

struct node {
    struct link_str	word		;
    struct link_ul	count		;
    char		data	[16]	;
};

static int cmp_str( avl_link * arg_one, avl_link * arg_two, AVL * arg_tree )
{
    return strcmp(
	((struct link_str *)arg_one)->key,
	((struct link_str *)arg_two)->key
    );
}

static int cmp_ul( avl_link * arg_one, avl_link * arg_two, AVL * arg_tree )
{
    return ( ((struct link_ul *)arg_one)->key < ((struct link_ul *)arg_two)->key ) ?
	-1 : ( ((struct link_ul *)arg_one)->key > ((struct link_ul *)arg_two)->key ) ? 1 : 0;
}

//-----------------------------------------------------------------------------
// function	readword
//-----------------------------------------------------------------------------
static
int
readword (
    char *	arg_buffer
    ,
    size_t	arg_length
    ,
    char *	arg_seps
    )
{
    char *	p	;
    char *	q	;
    int		c	;

    //-- Note where the end of the buffer is.
    q = ( p = arg_buffer ) + arg_length - 1;

    //-- Skip all leading separator characters.
    while ( (c = getc( stdin )) != EOF && arg_seps[ c ] ) {
    }

    //-- Get characters until end of word, store until full.
    while ( c != EOF && arg_seps[ c ] == 0 ) {
	if ( p < q ) * p ++ = c;
	c = getc( stdin );
    }
    * p = 0;

    //-- Return 0 if EOF, 1 if a word found.
    return ( c == EOF );
}

//-----------------------------------------------------------------------------
// function	main
//-----------------------------------------------------------------------------
int
main (
    int		argc
    ,
    char * *	argv
    )
{
    AVL			by_word		;
    AVL			by_count	;
    struct link_str	key_word	;
    struct node *	node_p		;
    char		seps		[ 256 ];
    char		word		[ WORDSIZE ];


    //------------------------------------------------------------------------
    // Initialize the separator table that is given to readword().  This table
    // tells it what characters separate each word.  Control characters are
    // always placed in the table.  If no explicit characters are given in the
    // third argument then all space and punctuation characters are added by
    // default.  Otherwise just the given characters are added.
    //------------------------------------------------------------------------
    {
	int i;

	for ( i = 0; i < 256; ++ i ) seps[ i ] = iscntrl( i );
	if ( argc > 1 && argv[ 1 ] ) {
	    char * p;
	    p = argv[ 1 ];
	    while ( * p ) seps[ 255 & * p ++ ] = 1;
	} else {
	    for ( i = 0; i < 256; ++ i ) {
		if ( ispunct( i ) || isspace( i ) ) seps[ i ] = 1;
	    }
	}
    }

    //------------------------------
    // Initialize both binary trees.
    //------------------------------
    avl_init( & by_word, struct node, word, cmp_str );
    avl_init( & by_count, struct node, count, cmp_ul );

    //-------------------------------------------------------------------
    // Initialize a struct used as the lookup key with the array address.
    //-------------------------------------------------------------------
    key_word.key = word;

    //--------------------------------------------------------------
    // Read every word from standard input to count them one by one.
    //--------------------------------------------------------------
    while ( 0 == readword( word, WORDSIZE, seps ) ) {

        //------------------------------------------------------------
        // If the node is found, increment the count for another word.
        // If not, create a new one with a count of 1 and insert it.
        //------------------------------------------------------------
	if ( ( node_p = avl_find_exact( & by_word, & key_word ) ) ) {
	    node_p->count.key ++;
	} else {
	    if ( ! ( node_p = malloc( sizeof (struct node) - 16 + strlen( word ) + 1 ) ) ) {
		perror( "malloc" );
		return 1;
	    }
	    node_p->count.key = 1;
	    strcpy( ( node_p->word.key = node_p->data ), word );
	    avl_insert( & by_word, node_p );
	}
    }

    //-----------------------------------------------------------------------
    // Step through all the words in the "by word" tree and insert them into
    // the "by count" tree.  Duplicates have to be allowed because many words
    // may have the same count.
    //-----------------------------------------------------------------------
    avl_loop_forward( & by_word, node_p ) {
	avl_insert_dup( & by_count, node_p );
    }

    //---------------------------------------------------------
    // Print all the words with count form the "by count" tree.
    //---------------------------------------------------------
    avl_loop_reverse( & by_count, node_p ) {
	printf( "%11lu %s\n", node_p->count.key, node_p->word.key );
    }

    return 0;
}

