//----------------------------------------------------------------------------- // Copyright © 2003 - Philip Howard - All rights reserved // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. //----------------------------------------------------------------------------- // package libh/avl // homepage http://libh.slashusr.org/ //----------------------------------------------------------------------------- // author Philip Howard // email libh at ipal dot org // homepage http://phil.ipal.org/ //----------------------------------------------------------------------------- // This file is best viewed using a fixed spaced font such as Courier // and in a display at least 120 columns wide. //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- // program wordfreq // // purpose Count the frequency of different word items in the input. // // Demonstrate how single node can be defined to have 2 tree // links allowing each instance to be in 2 tree classes at // the same time. // // syntax wordfreq ["sepchars"] < infile //----------------------------------------------------------------------------- #include #include #include #include #include #define WORDSIZE 256 //----------------------------------------------------------------------------- // Define a struct that implements a link for a character string key. // Define a struct that implements a link for an unsigned long key. // Define a struct that implements a node to carry both links together. // Define a function that compares the character string keys in two links. // Define a function that compares the unsigned long keys in two links. //----------------------------------------------------------------------------- struct link_str { avl_link link ; char * key ; }; struct link_ul { avl_link link ; unsigned long key ; }; struct node { struct link_str word ; struct link_ul count ; char data [16] ; }; static int cmp_str( avl_link * arg_one, avl_link * arg_two, AVL * arg_tree ) { return strcmp( ((struct link_str *)arg_one)->key, ((struct link_str *)arg_two)->key ); } static int cmp_ul( avl_link * arg_one, avl_link * arg_two, AVL * arg_tree ) { return ( ((struct link_ul *)arg_one)->key < ((struct link_ul *)arg_two)->key ) ? -1 : ( ((struct link_ul *)arg_one)->key > ((struct link_ul *)arg_two)->key ) ? 1 : 0; } //----------------------------------------------------------------------------- // function readword //----------------------------------------------------------------------------- static int readword ( char * arg_buffer , size_t arg_length , char * arg_seps ) { char * p ; char * q ; int c ; //-- Note where the end of the buffer is. q = ( p = arg_buffer ) + arg_length - 1; //-- Skip all leading separator characters. while ( (c = getc( stdin )) != EOF && arg_seps[ c ] ) { } //-- Get characters until end of word, store until full. while ( c != EOF && arg_seps[ c ] == 0 ) { if ( p < q ) * p ++ = c; c = getc( stdin ); } * p = 0; //-- Return 0 if EOF, 1 if a word found. return ( c == EOF ); } //----------------------------------------------------------------------------- // function main //----------------------------------------------------------------------------- int main ( int argc , char * * argv ) { AVL by_word ; AVL by_count ; struct link_str key_word ; struct node * node_p ; char seps [ 256 ]; char word [ WORDSIZE ]; //------------------------------------------------------------------------ // Initialize the separator table that is given to readword(). This table // tells it what characters separate each word. Control characters are // always placed in the table. If no explicit characters are given in the // third argument then all space and punctuation characters are added by // default. Otherwise just the given characters are added. //------------------------------------------------------------------------ { int i; for ( i = 0; i < 256; ++ i ) seps[ i ] = iscntrl( i ); if ( argc > 1 && argv[ 1 ] ) { char * p; p = argv[ 1 ]; while ( * p ) seps[ 255 & * p ++ ] = 1; } else { for ( i = 0; i < 256; ++ i ) { if ( ispunct( i ) || isspace( i ) ) seps[ i ] = 1; } } } //------------------------------ // Initialize both binary trees. //------------------------------ avl_init( & by_word, struct node, word, cmp_str ); avl_init( & by_count, struct node, count, cmp_ul ); //------------------------------------------------------------------- // Initialize a struct used as the lookup key with the array address. //------------------------------------------------------------------- key_word.key = word; //-------------------------------------------------------------- // Read every word from standard input to count them one by one. //-------------------------------------------------------------- while ( 0 == readword( word, WORDSIZE, seps ) ) { //------------------------------------------------------------ // If the node is found, increment the count for another word. // If not, create a new one with a count of 1 and insert it. //------------------------------------------------------------ if ( ( node_p = avl_find_exact( & by_word, & key_word ) ) ) { node_p->count.key ++; } else { if ( ! ( node_p = malloc( sizeof (struct node) - 16 + strlen( word ) + 1 ) ) ) { perror( "malloc" ); return 1; } node_p->count.key = 1; strcpy( ( node_p->word.key = node_p->data ), word ); avl_insert( & by_word, node_p ); } } //----------------------------------------------------------------------- // Step through all the words in the "by word" tree and insert them into // the "by count" tree. Duplicates have to be allowed because many words // may have the same count. //----------------------------------------------------------------------- avl_loop_forward( & by_word, node_p ) { avl_insert_dup( & by_count, node_p ); } //--------------------------------------------------------- // Print all the words with count form the "by count" tree. //--------------------------------------------------------- avl_loop_reverse( & by_count, node_p ) { printf( "%11lu %s\n", node_p->count.key, node_p->word.key ); } return 0; }