/* opqcp - Makes opaque copies of a group of sources to support a flavor
 * of distribution with the security of an object-only version but the ability
 * to recompile on a variety of machines like a source distribution.
 * 
 * The idea is that there is a lot of information in good sources which is
 * not needed by the compiler or linker, but which conveys the meaning of the
 * program to a programmer.  We try to write the most understandable programs
 * possible, opqcp aspires to translate them into the least understandable.
 * 
 *   Comments, #includes, #define'ed constants and macros - stripped or 
 * 	expanded by pouring the source through the C preprocessor.
 * 
 *   Global symbols - translated into unreadable equivalents or preserved
 * 	if needed to link to the program or function.  A global dictionary
 * 	must have previously been derived from the .o file symbol tables.
 * 	(All C reserved keywords are preserved automatically.)
 * 
 *   Local identifiers - Translated into unreadable equivalents within
 * 	a single source file.  This applies to everything not in the global
 * 	table (variables, typedefs, struct fields, etc.)
 * 
 *   Indentation/Whitespace - The tokens are packed on screen-width lines with
 * 	all whitespace removed.
 * 
 * Arguments (dash args precede file and destdir args.):
 *   -d dictfile - (Optional, may be repeated.) Contains a wordlist that is
 * 	entered into a symbol table of global names.  Lines with just a name
 *  	mark that identifier as a clear global to preserve in the output.  
 *	(All C reserved words are automatically preserved in this way.)
 * 	Lines with a name and its translation separated by a blank character
 * 	denote an opaque global which is translated the same everywhere.
 * 	All other symbols will be locally translated and may be different in
 * 	each source file.
 * 
 *   -Idir - (Optional, may be repeated.) Include directory arguments passed
 * 	on to the C preprocessor.
 * 
 *   -f - Filter mode, just opacifies stdin to stdout.  File and
 * 	directory arguments are ignored and the input must already have
 * 	been run through "cc -E" if preprocessing is desired.
 * 
 *   -t - Token trace (for testing).  Forces filter mode, prints token
 *	types and values.
 * 
 *   filenames.c - (Required except in filter mode, may be repeated.) 
 * 	Source files to obfuscate.
 * 
 *   destdir - (One required as the last argument except in filter mode.)
 *  	Where to put the opaque copies of the source files.
 * 	The copies have the same names as the originals, but opqcp
 * 	refuses to overwrite a file with itself.
 */

#include "misc.h"
#include "symtab.h"
#include "sys/types.h"
#include "sys/stat.h"
#include "ctype.h"

string c_keywords[] =		/* From K&R, appendix A, section 2.3. */
{
    "int", "char", "float", "double", "struct", "union", "enum", "long",
    "short", "unsigned", "auto", "extern", "register", "typedef", "static",
    "goto", "return", "sizeof", "break", "continue", "if", "else", "for",
    "do", "while", "switch", "case", "default", "entry", "fortran", "asm",
    "main",			    /* Reserved for main program functions. */
    "void"				/* Not in K&R, added since then. */
};
int n_keywords = sizeof c_keywords / sizeof( string );

main( argc, argv )
int argc;
string argv[];
{
    boolean filter_mode = FALSE, token_trace = FALSE;
    int i, j, k, l, arg_num;
    hash_table * globals, * locals;
    id * symbol;
    string *name_ptr, *arg_ptr, arg;
    static char I_args[BUFSIZ] = { '\0' };
    char buffer[BUFSIZ], buffer2[BUFSIZ];
    struct stat stat_buffer;
    FILE * dict_in, * input, * output, * popen();
    dev_t input_dev;
    ino_t input_inode;
    string directory, name, translation, index();

    char chr, * chr_ptr , token[BUFSIZ], * token_end, string_type;
    int token_length, line_length;
    enum { NONE, SYMBOL, STRING, NUMBER, OTHER }
	token_type, prev_token_type;
    static string token_type_strings[] =
	 { "none", "symbol", "string", "number", "other" };
    boolean whitespace;
    /* These characters will not be separated if they occur together. */
    string op_diphthong_chars = "=!<>&|+-*/%&^";

    /* Some simple macros to stream characters through the token recognizer. */
#   define NEW_TOKEN ( token_end = token, *token_end = '\0' )
#   define NEXT ( OUT, IN )
#   define OUT ( *token_end++ = chr, *token_end = '\0' )
#   define IN ( chr = getc( input ) )
#   define NEW_LINE ( line_length = 0 )

    int local_counter = 0;
#   define NLOCALS 5000
    static char local_names[NLOCALS][6];

    /* Save a lot of hassle by putting the local name strings in a static
     * array where they can be re-used.  The var_value cells of the ids in
     * a hashed symbol table are second class citizens in that they are not
     * freed when the hash table is freed, although the var_name strings are.
     */
    for ( i=0; i < NLOCALS/1000; i++ ) /* Initialize the local name strings. */
	for ( j=0; j<=9; j++ )
	    for ( k=0; k<=9; k++ )
		for ( l=0; l<=9; l++ )
		{
		    chr_ptr = local_names[local_counter++];
		    *chr_ptr++ = 'l';	/* Leading letter. */
		    if ( i ) *chr_ptr++ = '0' + i;	/* Thousands. */
		    if ( i || j ) *chr_ptr++ = '0' + j;	/* Hundreds. */
		    if ( i || j || k ) *chr_ptr++ = '0' + k;	/* Tens. */
		    *chr_ptr++ = '0' + l;	/* Ones. */
		    *chr_ptr = '\0';
		}
    
    /* Start out the global dictionary with the C keywords. */
    globals = new_hash_table( 1000 );
    for ( i=0, name_ptr = c_keywords; i < n_keywords; i++, name_ptr++ )
	new_symbol( *name_ptr, globals );

    /* Process dash arguments. */
    for ( arg_num=1, arg_ptr = &argv[1], arg = *arg_ptr;
	  arg_num < argc;
	  arg_num++, arg_ptr++, arg = *arg_ptr )
    {
	if ( arg[0] != '-' )
	    break;	/* Only do dash args here. */

	/* Process -d (global dictionary) args. */
	if ( strcmp( arg, "-d" ) == 0 )
	{
	    if ( ++arg_num >= argc-1 ) break;	/* Ignore -d at the end. */
	    arg = *++arg_ptr;

	    if ( (dict_in = fopen( arg, "r" )) == NULL )
	    {
		sprintf( buffer, "opqcp: Can't open dictionary %s\n", arg );
		perror( buffer );
		exit( 1 );
	    }

	    while( fgets( buffer, BUFSIZ, dict_in ) != NULL )
	    {
		/* Trash the newline fgets puts in the buffer. */
		buffer[ strlen(buffer)-1 ] = '\0';
		 
		/* Optional translation is separated from name by a blank. */
		if ( (translation = index( buffer, ' ' )) != NULL )
		    *translation++ = '\0';    /* Terminate the name string. */

		/* Link in a copy of the name string. */
		name = NEW( char, strlen(buffer)+1 );
		strcpy( name, buffer );
		symbol = new_symbol( name, globals );

		/* Translation will be NULL if symbol is to be left alone. */
		if ( translation != NULL )
		{
		    /* Link in a copy of the translation string. */
		    symbol->var_value = NEW( char, strlen(translation)+1 );
		    strcpy( symbol->var_value, translation );
		}
	    }
	    fclose( dict_in );
	}
	/* Process -Idir (cpp include directory) argument(s). */
	else if ( strncmp( arg, "-I", 2 ) == 0 )
	{
	    strcat( I_args, " " );
	    strcat( I_args, arg );
	}
	/* Process a -t (token trace) argument. */
	else if ( strcmp( arg, "-t" ) == 0 )
	{
	    token_trace = TRUE;		/* For testing. */
	    goto filter;		/* Forces filter mode. */
	}
	/* Process a -f (filter mode) argument. */
	else if ( strcmp( arg, "-f" ) == 0 )
	{
	filter:
	    filter_mode = TRUE;		/* No-ops all of the file handling. */
	    input = stdin;
	    output = stdout;
	}
	else
	    break; 			/* Out if invalid dash arg. */
    }

    /* Usage message if bad dash flag or no file and dir args . */
    if ( arg[0] == '-'  || !filter_mode && argc-arg_num < 2 )
    {
	fprintf( stderr, "%s\n%s\n",
	    "usage: opqcp [-d dict]* [-Idir]* [srcfile]+ destdir",
	    "  or   opqcp [-d dict]* -f" );
	exit( 2 );
    }

    /* Check that the last argument is a directory. */
    if ( !filter_mode )
    {
	if ( stat( directory = argv[argc-1], &stat_buffer ) == -1 )
	{
	    sprintf( buffer, "opqcp: Can't stat directory %s\n", directory );
	    perror( buffer );
	    exit( 1 );
	}
	if ( ! (stat_buffer.st_mode & S_IFDIR) )
	{
	    fprintf( stderr, "opqcp: %s is not a directory.\n", directory );
	    exit( 1 );
	}
    }

    /* Loop through the file arguments. */
    for ( ; filter_mode || arg_num < argc-1; arg_num++, arg_ptr++ )
    {
	if ( filter_mode )
	    arg = "(stdin)";
	else
	{
	    arg = *arg_ptr;

	    /* Before opening the output file, check that it isn't the same
	     * as the input file.  (This would result in clearing the file
	     * before it was read!)
	     */
	    if ( stat( arg, &stat_buffer ) == -1 )
	    {
		sprintf( buffer, "opqcp: Can't stat input file %s\n", arg );
		perror( buffer );
		continue;		/* Go do next file arg. */
	    }
	    /* Might as well check that it's a plain file while we're here. */
	    if ( ! (stat_buffer.st_mode & S_IFREG) )
	    {
		fprintf( stderr, "opqcp: %s is not a plain file.\n", arg );
		continue;		/* Go do next file arg. */
	    }
	    input_dev = stat_buffer.st_dev;
	    input_inode = stat_buffer.st_ino;

	    sprintf( buffer, "%s/%s", directory, arg ); /* Output file path. */
	    if ( stat( buffer, &stat_buffer ) == 0 )
	    {	       /* We can only stat an already existing output file. */
		/* Check for inode collision to guard against overwriting. */
		if ( stat_buffer.st_dev == input_dev &&
		     stat_buffer.st_ino == input_inode )
		{
		    fprintf( stderr, "opqcp: Can't copy file %s to itself.\n",
			     arg );
		    continue;		/* Go on to next file arg. */
		}
		/* Check that it's a plain file while we're here. */
		if ( ! (stat_buffer.st_mode & S_IFREG) )
		{
		    fprintf( stderr, "opqcp: %s is not a plain file.\n",
			     buffer );
		    continue;		/* Go on to next file arg. */
		}
	    }

	    /* Open the opaque source file on the destination directory. */
	    if ( (output = fopen( buffer, "w" )) == NULL )
	    {
		sprintf( buffer, "opqcp: Can't write file %s",arg );
		perror( buffer );
		continue;		/* Go on to the next file. */
	    }

	    /* Copyright notice. */
	    fputs( "/* Licensed material, Copyright ", output );
	    fputs( "(c) 1985, University of Utah. */\n", output );

	    /* Read the source file through the C preprocessor. */
	    sprintf( buffer2, "cc -E %s %s", I_args, arg );
	    if ( (input = popen( buffer2, "r" )) == NULL )
	    {
		sprintf( buffer2, "opqcp: Couldn't start cpp on %s.",arg );
		perror( buffer2 );
		fclose( output );	/* It was already opened. */
		unlink( buffer );	/* Wipe out the empty output file. */
		continue;		/* Go on to the next file. */
	    }
	}

	/* Local dictionary lasts only through a single source file. */
	locals = new_hash_table( 1000 );
	local_counter = 0;

	/* Transfer tokens from the cpp stream to a packed output file. */
	chr = '\n'; /* In effect, the end of the line before the first line. */
	NEW_LINE;			/* Start first line of output. */
	prev_token_type = NONE;
	/* Back here at the beginning of each token. */
	while( chr != EOF )
	{
	    NEW_TOKEN;		/* Set up to collect a token. */
	    token_type = NONE;
	    whitespace = FALSE;

	    /* Flush whitespace before a token. */
	    while( isspace( chr ) )
	    {
		whitespace = TRUE;

		if ( chr != '\n' )
		    IN;		/* Character after blank or tab. */
		else
		{
		    IN;		/* First character on a line. */
		    if ( chr == '#' )
		    {
			/* Flush C preprocessor linenumber statements. */
			while ( chr != '\n' && chr != EOF )
			    IN;		/* Go until the newline is found. */
		    }
		}
	    }

	    /* Identifiers, keywords, etc.  Underscore is a letter. */
	    if ( isalpha( chr ) || chr == '_' )
	    {
		token_type = SYMBOL;

		/* Grab the alpha and then alphanumeric characters. */
		NEXT;
		while ( isalnum( chr ) || chr == '_' ) NEXT;

		/* Look the symbol up in the global dictionary. */
		if ( (symbol = find_symbol( token, globals )) != NULL )
		{
		    /* Substitute a global translation if there is one. */
		    if ( symbol->var_value != NULL )
		    {
			strcpy( token, symbol->var_value );
		    }
		}
		else
		{
		    /* Look the symbol up in the local dictionary. */
		    if ( (symbol = find_symbol( token, locals )) == NULL )
		    {
			/* New symbol, assign a local translation. */
			name = NEW( char, strlen(token)+1 );
			strcpy( name, token );
			symbol = new_symbol( name, locals );
			symbol->var_value = local_names[++local_counter];
		    }
		    /* Translate symbol to the local equivalent. */
		    strcpy( token, symbol->var_value );
		}
	    }
	    /* Character constants and strings. */
	    else if ( chr == '"' || chr == '\'' )
	    {
		token_type = STRING;
		string_type = chr;	/* Remember which kind of quote. */
		NEXT;	  /* Stash quote, get first char of string. */
		while ( chr != string_type )
		{
		    if ( chr != '\\' )
			NEXT;    /* Stash non-backslash char, go ahead. */
		    else
		    /* Handle backslash escapes. */
		    {
			NEXT;   /* Stash backslash, go to escaped char. */
			if ( string_type != '"' || chr != '\n' )
			    NEXT;	/* Stash escaped char, go ahead. */
			else
			{
			    /* Weird special case.  Strings can be
			     * continued onto the next line if they
			     * precede the newline with a backslash.
			     */
			    NEXT;	/* Stash escaped newline, go ahead. */
			    if ( strlen(token) + line_length > 77 )
				fputs( "\n", output ); /* Break before. */
			    /* Put out the string with escaped newline. */
			    fputs( token, output );
			    /* Start a new line and a new token. */
			    NEW_LINE; NEW_TOKEN;
			}
		    }
		}
		NEXT;		/* Stash closing quote. */
	    }
	    /* Numbers.  (This can be a bit simplified over a real
	     * lexical analyzer, since it only has to correctly
	     * recognize all valid number forms, not detect subtle
	     * syntax errors.)
	     */
	    else if ( isdigit( chr ) )
	    {
	    number:
		token_type = NUMBER;

		/* Initial numeric part. */
		if ( chr == '0' )
		{  /* Could be an octal or hex constant, or just a "0". */
		    NEXT;
		    if ( chr == 'x' || chr == 'X' )
		    {
			NEXT;	/* Get first char of hex constant. */
			while( isdigit( chr ) || chr >= 'a' && chr <= 'f'
					      || chr >= 'A' && chr <= 'F' )
			    NEXT;	/* Read through hex constant. */
		    }
		    else		/* Octal integer constant. */
			while ( isdigit( chr ) ) NEXT;
		}
		else
		    /* Decimal integer, or part of a floating pt number. */
		    while ( isdigit( chr ) ) NEXT;

		/* Optional integer "l" suffix, fraction, or exponent. */
		if ( chr == 'l' || chr == 'L' )
		    NEXT;
		else
		{
		    /* Optional fractional part on floats. */
		    if ( chr == '.' )
		    {
			NEXT;	/* Get first char of fraction. */
			while ( isdigit( chr ) ) NEXT;
		    }

		    /* Optional exponent. */
		    if ( chr == 'e' || chr == 'E' )
		    {
			NEXT;	/* Get first char of exponent. */
			/* Optional sign on exponent. */
			if ( chr == '+' || chr == '-' ) NEXT;
			while ( isdigit( chr ) ) NEXT;	/* Exponent. */
		    }
		}
	    }
	    /* Operators and single-character tokens for punctuation. */
	    else if( chr != EOF )
	    {
		token_type = OTHER;

		/* Dot may be either a leading decimal point on a number,
		 * or a structure access operator. We have a number if the
		 * following character is a digit.
		 */
		if ( chr == '.' )
		{
		    NEXT;
		    if ( isdigit( chr ) )
			goto number;	/* Yep, leading decimal point. */
		}
		else			/* Everything other than dots. */
		{
		    /* Keep operator diphthongs contiguous. */
		    if ( index( op_diphthong_chars, chr ) != NULL )
			while ( index( op_diphthong_chars, chr ) != NULL )
			    NEXT;	/* Collect diphthong characters. */
		    else
			NEXT;		/* Single-character token. */
		}
	    }

	    if ( token_trace )
		fprintf( output, "type = %s, token = `",
		    token_type_strings[ (int)token_type ] );

	    /* Got a token in the buffer, pack it onto an output line. */
	    token_length = strlen( token );
	    if ( token_length + line_length > 77 )  /* Time for line break? */
	    {
		fputs( "\n", output );	/* Line break. */
		NEW_LINE;		/* Start next line. */
	    }
	    /* Need to preserve blanks between symbols or numbers, and
	     * between operators next to = signs so they don't run together.
	     */
	    else if ( (prev_token_type==SYMBOL || prev_token_type==NUMBER) &&
			   (token_type==SYMBOL || token_type==NUMBER)   ||
		      token[0] == '=' && whitespace )	/* Space before =. */
	    {
		fputs( " ", output );
		line_length++;
	    }

	    /* Add token to the output stream. */
	    fputs( token, output );
	    line_length += token_length;

	    if ( token[token_length-1] == '=' && isspace( chr ) )
	    {
		fputs( " ", output );	/* Space after = sign. */
		line_length++;
	    }

	    if ( token_trace )
		fprintf( output, "'\n" );

	    prev_token_type = token_type;
	}
	/* EOF. */
	fputs( "\n", output );		/* End of line at end of file. */

	if ( filter_mode )
	    exit( 0 );			/* Done. */
	else
	{
	    pclose( input );
	    fclose( output );
	    fr_hash_table( locals );
	}
    }					/* Next file. */
}