/* Sound Change Applier */


/*
**	SOUNDS.C
**
**	Sound Change Applier
**
**      Copyright (C) 2000 by Mark Rosenfelder.
**      This program may be freely used and modified for non-commercial purposes.
**      See http://www.zompist.com/sounds.htm for documentation.
*/

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

#define TRUE 1
#define FALSE 0

static int printRules = 0;
static int bracketOut = 0;
static int printSourc = 1;
static int toScreen   = 1;

#define MAXRULE 200
#define MAXCAT  50


static int  nRule = 0;
static char *Rule[MAXRULE];

static int  nCat = 0;
static char *Cat[MAXCAT];

/*
**	ReadRules
**
**	Read in the rules file *.sc for a given project.
**
**	There are two types of rules: sound changes and category definitions.
**	The former are stored in Rule[], the latter in Cat[].
**
**	The format of these rules is given under Transform().
*/
int ReadRules( char *filestart )
{
	char filename[84];
	char buffer[129];
	char *s;
	int  n;
	FILE *f;
	
	nRule = 0;
	nCat  = 0;
	
	/* Open the file */
	
	sprintf( filename, "%s.sc", filestart );
	
	f = fopen( filename, "r" );
	if (!f)
	{
		printf( "File %s could not be read in.\n\n", filename );
		return(FALSE);
	}
	
	while (fgets( buffer, 129, f))
	{
		if (strlen(buffer))
			buffer[strlen(buffer)-1] = '\0';
		
		s = malloc( strlen(buffer) + 1);
		if (s)
			strcpy( s, buffer );
			
		if (buffer[0] != '*')
		{	
			if      (strchr( buffer, '/' ))
				Rule[nRule++] = s;
			else if (strchr( buffer, '='))
				Cat[ nCat++]  = s;
		}
	}
	
	fclose(f);
	
	if (nCat)
	{
		printf( "%i categories found\n", nCat );
		
		#ifdef PRINT_RULES
		for (n = 0; n < nCat; n++)
			printf( "%s\n", Cat[n] );
			
		printf( "\n" );
		#endif
	}
	else
		printf( "No rules were found.\n\n" );
	
	if (nRule)
	{
		printf( "%i rules found\n", nRule );
		
		#ifdef PRINT_RULES
		for (n = 0; n < nRule; n++)
			printf( "%s\n", Rule[n] );
			
		printf( "\n" );
		#endif
	}
	else
		printf( "No rules were found.\n\n" );
	
	return( nRule );
	
} /*ReadRules*/


/*
**	Divide
**
**	Divide a rule into source and target phoneme(s) and environment.
**	That is, for a rule s1/s2/env
**	create the three null-terminated strings s1, s2, and env.
**
**	If this cannot be done, return FALSE.
*/
int Divide( char *Rule, char **s1, char **s2, char **env )
{
	size_t i;
	static char s1_str[20];
	static char s2_str[20];
	static char ev_str[50];
	
	i = strcspn( Rule, "/" );
	if (i == 0 || i > 19)
		return(FALSE);
		
	strncpy( s1_str, Rule, i );
	s1_str[i] = '\0';
	Rule += i + 1;
	
	i = strcspn( Rule, "/" );
	if (i > 19)
		return(FALSE);
	
	if (i)	
		strncpy( s2_str, Rule, i );
	s2_str[i] = '\0';
	Rule += i + 1;
	
	strcpy( ev_str, Rule );

	*s1  = s1_str;
	*s2  = s2_str;
	*env = ev_str;
	
	return(TRUE);
	
} /*Divide*/


/*
**	TryCat
**
**	See if a particular phoneme sequence is part of any category.
**	(We try all the categories.)
**
**	For instance, if we have 'a' in the source word and 'V' in the
**	structural description, and a category V=aeiou, TryCat returns TRUE,
**	and sets *n to the number of characters to skip.
**	
**	If we had 'b' instead, TryCat would return FALSE instead.
**
**	If no category with the given identification (env) can be found,
**	we return TRUE (continue looking), but set *n to 0.
**
**	Warning: For now, we don't have a way to handle digraphs.
**
**	We also return TRUE if 
*/
int TryCat( char *env, char *word, int *n, int *catLoc )
{
	int c;
	char *catdef;
	
	if (*word == '\0')
		return(FALSE);
		
	for (c = 0; c < nCat; c++)
	{
		if (*env == *Cat[c])
		{
			catdef = strchr( Cat[c], '=' );
			
			if (strchr( catdef + 1, word[0] ))
			{
				*n = 1;
				*catLoc = strchr( Cat[c], word[0] ) - Cat[c];
				return(TRUE);
			}
			else
				return(FALSE);
		}
	}

	*n = 0;
	return(TRUE);	
	
} /*TryCat*/

/*
**	TryRule
**
**	See if a rule s1->s2/env applies at position i in the given word.
**
**	If it does, we pass back the index where s1 was found in the
**	word, as well as s1 and s2, and return TRUE.
**
**	Otherwise, we return FALSE, and pass garbage in the output variables.
*/
int TryRule( char *word, int i, char *Rule, int *n, char **s1, char **s2, char *varRep )
{
	int j, m, cont = 0;
	int catLoc;
	char *env;
	int  optional = FALSE;
	*varRep = '\0';
	
	if (!Divide( Rule, s1, s2, &env ) || !strchr( env, '_' ))
		return(FALSE);
	
	for (j = 0, cont = TRUE; cont && j < strlen(env); j++)
	{
		switch( env[j] )
		{
			case '(':
				optional = TRUE;
				break;

			case ')':
				optional = FALSE;
				break;

			case '#':
				cont = j ? (i == strlen(word)) : (i == 0);
				break;
				
			case '_':
				cont = !strncmp( &word[i], *s1, strlen(*s1) );
				if (cont)
				{
					*n = i;
					i += strlen(*s1);
				}
				else
				{
				  cont = TryCat( *s1, &word[i], &m, &catLoc );
				  if (cont && m)
				  {
					int c;
				    *n = i;
				    i += m;

					for (c = 0; c < nCat; c++)
						if ((*s2)[0] == Cat[c][0] && catLoc < strlen(Cat[c]))
							*varRep = Cat[c][catLoc];
				   }
				   else if (cont)
				     cont = FALSE;
				}
				break;
			
			default:
				cont = TryCat( &env[j], &word[i], &m, &catLoc );
				
				if (cont && !m)
				{
					/* no category applied */
					
					cont = i < strlen(word) && word[i] == env[j];

					m = 1;
				}
				if (cont)
					i += m;

				if (!cont && optional)
					cont = TRUE;
		}
	}
	
	if (cont && printRules)
		printf( "   %s->%s /%s applies to %s at %i\n", 
		*s1, *s2, env, word, *n );
	
	return(cont);

} /*TryRule*/

/*
**	Transform
**
**	Apply the rules to a single word and return the result.
**
**	The rules are stated in the form string1/string2/environment, e.g.
**		f/h/#_V
**	which states that f changes to h at the beginning of a word before a 
**	vowel.
*/
char *Transform( char *input )
{
	       char inword[80];
	static char outword[80];

	char instr[10];
	char *s1, *s2;
	int i;
	int r;
	int n;
	
	strcpy( inword, input );
	
	/* Try to apply each rule in turn */ 
	
	for (r = 0; r < nRule; r++)
	{
		/* Initialize output of this rule to null */
		
		memset( outword, 0, 80 );	
		
		/* Check each position of the input word in turn */
		 
		i = 0;
		while (i < strlen(inword))
		{
			char varRep = 0;

			if (TryRule( inword, i, Rule[r], &n, &s1, &s2, &varRep ))
			{
				/* Rule applies at inword[n] */
				
				if (n)
					strncat( outword, &inword[i], n - i );
			
				if (varRep)
					outword[strlen(outword)] = varRep;
				else if (strlen(s2))
					strcat( outword, s2 );
				
				i = n + strlen(s1);
			} 
			else
			{
				/* Rule doesn't apply at this location */
				
				outword[strlen(outword)] = inword[i++];
			}
		}
		
		/* Output of one rule is input to next one */
		
		strcpy( inword, outword );
	}
	
	/* Return the output of the last rule */
	
	return(outword);
	
} /*Transform*/
	
/*
**	DoWords
**
**	Read in each word in turn from the input file,
**	transform it according to the rules,
**	and output it to the output file.
**
**	This algorithm ensures that word files of any size can be processed.
*/
void DoWords( char *lexname, char *outname )
{
	char filename[84];
	char inword[84];
	int  n = 0;
	FILE *f, *g;
	char *outword;
	
	sprintf( filename, "%s.lex", lexname );
	
	f = fopen( filename, "r" );
	if (!f)
	{
		printf( "File %s could not be read in.\n\n", filename );
		return;
	}
	
	sprintf( filename, "%s.out", outname );
	
	g = fopen( filename, "w" );
	if (!g)
	{
		printf( "File %s could not be created.\n\n", filename );
		fclose(f);
		return;
	}
	
	while (fgets( inword, 129, f))
	{
		n++;
		if (strlen(inword))
			inword[strlen(inword) - 1] = '\0';
		
		outword = Transform(inword);

		if (!printSourc)
		{
			if (toScreen)
				printf(     "%s\n", outword );
			fprintf( g, "%s\n", outword );
		}
		else if (bracketOut)
		{
			if (toScreen)
				printf(     "%s \t[%s]\n", outword, inword );
			fprintf( g, "%s \t[%s]\n", outword, inword );
		}
		else
		{
			if (toScreen)
				printf(     "%s --> %s\n", inword, outword );
			fprintf( g, "%s --> %s\n", inword, outword );
		}
	}
	
	fclose(f);
	fclose(g);
	
	printf( "%i word%s processed.\n", n, n == 1 ? "" : "s" );
	
} /*DoWords*/

/*
**	MAIN ROUTINE
**
**	Ask for name of project
**	Read in rules and input words
**	Apply transformations
**	Output words
**
*/
main( int argc, char **argv )
{
	int once = FALSE;
	char lexicon[65] = "\0";
	char rules[65] = "\0";

	/* Read command line arguments */
	int i;
	for (i = 1; i < argc; i++)
	{
		if (argv[i][0] == '-' && strlen(argv[i]) > 1)
		{
			switch (argv[i][1])
			{
			case 'p': case 'P':   printRules = 1;  break;
			case 'b': case 'B':   bracketOut = 1;  break;
			case 'l': case 'L':   printSourc = 0;  break;
			case 'f': case 'F':   toScreen   = 0;  break;				
			}
		}
		else if (!lexicon[0])
			strcpy( lexicon, argv[i] );
		else
			strcpy( rules, argv[i] );
	}

	once = lexicon[0] && rules[0];
	
	printf( "\nSOUND CHANGE APPLIER\n(C) 1992,2000 by Mark Rosenfelder\nFor more information see www.zompist.com\n\n" );
	
	if (once)
	{
		printf( "Applying %s.sc to %s.lex\n\n", lexicon, rules );

    	if (ReadRules( rules ))
    		DoWords( lexicon, rules );
	}
	else
	{
		int done = FALSE;
		while (!done)
		{
			printf( "\nEnter the name of a LEXICON.\n\n" );
			printf( "For example, enter latin to specify latin.lex.\nEnter q to quit the program.\n-->" );
			
			fgets( lexicon, 65, stdin );

			if (strlen(lexicon))
	    		lexicon[strlen(lexicon) - 1] = '\0';

			if (!strcmp( lexicon, "q" ))
				done = TRUE;
			else
			{
				printf( "Enter the name of a RULES FILE.\n\n" );
				printf( "For example, enter french to specify french.sc.\n" );
				printf( "The output words would be stored in french.out.\n-->" );
			
				fgets( rules, 65, stdin );
				
				if (strlen(rules))
	    			rules[strlen(rules) - 1] = '\0';

    			if (ReadRules( rules ))
    				DoWords( lexicon, rules );
			}
		}
	}
	
	printf( "\nThank you for using the SOUND CHANGE APPLIER!\n" );
	
} /*main*/