/* A program for simulating chance resemblances */


/* simmatch.cpp  CDocTemplate
//
// A program used to check for random matches between simulated
// languages.  
// Copyright 1999 by Mark Rosenfelder (markrose@zompist.com)
// May be freely used, reproduced, or modified.
//
// For the logic underlying this model, see my paper
// "How likely are chance resemblances between languages?"
// at http://www.zompist.com/chance.htm
// 
// The program allows the testing the model by creating two
// randomly generated languages, and checking for random matches
// between them.  
// 
// The user can determine the phonological complexity of the
// languages (which indirectly determines the lexicon size), and
// can specify the phonetic and semantic leeway allowed for a match.
//
// The program was written on a Windows system, but I've run it
// with no problems on Unix.
*/

#include "stdio.h"
#include "stdlib.h"
#include "malloc.h"
#include "stdarg.h"
#include "string.h"
#include "time.h"

/******* Simulation parameters *********/

#define nV         5        /* number of vowels */
#define nC        14        /* number of consonants */
#define phonLee    3        /* phonetic leeway */
#define semLee    10        /* semantic leeway */
char    form[] = "CVC";     /* phonological structure */

/******* Global variables *********/

int nLex;                   /* number of words */
int nForm;                  /* number of letters in form[] */
int *fac;                   /* used by PosIndex() for w -> form map */

/* Strings for interpreting 'words'.  
// Adjacent letters represent similar sounds.
// The program doesn't know anything about phonetics;
// its words are just sequences of numbered phonemes,
// with similarity being represented by similar numbers.
// These strings are for representing them to the user.
//
// Add to these if nC or nV is bigger than these strings.
*/
char *vowel = "aeiuoOUIEA";
char *conso = "ptkgdbvzsTDGxflrwmnNy";

/******* Operational parameters *********/

/* Output words as strings not numbers */
#define SHOW_AS_CHAR         
/* Print just matches */
#define PRINT_ONLY_MATCHES
/* Print the entire lexicon */
/* #define PRINT_LANGUAGES  */

#define TRUE  1
#define FALSE 0

/******* Prototypes *********/

int *MakeLanguage( char *name );
void PrintWord( int w, int mng );
int  Compare( char *aname, int *a, char *bname, int *b );
int  Match( int p, int q, int n, int dist );
int  PosIndex( int w, int pos );

/******* main routine *********/

int main(int argc, char* argv[])
{
	/* Declare variables here because (snif) this isn't C++ */
	int i;
	int *a, *b;

	/* Seed the random number generator */
	srand( (unsigned) time( NULL ));

	/* Figure out the number of variations per digit */
	nLex = 1;
	nForm = strlen(form);
	fac = (int *) calloc( nForm, sizeof(int) );
	for (i = nForm - 1; i >= 0; i--)
	{
		int n = form[i] == 'V' ? nV : nC;
		if (i == nForm - 1)
			fac[i] = n;
		else
			fac[i] = n * fac[i+1];
	}

	/* # words is the number of variations from the 1st digit on */
	nLex = fac[0];

	/* Create our two languages */
	a = MakeLanguage( "A" );
	b = MakeLanguage( "B" );

	/* Count the matches */
	Compare( "A", a, "B", b );

	/* Cleanup */
	free(a); 
	free(b);
	free(fac);

	printf( "\nPress return to continue\n" );
	getchar();
	return 1;
}

/******* MakeLanguage *********
//
// Create a fake language according to the current parameters.
//
// A language conceptually consists of a set of nLex words, each
// with a separate meaning.  Meanings are represented simply by
// numbers from 0 to nLex, randomly distributed among the words.
// An array of these meanings is returned by this function.
//
// The words-- their phonetic shapes-- are not actually stored,
// but re-created on demand.  We simply assume that there's a word
// for every possible phonetic shape, listed in a standard order.
// 
// The possible shapes are given by form[], and the order is given
// by the vowel[] and conso[] arrays.
//
// Then we simply need some rules for figuring out the actual
// form for word i.  These are given by the PosIndex routine.
*/

int *MakeLanguage( char *name )
{
	/* Allocate space for the language */
	int *lg = (int *) calloc( nLex, sizeof(int) );
	int w;

	/* To allocate the nLex meanings automatically, we keep a list
	// of unused meanings and use them randomly, one at a time.
	// This list of course starts out with all nLex possibilities. */
	int *meanings = (int *) calloc(nLex, sizeof(int));
	int meanLeft = nLex;
	for (w = 0; w < nLex; w++)
		meanings[w] = w;

	/* Now generate nLex words */
	for (w = 0; w < nLex; w++)
	{
		/* If there's only one meaning left, just grab it */
		if (meanLeft == 1)
			lg[w] = meanings[0];

		/* Otherwise, pick randomly from the remaining meanings */
		else
		{
			int whichMean = rand() % meanLeft;
			lg[w] = meanings[whichMean];

			/* The selected meaning is replaced with the last
			// meaning in the list, which thus shrinks by 1. */
			meanings[whichMean] = meanings[meanLeft - 1];
			meanLeft--;
		}
	}

	free( meanings );

#ifdef PRINT_LANGUAGES
	printf( "\nLanguage %s\n", name );
	for (w = 0; w < nLex; w++)
	{
		PrintWord( w, lg[w] );
		printf( " " );
	}
	printf( "\n" );
#endif

	return lg;
}

/******* PosIndex *********
//
// Given word w, this word will generate its phonetic shape.
// 
// It should be called one character at a time, from 0 to nForm - 1.
// It returns an integer representing which vowel or consonant
// word w has.
// 
// Suppose form[] is CVC.  As we go down the list of words, the 
// last character (nForm - 1) just increments from 0 to nC 
// over and over again-- so it's always w mod nC.
// 
// The character before it starts out at 0 for the first cycle,
// 1 for the next, and so on-- changing every nC words.
// 
// The character before *that* does the same, but changes only as
// the following characters finish their entire cycle-- nC * nV.
// 
// So the general formula for all but the last character is that
// we divide by the size of the following characters' cycle.
// We figured that out in main() and stored it in fac[].
*/

int PosIndex( int w, int pos )
{
	int n = form[pos] == 'C' ? nC : nV;

	if (pos == nForm - 1)
		return w % n;
	else
		return (w / fac[ pos + 1 ]) % n;
}

/******* PrintWord *********
//
// A routine to print out the phonetic shape of word w, as well
// as its meaning.  It uses PosIndex to figure out the phonetics.
*/

void PrintWord( int w, int mng )
{
	int pos;
	for (pos = 0; pos < nForm; pos++)
	{
		int ix = PosIndex( w, pos );
		char c = form[pos] == 'C' ? conso[ix] : vowel[ix] ;

#       ifdef SHOW_AS_CHAR
			printf( "%c", c );
#       else
			printf( "%i", ix );
#       endif
	}

	if (nForm > 2)
		printf( "." );

	printf( "%2i", mng );
}

/******* Match *********
//
// Determine whether two phonetic or semantic values match,
// to within the specified leeway.
// 
// (Remember that both sounds and meanings are represented, in this
// simulation, by simple integers.)
//
// Value q is considered to match value p if it falls within the
// range p - dist/2 ... p + dist/2.  (We adjust the bottom range
// upward one if the leeway is even.)
//
// The values OVERLAP at the ends.  Otherwise, there would be less
// matches than expected values near the ends of their range.
//
// Obviously a more sophisticated calculation could be used-- e.g.
// use a multi-dimensional model of phonological space.  But 
// a model where we just specify how many sounds each sound can 
// match is a good approximation to this.
*/

int Match( int p, int q, int n, int dist )
{
	/* Fast check for exact matches */
	if (p == q)
		return TRUE;
	else
	{
		int i;
		int d2  = dist / 2;
		int odd = dist % 2 == 1;

		/* Check each value in the target range */
		for (i = -d2 + (odd ? 0 : 1); i <= d2; i++)
		{
			int comparand = q + i;

			/* Make the ends of the range of values overlap */
			if (comparand >= n)
				comparand -= n;
			else if (comparand < 0)
				comparand += n;

			if (p == comparand)
				return TRUE;
		}
	}

	return FALSE;
}

/******* Compare *********
//
// Compare two languages, reporting the number of matches.
//
// Two numbers are reported: 
//
// * the "raw" number of matches, corresponding to the 'naive' 
// match probability of phonLee * semLee.  There can be more than
// one match in B for each word in A, this way.
//
// * a number restricted to one per word, corresponding to the
// corrected probability 1 - ((1 - p)^m) discussed later in the paper.
// Here, there is no more than one match per word in A.
//
// This match routine corresponds to the methodology of checking for
// a match in B for each word in language A.
//
// This can lead to using the same word in B more than once.  This
// doesn't bother some of the people who amass apparent coincidences,
// and if we're testing their claims we must use the same methodology.
//
// So, we return two sets of results, allowing and disallowing
// the use of the same word in B more than once.
*/

int Compare( char *aname, int *a, char *bname, int *b )
{
	int w, v, i;
	int nMatch = 0;
	int nTMatch = 0;
	int nMatch1 = 0;
	int nTMatch1 = 0;

	/* Arrays used to store phonetic realizations of words */
	int *apos = (int *) calloc( nForm, sizeof(int) );
	int *bpos = (int *) calloc( nForm, sizeof(int) );

	/* Array that remembers words in B we already used */
	int *used = (int *) calloc( nLex, sizeof(int) );
	for (v = 0; v < nLex; v++)
		used[v] = FALSE;

	printf( "Comparing languages %s and %s\n", aname, bname );
	printf( "Format: word-in-A > phonetic-matches-in-B\n" );
	printf( "Semantic matches indicated by +\n" );
	printf( "Words in B already used marked by X\n" );

	/* For each word in A... */
	for (w = 0; w < nLex; w++)
	{
		int printedA = FALSE;
		int subMatch = 0;
		int subMatch1 = 0;

#ifndef PRINT_ONLY_MATCHES
		printf( "\n" );
		PrintWord( w, a[w] );
		printf( " > " );
		printedA = TRUE;
#endif

		/* Find its phonetic form (as an integer array) */
		for (i = 0; i < nForm; i++)
			apos[i] = PosIndex( w, i );

		/* Check it against each word in B */
		for (v = 0; v < nLex; v++)
		{
			int cont = TRUE;

			/* Check the characters for a phonetic match, 1 at a time */
			for (i = 0; cont && i < nForm; i++)
			{
				int n = form[i] == 'C' ? nC : nV;
				bpos[i] = PosIndex( v, i );

				if (!Match( apos[i], bpos[i], n, phonLee ))
					cont = FALSE;
			}

			/* Did all the characters match (within the leeway)?  */
			if (cont)
			{
				/* Yes, check for a semantic match. */
				int matches = Match( a[w], b[v], nLex, semLee);

				/* Increment #'s of matches
				// There's a separate count of only-use-B-once matches. */
				if (matches)
				{
					if (!used[v])
						subMatch1++;

					subMatch++;
				}

				/* Report all phonetic matches, or just semantic ones */
#ifdef PRINT_ONLY_MATCHES
				if (matches)
#endif
				{
					if (!printedA)
					{
						printf( "\n" );
						PrintWord( w, a[w] );
						printf( " > " );
						printedA = TRUE;
					}

					PrintWord( v, b[v] );
					printf( "+" );
					if (used[v])
						printf( "X" );
					printf( " " );
				}

				/* Mark this word used
				// We simple-mindedly keep this as an unsorted list */
				if (matches)
					used[v] = TRUE;
			}
		}
		/*We're done with word w.  Increment our subtotals */
		nMatch  += subMatch ? 1 : 0;
		nTMatch += subMatch;
		nMatch1  += subMatch1 ? 1 : 0;
		nTMatch1 += subMatch1;
	}

	/* Report the results  */
	printf( "\n\nMatches in %i words:\n", nLex );
	printf( "A matches     B matches allowed:     \n" );
	printf( "allowed:      >1 per word  1 per word\n" );
	printf( "-----------   -----------  ----------\n" );
	printf( ">1 per word    %8i     %8i \n", nTMatch, nTMatch1 );
	printf( " 1 per word    %8i     %8i \n", nMatch,  nMatch1  );

	/* Cleanup */
	free(apos);
	free(bpos);
	free(used);

	return nMatch;
}