/* simmatch.cpp CDocTemplate // // A program used to check for random matches between simulated // languages. // Copyright 1999 by Mark Rosenfelder (markrose@zompist.com) // May be freely used, reproduced, or modified. // // For the logic underlying this model, see my paper // "How likely are chance resemblances between languages?" // at http://www.zompist.com/chance.htm // // The program allows the testing the model by creating two // randomly generated languages, and checking for random matches // between them. // // The user can determine the phonological complexity of the // languages (which indirectly determines the lexicon size), and // can specify the phonetic and semantic leeway allowed for a match. // // The program was written on a Windows system, but I've run it // with no problems on Unix. */ #include "stdio.h" #include "stdlib.h" #include "malloc.h" #include "stdarg.h" #include "string.h" #include "time.h" /******* Simulation parameters *********/ #define nV 5 /* number of vowels */ #define nC 14 /* number of consonants */ #define phonLee 3 /* phonetic leeway */ #define semLee 10 /* semantic leeway */ char form[] = "CVC"; /* phonological structure */ /******* Global variables *********/ int nLex; /* number of words */ int nForm; /* number of letters in form[] */ int *fac; /* used by PosIndex() for w -> form map */ /* Strings for interpreting 'words'. // Adjacent letters represent similar sounds. // The program doesn't know anything about phonetics; // its words are just sequences of numbered phonemes, // with similarity being represented by similar numbers. // These strings are for representing them to the user. // // Add to these if nC or nV is bigger than these strings. */ char *vowel = "aeiuoOUIEA"; char *conso = "ptkgdbvzsTDGxflrwmnNy"; /******* Operational parameters *********/ /* Output words as strings not numbers */ #define SHOW_AS_CHAR /* Print just matches */ #define PRINT_ONLY_MATCHES /* Print the entire lexicon */ /* #define PRINT_LANGUAGES */ #define TRUE 1 #define FALSE 0 /******* Prototypes *********/ int *MakeLanguage( char *name ); void PrintWord( int w, int mng ); int Compare( char *aname, int *a, char *bname, int *b ); int Match( int p, int q, int n, int dist ); int PosIndex( int w, int pos ); /******* main routine *********/ int main(int argc, char* argv[]) { /* Declare variables here because (snif) this isn't C++ */ int i; int *a, *b; /* Seed the random number generator */ srand( (unsigned) time( NULL )); /* Figure out the number of variations per digit */ nLex = 1; nForm = strlen(form); fac = (int *) calloc( nForm, sizeof(int) ); for (i = nForm - 1; i >= 0; i--) { int n = form[i] == 'V' ? nV : nC; if (i == nForm - 1) fac[i] = n; else fac[i] = n * fac[i+1]; } /* # words is the number of variations from the 1st digit on */ nLex = fac[0]; /* Create our two languages */ a = MakeLanguage( "A" ); b = MakeLanguage( "B" ); /* Count the matches */ Compare( "A", a, "B", b ); /* Cleanup */ free(a); free(b); free(fac); printf( "\nPress return to continue\n" ); getchar(); return 1; } /******* MakeLanguage ********* // // Create a fake language according to the current parameters. // // A language conceptually consists of a set of nLex words, each // with a separate meaning. Meanings are represented simply by // numbers from 0 to nLex, randomly distributed among the words. // An array of these meanings is returned by this function. // // The words-- their phonetic shapes-- are not actually stored, // but re-created on demand. We simply assume that there's a word // for every possible phonetic shape, listed in a standard order. // // The possible shapes are given by form[], and the order is given // by the vowel[] and conso[] arrays. // // Then we simply need some rules for figuring out the actual // form for word i. These are given by the PosIndex routine. */ int *MakeLanguage( char *name ) { /* Allocate space for the language */ int *lg = (int *) calloc( nLex, sizeof(int) ); int w; /* To allocate the nLex meanings automatically, we keep a list // of unused meanings and use them randomly, one at a time. // This list of course starts out with all nLex possibilities. */ int *meanings = (int *) calloc(nLex, sizeof(int)); int meanLeft = nLex; for (w = 0; w < nLex; w++) meanings[w] = w; /* Now generate nLex words */ for (w = 0; w < nLex; w++) { /* If there's only one meaning left, just grab it */ if (meanLeft == 1) lg[w] = meanings[0]; /* Otherwise, pick randomly from the remaining meanings */ else { int whichMean = rand() % meanLeft; lg[w] = meanings[whichMean]; /* The selected meaning is replaced with the last // meaning in the list, which thus shrinks by 1. */ meanings[whichMean] = meanings[meanLeft - 1]; meanLeft--; } } free( meanings ); #ifdef PRINT_LANGUAGES printf( "\nLanguage %s\n", name ); for (w = 0; w < nLex; w++) { PrintWord( w, lg[w] ); printf( " " ); } printf( "\n" ); #endif return lg; } /******* PosIndex ********* // // Given word w, this word will generate its phonetic shape. // // It should be called one character at a time, from 0 to nForm - 1. // It returns an integer representing which vowel or consonant // word w has. // // Suppose form[] is CVC. As we go down the list of words, the // last character (nForm - 1) just increments from 0 to nC // over and over again-- so it's always w mod nC. // // The character before it starts out at 0 for the first cycle, // 1 for the next, and so on-- changing every nC words. // // The character before *that* does the same, but changes only as // the following characters finish their entire cycle-- nC * nV. // // So the general formula for all but the last character is that // we divide by the size of the following characters' cycle. // We figured that out in main() and stored it in fac[]. */ int PosIndex( int w, int pos ) { int n = form[pos] == 'C' ? nC : nV; if (pos == nForm - 1) return w % n; else return (w / fac[ pos + 1 ]) % n; } /******* PrintWord ********* // // A routine to print out the phonetic shape of word w, as well // as its meaning. It uses PosIndex to figure out the phonetics. */ void PrintWord( int w, int mng ) { int pos; for (pos = 0; pos < nForm; pos++) { int ix = PosIndex( w, pos ); char c = form[pos] == 'C' ? conso[ix] : vowel[ix] ; # ifdef SHOW_AS_CHAR printf( "%c", c ); # else printf( "%i", ix ); # endif } if (nForm > 2) printf( "." ); printf( "%2i", mng ); } /******* Match ********* // // Determine whether two phonetic or semantic values match, // to within the specified leeway. // // (Remember that both sounds and meanings are represented, in this // simulation, by simple integers.) // // Value q is considered to match value p if it falls within the // range p - dist/2 ... p + dist/2. (We adjust the bottom range // upward one if the leeway is even.) // // The values OVERLAP at the ends. Otherwise, there would be less // matches than expected values near the ends of their range. // // Obviously a more sophisticated calculation could be used-- e.g. // use a multi-dimensional model of phonological space. But // a model where we just specify how many sounds each sound can // match is a good approximation to this. */ int Match( int p, int q, int n, int dist ) { /* Fast check for exact matches */ if (p == q) return TRUE; else { int i; int d2 = dist / 2; int odd = dist % 2 == 1; /* Check each value in the target range */ for (i = -d2 + (odd ? 0 : 1); i <= d2; i++) { int comparand = q + i; /* Make the ends of the range of values overlap */ if (comparand >= n) comparand -= n; else if (comparand < 0) comparand += n; if (p == comparand) return TRUE; } } return FALSE; } /******* Compare ********* // // Compare two languages, reporting the number of matches. // // Two numbers are reported: // // * the "raw" number of matches, corresponding to the 'naive' // match probability of phonLee * semLee. There can be more than // one match in B for each word in A, this way. // // * a number restricted to one per word, corresponding to the // corrected probability 1 - ((1 - p)^m) discussed later in the paper. // Here, there is no more than one match per word in A. // // This match routine corresponds to the methodology of checking for // a match in B for each word in language A. // // This can lead to using the same word in B more than once. This // doesn't bother some of the people who amass apparent coincidences, // and if we're testing their claims we must use the same methodology. // // So, we return two sets of results, allowing and disallowing // the use of the same word in B more than once. */ int Compare( char *aname, int *a, char *bname, int *b ) { int w, v, i; int nMatch = 0; int nTMatch = 0; int nMatch1 = 0; int nTMatch1 = 0; /* Arrays used to store phonetic realizations of words */ int *apos = (int *) calloc( nForm, sizeof(int) ); int *bpos = (int *) calloc( nForm, sizeof(int) ); /* Array that remembers words in B we already used */ int *used = (int *) calloc( nLex, sizeof(int) ); for (v = 0; v < nLex; v++) used[v] = FALSE; printf( "Comparing languages %s and %s\n", aname, bname ); printf( "Format: word-in-A > phonetic-matches-in-B\n" ); printf( "Semantic matches indicated by +\n" ); printf( "Words in B already used marked by X\n" ); /* For each word in A... */ for (w = 0; w < nLex; w++) { int printedA = FALSE; int subMatch = 0; int subMatch1 = 0; #ifndef PRINT_ONLY_MATCHES printf( "\n" ); PrintWord( w, a[w] ); printf( " > " ); printedA = TRUE; #endif /* Find its phonetic form (as an integer array) */ for (i = 0; i < nForm; i++) apos[i] = PosIndex( w, i ); /* Check it against each word in B */ for (v = 0; v < nLex; v++) { int cont = TRUE; /* Check the characters for a phonetic match, 1 at a time */ for (i = 0; cont && i < nForm; i++) { int n = form[i] == 'C' ? nC : nV; bpos[i] = PosIndex( v, i ); if (!Match( apos[i], bpos[i], n, phonLee )) cont = FALSE; } /* Did all the characters match (within the leeway)? */ if (cont) { /* Yes, check for a semantic match. */ int matches = Match( a[w], b[v], nLex, semLee); /* Increment #'s of matches // There's a separate count of only-use-B-once matches. */ if (matches) { if (!used[v]) subMatch1++; subMatch++; } /* Report all phonetic matches, or just semantic ones */ #ifdef PRINT_ONLY_MATCHES if (matches) #endif { if (!printedA) { printf( "\n" ); PrintWord( w, a[w] ); printf( " > " ); printedA = TRUE; } PrintWord( v, b[v] ); printf( "+" ); if (used[v]) printf( "X" ); printf( " " ); } /* Mark this word used // We simple-mindedly keep this as an unsorted list */ if (matches) used[v] = TRUE; } } /*We're done with word w. Increment our subtotals */ nMatch += subMatch ? 1 : 0; nTMatch += subMatch; nMatch1 += subMatch1 ? 1 : 0; nTMatch1 += subMatch1; } /* Report the results */ printf( "\n\nMatches in %i words:\n", nLex ); printf( "A matches B matches allowed: \n" ); printf( "allowed: >1 per word 1 per word\n" ); printf( "----------- ----------- ----------\n" ); printf( ">1 per word %8i %8i \n", nTMatch, nTMatch1 ); printf( " 1 per word %8i %8i \n", nMatch, nMatch1 ); /* Cleanup */ free(apos); free(bpos); free(used); return nMatch; }