/******************************************************************
 *
 *  compareLeafAncestors.c
 *
 *  Geoffrey Sampson
 *
 *  Department of Informatics, Sussex University
 *  www.grsampson.net
 *
 *  23 Mar 2005
 *
 *
 ******************************************************************/

/******************************************************************
 *
 *  [added 14 Mar 2006:]
 *
 *  Since writing this program, I have come to realize that it
 *  is very inefficient.  When comparing alternative lineages for
 *  a word (see below for what that means), it seeks the best-scoring
 *  mapping from some subset of the labels in one lineage into a
 *  same-sized subset of labels in the other lineage, by starting
 *  with all possible size-1 subsets, going on to all possible size-2
 *  subsets, and so on up to the entire shorter lineage.  When trees
 *  are "deep" (many nodes between root and leaves), this can take
 *  a very long time.  Since in realistic applications lineage-pairs
 *  are usually similar even when they are not identical, a much
 *  better approach is to begin with the largest possible subsets
 *  and work downwards, stopping when the score for some mapping
 *  between size-N subsets is greater than or equal to what the 
 *  score would potentially be for a perfect match between 
 *  size-(N-1) subsets -- there is no point in considering smaller
 *  subsets and it saves a very great deal of processing if one does
 *  not do so.
 *
 *  For my private use I have kludged up a modified program that
 *  works that way, and I intend to place it here when I can find 
 *  time to polish it into a state fit for publication.  (That may
 *  not be soon, though!)
 *
 *  [added 15 Nov 2006:]
 *
 *  Although the above remarks remain true, they have now been
 *  overtaken by work done by a user of the program, 
 *  Derrick Higgins of the Educational Testing Service,
 *  Princeton, New Jersey.  Derrick has adapted my software to 
 *  improve efficiency in a different way from the way discussed
 *  above.  His version of the program is (by his kind permission)
 *  also available from this same site, under the name
 *  compareLeafAncestorsDH.c -- I recommend that users would in
 *  future be well advised to use Derrick Higgins's version of the
 *  software in preference to this original version, which I leave
 *  on the ftp server merely because it is the only version for which
 *  I can personally vouch.
 *
 ******************************************************************/


/************************************************************************
 *
 *
 *  This program written in ANSI standard C implements the "leaf-ancestor
 *  assessment" metric for quantifying the accuracy with which a string has
 *  been parsed.  That metric was defined in G.R. Sampson, "A proposal for
 *  improving the measurement of parse accuracy", International Journal of
 *  Corpus Linguistics 5.53-68, 2000; a later paper, G.R. Sampson & Anna
 *  Babarczy, "A test of the leaf-ancestor metric for parse accuracy",
 *  Journal of Natural Language Engineering 9.365-80, 2003, gave empirical
 *  evidence that it is a suitable metric for the purpose, and in particular
 *  that its performance is superior to that of its main rival, the PARSEVAL
 *  or GEIG metric.
 *
 *  Members of the public are very welcome to take copies of this program
 *  and use it for any purpose.  If it is used for published research, then
 *  the program author and his employer, Sussex University, would appreciate
 *  a brief acknowledgement in the relevant publication(s).  Users should
 *  note that no guarantees are offered about absence of bugs, and the
 *  program includes only limited error-trapping for faulty inputs.  
 *  Anyone who finds bugs is warmly encouraged to report these to the
 *  author so that corrections can be made in this file; any such help will
 *  be acknowledged.
 *
 *  The program takes as input two files, one containing a set of
 *  candidate parses (i.e. labelled trees) to be assessed, the other
 *  containing a set of "gold-standard" parses of the same strings.
 *  It produces output in which, for successive strings, a figure is given
 *  for the parse accuracy of each word, and an overall average is given
 *  for the accuracy of the entire parse-tree.  (The output also shows the
 *  candidate and gold-standard label-lineages for the individual words,
 *  to help analysts track down what is going wrong when parse scores are
 *  poor.)  The term "word" is used here for whatever elements of the
 *  parsed strings are associated with the leaf nodes of parse-trees:
 *  these will commonly be words in the ordinary sense, though alternatively
 *  they might be individual characters (if grouping characters into "tokens"
 *  is itself treated as part of the parsing task), and furthermore "parsing"
 *  is relevant to domains other than language, where the concept of "word"
 *  may not apply but the leaf-ancestor metric can still be used.
 *
 *  The command line takes two parameters; if the program is compiled in the current
 *  directory as -compareLeafAncestors-, a typical invocation of it would be:
 *
 *       ./compareLeafAncestors candFile goldFile
 *
 *  where -candFile- and -goldFile- name the files containing candidate and
 *  gold-standard trees respectively.  Each of these files contains a
 *  succession of parse-trees, each tree being shown as a numeral (to
 *  identify the string parsed) followed after whitespace by a sequence in
 *  which each item is one of the following, separated by whitespace:
 *
 *  - a left-square-bracket immediately followed by a bracket-label
 *
 *  - an element of the parsed string
 *
 *  - an (unlabelled) right-square-bracket
 *
 *  For human consumption it is convenient to set files out with separate trees
 *  on separate line(s), beginning with the string number, but this format is not
 *  required; all whitespace is alike to the program (it locates the end of a tree
 *  by finding a closing bracket which balances the opening bracket at the start
 *  of the tree).  There is no requirement for the string-numbers to be continuous
 *  or in ascending order, so long as the sequence of numbers in the
 *  candidate file matches the sequence in the gold-standard file.  Output
 *  is sent to stdout unless redirected.
 *
 *  As an example, the two following short specimen input files produce the output shown
 *  below them.
 *
 *  CANDIDATE FILE:
 *       
 *  1
 *  [S [NP [NP The Fulton ] County Grand Jury ] said Friday
 *  [S [NP an investigation [PP of [NP [NP Atlanta 's ] recent primary election ] ] ]
 *   produced [NP no evidence [S that [NP any irregularities ] took place ] ] ] ]
 * 
 *  2
 *  [S However , [NP the jury ] said
 *  [S it believes [NP these two ]
 *   [S offices should be combined
 *    [VP to
 *     [VP achieve [N1 greater efficiency ]
 *      [VP and reduce [NP the cost [PP of administration ] ] ] ] ] ] ] ]
 *
 *  3
 *  [S [N1 Implementation [PP of [NP [NP Georgia 's ] automobile title law ] ] ] was
 *  also recommended [PP by [NP the outgoing jury ] ] ]
 *  
 *  
 *
 *  GOLD-STANDARD FILE:
 *       
 *  1
 *  [S [NP The [N1 Fulton County ] Grand Jury ] said  Friday  [NP [NP an investigation 
 *  [PP of [N1 [N1  Atlanta  's ] recent primary election ] ] ] produced  
 *  [NP no evidence  [S that [S any irregularities ] took  place  ] ] ] ]  
 *  
 *  2
 *  [S  However  , [NP the jury ] said [S  it  believes  [S [NP these two offices ] 
 *  should be combined [VP to achieve [N1 greater efficiency ] [VP and reduce [NP the cost 
 *  [PP of administration ] ] ] ] ] ]  ]  
 *  
 *  3
 *  [S [N1 Implementation [PP of [NP [NP  Georgia  's ] [N1 automobile title ] law ] ] ]  
 *  was  also  recommended [PP by [NP the outgoing jury ] ] ]  
 *  
 *
 *  
 *  OUTPUT:
 *
 *      0.857	The	 NP NP [ S  :   NP [ S
 *  	0.688	Fulton	 NP ] NP S  :   [ N1 NP S
 *  	0.667	County	 NP S  :   N1 ] NP S
 *  	1.000	Grand	 NP S  :   NP S
 *  	1.000	Jury	 NP ] S  :   NP ] S
 *  	1.000	said	 S  :   S
 *  	1.000	Friday	 S  :   S
 *  	0.750	an	 NP [ S S  :   NP [ NP S
 *  	0.667	investigation	 NP S S  :   NP NP S
 *  	0.800	of	 [ PP NP S S  :   [ PP NP NP S
 *  	0.786	Atlanta	 NP [ NP PP NP S S  :   N1 [ N1 PP NP NP S
 *  	0.786	's	 NP ] NP PP NP S S  :   N1 ] N1 PP NP NP S
 *  	0.750	recent	 NP PP NP S S  :   N1 PP NP NP S
 *  	0.750	primary	 NP PP NP S S  :   N1 PP NP NP S
 *  	0.792	election	 NP PP NP ] S S  :   N1 PP NP ] NP S
 *  	0.500	produced	 S S  :   NP S
 *  	0.750	no	 [ NP S S  :   [ NP NP S
 *  	0.667	evidence	 NP S S  :   NP NP S
 *  	0.800	that	 [ S NP S S  :   [ S NP NP S
 *  	0.667	any	 [ NP S NP S S  :   [ S S NP NP S
 *  	0.667	irregularities	 NP ] S NP S S  :   S ] S NP NP S
 *  	0.750	took	 S NP S S  :   S NP NP S
 *  	0.800	place	 S NP S S ]  :   S NP NP S ]
 *  
 *  Sentence 1: ave. 0.778
 *  
 *  
 *  	1.000	However	 [ S  :   [ S
 *  	1.000	,	 S  :   S
 *  	1.000	the	 [ NP S  :   [ NP S
 *  	1.000	jury	 NP ] S  :   NP ] S
 *  	1.000	said	 S  :   S
 *  	1.000	it	 [ S S  :   [ S S
 *  	1.000	believes	 S S  :   S S
 *  	0.667	these	 [ NP S S  :   NP [ S S S
 *  	0.750	two	 NP ] S S  :   NP S S S
 *  	0.667	offices	 [ S S S  :   NP ] S S S
 *  	1.000	should	 S S S  :   S S S
 *  	1.000	be	 S S S  :   S S S
 *  	1.000	combined	 S S S  :   S S S
 *  	1.000	to	 [ VP S S S  :   [ VP S S S
 *  	0.800	achieve	 [ VP VP S S S  :   VP S S S
 *  	0.923	greater	 [ N1 VP VP S S S  :   [ N1 VP S S S
 *  	0.923	efficiency	 N1 ] VP VP S S S  :   N1 ] VP S S S
 *  	0.923	and	 [ VP VP VP S S S  :   [ VP VP S S S
 *  	0.909	reduce	 VP VP VP S S S  :   VP VP S S S
 *  	0.933	the	 [ NP VP VP VP S S S  :   [ NP VP VP S S S
 *  	0.923	cost	 NP VP VP VP S S S  :   NP VP VP S S S
 *  	0.941	of	 [ PP NP VP VP VP S S S  :   [ PP NP VP VP S S S
 *  	0.941	administration	 PP NP VP VP VP S S S ]  :   PP NP VP VP S S S ]
 *  
 *  Sentence 2: ave. 0.926
 *  
 *  
 *  	1.000	Implementation	 N1 [ S  :   N1 [ S
 *  	1.000	of	 [ PP N1 S  :   [ PP N1 S
 *  	1.000	Georgia	 NP [ NP PP N1 S  :   NP [ NP PP N1 S
 *  	1.000	's	 NP ] NP PP N1 S  :   NP ] NP PP N1 S
 *  	0.800	automobile	 NP PP N1 S  :   [ N1 NP PP N1 S
 *  	0.800	title	 NP PP N1 S  :   N1 ] NP PP N1 S
 *  	1.000	law	 NP PP N1 ] S  :   NP PP N1 ] S
 *  	1.000	was	 S  :   S
 *  	1.000	also	 S  :   S
 *  	1.000	recommended	 S  :   S
 *  	1.000	by	 [ PP S  :   [ PP S
 *  	1.000	the	 [ NP PP S  :   [ NP PP S
 *  	1.000	outgoing	 NP PP S  :   NP PP S
 *  	1.000	jury	 NP PP S ]  :   NP PP S ]
 *  
 *  Sentence 3: ave. 0.971
 *  
 *  
 *
 *  The leaf-ancestor metric assumes (Sampson & Babarczy, p. 370) that one
 *  has chosen a function which specifies the similarity between any pair
 *  of bracket-labels by assigning it a value between 2.0 (the two labels
 *  are identical) and zero (the labels are unrelated).  The simplest 
 *  version of the metric assigns value 2 to identical labels and value 0
 *  to any nonidentical pair of labels; but for many applications it will
 *  be preferable to recognize a concept of partial label-similarity, so
 *  that some label-pairs receive a value intermediate between 0 and 2.
 *  The chosen function is implemented by -matchVal()- (at the end of the
 *  program), and this program uses the very simple partial-similarity metric 
 *  discussed in Sampson & Babarczy, whereby node-labels beginning with
 *  the same character but differing elsewhere are assigned the value 1.5.
 *  It is important to stress that this is only an example.  For the user's
 *  application, some quite different measure of partial label-similarity
 *  might be appropriate -- in which case the user should rewrite
 *  -matchVal()- accordingly.  The only requirement which the rest of the
 *  program imposes on -matchVal()- is that it must always return a value
 *  within the range 2.0 to 0.0, with higher values representing more-
 *  similar pairs of labels.  (In the Sampson & Babarczy paper, the function
 *  was discussed the other way round, as a replacement-cost function, with 
 *  identical labels scoring zero and unrelated labels scoring 2;
 *  and this explains why the value 2 rather than the more obvious 1 is
 *  used -- replacing one label by the other involves deleting one entire 
 *  label plus inserting another entire label.)
 *
 *  A further element of user choice relates to whether the items at either
 *  end of lineages -- root nodes and/or leaf nodes -- are considered
 *  in comparing trees (cf. Sampson, "Improving the measurement ...", p.
 *  57).  In some applications, it may be known in advance that root nodes
 *  are always labelled with the same initial symbol, so that a parse is
 *  entitled to no credit for getting that label correct.  And the data
 *  submitted for parsing will commonly specify the identity of the leaf
 *  nodes, making it inappropriate to give credit for those.  These two
 *  choices are controlled by defining the constants LEAVES_DISREGARDED
 *  and ROOTS_DISREGARDED as true or false.
 *
 ************************************************************************/


/************************************************************************
 *
 *
 *  Data structures:
 *
 *  The lowest node-number in any tree is 1; 0 means no node, e.g.
 *  mother of root node, daughter of leaf node.
 *
 *  A tree is defined by four arrays:
 *    -daughters-, an array of integer sequences terminated by 0, each
 *      sequence is the successive daughter nodes for the respective mother
 *    -mothers-, an array giving the mother node for each node
 *    -labels-, an array of strings showing the words at leaf nodes and
 *      the nonterminal labels for nonterminal nodes
 *    -leafString-, a sequence of integers terminated by 0 giving the
 *      successive leaf nodes.
 *
 *  (Some of these arrays are determined by the others, but it makes the
 *  programming simpler to store them independently.)
 *
 *  In each case, the first dimension in the array identifies which
 *  comparand is represented by the tree, i.e. the GOLD-standard tree
 *  or the CANDidate tree.
 *
 *
 ************************************************************************/

#include <stdio.h>
#include <ctype.h>
#include <string.h>

#define TRUE 1
#define FALSE 0

#define LONGEST_TEXT 150000
#define LAST_S_NO 500
#define LONGEST_CHUNK 50
#define MOST_NODES 500
#define GOLD 0
#define CAND 1
#define DEEPEST_LINEAGE 300

#define LEAVES_DISREGARDED TRUE
#define ROOTS_DISREGARDED FALSE

const char nl = '\n';
const char sp = ' ';
const char tab = '\t';
const char zero = '\0';

char goldText[LONGEST_TEXT], candText[LONGEST_TEXT];
int firstNewGoldCh, firstNewCandCh;
char thChunk[LONGEST_CHUNK];
int nxNode[2];

int daughters[2][MOST_NODES][MOST_NODES];
int mothers[2][MOST_NODES];
char labels[2][MOST_NODES][LONGEST_CHUNK];
int leafString[2][MOST_NODES];
char lineages[2][DEEPEST_LINEAGE][LONGEST_CHUNK];

/************************************************************************
 *
 *  -lineages- holds a pair of candidate and gold lineages, in the
 *  form of an array of node-labels
 *
 ************************************************************************/

void crash(char *s, char *t, int errN)
  {
  fprintf(stderr, "\n\n***\n%s\n%s\n", s, t);
  exit(errN);
  }

int main(int argc, char *argv[])
  {
  int i;
  char c;
  void processTrees(void);
  FILE *fp;

  if (argc != 3)
    crash("pattern:  compareLeafAncestors cand-file gold-file", "", 0);
  fp = fopen(argv[1], "r");
  i = 0;
  while ((c = getc(fp)) != EOF)
    {
    candText[i] = c;
    candText[++i] = zero;
    }
  fclose(fp);
  fp = fopen(argv[2], "r");
  i = 0;
  while ((c = getc(fp)) != EOF)
    {
    goldText[i] = c;
    goldText[++i] = zero;
    }
  fclose(fp);
  processTrees();
  return(TRUE);
  }

void processTrees(void)
  {
  int j = 0;
  double aveVal;
  int getNxChunk(int);
  void initializeTree(int);
  void buildTree(int);
  double compareLineages(void);

  firstNewCandCh = 0;
  while (TRUE)
    {
    if (! getNxChunk(CAND))
      {
      printf("\n\nend of file\n");
      exit(TRUE);
      }
    if (atoi(thChunk) != ++j)
      crash("was expecting next sentence no, not ", thChunk, 0);
    initializeTree(CAND);
    buildTree(CAND);
    if (! getNxChunk(GOLD))
      crash("prematurely exhausted gold trees", "", 0);
    if (atoi(thChunk) != j)
      crash("gold sentence no. does not match candidate sentence no.", 
          thChunk, 0);
    initializeTree(GOLD);
    buildTree(GOLD);
    aveVal = compareLineages();
    printf("\nSentence %d: ave. %.3f\n\n\n", j, aveVal);
    }
  }

int getNxChunk(int comparand)
  {
  int c;
  int i;

  i = 0;
  thChunk[i] = zero;
  while (TRUE)
    {
    if (comparand == GOLD)
      {
      c = goldText[firstNewGoldCh];
      ++firstNewGoldCh;
      }
    else if (comparand == CAND)
      {
      c = candText[firstNewCandCh];
      ++firstNewCandCh;
      }
    else
      crash("unrecognized comparand for getNxChunk()", "", FALSE);
    if (c == zero)
      return(FALSE);
    if (isspace(c))
      {
      if (i > 0)
        return(TRUE);
      }
    else
      {
      thChunk[i] = c;
      thChunk[++i] = zero;
      }
    }
  }

void add2IntString(int *s, int n)
  {
  int i = -1;
  
  while (s[++i] != 0)
    ;
  s[i] = n;
  s[++i] = 0;
  }

void showLineage(int compar)
  {
  int n = 0;

  while (lineages[compar][n][0] != zero)
    {
    printf(" %s", lineages[compar][n]);
    ++n;
    }
  }

double compareLineages(void)
  {
  int i;
  int NWords = 0;
  double thLVals;
  double totalLVals = 0.0;
  void makeLineage(int, int), showLineage(int);
  double compareAtALeaf(void);

  for (i = 0; leafString[CAND][i] != 0; ++i)
    { 
    if (strcmp(labels[GOLD][leafString[GOLD][i]], 
        labels[CAND][leafString[CAND][i]]) != 0)
    crash("gold and candidate words do not match", 
        labels[CAND][leafString[CAND][i]], 0);
    makeLineage(CAND, i);
    makeLineage(GOLD, i);
    totalLVals += (thLVals = compareAtALeaf());
    ++NWords;
    printf("\t%.3f\t%s\t", thLVals, labels[CAND][leafString[CAND][i]]);
    showLineage(CAND);
    printf("  :  ");
    showLineage(GOLD);
    putchar(nl);
    }
  return(totalLVals/NWords);
  }

void makeLineage(int comp, int Nleaf)
  {
  int thNode, mum, seenLB, seenRB;
  int i;
  int isFirstSis(int, int), isLastSis(int, int), 
      isLeaf(int, int), isRoot(int, int);
  int nxLineageEl(int);

  lineages[comp][0][0] = zero;
  seenLB = seenRB = FALSE;
  thNode = leafString[comp][Nleaf];
  while (thNode > 0)
    {
    mum = mothers[comp][thNode];
    if (( isRoot(comp, thNode) || ! isFirstSis(comp, thNode)) && ! seenLB)
      {
      seenLB = TRUE;
      if (! isLeaf(comp, thNode))
        {
        i = nxLineageEl(comp);
        strcpy(lineages[comp][i], "[");
        lineages[comp][i + 1][0] = zero;
        }
      }
    if (!((isLeaf(comp, thNode) && LEAVES_DISREGARDED) ||
        (isRoot(comp, thNode) && ROOTS_DISREGARDED)))
      {
      i = nxLineageEl(comp);
      strcpy(lineages[comp][i], labels[comp][thNode]);
      lineages[comp][i + 1][0] = zero;
      }
    if (( isRoot(comp, thNode) || ! isLastSis(comp, thNode)) && ! seenRB)
      {
      seenRB = TRUE;
      if (! isLeaf(comp, thNode))
        {
        i = nxLineageEl(comp);
        strcpy(lineages[comp][i], "]");
        lineages[comp][i + 1][0] = zero;
        }
      }
    thNode = mum;
    }
  }

int nxLineageEl(int comp)
  {
  int i = 0;

  while (lineages[comp][i][0] != zero)
    ++i;
  if (i >= DEEPEST_LINEAGE)
    crash("lineage overflowed", "", 0);
  return(i);
  }

int isRoot(int comp, int node)
  {
  return(mothers[comp][node] == 0);
  }

int isLeaf(int comp, int node)
  {
  return(daughters[comp][node][0] == 0);
  }

int isFirstSis(int comp, int node)
  {
  int m;

  if ((m = mothers[comp][node]) == 0)
    crash("tried checking sister sequence of root", "", 0);
  return(daughters[comp][m][0] == node);
  }

int isLastSis(int comp, int node)
  {
  int m, i;

  if ((m = mothers[comp][node]) == 0)
    crash("tried checking sister sequence of root", "", 0);
  i = 0;
  while (daughters[comp][m][i] > 0)
    ++i;
  return(daughters[comp][m][--i] == node);
  }

void buildTree(int comp)
  {
  int thNode = 0;
  int lastNode;
  int newLeaf;
  int newNode(int);
  int i;
  char c;
  void add2IntString(int *, int);
  int getNxChunk(int);

  nxNode[comp] = 0;
  do
    {
    if (! getNxChunk(comp))
      crash("unexpectedly run out of file", "", 0);
    if (thChunk[0] == '[')
/***********************************************
 * 
 *  if the chunk starts with '[':
 *    - create a new current node labelled with the rest of the chunk
 *    - make it a daughter of the last node
 *    - make that node its mother
 *
 **********************************************/
      {
      lastNode = thNode;
      thNode = newNode(comp);
/***********************************************
 *  copy rest of -thChunk- into -labels[thNode]-
 ***********************************************/
      for (i = 1; (c = thChunk[i]) != zero; ++i)
        {
        labels[comp][thNode][i - 1] = c;
        labels[comp][thNode][i] = zero;
        }
      add2IntString(daughters[comp][lastNode], thNode);
      mothers[comp][thNode] = lastNode;
      }
    else if (strcmp(thChunk, "]") == 0)
/***********************************************
 * 
 *  if the chunk is ']':
 *    - move up to the current node's mother
 *
 **********************************************/
      thNode = mothers[comp][thNode];
    else
/***********************************************
 * 
 *  if the chunk is not a bracket, it must be a leaf:
 *    - create a daughterless node, labelled with the chunk,
 *      as daughter of the current node
 *    - add it to -leafString-
 *
 **********************************************/
      {
      newLeaf = newNode(comp);
      strcpy(labels[comp][newLeaf], thChunk);
      add2IntString(daughters[comp][thNode], newLeaf);
      mothers[comp][newLeaf] = thNode;
      add2IntString(leafString[comp], newLeaf);
      }
    }
  while (thNode > 0);
  }

int newNode(int comp)
  {
  return(++nxNode[comp]);
  }

void initializeTree(int comp)
  {
  int i;

  for (i = 0; i < MOST_NODES; ++i)
    {
    daughters[comp][i][0] = 0;
    mothers[comp][i] = 0;
    labels[comp][i][0] = zero;
    }
  leafString[comp][0] = 0;
  }


/************************************************************************
 *
 *  To run through all possible sequence-preserving mappings from a
 *  subset of the elements of a string A from el 0 to el -lastAEl-
 *  into elements of a string B from el 0 to el -lastBEl-
 *
 *
 *                   _______________________      
 *    chosenAEls    | 1 | 3 | 4 | 6 |   |   |
 *                   _______________________      
 *                                ^
 *                   _______________________      
 *    chosenBEls    | 2 | 4 | 5 | 8 |   |   |
 *                   _______________________    
 *                                ^
 *                                |
 *                            lastChosenEl = 3
 *                 (i.e. 1 less than no. of matched pairs)
 *
 ************************************************************************/



/************************************************************************
 *
 *  First:  to run through all possible choices of between 1 and -lastAEl-
 *  elements in string A:
 *
 ************************************************************************/

int emptyLineages(void)
  {
  if (lineages[GOLD][0][0] == zero || lineages[CAND][0][0] == zero)
    {
    if (lineages[GOLD][0][0] == zero && lineages[CAND][0][0] == zero)
      return(2);
    else
      return(1);
    }
  return(0);
  }


double compareAtALeaf(void)
  {
  int lastChosenEl, finished, i, j, iVal, lastAEl, lastBEl;
  int k, l, kVal, thisAChoiceFinished;
  int el;
  int nxLineageEl(int);
  int chosenAEls[DEEPEST_LINEAGE], chosenBEls[DEEPEST_LINEAGE];
  int emptyLineages(void);
  double matchVal(int, int);
  double bestVal, thMappingVal;

  lastAEl = nxLineageEl(CAND) - 1;
  lastBEl = nxLineageEl(GOLD) - 1;
  bestVal = 0.0;
  el = emptyLineages();
  if (el == 2)
    bestVal = 1.0;
  finished = (el > 0);
  lastChosenEl = -1;
  while (! finished)
    {
    i = lastChosenEl;
    while (! (i < 0 || ((iVal = chosenAEls[i]) < lastAEl &&
        (i == lastChosenEl || chosenAEls[i + 1] - iVal > 1))))
      --i;
    if (i < 0)
      {
      ++lastChosenEl;
      if (lastChosenEl > lastAEl || lastChosenEl > lastBEl)
        finished = TRUE;
      i = 0;
      chosenAEls[i] = 0;
      iVal = -1;
      }
    if (! finished)
      {
      for (j = i; j <= lastChosenEl; ++j)
        chosenAEls[j] = ++iVal;
/************************************************************************
 *
 *  Now:  to run through all possible choices of -lastChosenEl-
 *  elements in string B to match against the respective choices in string A
 *
 ************************************************************************/
      for (k = 0; k <= lastChosenEl; ++k)
        chosenBEls[k] = k;
      chosenBEls[lastChosenEl] = lastChosenEl - 1;
      thisAChoiceFinished = FALSE;
      while (! thisAChoiceFinished)
        {
        k = lastChosenEl;
        while (! (k < 0 || ((kVal = chosenBEls[k]) < lastBEl &&
            (k == lastChosenEl || chosenBEls[k + 1] - kVal > 1))))
          --k;
        if (k < 0)
          thisAChoiceFinished = TRUE;
        else
          {
          for (l = k; l <= lastChosenEl; ++l)
            chosenBEls[l] = ++kVal;
          thMappingVal = 0.0;
          for (j = 0; j <= lastChosenEl; ++j)
            thMappingVal += matchVal(chosenAEls[j], chosenBEls[j]);
          thMappingVal /= (lastAEl + lastBEl + 2);
          if (thMappingVal > bestVal)
            bestVal = thMappingVal;
          }
        }
      }
    }
  return(bestVal);
  }

double matchVal(int s, int t)
  {
  if (strcmp(lineages[CAND][s], lineages[GOLD][t]) == 0)
    return(2.0);
  else if (lineages[CAND][s][0] == lineages[GOLD][t][0])
    return(1.5);
  else
    return(0.0);
  }