

/******************************************************************
 *
 *  compareLeafAncestorsDH.c
 *
 *  Geoffrey Sampson
 *
 *  Department of Informatics, Sussex University
 *  www.grsampson.net
 *
 *  and
 *
 *  Derrick Higgins
 *
 *  Educational Testing Service, Princeton, New Jersey
 *  DHiggins@ETS.ORG
 *
 *  14 Nov 2006
 *
 *
 ******************************************************************/
 
 /******************************************************************
 *
 *  Introduction by Geoffrey Sampson:
 *
 *  This is a modified version, developed by Derrick Higgins of the
 *  Educational Testing Service, Princeton, New Jersey, of the program
 *  compareLeafAncestors.c which I wrote and have been distributing
 *  from this site since 2005.  Although my original code works,
 *  it is inefficient, and with large input files can be unusable in
 *  practice.  In early 2006 I spotted a way to improve the program
 *  logic (discussed in a comment added to the online version of 
 *  compareLeafAncestors.c),
 *  which I implemented for my private use and planned to
 *  implement more cleanly for the publicly-distributed version of
 *  the software.  However, before I managed to do the latter,
 *  Derrick Higgins offered me his version, which uses other
 *  techniques, notably dynamic programming, to increase efficiency.
 *  He has kindly allowed me to distribute his version of the
 *  software from this site.
 *
 *  Derrick tells me that he has used this version of the program
 *  with real-life data of his own, on a scale that made my original
 *  code fall over, and that this modified version runs successfully
 *  without apparent bugs.  I believe, therefore, that any
 *  third-party users would probably be best advised to use this
 *  version and to ignore my original version.  I leave the original
 *  version publicly available simply because it is the only version
 *  for which I have sole responsibility.  It does not seem right
 *  to offer only a software version whose properties I am not
 *  personally in a position to vouch for; however, as said, I
 *  believe that Derrick's version is in fact superior to my original.
 *
 *  Differences between the two versions are identified below by
 *  comments labelled with the initials DCH.  Other comments
 *  are carried over from my original version.
 *
 ******************************************************************/


/************************************************************************
 *
 *
 *  This program written in ANSI standard C implements the "leaf-ancestor
 *  assessment" metric for quantifying the accuracy with which a string has
 *  been parsed.  That metric was defined in G.R. Sampson, "A proposal for
 *  improving the measurement of parse accuracy", International Journal of
 *  Corpus Linguistics 5.53-68, 2000; a later paper, G.R. Sampson & Anna
 *  Babarczy, "A test of the leaf-ancestor metric for parse accuracy",
 *  Journal of Natural Language Engineering 9.365-80, 2003, gave empirical
 *  evidence that it is a suitable metric for the purpose, and in particular
 *  that its performance is superior to that of its main rival, the PARSEVAL
 *  or GEIG metric.
 *
 *  Members of the public are very welcome to take copies of this program
 *  and use it for any purpose.  If it is used for published research, then
 *  the program author and his employer, Sussex University, would appreciate
 *  a brief acknowledgement in the relevant publication(s).  Users should
 *  note that no guarantees are offered about absence of bugs, and the
 *  program includes only limited error-trapping for faulty inputs.  
 *  Anyone who finds bugs is warmly encouraged to report these to the
 *  author so that corrections can be made in this file; any such help will
 *  be acknowledged.
 *
 *  The program takes as input two files, one containing a set of
 *  candidate parses (i.e. labelled trees) to be assessed, the other
 *  containing a set of "gold-standard" parses of the same strings.
 *  It produces output in which, for successive strings, a figure is given
 *  for the parse accuracy of each word, and an overall average is given
 *  for the accuracy of the entire parse-tree.  (The output also shows the
 *  candidate and gold-standard label-lineages for the individual words,
 *  to help analysts track down what is going wrong when parse scores are
 *  poor.)  The term "word" is used here for whatever elements of the
 *  parsed strings are associated with the leaf nodes of parse-trees:
 *  these will commonly be words in the ordinary sense, though alternatively
 *  they might be individual characters (if grouping characters into "tokens"
 *  is itself treated as part of the parsing task), and furthermore "parsing"
 *  is relevant to domains other than language, where the concept of "word"
 *  may not apply but the leaf-ancestor metric can still be used.
 *
 *  The command line takes two parameters; 
 *  if the program is compiled in the current
 *  directory as -compareLeafAncestors-, a typical invocation of it would be:
 *
 *       ./compareLeafAncestors candFile goldFile
 *
 *  where -candFile- and -goldFile- name the files containing candidate and
 *  gold-standard trees respectively.  Each of these files contains a
 *  succession of parse-trees, each tree being shown as a numeral (to
 *  identify the string parsed) followed after whitespace by a sequence in
 *  which each item is one of the following, separated by whitespace:
 *
 *  - a left-square-bracket immediately followed by a bracket-label
 *
 *  - an element of the parsed string
 *
 *  - an (unlabelled) right-square-bracket
 *
 *  For human consumption it is convenient to set files out with separate trees
 *  on separate line(s), beginning with the string number, 
 *  but this format is not
 *  required; all whitespace is alike to the program 
 *  (it locates the end of a tree
 *  by finding a closing bracket which balances 
 *  the opening bracket at the start
 *  of the tree).  There is no requirement 
 *  for the string-numbers to be continuous
 *  or in ascending order, so long as the sequence of numbers in the
 *  candidate file matches the sequence in the gold-standard file.  Output
 *  is sent to stdout unless redirected.
 *
 *  As an example, the two following short specimen input files 
 *  produce the output shown
 *  below them.
 *
 *  CANDIDATE FILE:
 *       
 *  1
 *  [S [NP [NP The Fulton ] County Grand Jury ] said Friday
 *  [S [NP an investigation [PP of [NP [NP Atlanta 's ] recent primary election ] ] ]
 *   produced [NP no evidence [S that [NP any irregularities ] took place ] ] ] ]
 * 
 *  2
 *  [S However , [NP the jury ] said
 *  [S it believes [NP these two ]
 *   [S offices should be combined
 *    [VP to
 *     [VP achieve [N1 greater efficiency ]
 *      [VP and reduce [NP the cost [PP of administration ] ] ] ] ] ] ] ]
 *
 *  3
 *  [S [N1 Implementation [PP of [NP [NP Georgia 's ] automobile title law ] ] ] was
 *  also recommended [PP by [NP the outgoing jury ] ] ]
 *  
 *  
 *
 *  GOLD-STANDARD FILE:
 *       
 *  1
 *  [S [NP The [N1 Fulton County ] Grand Jury ] said  Friday  [NP [NP an investigation 
 *  [PP of [N1 [N1  Atlanta  's ] recent primary election ] ] ] produced  
 *  [NP no evidence  [S that [S any irregularities ] took  place  ] ] ] ]  
 *  
 *  2
 *  [S  However  , [NP the jury ] said [S  it  believes  [S [NP these two offices ] 
 *  should be combined [VP to achieve [N1 greater efficiency ] [VP and reduce [NP the cost 
 *  [PP of administration ] ] ] ] ] ]  ]  
 *  
 *  3
 *  [S [N1 Implementation [PP of [NP [NP  Georgia  's ] [N1 automobile title ] law ] ] ]  
 *  was  also  recommended [PP by [NP the outgoing jury ] ] ]  
 *  
 *
 *  
 *  OUTPUT:
 *
 *      0.857	The	 NP NP [ S  :   NP [ S
 *  	0.688	Fulton	 NP ] NP S  :   [ N1 NP S
 *  	0.667	County	 NP S  :   N1 ] NP S
 *  	1.000	Grand	 NP S  :   NP S
 *  	1.000	Jury	 NP ] S  :   NP ] S
 *  	1.000	said	 S  :   S
 *  	1.000	Friday	 S  :   S
 *  	0.750	an	 NP [ S S  :   NP [ NP S
 *  	0.667	investigation	 NP S S  :   NP NP S
 *  	0.800	of	 [ PP NP S S  :   [ PP NP NP S
 *  	0.786	Atlanta	 NP [ NP PP NP S S  :   N1 [ N1 PP NP NP S
 *  	0.786	's	 NP ] NP PP NP S S  :   N1 ] N1 PP NP NP S
 *  	0.750	recent	 NP PP NP S S  :   N1 PP NP NP S
 *  	0.750	primary	 NP PP NP S S  :   N1 PP NP NP S
 *  	0.792	election	 NP PP NP ] S S  :   N1 PP NP ] NP S
 *  	0.500	produced	 S S  :   NP S
 *  	0.750	no	 [ NP S S  :   [ NP NP S
 *  	0.667	evidence	 NP S S  :   NP NP S
 *  	0.800	that	 [ S NP S S  :   [ S NP NP S
 *  	0.667	any	 [ NP S NP S S  :   [ S S NP NP S
 *  	0.667	irregularities	 NP ] S NP S S  :   S ] S NP NP S
 *  	0.750	took	 S NP S S  :   S NP NP S
 *  	0.800	place	 S NP S S ]  :   S NP NP S ]
 *  
 *  Sentence 1: ave. 0.778
 *  
 *  
 *  	1.000	However	 [ S  :   [ S
 *  	1.000	,	 S  :   S
 *  	1.000	the	 [ NP S  :   [ NP S
 *  	1.000	jury	 NP ] S  :   NP ] S
 *  	1.000	said	 S  :   S
 *  	1.000	it	 [ S S  :   [ S S
 *  	1.000	believes	 S S  :   S S
 *  	0.667	these	 [ NP S S  :   NP [ S S S
 *  	0.750	two	 NP ] S S  :   NP S S S
 *  	0.667	offices	 [ S S S  :   NP ] S S S
 *  	1.000	should	 S S S  :   S S S
 *  	1.000	be	 S S S  :   S S S
 *  	1.000	combined	 S S S  :   S S S
 *  	1.000	to	 [ VP S S S  :   [ VP S S S
 *  	0.800	achieve	 [ VP VP S S S  :   VP S S S
 *  	0.923	greater	 [ N1 VP VP S S S  :   [ N1 VP S S S
 *  	0.923	efficiency	 N1 ] VP VP S S S  :   N1 ] VP S S S
 *  	0.923	and	 [ VP VP VP S S S  :   [ VP VP S S S
 *  	0.909	reduce	 VP VP VP S S S  :   VP VP S S S
 *  	0.933	the	 [ NP VP VP VP S S S  :   [ NP VP VP S S S
 *  	0.923	cost	 NP VP VP VP S S S  :   NP VP VP S S S
 *  	0.941	of	 [ PP NP VP VP VP S S S  :   [ PP NP VP VP S S S
 *  	0.941	administration	 PP NP VP VP VP S S S ]  :   PP NP VP VP S S S ]
 *  
 *  Sentence 2: ave. 0.926
 *  
 *  
 *  	1.000	Implementation	 N1 [ S  :   N1 [ S
 *  	1.000	of	 [ PP N1 S  :   [ PP N1 S
 *  	1.000	Georgia	 NP [ NP PP N1 S  :   NP [ NP PP N1 S
 *  	1.000	's	 NP ] NP PP N1 S  :   NP ] NP PP N1 S
 *  	0.800	automobile	 NP PP N1 S  :   [ N1 NP PP N1 S
 *  	0.800	title	 NP PP N1 S  :   N1 ] NP PP N1 S
 *  	1.000	law	 NP PP N1 ] S  :   NP PP N1 ] S
 *  	1.000	was	 S  :   S
 *  	1.000	also	 S  :   S
 *  	1.000	recommended	 S  :   S
 *  	1.000	by	 [ PP S  :   [ PP S
 *  	1.000	the	 [ NP PP S  :   [ NP PP S
 *  	1.000	outgoing	 NP PP S  :   NP PP S
 *  	1.000	jury	 NP PP S ]  :   NP PP S ]
 *  
 *  Sentence 3: ave. 0.971
 *  
 *  
 *
 *  The leaf-ancestor metric assumes (Sampson & Babarczy, p. 370) that one
 *  has chosen a function which specifies the similarity between any pair
 *  of bracket-labels by assigning it a value between 2.0 (the two labels
 *  are identical) and zero (the labels are unrelated).  The simplest 
 *  version of the metric assigns value 2 to identical labels and value 0
 *  to any nonidentical pair of labels; but for many applications it will
 *  be preferable to recognize a concept of partial label-similarity, so
 *  that some label-pairs receive a value intermediate between 0 and 2.
 *  The chosen function is implemented by -matchVal()- (at the end of the
 *  program), and this program uses the very simple partial-similarity metric 
 *  discussed in Sampson & Babarczy, whereby node-labels beginning with
 *  the same character but differing elsewhere are assigned the value 1.5.
 *  It is important to stress that this is only an example.  For the user's
 *  application, some quite different measure of partial label-similarity
 *  might be appropriate -- in which case the user should rewrite
 *  -matchVal()- accordingly.  The only requirement which the rest of the
 *  program imposes on -matchVal()- is that it must always return a value
 *  within the range 2.0 to 0.0, with higher values representing more-
 *  similar pairs of labels.  (In the Sampson & Babarczy paper, the function
 *  was discussed the other way round, as a replacement-cost function, with 
 *  identical labels scoring zero and unrelated labels scoring 2;
 *  and this explains why the value 2 rather than the more obvious 1 is
 *  used -- replacing one label by the other involves deleting one entire 
 *  label plus inserting another entire label.)
 *
 *  A further element of user choice relates to whether the items at either
 *  end of lineages -- root nodes and/or leaf nodes -- are considered
 *  in comparing trees (cf. Sampson, "Improving the measurement ...", p.
 *  57).  In some applications, it may be known in advance that root nodes
 *  are always labelled with the same initial symbol, so that a parse is
 *  entitled to no credit for getting that label correct.  And the data
 *  submitted for parsing will commonly specify the identity of the leaf
 *  nodes, making it inappropriate to give credit for those.  These two
 *  choices are controlled by defining the constants LEAVES_DISREGARDED
 *  and ROOTS_DISREGARDED as true or false.
 *
 ************************************************************************/


/************************************************************************
 *
 *
 *  Data structures:
 *
 *  The lowest node-number in any tree is 1; 0 means no node, e.g.
 *  mother of root node, daughter of leaf node.
 *
 *  A tree is defined by four arrays:
 *    -daughters-, an array of integer sequences terminated by 0, each
 *      sequence is the successive daughter nodes for the respective mother
 *    -mothers-, an array giving the mother node for each node
 *    -labels-, an array of strings showing the words at leaf nodes and
 *      the nonterminal labels for nonterminal nodes
 *    -leafString-, a sequence of integers terminated by 0 giving the
 *      successive leaf nodes.
 *
 *  (Some of these arrays are determined by the others, but it makes the
 *  programming simpler to store them independently.)
 *
 *  In each case, the first dimension in the array identifies which
 *  comparand is represented by the tree, i.e. the GOLD-standard tree
 *  or the CANDidate tree.
 *
 *
 ************************************************************************/

#include <stdio.h>
#include <ctype.h>
#include <string.h>

#define TRUE 1
#define FALSE 0

#define LONGEST_TEXT 150000
#define LAST_S_NO 5000
#define LONGEST_CHUNK 50
#define MOST_NODES 500
#define GOLD 0
#define CAND 1
#define DEEPEST_LINEAGE 300

#define LEAVES_DISREGARDED TRUE
#define ROOTS_DISREGARDED FALSE

const char nl = '\n';
const char sp = ' ';
const char tab = '\t';
const char zero = '\0';

/* DCH
char goldText[LONGEST_TEXT], candText[LONGEST_TEXT];
*/

int firstNewGoldCh, firstNewCandCh;
char thChunk[LONGEST_CHUNK];
int nxNode[2];

int daughters[2][MOST_NODES][MOST_NODES];
int mothers[2][MOST_NODES];
char labels[2][MOST_NODES][LONGEST_CHUNK];
int leafString[2][MOST_NODES];
char lineages[2][DEEPEST_LINEAGE][LONGEST_CHUNK];

double scoreMatrix[DEEPEST_LINEAGE][DEEPEST_LINEAGE];

FILE *goldFP, *candFP;

/************************************************************************
 *
 *  -lineages- holds a pair of candidate and gold lineages, in the
 *  form of an array of node-labels
 *
 ************************************************************************/

void crash(char *s, char *t, int errN)
  {
  fprintf(stderr, "\n\n***\n%s\n%s\n", s, t);
  exit(errN);
  }

int main(int argc, char *argv[])
  {
  int i;
  char c;
  void processTrees(void);
  FILE *fp;

  if (argc != 3)
    crash("pattern:  compareLeafAncestors cand-file gold-file", "", 0);

  candFP = fopen(argv[1], "r");
  goldFP = fopen(argv[2], "r");

  /* DCH
  fp = fopen(argv[1], "r");
  i = 0;
  while ((c = getc(fp)) != EOF && i < LONGEST_TEXT-1)
    {
    candText[i] = c;
    candText[++i] = zero;
    }
  fclose(fp);
  fp = fopen(argv[2], "r");
  i = 0;
  while ((c = getc(fp)) != EOF && i < LONGEST_TEXT-1)
    {
    goldText[i] = c;
    goldText[++i] = zero;
    }
  fclose(fp);
  */
  processTrees();

  return(TRUE);
  }

void processTrees(void)
  {
  int j = 0;
  double aveVal;
  int getNxChunk(int);
  void initializeTree(int);
  void buildTree(int);
  double compareLineages(void);

  firstNewCandCh = 0;
  while (TRUE)
    {
    if (! getNxChunk(CAND))
      {
      printf("\n\nend of file\n");
      exit(TRUE);
      }
    if (atoi(thChunk) != ++j)
      crash("was expecting next sentence no, not ", thChunk, 0);
    initializeTree(CAND);
    buildTree(CAND);
    if (! getNxChunk(GOLD))
      crash("prematurely exhausted gold trees", "", 0);
    if (atoi(thChunk) != j)
      crash("gold sentence no. does not match candidate sentence no.", 
          thChunk, 0);
    initializeTree(GOLD);
    buildTree(GOLD);
    aveVal = compareLineages();
    printf("\nSentence %d: ave. %.3f\n\n\n", j, aveVal);
    }
  }

int getNxChunk(int comparand)
  {
  int c;
  int i;

  i = 0;
  thChunk[i] = zero;
  while (TRUE)
    {
    if (comparand == GOLD)
      {
	c = getc(goldFP);
	/*
      c = goldText[firstNewGoldCh];
      ++firstNewGoldCh;
	*/
      }
    else if (comparand == CAND)
      {
	c = getc(candFP);
	/*
      c = candText[firstNewCandCh];
      ++firstNewCandCh;
	*/
      }
    else
      crash("unrecognized comparand for getNxChunk()", "", FALSE);
    if (c == EOF)
      return(FALSE);
    if (c == zero)
      return(FALSE);
    if (isspace(c))
      {
      if (i > 0)
        return(TRUE);
      }
    else
      {
      thChunk[i] = c;
      thChunk[++i] = zero;
      }
    }
  }

void add2IntString(int *s, int n)
  {
  int i = -1;
  
  while (s[++i] != 0)
    ;
  s[i] = n;
  s[++i] = 0;
  }

void showLineage(int compar)
  {
  int n = 0;

  while (lineages[compar][n][0] != zero)
    {
    printf(" %s", lineages[compar][n]);
    ++n;
    }
  }

double compareLineages(void)
  {
  int i;
  int NWords = 0;
  double thLVals;
  double totalLVals = 0.0;
  void makeLineage(int, int), showLineage(int);

  double compareAtALeafDP(void);
  /* DCH
  double compareAtALeaf(void);
  */

  for (i = 0; leafString[CAND][i] != 0; ++i)
    { 
    if (strcmp(labels[GOLD][leafString[GOLD][i]], 
        labels[CAND][leafString[CAND][i]]) != 0)
    crash("gold and candidate words do not match", 
        labels[CAND][leafString[CAND][i]], 0);
    makeLineage(CAND, i);
    makeLineage(GOLD, i);
    /* DCH
    totalLVals += (thLVals = compareAtALeaf());
     */
    totalLVals += (thLVals = compareAtALeafDP());
    ++NWords;
    printf("\t%.3f\t%s\t", thLVals, labels[CAND][leafString[CAND][i]]);
    showLineage(CAND);
    printf("  :  ");
    showLineage(GOLD);
    putchar(nl);
    }
  return(totalLVals/NWords);
  }

void makeLineage(int comp, int Nleaf)
  {
  int thNode, mum, seenLB, seenRB;
  int i;
  int isFirstSis(int, int), isLastSis(int, int), 
      isLeaf(int, int), isRoot(int, int);
  int nxLineageEl(int);

  lineages[comp][0][0] = zero;
  seenLB = seenRB = FALSE;
  thNode = leafString[comp][Nleaf];
  while (thNode > 0)
    {
    mum = mothers[comp][thNode];
    if (( isRoot(comp, thNode) || ! isFirstSis(comp, thNode)) && ! seenLB)
      {
      seenLB = TRUE;
      if (! isLeaf(comp, thNode))
        {
        i = nxLineageEl(comp);
        strcpy(lineages[comp][i], "[");
        lineages[comp][i + 1][0] = zero;
        }
      }
    if (!((isLeaf(comp, thNode) && LEAVES_DISREGARDED) ||
        (isRoot(comp, thNode) && ROOTS_DISREGARDED)))
      {
      i = nxLineageEl(comp);
      strcpy(lineages[comp][i], labels[comp][thNode]);
      lineages[comp][i + 1][0] = zero;
      }
    if (( isRoot(comp, thNode) || ! isLastSis(comp, thNode)) && ! seenRB)
      {
      seenRB = TRUE;
      if (! isLeaf(comp, thNode))
        {
        i = nxLineageEl(comp);
        strcpy(lineages[comp][i], "]");
        lineages[comp][i + 1][0] = zero;
        }
      }
    thNode = mum;
    }
  }

int nxLineageEl(int comp)
  {
  int i = 0;

  while (lineages[comp][i][0] != zero)
    ++i;
  if (i >= DEEPEST_LINEAGE)
    crash("lineage overflowed", "", 0);
  return(i);
  }

int isRoot(int comp, int node)
  {
  return(mothers[comp][node] == 0);
  }

int isLeaf(int comp, int node)
  {
  return(daughters[comp][node][0] == 0);
  }

int isFirstSis(int comp, int node)
  {
  int m;

  if ((m = mothers[comp][node]) == 0)
    crash("tried checking sister sequence of root", "", 0);
  return(daughters[comp][m][0] == node);
  }

int isLastSis(int comp, int node)
  {
  int m, i;

  if ((m = mothers[comp][node]) == 0)
    crash("tried checking sister sequence of root", "", 0);
  i = 0;
  while (daughters[comp][m][i] > 0)
    ++i;
  return(daughters[comp][m][--i] == node);
  }

void buildTree(int comp)
  {
  int thNode = 0;
  int lastNode;
  int newLeaf;
  int newNode(int);
  int i;
  char c;
  void add2IntString(int *, int);
  int getNxChunk(int);

  nxNode[comp] = 0;
  do
    {
      /* DCH
    if (! getNxChunk(comp))
      crash("unexpectedly run out of file", "", 0);
      */
    if (! getNxChunk(comp))
      crash("End of file", "", 0);
    if (thChunk[0] == '[')
/***********************************************
 * 
 *  if the chunk starts with '[':
 *    - create a new current node labelled with the rest of the chunk
 *    - make it a daughter of the last node
 *    - make that node its mother
 *
 **********************************************/
      {
      lastNode = thNode;
      thNode = newNode(comp);
/***********************************************
 *  copy rest of -thChunk- into -labels[thNode]-
 ***********************************************/
      for (i = 1; (c = thChunk[i]) != zero; ++i)
        {
        labels[comp][thNode][i - 1] = c;
        labels[comp][thNode][i] = zero;
        }
      add2IntString(daughters[comp][lastNode], thNode);
      mothers[comp][thNode] = lastNode;
      }
    else if (strcmp(thChunk, "]") == 0)
/***********************************************
 * 
 *  if the chunk is ']':
 *    - move up to the current node's mother
 *
 **********************************************/
      thNode = mothers[comp][thNode];
    else
/***********************************************
 * 
 *  if the chunk is not a bracket, it must be a leaf:
 *    - create a daughterless node, labelled with the chunk,
 *      as daughter of the current node
 *    - add it to -leafString-
 *
 **********************************************/
      {
      newLeaf = newNode(comp);
      strcpy(labels[comp][newLeaf], thChunk);
      add2IntString(daughters[comp][thNode], newLeaf);
      mothers[comp][newLeaf] = thNode;
      add2IntString(leafString[comp], newLeaf);
      }
    }
  while (thNode > 0);
  }

int newNode(int comp)
  {
  return(++nxNode[comp]);
  }

void initializeTree(int comp)
  {
  int i;

  for (i = 0; i < MOST_NODES; ++i)
    {
    daughters[comp][i][0] = 0;
    mothers[comp][i] = 0;
    labels[comp][i][0] = zero;
    }
  leafString[comp][0] = 0;
  }


/************************************************************************
 *
 *  To run through all possible sequence-preserving mappings from a
 *  subset of the elements of a string A from el 0 to el -lastAEl-
 *  into elements of a string B from el 0 to el -lastBEl-
 *
 *
 *                   _______________________      
 *    chosenAEls    | 1 | 3 | 4 | 6 |   |   |
 *                   _______________________      
 *                                ^
 *                   _______________________      
 *    chosenBEls    | 2 | 4 | 5 | 8 |   |   |
 *                   _______________________    
 *                                ^
 *                                |
 *                            lastChosenEl = 3
 *                 (i.e. 1 less than no. of matched pairs)
 *
 ************************************************************************/



/************************************************************************
 *
 *  First:  to run through all possible choices of between 1 and -lastAEl-
 *  elements in string A:
 *
 ************************************************************************/

int emptyLineages(void)
  {
  if (lineages[GOLD][0][0] == zero || lineages[CAND][0][0] == zero)
    {
    if (lineages[GOLD][0][0] == zero && lineages[CAND][0][0] == zero)
      return(2);
    else
      return(1);
    }
  return(0);
  }

/* DCH
 *   New version of compareAtLeaf() uses dynamic programming to store
 *   partial results.
 */
double compareAtALeafDP(void) {
  // Last elements in lineages of A, B
  int lastAEl, lastBEl;
  int i, j;
  double matchVal(int, int);
  double matchScore, candScore;
  int el;

  lastAEl = nxLineageEl(CAND) - 1;
  lastBEl = nxLineageEl(GOLD) - 1;

  if (lastBEl >= DEEPEST_LINEAGE)
    crash("lineage overflowed", "", 0);
  if (lastAEl >= DEEPEST_LINEAGE)
    crash("lineage overflowed", "", 0);

  el = emptyLineages();
  if (el == 2)
    return 1.0;
  if (el == 1)
    return 0.0;

  // Initialize
  for (i=0; i < DEEPEST_LINEAGE; i++) {
    for (j=0; j < DEEPEST_LINEAGE; j++) {
      scoreMatrix[i][j] = 0;
    }
  }
  // Fill in matrix
  for (i=0; i <= lastAEl; i++) {
    for (j=0; j <= lastBEl; j++) {
      matchScore = matchVal(i,j);
      scoreMatrix[i][j] = matchScore;
      // Carry over to the right
      if (i-1 >= 0) {
	candScore = scoreMatrix[i-1][j];
	if (candScore > scoreMatrix[i][j]) {
	  scoreMatrix[i][j] = candScore;
	}
      }
      // Carry over upwards
      if (j-1 >= 0) {
	candScore = scoreMatrix[i][j-1];
	if (candScore > scoreMatrix[i][j]) {
	  scoreMatrix[i][j] = candScore;
	}
      }
      // Carry over up and to the right
      if (i-1 >= 0 && j-1 >= 0) {
	candScore = matchScore + scoreMatrix[i-1][j-1];
	if (candScore > scoreMatrix[i][j]) {
	  scoreMatrix[i][j] = candScore;
	}
      }

    }
  }

  /* DCH
   * for (i=lastAEl; i >= 0; i--) {
   *   for (j=0; j <= lastBEl; j++) {
   *     printf(" %4.1f", scoreMatrix[i][j]);
   *   }
   *   printf("\n");
   * }
   */

  return scoreMatrix[lastAEl][lastBEl] / (lastAEl + lastBEl + 2);
}

double compareAtALeaf(void)
  {
  int lastChosenEl, finished, i, j, iVal, lastAEl, lastBEl;
  int k, l, kVal, thisAChoiceFinished;
  int el;
  int nxLineageEl(int);
  int chosenAEls[DEEPEST_LINEAGE], chosenBEls[DEEPEST_LINEAGE];
  int emptyLineages(void);
  double matchVal(int, int);
  double bestVal, thMappingVal;

  lastAEl = nxLineageEl(CAND) - 1;
  lastBEl = nxLineageEl(GOLD) - 1;
  bestVal = 0.0;
  el = emptyLineages();
  if (el == 2)
    bestVal = 1.0;
  finished = (el > 0);

  lastChosenEl = -1;
  while (! finished)
    {
    i = lastChosenEl;
    while (! (i < 0 || ((iVal = chosenAEls[i]) < lastAEl &&
        (i == lastChosenEl || chosenAEls[i + 1] - iVal > 1))))
      --i;
    if (i < 0)
      {
      ++lastChosenEl;
      if (lastChosenEl > lastAEl || lastChosenEl > lastBEl)
        finished = TRUE;
      i = 0;
      chosenAEls[i] = 0;
      iVal = -1;
      }
    if (! finished)
      {
      for (j = i; j <= lastChosenEl; ++j)
        chosenAEls[j] = ++iVal;
/************************************************************************
 *
 *  Now:  to run through all possible choices of -lastChosenEl-
 *  elements in string B to match against the respective choices in string A
 *
 ************************************************************************/
      for (k = 0; k <= lastChosenEl; ++k)
        chosenBEls[k] = k;
      chosenBEls[lastChosenEl] = lastChosenEl - 1;
      thisAChoiceFinished = FALSE;
      while (! thisAChoiceFinished)
        {
        k = lastChosenEl;
        while (! (k < 0 || ((kVal = chosenBEls[k]) < lastBEl &&
            (k == lastChosenEl || chosenBEls[k + 1] - kVal > 1))))
          --k;
        if (k < 0)
          thisAChoiceFinished = TRUE;
        else
          {
          for (l = k; l <= lastChosenEl; ++l)
            chosenBEls[l] = ++kVal;
          thMappingVal = 0.0;
          for (j = 0; j <= lastChosenEl; ++j)
            thMappingVal += matchVal(chosenAEls[j], chosenBEls[j]);
          thMappingVal /= (lastAEl + lastBEl + 2);
          if (thMappingVal > bestVal)
            bestVal = thMappingVal;
          }
        }
      }
    }
  return(bestVal);
  }

double matchVal(int s, int t)
  {
    double val;
    if (strcmp(lineages[CAND][s], lineages[GOLD][t]) == 0)
      
      val = 2.0;
    else if (lineages[CAND][s][0] == lineages[GOLD][t][0])
      val = 1.5;
    else
      val = 0.0;
    /* DCH
     * printf(" %10s %10s %5.3f\n", lineages[CAND][s], lineages[GOLD][t], val);
     */
    return val;
  }
