Perl software to accompany the paper “How many trade names are there?”


#!/usr/bin/perl

#
#   WordGen25
#
#
#  (c) Geoffrey Sampson
#  University of South Africa
#  November 2015
#
#
#
# Command line takes the form e.g.:  perl wordGen 100 3 2
# meaning:  give me 100 random 3-syllable examples with not more
# than 2 phoneme-pairs violating the CVCV pattern.
# Third argument is "-" for "no limit to CVCV violations".
#

# In this program, phonemes are represented in general by their
# SAMPA spellings, but with single characters used for some cases
# where SAMPA uses digraphs:
#
# C and J represents /tS dZ/ respectively
# W and Y represent the diphthongs /aU OI/ respectively.
#
# The vowel-letters a e i o u each stand for a range of vowels/
# diphthongs which can conventionally be written with that letter:
#
#   a = /&/ or /eI/ 
#   e = /e/ or /i:/
#   i = /I/ or /aI/
#   o = /Q/ or /@U/
#   u = /V/, /U/, /u:/, or /ju:/
#
# The voiced and voiceless interdental fricatives /D T/ are both
# represented indifferently as /T/.


$| = 1;

@initials = qw(0 p t k C J b d g f v T s z S h j w l r m n
    sp st sk sw sl sm sn sf
    pl kl bl gl fl spl skl
    pr tr kr br dr gr fr Tr Sr spr str skr
    tw kw dw gw Tw sw);
    
@genFinals = qw(0 p t k C J b d g f v T s z S l r m n N
    mp
    nt nd nT nC nJ ns nz
    Nk Nks
    st sp sk
    rp rt rk rC rJ rb rd rg rf rv rT rs rz rS rl rm rn rks
    lp lt lk lC lJ lb ld lf lT ls lS lm ln lks
    ks kt
    ps pt);

@YWfinals = qw(0 t k d T s z l r n nt nd ns st);

$attemptsSoFar = 0;
$examplesSoFar = 0;

$targetExamples = $ARGV[0];
$targetSylls = $ARGV[1];
$CVCVthreshold = $ARGV[2];

@genVowels = qw(a e i o u);
@YW = qw(Y W);

while ($examplesSoFar < $targetExamples)
  {
  ++$attemptsSoFar;
  $word = "";
  $fails1stCheck = 0;
  $oldFinal = "";
  SYLLS:  for ($i = 1; $i <= $targetSylls; ++$i)
    {
    $newInit = oneof(@initials);
    if ($newInit eq "0") {$newInit = "";}
    # the following is designed to opt for either the diphthongs /OI aU/, or
    # the vowels and diphthongs which can be spelled with a single vowel letter,
    # in proportion to the numbers of different syllable-finals that can occur
    # with the respective classes of nuclei.  The significance of '11' is that
    # that is the number of different vowels/diphthongs which can regularly be
    # written with one or other of the five vowel letters.
    if (rand(2 * scalar @YWfinals + 11 * scalar @genFinals) < 2 * scalar @YWfinals)
      {
      $newNucleus = oneof(@YW);
      $newFinal = oneof(@YWfinals);
      }
    else
      {
      $newNucleus = oneof(@genVowels);
      $newFinal = oneof(@genFinals);
      }
    if ($newFinal eq "0") {$newFinal = "";}
    if (syllBoundCdBeEarlier($oldFinal, $newInit) or giveGeminate($oldFinal, $newInit))
      {
      $fails1stCheck = 1;
      last SYLLS;
      }
    else
      {
      $word .= $newInit . $newNucleus . $newFinal;
      $oldFinal = $newFinal;
      }
    }
  if ((! $fails1stCheck) and passesRemainingFilters($word) and 
      ($CVCVthreshold eq "-" or CVCVviolations($word) <= $CVCVthreshold))
    {
    ++$examplesSoFar;
    print ($word, "\t", $examplesSoFar/$attemptsSoFar, "\n");
    }
  }
  
sub syllBoundCdBeEarlier
  {
  my ($strA, $strB, $aEquiv, $lastOfA);
  $strA = $_[0];
  $strB = $_[1];
  
  while ($strA ne "")
    {
    $lastOfA = chop $strA;
    $aEquiv = ($strA eq "" ? "0" : $strA);
    $strB = $lastOfA . $strB;
    if (memb($aEquiv, @initials) and memb($strB, @genFinals))
      {return(1);}
    }
  return(0);
  }

sub giveGeminate
  {
  my ($strA, $strB, $lastOfA);
  $strA = $_[0];
  $strB = $_[1];
  
  if ($strA eq "" or $strB eq "")
    {return(0);}
  $lastOfA = chop $strA;
  if (index($strB, $lastOfA) == 0)
    {return(1);}
  if ($lastOfA eq "t" and index($strB, "C") == 0)
    {return(1);}
  if ($lastOfA eq "d" and index($strB, "J") == 0)
    {return(1);}
  return(0);
  }
  
sub passesRemainingFilters
  {
  my $str;
  $str = $_[0];
  
  if ($str =~ /ju|Ww|Yj|[iu]r$|[iu]r[ptkCJbdgfvTszShjwlrmn]/)
    {return(0);}
  return(1);
  }

sub oneof
  {
  my $choice;
  my @array = @_;
  $choice = int rand scalar @array;
  return $array[$choice];
  }

sub memb
  {
  my $cand = shift @_;
  my @club;
  my $element;
  
  @club = @_;
  foreach $element (@club)
    {
    if ($element eq $cand)
      {return(1);}
    }
  return(0);
  }
  
sub vocalic
  {
  return(index("aeiouWY", $_[0]) >= 0);
  }
  
sub isdigraph
  {
  return(index("SCJNTWY", $_[0]) >= 0);
  }


sub CVCVviolations
  {
  my $string = $_[0];
  my $slen = length $string;
  my $violations = 0;
  my ($i, $c, $wasV);
  
  $c = substr($string, 0, 1);
  $wasV = vocalic($c);
  if (isdigraph($c)) {++$violations};
  if ($slen < 2) {return(0);}
  for ($i = 1; $i < $slen; ++$i)
    {
    $c = substr($string, $i, 1);    
    if (isdigraph($c)) {++$violations};
    if (vocalic($c))
      {
      if ($wasV) {++$violations;}
      $wasV = 1;
      }
    else
      {
      if (! $wasV)  {++$violations;}
      $wasV = 0;
      }
    }
  return($violations);
  }