Topic: Language utility: generate random words similar to existing languages (Read 2815 times)

Khym Chanur · « **on:** April 03, 2010, 12:15:03 am »

I wanted to add some new words to the languages files, but didn't want spend that much effort making up four new translations (dwarf, elf, human, goblin) for each word. So I made a little Perl utility to help: you give it the name of a language file, and it will spit out a number of randomly generated words similar to the words found in the language file.

Usage notes and algorithm notes are at the beginning of the script. You need to have Perl installed to use it.

Spoiler: Perl script (click to show/hide)

Code: [Select]

#!/usr/bin/perl -w                                              

# Author: Matthew Cline (matt@nightrealms.com)
# License: GPL 3.0 or later.                  
# Version: 0.01                               

# USAGE: dw.pl LANG num_words
#                            
# Takes language file LANG, and outputs num_words random words that
# should sort of sound like those in the lanugage file.  LANG      
# can be the name of the language, the name of the start of the    
# language file, or the entire name of the language file.          
#                                                                  
# The randomly generated words are guaranteed to not already exist 
# in the input language file.                                      
#                                                                  
# The program can be run from the top-level DF directory, the      
# raw directory, or the raw/objects directory.                     
#                                                                  
# Examples:                                                        
#                                                                  
#   dw.pl DWARF 1                                                  
#   dw.pl language_human 10                                        
#   dw.pl language_GOBLIN.txt 42                                   

# ALGORITHM:
#           
# 1) The program analyzes the frequency with which each unique pair of
# conesuctive characters occurs as the starting pair of letters of all the
# words in the lanuage input file, and uses this weighting to randomly    
# choose the first two letters of each random word.                       
#                                                                         
# 2) For each unique pair of consecutive characters found in the words in 
# the input language file, it computes the frequencies with which various 
# characters form the next letter.  In constructing a random word, the    
# last two letters of the word-so-far are chosen, and the weights of the  
# characters which can be found after that particular character pair are  
# used to randomly pick the next letter in the word.                      

use strict;

my %all_words;
my %pair_tables;
my %start_pairs;

my ($lang, $words_out) = @ARGV;

sub make_word
{            
    # Chose starting pair of letters.
    my $choice = rand();             

    my $weights = $start_pairs{weights};

    my @letters;

    my $index = scalar(@{$weights}) - 1;
    for (my $i = 0; $i < @{$weights}; $i++)
    {                                      
        if ($choice < $weights->[$i])      
        {                                  
            $index = $i;                   
            last;                          
        }                                  
    }                                      

    @letters = split(//, $start_pairs{letters}->[$index]);

    my $i = 0;
    while(1)  
    {         
        my $pair = $letters[$i] . $letters[$i + 1];
        my $curr_table = $pair_tables{$pair};      

        $weights = $curr_table->{weights};

        $choice = rand();

        $index = scalar(@{$weights}) - 1;

        for (my $j = 0; $j < @{$weights}; $j++)
        {                                      
            if ($choice < $weights->[$j])      
            {                                  
                $index = $j;                   
                last;                          
            }                                  
        }                                      
        my $char = $curr_table->{letters}->[$index];

        return join("", @letters) if ($char eq "\0");
        push(@letters, $char);                       
        $i++;                                        
    }                                                
}                                                    

sub make_new_word {
    while (1)      
    {              
        my $word = make_word();
        return $word if (!exists($all_words{$word}));
    }                                                
}                                                    

########################################################################
########################################################################

# Start of main

chdir("raw") if (-d "raw");
chdir("objects") if (-d "objects");
die "No language files\n" unless (-f "language_words.txt");

die "usage: dw.pl LANG num_words" unless (@ARGV == 2);

die "Second argument must be a number" unless ($words_out =~ /^\d+$/);
die "Second argument must be a positive number" unless ($words_out > 0);

if ($lang !~ /_/) {
    $lang  = uc($lang);
    $lang = "language_$lang" if ($lang !~ /_/);
}                                              
$lang .= ".txt"           if ($lang !~ /\.txt$/);

die "'SYM' is not a language\n" if ($lang eq "language_SYM.txt");
die "'words' is not a language\n" if ($lang =~ /language_words.txt/i);

die "No such language file as '$lang'\n" unless (-f $lang);

# Get raw counts.

unless(open(FILE, "<$lang"))
{                           
    die "Unable to open '$lang' for reading: $!\n";
}                                                  

my (%raw_pairs);
my $num_words;  

while(<FILE>)
{            
    next unless (/\[T_WORD:[^:\]]+:([^:\]]+)\]/);
    my $word = $1;                               
    next if (length($word) < 2);                 

    $num_words++;

    $all_words{$word} = 1;

    my @letters = split(//, $word);
    push(@letters, "\0");          

    my $pair = $letters[0] . $letters[1];
    $start_pairs{$pair}++;               

    for (my $i = 0; $i < (@letters - 2); $i++)
    {                                         
        $pair = $letters[$i] . $letters[$i + 1];
        $raw_pairs{$pair} ||= {};               
        my $next_letter = $raw_pairs{$pair};    
        $next_letter->{$letters[$i + 2]}++;     
    }                                           
}                                               

close(FILE);

die "No words in '$lang'\n" if ($num_words == 0);
die "No letter pairs in '$lang'\n" if (keys(%raw_pairs) == 0);

# Turn raw counts into tables.  Normalize total weights to 1.0 so
# that rand() will return a proper weight.                       
my (@pair_list, @weight_list);                                   
my $cummulaitve_weight = 0;                                      

while (my($pair, $count) = each %start_pairs)
{                                            
    push(@pair_list, $pair);                 
    $cummulaitve_weight += $count;           
    push(@weight_list, ($cummulaitve_weight + 0.0) / ($num_words + 0.0));
}                                                                        

%start_pairs = ();

$start_pairs{letters} = [@pair_list];
$start_pairs{weights} = [@weight_list];

while (my($pair, $next_letter) = each %raw_pairs)
{                                                
    my $total_weight = 0;                        
    for my $weight(values %{$next_letter})       
    {                                            
        $total_weight += $weight;                
    }                                            
    my @letter_list;                             
    @weight_list = ();                           

    $cummulaitve_weight = 0;
    while (my ($letter, $weight) = each %{$next_letter})
    {
        push(@letter_list, $letter);
        $cummulaitve_weight += $weight;
        push(@weight_list, ($cummulaitve_weight + 0.0) / ($total_weight + 0.0));
    }

    $pair_tables{$pair} =
        {
            letters => [@letter_list],
            weights => [@weight_list]
        };
}

for (my $i = 0; $i < $words_out; $i++)
{
    my $word = make_new_word();
    print "$word\n";
    $all_words{$word} = 1;
}

News:

Author Topic: Language utility: generate random words similar to existing languages (Read 2815 times)

Khym Chanur

Language utility: generate random words similar to existing languages