p5-colomon

Download the raw code.

use v6;
use Test;

# SuffixTree class based on
# STREE2006.CPP - Suffix tree creation
#
# by Mark Nelson, updated December, 2006
# http://marknelson.us/1996/08/01/suffix-trees/

class SuffixTree {
    has $.string;
    has %.edges;
    has @.nodes;
    has $.node-count is rw = 1;

    class Suffix {
        has SuffixTree $.tree;
        has Int $.origin_node is rw;
        has Int $.first_char_index is rw;
        has Int $.last_char_index is rw;

        method new(SuffixTree $tree, Int $node, Int $start, Int $stop) {
            self.bless(*, :$tree, :origin_node($node), :first_char_index($start), :last_char_index($stop));
        }

        method is-explicit { $.first_char_index > $.last_char_index; }
        method is-implicit { !self.is-explicit; }
        method canonize() {
            if !self.is-explicit {
                my $edge = $.tree.edges{$.tree.hash-tag($.origin_node, $.first_char_index)};
                my $edge_span = $edge.last_char_index - $edge.first_char_index;
                while $edge_span <= $.last_char_index - $.first_char_index {
                    $.first_char_index = $.first_char_index + $edge_span + 1;
                    $.origin_node = $edge.end_node;
                    if ($.first_char_index <= $.last_char_index) {
                        $edge = $.tree.edges{$.tree.hash-tag($edge.end_node, $.first_char_index)};
                        $edge_span = $edge.last_char_index - $edge.first_char_index;
                    }
                }
            }
        }
    }

    class Edge {
        has SuffixTree $.tree;
        has Int $.first_char_index is rw;
        has Int $.last_char_index is rw;
        has Int $.start_node is rw;
        has Int $.end_node is rw;

        multi method new(SuffixTree $tree,
                         Int $first_char_index,
                         Int $last_char_index,
                         Int $parent_node) {
            self.bless(*, :$tree, 
                          :$first_char_index, 
                          :$last_char_index, 
                          :start_node($parent_node), 
                          :end_node($tree.node-count++));
        }

        method hash-tag() {
            $.tree.hash-tag($.start_node, $.first_char_index);
        }
    }

    #################################
    ## SuffixTree methods start here!
    #################################

    method new(Str $s) {
        self.bless(*, :string($s));
    }

    method hash-tag($node, $first_char_index) {
        $.string.substr($first_char_index, 1) ~ $node;
    }

    method split-edge(Edge $edge, Suffix $s) {
        %.edges.delete($edge.hash-tag);
        my $new-edge = Edge.new(self,
                                $edge.first_char_index,
                                $edge.first_char_index + $s.last_char_index - $s.first_char_index,
                                $s.origin_node);
        %.edges{$new-edge.hash-tag} = $new-edge;
        @.nodes[$new-edge.end_node] = $s.origin_node;
        $edge.first_char_index += $s.last_char_index - $s.first_char_index + 1;
        $edge.start_node = $new-edge.end_node;
        %.edges{$edge.hash-tag} = $edge;
        $new-edge.end_node;
    }

    method dump-edges($current-n) {
        say " Start  End  Suf  First Last  String";
        for %.edges.values -> $s {
            my $top;
            if ($current-n > $s.last_char_index ) {
                $top = $s.last_char_index;
            } else {
                $top = $current-n;
            }

            say $s.start_node ~ " "
                ~ $s.end_node ~ " "
                ~ @.nodes[$s.end_node] ~ " "
                ~ $s.first_char_index ~ " "
                ~ $s.last_char_index ~ "     "
                ~ $.string.substr($s.first_char_index, $top - $s.first_char_index + 1);
        }
    }

    method add-prefix(Suffix $active, $last_char_index) {
        my $parent_node;
        my $last_parent_node = -1;

        loop {
            my $edge;
            $parent_node = $active.origin_node;

            # // Step 1 is to try and find a matching edge for the given node.
            # // If a matching edge exists, we are done adding edges, so we break
            # // out of this big loop.

            if $active.is-explicit {
                $edge = %.edges{self.hash-tag($active.origin_node, $last_char_index)};
                last if $edge.defined;
            } else { # implicit node, a little more complicated
                $edge = %.edges{self.hash-tag($active.origin_node, $active.first_char_index)};
                my $span = $active.last_char_index - $active.first_char_index;
                last if $.string.substr($edge.first_char_index + $span + 1, 1)
                        eq $.string.substr($last_char_index, 1);
                $parent_node = self.split-edge($edge, $active);
            }

            # // We didn't find a matching edge, so we create a new one, add
            # // it to the tree at the parent node position, and insert it
            # // into the hash table.  When we create a new node, it also
            # // means we need to create a suffix link to the new node from
            # // the last node we visited.

            my $new-edge = Edge.new(self, $last_char_index, $.string.chars, $parent_node);
            %.edges{$new-edge.hash-tag} = $new-edge;
            if $last_parent_node > 0 {
                @.nodes[$last_parent_node] = $parent_node;
            }
            $last_parent_node = $parent_node;

            # // This final step is where we move to the next smaller suffix

            if $active.origin_node == 0 {
                $active.first_char_index++;
            } else {
                $active.origin_node = @.nodes[$active.origin_node];
            }
            $active.canonize;
        }

        if $last_parent_node > 0 {
            @.nodes[$last_parent_node] = $parent_node;
        }
        $active.last_char_index++;  # Now the endpoint is the next active point
        $active.canonize();
    };

    method make-tree() {
        # // The active point is the first non-leaf suffix in the
        # // tree.  We start by setting this to be the empty string
        # // at node 0.  The AddPrefix() function will update this
        # // value after every new prefix is added.

        my $active = Suffix.new(self, 0, 0, -1);  # The initial active prefix
        for ^($.string.chars) -> $i {
            # say "Character $i";
            self.add-prefix($active, $i);
        }
    }

    has %.node-map;

    method update-node-map() {
        %!node-map = ();
        for %.edges.values -> $edge {
            %!node-map.push($edge.start_node => $edge);
        }
    }

    method get-edge-map-for-node($start-node) {
        my %edge-map;
        if %!node-map{$start-node} {
            for @(%!node-map{$start-node}) -> $edge {
                %edge-map.push($.string.substr($edge.first_char_index, 1) => $edge);
            }
        }
        %edge-map;
    }

    my @good-suffixes;
    my @branch-count;
    my $error-found;

    method validate(:$verbose = Bool::False)
    {
        say "\nValidating:" if $verbose;

        self.update-node-map;
        @good-suffixes = Bool::False xx $.string.chars;
        @branch-count = 0 xx self.node-count;
        $error-found = Bool::False;

        self.walk-tree(0, "");

        my $error = 0;
        for @good-suffixes.kv -> $i, $status {
            if !$status {
                say "Suffix $i count wrong!";
                $error++;
                $error-found = Bool::True;
            }
        }
        say "All Suffixes present!" if $error == 0 && $verbose;

        my $leaf_count = 0;
        my $branch_count = 0;
        for ^self.node-count -> $i {
            if @branch-count[$i] == 0 {
                say "Logic error on node $i, not a leaf or internal node!";
                $error-found = Bool::True;
            } elsif @branch-count[$i] == -1 {
                $leaf_count++;
            } else {
                $branch_count += @branch-count[$i];
            }
        }

        if $verbose {
            say "Leaf count: $leaf_count { $leaf_count == $.string.chars ?? " OK" !! " Error!" }";
            say "Branch count: $branch_count { $branch_count == self.node-count - 1 ?? " OK" !! " Error!" }";
        }
        $error-found ||= $leaf_count != $.string.chars || $branch_count != self.node-count - 1;
        !$error-found;
    }

    method walk-tree($start_node, $string-so-far)
    {
        my $edges = 0;
        if %.node-map{$start_node} {
            for @(%.node-map{$start_node}) -> $edge {
                if @branch-count[$edge.start_node] < 0 {
                    say "Logic error on node { $edge.start_node }";
                    $error-found = Bool::True;
                }
                @branch-count[$edge.start_node]++;
                $edges++;
                my $current-string = $string-so-far;
                $current-string ~= $.string.substr($edge.first_char_index, 
                                                   $edge.last_char_index - $edge.first_char_index + 1);
                if $edge.end_node == $start_node {
                    say "Structure error!!!";
                    $error-found = Bool::True;
                }

                if $edge.end_node != $start_node && self.walk-tree($edge.end_node, $current-string) {
                    if @branch-count[$edge.end_node] > 0 {
                        say "Logic error on node { $edge.end_node }";
                        $error-found = Bool::True;
                    }
                    @branch-count[$edge.end_node]--;
                }
            }
        }

        # // If this node didn't have any child edges, it means we
        # // are at a leaf node, and can check on this suffix.  We
        # // check to see if it matches the input string, then tick
        # // off it's entry in the @good-suffixes list.

        if $edges == 0 {
            # say "Suffix: $string-so-far";
            @good-suffixes[$string-so-far.chars - 1]++;
            my $tail-of-string = $.string.substr($.string.chars - $string-so-far.chars);
            if $tail-of-string ne $string-so-far {
                say "Comparison failure!  Expected $tail-of-string but got $string-so-far";
                $error-found = Bool::True;
            }
            Bool::True;
        } else {
            Bool::False;
        }
    }
}

sub look-for-longest-substring(SuffixTree $st, $length-of-first) 
{
    my @shared-substrings;

    sub walk-tree($start_node, $string-so-far)
    {
        my $edges = 0;
        if $st.node-map{$start_node} {
            my $split = ?@($st.node-map{$start_node}).grep({ $_.first_char_index < $length-of-first })
                     && ?@($st.node-map{$start_node}).grep({ $_.first_char_index > $length-of-first });
            if $split {
                @shared-substrings.push($string-so-far);
            } 

            for @($st.node-map{$start_node}) -> $edge {
                $edges++;
                my $current-string = $string-so-far;
                $current-string ~= $st.string.substr($edge.first_char_index, 
                                                     $edge.last_char_index - $edge.first_char_index + 1);

                walk-tree($edge.end_node, $current-string);
            }
        }
    }

    $st.update-node-map;
    walk-tree(0, "");

    if +@shared-substrings {
        @shared-substrings.max({ $^a.chars <=> $^b.chars });
    } else {
        Any;
    }
}

sub find-longest-substring($a, $b, :$return-suffixtree = Bool::False) {
    my $term1 = "©";
    my $term2 = "®";
    my $st = SuffixTree.new($a ~ $term1 ~ $b ~ $term2);
    $st.make-tree;
    my $longest = look-for-longest-substring($st, $a.chars + 1);

    if $return-suffixtree {
        ($st, $longest);
    } else {
        $longest;
    }
}

multi MAIN() {
    my $s1 = $*IN.get;
    my $s2 = $*IN.get;
    my $substring = find-longest-substring($s1, $s2);
    if $substring.defined {
        say $substring;
    } else {
        say "";
    }
}


multi MAIN("test") {
    plan *;

    my @test-strings = "bananas",
                       "aa",
                       "aaaaabaaabaaaaabaaaabb",
                       "Four score and seven years ago",
                       # <a b c d e>.roll(200).join,
                       ;

    for @test-strings -> $string {
        my $term = "©";
        my $a = SuffixTree.new($string ~ $term);
        isa_ok $a, SuffixTree, "We made a SuffixTree object for '$string'";
        $a.make-tree;
        ok $a.validate, "and it passes validation.";
        my %edge-map = $a.get-edge-map-for-node(0);
        is %edge-map.keys.sort.join, 
           ($string ~ $term).comb.uniq.sort.join, 
           "Correct edges leading away from first node";
    }

    my @substring-tests = (["this", "a", ""],
                           ["this", "is", "is"],
                           ["is", "isn't", "is"],
                           ["this was", "isn't", "is"]);

    for @substring-tests -> $a {
        my ($st, $longest) = find-longest-substring($a[0], $a[1], :return-suffixtree);
        isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
        ok $st.validate, "and it passes validation.";
        # say :$longest.perl;
        is $longest, $a[2], "and we found the correct shared substring";

        ($st, $longest) = find-longest-substring($a[1], $a[0], :return-suffixtree);
        isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
        ok $st.validate, "and it passes validation.";
        # say :$longest.perl;
        is $longest, $a[2], "and we found the correct shared substring";
    }

    {
        my $a = "Oh... this is the place where the fishermen gather
                 With oil-skins and boots and Cape Anns battened down
                 All sizes of figures with squid lines and jiggers
                 They congregate here on the squid-jiggin' ground.".subst(/\s+/, " ", :g);
        my $b = "Take me back to my Western boat,
                 Let me fish off Cape St. Mary's,
                 Where the hogdowns sail and the foghorns wail,
                 With my friends the Browns and the Clearys,
                 Let me fish off Cape St. Mary's.".subst(/\s+/, " ", :g);
        my ($st, $longest) = find-longest-substring($a, $b, :return-suffixtree);
        isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
        ok $st.validate, "and it passes validation.";
        # say :$longest.perl;
        is $longest, "here the ", "and we found the correct shared substring";

        ($st, $longest) = find-longest-substring($b, $a, :return-suffixtree);
        isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
        ok $st.validate, "and it passes validation.";
        # say :$longest.perl;
        is $longest, "here the ", "and we found the correct shared substring";
    }

    done;
}

Readability

The code is long and class-y, but not to difficult to take in. A clean indentation and whitespace policy helps here.

Consistency

At line 27, .is-implicit is defined, and then never used. In particular, it's not used two lines down where self.is-explicit is instead negated.

Clarity of intent

In my view, the three .new methods containing only a .bless statement constitute a net decrease in clarity. The out-of-the box .new method uses named arguments instead of positional ones, which makes looking up the order of things in the signature less required.

On line 68, a big comment announces the arrival of the methods of the big class, SuffixTree. It strikes me that the need for such a comment could have been avoided by not nesting Suffix and Edge inside that class; the nested classes are not my-scoped, so it wouldn't widen their visibility.

The method .walk-tree returns a boolean value for no documented reason. The value seems to have something to do with leaf nodes. I find it less than perfectly clear. The subroutine walk-tree returns either an Int or Any, which is also confusing.

The variable $error on line 425 is redundant in the presence of $error-found. It counts the number of errors (and so probably should have been called $errors), but we only ever check whether we got 0 errors (on line 220), and $error-found could've done that.

It seems to be that the defining property of the query method look-for-longest-subtring is not that it looks for the longest substring, but that it returns it. I think it wants to be called longest-substring. A similar argument could be made for the subroutine find-longest-substring. (Also, what's the technical difference between "look for" and "find" here that motivates two different naming schemes?)

Algorithmic efficiency

The algorithm should be O(n1 + n2), where n1 and n1 are the lengths of $s1 and $s2, respectively. I can't honestly say that it is, because my limited brain can't take in the whole file at once. But I have no reason to believe that it isn't, and so this solution definitely wins the moral victory of being the solution that scales the best.

In benchmarking, it doesn't always. Maybe that's Rakudo's fault.

Idiomatic use of Perl 6

The code shows a clear C++ heritage in some cases. It starts with some comments being of the form # //. I'd guess the explicit .new methods are another such loan.

The xx operators on lines 206 and 207 are a nice way to fill up arrays. Rids us of some for loops.

Brevity

Nope.

On the other hand, it's un unknown quantity how small you can make a suffix tree implementation in Perl 6. Someone please try to make a smaller one from scratch!