Download the raw code.
use v6;
use Test;
# SuffixTree class based on
# STREE2006.CPP - Suffix tree creation
#
# by Mark Nelson, updated December, 2006
# http://marknelson.us/1996/08/01/suffix-trees/
class SuffixTree {
has $.string;
has %.edges;
has @.nodes;
has $.node-count is rw = 1;
class Suffix {
has SuffixTree $.tree;
has Int $.origin_node is rw;
has Int $.first_char_index is rw;
has Int $.last_char_index is rw;
method new(SuffixTree $tree, Int $node, Int $start, Int $stop) {
self.bless(*, :$tree, :origin_node($node), :first_char_index($start), :last_char_index($stop));
}
method is-explicit { $.first_char_index > $.last_char_index; }
method is-implicit { !self.is-explicit; }
method canonize() {
if !self.is-explicit {
my $edge = $.tree.edges{$.tree.hash-tag($.origin_node, $.first_char_index)};
my $edge_span = $edge.last_char_index - $edge.first_char_index;
while $edge_span <= $.last_char_index - $.first_char_index {
$.first_char_index = $.first_char_index + $edge_span + 1;
$.origin_node = $edge.end_node;
if ($.first_char_index <= $.last_char_index) {
$edge = $.tree.edges{$.tree.hash-tag($edge.end_node, $.first_char_index)};
$edge_span = $edge.last_char_index - $edge.first_char_index;
}
}
}
}
}
class Edge {
has SuffixTree $.tree;
has Int $.first_char_index is rw;
has Int $.last_char_index is rw;
has Int $.start_node is rw;
has Int $.end_node is rw;
multi method new(SuffixTree $tree,
Int $first_char_index,
Int $last_char_index,
Int $parent_node) {
self.bless(*, :$tree,
:$first_char_index,
:$last_char_index,
:start_node($parent_node),
:end_node($tree.node-count++));
}
method hash-tag() {
$.tree.hash-tag($.start_node, $.first_char_index);
}
}
#################################
## SuffixTree methods start here!
#################################
method new(Str $s) {
self.bless(*, :string($s));
}
method hash-tag($node, $first_char_index) {
$.string.substr($first_char_index, 1) ~ $node;
}
method split-edge(Edge $edge, Suffix $s) {
%.edges.delete($edge.hash-tag);
my $new-edge = Edge.new(self,
$edge.first_char_index,
$edge.first_char_index + $s.last_char_index - $s.first_char_index,
$s.origin_node);
%.edges{$new-edge.hash-tag} = $new-edge;
@.nodes[$new-edge.end_node] = $s.origin_node;
$edge.first_char_index += $s.last_char_index - $s.first_char_index + 1;
$edge.start_node = $new-edge.end_node;
%.edges{$edge.hash-tag} = $edge;
$new-edge.end_node;
}
method dump-edges($current-n) {
say " Start End Suf First Last String";
for %.edges.values -> $s {
my $top;
if ($current-n > $s.last_char_index ) {
$top = $s.last_char_index;
} else {
$top = $current-n;
}
say $s.start_node ~ " "
~ $s.end_node ~ " "
~ @.nodes[$s.end_node] ~ " "
~ $s.first_char_index ~ " "
~ $s.last_char_index ~ " "
~ $.string.substr($s.first_char_index, $top - $s.first_char_index + 1);
}
}
method add-prefix(Suffix $active, $last_char_index) {
my $parent_node;
my $last_parent_node = -1;
loop {
my $edge;
$parent_node = $active.origin_node;
# // Step 1 is to try and find a matching edge for the given node.
# // If a matching edge exists, we are done adding edges, so we break
# // out of this big loop.
if $active.is-explicit {
$edge = %.edges{self.hash-tag($active.origin_node, $last_char_index)};
last if $edge.defined;
} else { # implicit node, a little more complicated
$edge = %.edges{self.hash-tag($active.origin_node, $active.first_char_index)};
my $span = $active.last_char_index - $active.first_char_index;
last if $.string.substr($edge.first_char_index + $span + 1, 1)
eq $.string.substr($last_char_index, 1);
$parent_node = self.split-edge($edge, $active);
}
# // We didn't find a matching edge, so we create a new one, add
# // it to the tree at the parent node position, and insert it
# // into the hash table. When we create a new node, it also
# // means we need to create a suffix link to the new node from
# // the last node we visited.
my $new-edge = Edge.new(self, $last_char_index, $.string.chars, $parent_node);
%.edges{$new-edge.hash-tag} = $new-edge;
if $last_parent_node > 0 {
@.nodes[$last_parent_node] = $parent_node;
}
$last_parent_node = $parent_node;
# // This final step is where we move to the next smaller suffix
if $active.origin_node == 0 {
$active.first_char_index++;
} else {
$active.origin_node = @.nodes[$active.origin_node];
}
$active.canonize;
}
if $last_parent_node > 0 {
@.nodes[$last_parent_node] = $parent_node;
}
$active.last_char_index++; # Now the endpoint is the next active point
$active.canonize();
};
method make-tree() {
# // The active point is the first non-leaf suffix in the
# // tree. We start by setting this to be the empty string
# // at node 0. The AddPrefix() function will update this
# // value after every new prefix is added.
my $active = Suffix.new(self, 0, 0, -1); # The initial active prefix
for ^($.string.chars) -> $i {
# say "Character $i";
self.add-prefix($active, $i);
}
}
has %.node-map;
method update-node-map() {
%!node-map = ();
for %.edges.values -> $edge {
%!node-map.push($edge.start_node => $edge);
}
}
method get-edge-map-for-node($start-node) {
my %edge-map;
if %!node-map{$start-node} {
for @(%!node-map{$start-node}) -> $edge {
%edge-map.push($.string.substr($edge.first_char_index, 1) => $edge);
}
}
%edge-map;
}
my @good-suffixes;
my @branch-count;
my $error-found;
method validate(:$verbose = Bool::False)
{
say "\nValidating:" if $verbose;
self.update-node-map;
@good-suffixes = Bool::False xx $.string.chars;
@branch-count = 0 xx self.node-count;
$error-found = Bool::False;
self.walk-tree(0, "");
my $error = 0;
for @good-suffixes.kv -> $i, $status {
if !$status {
say "Suffix $i count wrong!";
$error++;
$error-found = Bool::True;
}
}
say "All Suffixes present!" if $error == 0 && $verbose;
my $leaf_count = 0;
my $branch_count = 0;
for ^self.node-count -> $i {
if @branch-count[$i] == 0 {
say "Logic error on node $i, not a leaf or internal node!";
$error-found = Bool::True;
} elsif @branch-count[$i] == -1 {
$leaf_count++;
} else {
$branch_count += @branch-count[$i];
}
}
if $verbose {
say "Leaf count: $leaf_count { $leaf_count == $.string.chars ?? " OK" !! " Error!" }";
say "Branch count: $branch_count { $branch_count == self.node-count - 1 ?? " OK" !! " Error!" }";
}
$error-found ||= $leaf_count != $.string.chars || $branch_count != self.node-count - 1;
!$error-found;
}
method walk-tree($start_node, $string-so-far)
{
my $edges = 0;
if %.node-map{$start_node} {
for @(%.node-map{$start_node}) -> $edge {
if @branch-count[$edge.start_node] < 0 {
say "Logic error on node { $edge.start_node }";
$error-found = Bool::True;
}
@branch-count[$edge.start_node]++;
$edges++;
my $current-string = $string-so-far;
$current-string ~= $.string.substr($edge.first_char_index,
$edge.last_char_index - $edge.first_char_index + 1);
if $edge.end_node == $start_node {
say "Structure error!!!";
$error-found = Bool::True;
}
if $edge.end_node != $start_node && self.walk-tree($edge.end_node, $current-string) {
if @branch-count[$edge.end_node] > 0 {
say "Logic error on node { $edge.end_node }";
$error-found = Bool::True;
}
@branch-count[$edge.end_node]--;
}
}
}
# // If this node didn't have any child edges, it means we
# // are at a leaf node, and can check on this suffix. We
# // check to see if it matches the input string, then tick
# // off it's entry in the @good-suffixes list.
if $edges == 0 {
# say "Suffix: $string-so-far";
@good-suffixes[$string-so-far.chars - 1]++;
my $tail-of-string = $.string.substr($.string.chars - $string-so-far.chars);
if $tail-of-string ne $string-so-far {
say "Comparison failure! Expected $tail-of-string but got $string-so-far";
$error-found = Bool::True;
}
Bool::True;
} else {
Bool::False;
}
}
}
sub look-for-longest-substring(SuffixTree $st, $length-of-first)
{
my @shared-substrings;
sub walk-tree($start_node, $string-so-far)
{
my $edges = 0;
if $st.node-map{$start_node} {
my $split = ?@($st.node-map{$start_node}).grep({ $_.first_char_index < $length-of-first })
&& ?@($st.node-map{$start_node}).grep({ $_.first_char_index > $length-of-first });
if $split {
@shared-substrings.push($string-so-far);
}
for @($st.node-map{$start_node}) -> $edge {
$edges++;
my $current-string = $string-so-far;
$current-string ~= $st.string.substr($edge.first_char_index,
$edge.last_char_index - $edge.first_char_index + 1);
walk-tree($edge.end_node, $current-string);
}
}
}
$st.update-node-map;
walk-tree(0, "");
if +@shared-substrings {
@shared-substrings.max({ $^a.chars <=> $^b.chars });
} else {
Any;
}
}
sub find-longest-substring($a, $b, :$return-suffixtree = Bool::False) {
my $term1 = "©";
my $term2 = "®";
my $st = SuffixTree.new($a ~ $term1 ~ $b ~ $term2);
$st.make-tree;
my $longest = look-for-longest-substring($st, $a.chars + 1);
if $return-suffixtree {
($st, $longest);
} else {
$longest;
}
}
multi MAIN() {
my $s1 = $*IN.get;
my $s2 = $*IN.get;
my $substring = find-longest-substring($s1, $s2);
if $substring.defined {
say $substring;
} else {
say "";
}
}
multi MAIN("test") {
plan *;
my @test-strings = "bananas",
"aa",
"aaaaabaaabaaaaabaaaabb",
"Four score and seven years ago",
# <a b c d e>.roll(200).join,
;
for @test-strings -> $string {
my $term = "©";
my $a = SuffixTree.new($string ~ $term);
isa_ok $a, SuffixTree, "We made a SuffixTree object for '$string'";
$a.make-tree;
ok $a.validate, "and it passes validation.";
my %edge-map = $a.get-edge-map-for-node(0);
is %edge-map.keys.sort.join,
($string ~ $term).comb.uniq.sort.join,
"Correct edges leading away from first node";
}
my @substring-tests = (["this", "a", ""],
["this", "is", "is"],
["is", "isn't", "is"],
["this was", "isn't", "is"]);
for @substring-tests -> $a {
my ($st, $longest) = find-longest-substring($a[0], $a[1], :return-suffixtree);
isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
ok $st.validate, "and it passes validation.";
# say :$longest.perl;
is $longest, $a[2], "and we found the correct shared substring";
($st, $longest) = find-longest-substring($a[1], $a[0], :return-suffixtree);
isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
ok $st.validate, "and it passes validation.";
# say :$longest.perl;
is $longest, $a[2], "and we found the correct shared substring";
}
{
my $a = "Oh... this is the place where the fishermen gather
With oil-skins and boots and Cape Anns battened down
All sizes of figures with squid lines and jiggers
They congregate here on the squid-jiggin' ground.".subst(/\s+/, " ", :g);
my $b = "Take me back to my Western boat,
Let me fish off Cape St. Mary's,
Where the hogdowns sail and the foghorns wail,
With my friends the Browns and the Clearys,
Let me fish off Cape St. Mary's.".subst(/\s+/, " ", :g);
my ($st, $longest) = find-longest-substring($a, $b, :return-suffixtree);
isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
ok $st.validate, "and it passes validation.";
# say :$longest.perl;
is $longest, "here the ", "and we found the correct shared substring";
($st, $longest) = find-longest-substring($b, $a, :return-suffixtree);
isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings";
ok $st.validate, "and it passes validation.";
# say :$longest.perl;
is $longest, "here the ", "and we found the correct shared substring";
}
done;
}
The code is long and class-y, but not to difficult to take in. A clean indentation and whitespace policy helps here.
At line 27, .is-implicit
is defined, and then never used. In particular,
it's not used two lines down where self.is-explicit
is instead negated.
In my view, the three .new
methods containing only a .bless
statement
constitute a net decrease in clarity. The out-of-the box .new
method uses
named arguments instead of positional ones, which makes looking up the order of
things in the signature less required.
On line 68, a big comment announces the arrival of the methods of the big
class, SuffixTree
. It strikes me that the need for such a comment could
have been avoided by not nesting Suffix
and Edge
inside that class; the
nested classes are not my
-scoped, so it wouldn't widen their visibility.
The method .walk-tree
returns a boolean value for no documented reason.
The value seems to have something to do with leaf nodes. I find it less than
perfectly clear. The subroutine walk-tree
returns either an Int
or Any
,
which is also confusing.
The variable $error
on line 425 is redundant in the presence of
$error-found
. It counts the number of errors (and so probably should have
been called $errors
), but we only ever check whether we got 0 errors (on line
220), and $error-found
could've done that.
It seems to be that the defining property of the query method
look-for-longest-subtring
is not that it looks for the longest substring, but
that it returns it. I think it wants to be called longest-substring
. A
similar argument could be made for the subroutine find-longest-substring
.
(Also, what's the technical difference between "look for" and "find" here
that motivates two different naming schemes?)
The algorithm should be O(n1 + n2), where
n1 and n1 are the lengths of $s1
and $s2
,
respectively. I can't honestly say that it is, because my limited brain can't
take in the whole file at once. But I have no reason to believe that it isn't,
and so this solution definitely wins the moral victory of being the
solution that scales the best.
In benchmarking, it doesn't always. Maybe that's Rakudo's fault.
The code shows a clear C++ heritage in some cases. It starts with some comments
being of the form # //
. I'd guess the explicit .new
methods are another
such loan.
The xx
operators on lines 206 and 207 are a nice way to fill up arrays.
Rids us of some for
loops.
Nope.
On the other hand, it's un unknown quantity how small you can make a suffix tree implementation in Perl 6. Someone please try to make a smaller one from scratch!