use v6; use Test; # SuffixTree class based on # STREE2006.CPP - Suffix tree creation # # by Mark Nelson, updated December, 2006 # http://marknelson.us/1996/08/01/suffix-trees/ class SuffixTree { has $.string; has %.edges; has @.nodes; has $.node-count is rw = 1; class Suffix { has SuffixTree $.tree; has Int $.origin_node is rw; has Int $.first_char_index is rw; has Int $.last_char_index is rw; method new(SuffixTree $tree, Int $node, Int $start, Int $stop) { self.bless(*, :$tree, :origin_node($node), :first_char_index($start), :last_char_index($stop)); } method is-explicit { $.first_char_index > $.last_char_index; } method is-implicit { !self.is-explicit; } method canonize() { if !self.is-explicit { my $edge = $.tree.edges{$.tree.hash-tag($.origin_node, $.first_char_index)}; my $edge_span = $edge.last_char_index - $edge.first_char_index; while $edge_span <= $.last_char_index - $.first_char_index { $.first_char_index = $.first_char_index + $edge_span + 1; $.origin_node = $edge.end_node; if ($.first_char_index <= $.last_char_index) { $edge = $.tree.edges{$.tree.hash-tag($edge.end_node, $.first_char_index)}; $edge_span = $edge.last_char_index - $edge.first_char_index; } } } } } class Edge { has SuffixTree $.tree; has Int $.first_char_index is rw; has Int $.last_char_index is rw; has Int $.start_node is rw; has Int $.end_node is rw; multi method new(SuffixTree $tree, Int $first_char_index, Int $last_char_index, Int $parent_node) { self.bless(*, :$tree, :$first_char_index, :$last_char_index, :start_node($parent_node), :end_node($tree.node-count++)); } method hash-tag() { $.tree.hash-tag($.start_node, $.first_char_index); } } ################################# ## SuffixTree methods start here! ################################# method new(Str $s) { self.bless(*, :string($s)); } method hash-tag($node, $first_char_index) { $.string.substr($first_char_index, 1) ~ $node; } method split-edge(Edge $edge, Suffix $s) { %.edges.delete($edge.hash-tag); my $new-edge = Edge.new(self, $edge.first_char_index, $edge.first_char_index + $s.last_char_index - $s.first_char_index, $s.origin_node); %.edges{$new-edge.hash-tag} = $new-edge; @.nodes[$new-edge.end_node] = $s.origin_node; $edge.first_char_index += $s.last_char_index - $s.first_char_index + 1; $edge.start_node = $new-edge.end_node; %.edges{$edge.hash-tag} = $edge; $new-edge.end_node; } method dump-edges($current-n) { say " Start End Suf First Last String"; for %.edges.values -> $s { my $top; if ($current-n > $s.last_char_index ) { $top = $s.last_char_index; } else { $top = $current-n; } say $s.start_node ~ " " ~ $s.end_node ~ " " ~ @.nodes[$s.end_node] ~ " " ~ $s.first_char_index ~ " " ~ $s.last_char_index ~ " " ~ $.string.substr($s.first_char_index, $top - $s.first_char_index + 1); } } method add-prefix(Suffix $active, $last_char_index) { my $parent_node; my $last_parent_node = -1; loop { my $edge; $parent_node = $active.origin_node; # // Step 1 is to try and find a matching edge for the given node. # // If a matching edge exists, we are done adding edges, so we break # // out of this big loop. if $active.is-explicit { $edge = %.edges{self.hash-tag($active.origin_node, $last_char_index)}; last if $edge.defined; } else { # implicit node, a little more complicated $edge = %.edges{self.hash-tag($active.origin_node, $active.first_char_index)}; my $span = $active.last_char_index - $active.first_char_index; last if $.string.substr($edge.first_char_index + $span + 1, 1) eq $.string.substr($last_char_index, 1); $parent_node = self.split-edge($edge, $active); } # // We didn't find a matching edge, so we create a new one, add # // it to the tree at the parent node position, and insert it # // into the hash table. When we create a new node, it also # // means we need to create a suffix link to the new node from # // the last node we visited. my $new-edge = Edge.new(self, $last_char_index, $.string.chars, $parent_node); %.edges{$new-edge.hash-tag} = $new-edge; if $last_parent_node > 0 { @.nodes[$last_parent_node] = $parent_node; } $last_parent_node = $parent_node; # // This final step is where we move to the next smaller suffix if $active.origin_node == 0 { $active.first_char_index++; } else { $active.origin_node = @.nodes[$active.origin_node]; } $active.canonize; } if $last_parent_node > 0 { @.nodes[$last_parent_node] = $parent_node; } $active.last_char_index++; # Now the endpoint is the next active point $active.canonize(); }; method make-tree() { # // The active point is the first non-leaf suffix in the # // tree. We start by setting this to be the empty string # // at node 0. The AddPrefix() function will update this # // value after every new prefix is added. my $active = Suffix.new(self, 0, 0, -1); # The initial active prefix for ^($.string.chars) -> $i { # say "Character $i"; self.add-prefix($active, $i); } } has %.node-map; method update-node-map() { %!node-map = (); for %.edges.values -> $edge { %!node-map.push($edge.start_node => $edge); } } method get-edge-map-for-node($start-node) { my %edge-map; if %!node-map{$start-node} { for @(%!node-map{$start-node}) -> $edge { %edge-map.push($.string.substr($edge.first_char_index, 1) => $edge); } } %edge-map; } my @good-suffixes; my @branch-count; my $error-found; method validate(:$verbose = Bool::False) { say "\nValidating:" if $verbose; self.update-node-map; @good-suffixes = Bool::False xx $.string.chars; @branch-count = 0 xx self.node-count; $error-found = Bool::False; self.walk-tree(0, ""); my $error = 0; for @good-suffixes.kv -> $i, $status { if !$status { say "Suffix $i count wrong!"; $error++; $error-found = Bool::True; } } say "All Suffixes present!" if $error == 0 && $verbose; my $leaf_count = 0; my $branch_count = 0; for ^self.node-count -> $i { if @branch-count[$i] == 0 { say "Logic error on node $i, not a leaf or internal node!"; $error-found = Bool::True; } elsif @branch-count[$i] == -1 { $leaf_count++; } else { $branch_count += @branch-count[$i]; } } if $verbose { say "Leaf count: $leaf_count { $leaf_count == $.string.chars ?? " OK" !! " Error!" }"; say "Branch count: $branch_count { $branch_count == self.node-count - 1 ?? " OK" !! " Error!" }"; } $error-found ||= $leaf_count != $.string.chars || $branch_count != self.node-count - 1; !$error-found; } method walk-tree($start_node, $string-so-far) { my $edges = 0; if %.node-map{$start_node} { for @(%.node-map{$start_node}) -> $edge { if @branch-count[$edge.start_node] < 0 { say "Logic error on node { $edge.start_node }"; $error-found = Bool::True; } @branch-count[$edge.start_node]++; $edges++; my $current-string = $string-so-far; $current-string ~= $.string.substr($edge.first_char_index, $edge.last_char_index - $edge.first_char_index + 1); if $edge.end_node == $start_node { say "Structure error!!!"; $error-found = Bool::True; } if $edge.end_node != $start_node && self.walk-tree($edge.end_node, $current-string) { if @branch-count[$edge.end_node] > 0 { say "Logic error on node { $edge.end_node }"; $error-found = Bool::True; } @branch-count[$edge.end_node]--; } } } # // If this node didn't have any child edges, it means we # // are at a leaf node, and can check on this suffix. We # // check to see if it matches the input string, then tick # // off it's entry in the @good-suffixes list. if $edges == 0 { # say "Suffix: $string-so-far"; @good-suffixes[$string-so-far.chars - 1]++; my $tail-of-string = $.string.substr($.string.chars - $string-so-far.chars); if $tail-of-string ne $string-so-far { say "Comparison failure! Expected $tail-of-string but got $string-so-far"; $error-found = Bool::True; } Bool::True; } else { Bool::False; } } } sub look-for-longest-substring(SuffixTree $st, $length-of-first) { my @shared-substrings; sub walk-tree($start_node, $string-so-far) { my $edges = 0; if $st.node-map{$start_node} { my $split = ?@($st.node-map{$start_node}).grep({ $_.first_char_index < $length-of-first }) && ?@($st.node-map{$start_node}).grep({ $_.first_char_index > $length-of-first }); if $split { @shared-substrings.push($string-so-far); } for @($st.node-map{$start_node}) -> $edge { $edges++; my $current-string = $string-so-far; $current-string ~= $st.string.substr($edge.first_char_index, $edge.last_char_index - $edge.first_char_index + 1); walk-tree($edge.end_node, $current-string); } } } $st.update-node-map; walk-tree(0, ""); if +@shared-substrings { @shared-substrings.max({ $^a.chars <=> $^b.chars }); } else { Any; } } sub find-longest-substring($a, $b, :$return-suffixtree = Bool::False) { my $term1 = "©"; my $term2 = "®"; my $st = SuffixTree.new($a ~ $term1 ~ $b ~ $term2); $st.make-tree; my $longest = look-for-longest-substring($st, $a.chars + 1); if $return-suffixtree { ($st, $longest); } else { $longest; } } multi MAIN() { my $s1 = $*IN.get; my $s2 = $*IN.get; my $substring = find-longest-substring($s1, $s2); if $substring.defined { say $substring; } else { say ""; } } multi MAIN("test") { plan *; my @test-strings = "bananas", "aa", "aaaaabaaabaaaaabaaaabb", "Four score and seven years ago", # .roll(200).join, ; for @test-strings -> $string { my $term = "©"; my $a = SuffixTree.new($string ~ $term); isa_ok $a, SuffixTree, "We made a SuffixTree object for '$string'"; $a.make-tree; ok $a.validate, "and it passes validation."; my %edge-map = $a.get-edge-map-for-node(0); is %edge-map.keys.sort.join, ($string ~ $term).comb.uniq.sort.join, "Correct edges leading away from first node"; } my @substring-tests = (["this", "a", ""], ["this", "is", "is"], ["is", "isn't", "is"], ["this was", "isn't", "is"]); for @substring-tests -> $a { my ($st, $longest) = find-longest-substring($a[0], $a[1], :return-suffixtree); isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings"; ok $st.validate, "and it passes validation."; # say :$longest.perl; is $longest, $a[2], "and we found the correct shared substring"; ($st, $longest) = find-longest-substring($a[1], $a[0], :return-suffixtree); isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings"; ok $st.validate, "and it passes validation."; # say :$longest.perl; is $longest, $a[2], "and we found the correct shared substring"; } { my $a = "Oh... this is the place where the fishermen gather With oil-skins and boots and Cape Anns battened down All sizes of figures with squid lines and jiggers They congregate here on the squid-jiggin' ground.".subst(/\s+/, " ", :g); my $b = "Take me back to my Western boat, Let me fish off Cape St. Mary's, Where the hogdowns sail and the foghorns wail, With my friends the Browns and the Clearys, Let me fish off Cape St. Mary's.".subst(/\s+/, " ", :g); my ($st, $longest) = find-longest-substring($a, $b, :return-suffixtree); isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings"; ok $st.validate, "and it passes validation."; # say :$longest.perl; is $longest, "here the ", "and we found the correct shared substring"; ($st, $longest) = find-longest-substring($b, $a, :return-suffixtree); isa_ok $st, SuffixTree, "We made a SuffixTree object for our strings"; ok $st.validate, "and it passes validation."; # say :$longest.perl; is $longest, "here the ", "and we found the correct shared substring"; } done; }