Re: Levenshtein distance in XSLT 2.0

Oh well here’s the O(M*N) one, pretty similar to yours I think, and hits the same limits, I don’t think it should really as it’s tail recursive. Of course, normally I would have fully documented the code, but as I was so hurt by your remarks in the regex post, this is just unadorned xslt….

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:d="data:,dpc">

  <xsl:param name="debug"/>

  <xsl:template name="test">

    <xsl:sequence select="d:levenshteintest('0123456789001234567890','01234567890ABCDEFGHIJ')"/>
    <xsl:sequence select="d:levenshteintest('GUMBO','GAMBOL')"/>
    <xsl:sequence select="d:levenshteintest('GUMBO','GAMBOL')"/>

    <xsl:sequence select="d:levenshteintest('GUMBO','')"/>
    <xsl:sequence select="d:levenshteintest('','GAMBOL')"/>

  </xsl:template>


  <xsl:function name="d:levenshteintest">
    <xsl:param name="s" as="xs:string"/>
    <xsl:param name="t" as="xs:string"/>
    <xsl:text>&#10;distance( "</xsl:text>
    <xsl:value-of select="$s"/>
    <xsl:text>" , "</xsl:text>
    <xsl:value-of select="$t"/>
    <xsl:text>" ) = </xsl:text>
    <xsl:value-of select="d:levenshtein($s,$t)"/>
  </xsl:function>


  <xsl:function name="d:levenshtein">
    <xsl:param name="s" as="xs:string"/>
    <xsl:param name="t" as="xs:string"/>
    <xsl:variable name="n" select="string-length($s)"/>
    <xsl:variable name="m" select="string-length($t)"/>
    <xsl:variable name="ss" select="string-to-codepoints($s)"/>
    <xsl:variable name="tt" select="string-to-codepoints($t)"/>
    <xsl:sequence
    select="if ($n=0) 
        then
        $m
        else
                if ($m=0)
                then
                $n
                else
        d:levenshtein2(
        $m,
        $n,
        (for $i in (0 to $n) return $i,
                 for $j in (1 to $m) return 
                   ($j, for $i in (1 to $n) return 
                      (if($ss[$i]=$tt[$j]) then 0 else 1))),
        $n+3)
"/>
  </xsl:function>

  <xsl:function name="d:levenshtein2">
    <xsl:param name="m" as="xs:integer"/>
    <xsl:param name="n" as="xs:integer"/>
    <xsl:param name="a" as="xs:integer+"/>
    <xsl:param name="p" as="xs:integer"/>
<xsl:if test="$debug='yes'">
<xsl:value-of select="$m,$n"/>
======================================
<xsl:for-each select="0 to ($m)">
<xsl:text>&#10;</xsl:text>
<xsl:value-of select="for $x in subsequence($a,1+ . * ($n+1),$n+1) return format-number($x,'00')"/>
</xsl:for-each>
======================================
</xsl:if>
<xsl:choose>
  <xsl:when test="$p=2+($n+1)*($m+1)">
    <xsl:sequence select="$a[last()]"/>
  </xsl:when>
  <xsl:otherwise>
  <xsl:sequence select="d:levenshtein2($m,$n,
            ($a[position()&lt;$p],  
                        min((
$a[position()=$p - ($n+1)]+1,
$a[position()=$p - 1] + 1,
$a[position()=$p - ($n+2)]+$a[position()=$p]
                        )),
            $a[position()&gt;$p]),
            if($p mod ($n+1) = 0) then $p+2 else $p+1)
    "/>
  </xsl:otherwise>
</xsl:choose>
  </xsl:function>




</xsl:stylesheet>

$  saxon8 -it test levenshtein.xsl 

distance( "0123456789001234567890" , "01234567890ABCDEFGHIJ" ) = 11
distance( "GUMBO" , "GAMBOL" ) = 2
distance( "GUMBO" , "GAMBOL" ) = 2
distance( "GUMBO" , "" ) = 5
distance( "" , "GAMBOL" ) = 6

David

Reply

The content of this field is kept private and will not be shown publicly.