001 /*
002 * CDDL HEADER START
003 *
004 * The contents of this file are subject to the terms of the
005 * Common Development and Distribution License, Version 1.0 only
006 * (the "License"). You may not use this file except in compliance
007 * with the License.
008 *
009 * You can obtain a copy of the license at
010 * trunk/opends/resource/legal-notices/OpenDS.LICENSE
011 * or https://OpenDS.dev.java.net/OpenDS.LICENSE.
012 * See the License for the specific language governing permissions
013 * and limitations under the License.
014 *
015 * When distributing Covered Code, include this CDDL HEADER in each
016 * file and include the License file at
017 * trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable,
018 * add the following below this CDDL HEADER, with the fields enclosed
019 * by brackets "[]" replaced with your own identifying information:
020 * Portions Copyright [yyyy] [name of copyright owner]
021 *
022 * CDDL HEADER END
023 *
024 *
025 * Copyright 2008 Sun Microsystems, Inc.
026 */
027 package org.opends.server.util;
028
029
030
031 import static org.opends.server.util.Validator.*;
032
033
034
035 /**
036 * This class provides an implementation of the Levenshtein distance algorithm,
037 * which may be used to determine the minimum number of changes required to
038 * transform one string into another. For the purpose of this algorithm, a
039 * change is defined as replacing one character with another, inserting a new
040 * character, or removing an existing character.
041 * <BR><BR>
042 * The basic algorithm works as follows for a source string S of length X and
043 * a target string T of length Y:
044 * <OL>
045 * <LI>Create a matrix M with dimensions of X+1, Y+1.</LI>
046 * <LI>Fill the first row with sequentially-increasing values 0 through
047 * X.</LI>
048 * <LI>Fill the first column with sequentially-increasing values 0 through
049 * Y.</LI>
050 * <LI>Create a nested loop iterating over the characters in the strings. In
051 * the outer loop, iterate through characters in S from 0 to X-1 using an
052 * iteration counter I. In the inner loop, iterate through the characters
053 * in T from 0 to Y-1 using an iterator counter J. Calculate the
054 * following three things and place the smallest value in the matrix in
055 * row I+1 column J+1:
056 * <UL>
057 * <LI>One more than the value in the matrix position immediately to the
058 * left (i.e., 1 + M[I][J+1]).</LI>
059 * <LI>One more than the value in the matrix position immediately above
060 * (i.e., 1 + M[I+1][J]).</LI>
061 * <LI>Define a value V to be zero if S[I] is the same as T[J], or one if
062 * they are different. Add that value to the value in the matrix
063 * position immediately above and to the left (i.e.,
064 * V + M[I][J]).</LI>
065 * </UL>
066 * </LI>
067 * <LI>The Levenshtein distance value, which is the least number of changes
068 * needed to transform the source string into the target string, will be
069 * the value in the bottom right corner of the matrix (i.e.,
070 * M[X][Y]).</LI>
071 * </OL>
072 * <BR><BR>
073 * Note that this is a completely "clean room" implementation, developed from a
074 * description of the algorithm, rather than copying an existing implementation.
075 * Doing it in this way eliminates copyright and licensing concerns associated
076 * with using an existing implementation.
077 */
078 @org.opends.server.types.PublicAPI(
079 stability=org.opends.server.types.StabilityLevel.UNCOMMITTED,
080 mayInstantiate=false,
081 mayExtend=false,
082 mayInvoke=true)
083 public final class LevenshteinDistance
084 {
085 /**
086 * Calculates the Levenshtein distance between the provided string values.
087 *
088 * @param source The source string to compare. It must not be {@code null}.
089 * @param target The target string to compare. It must not be {@code null}.
090 *
091 * @return The minimum number of changes required to turn the source string
092 * into the target string.
093 */
094 public static int calculate(String source, String target)
095 {
096 ensureNotNull(source, target);
097
098 // sl == source length; tl == target length
099 int sl = source.length();
100 int tl = target.length();
101
102
103 // If either of the lengths is zero, then the distance is the length of the
104 // other string.
105 if (sl == 0)
106 {
107 return tl;
108 }
109 else if (tl == 0)
110 {
111 return sl;
112 }
113
114
115 // w == matrix width; h == matrix height
116 int w = sl+1;
117 int h = tl+1;
118
119
120 // m == matrix array
121 // Create the array and fill it with values 0..sl in the first dimension and
122 // 0..tl in the second dimension.
123 int[][] m = new int[w][h];
124 for (int i=0; i < w; i++)
125 {
126 m[i][0] = i;
127 }
128
129 for (int i=1; i < h; i++)
130 {
131 m[0][i] = i;
132 }
133
134 for (int i=0,x=1; i < sl; i++,x++)
135 {
136 char s = source.charAt(i);
137
138 for (int j=0,y=1; j < tl; j++,y++)
139 {
140 char t = target.charAt(j);
141
142
143 // Figure out what to put in the appropriate matrix slot. It should be
144 // the lowest of:
145 // - One more than the value to the left
146 // - One more than the value to the top
147 // - If the characters are equal, the value to the upper left, otherwise
148 // one more than the value to the upper left.
149 m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)),
150 (m[i][j] + ((s == t) ? 0 : 1)));
151 }
152 }
153
154 // The Levenshtein distance should now be the value in the lower right
155 // corner of the matrix.
156 return m[sl][tl];
157 }
158 }
159