1 /*
2  * Copyright (C) 2010 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.quicksearchbox.util;
18 
19 /**
20  * This class represents the matrix used in the Levenshtein distance algorithm, together
21  * with the algorithm itself which operates on the matrix.
22  *
23  * We also track of the individual operations applied to transform the source string into the
24  * target string so we can trace the path taken through the matrix afterwards, in order to
25  * perform the formatting as required.
26  */
27 public class LevenshteinDistance {
28     public static final int EDIT_DELETE = 0;
29     public static final int EDIT_INSERT = 1;
30     public static final int EDIT_REPLACE = 2;
31     public static final int EDIT_UNCHANGED = 3;
32 
33     private final Token[] mSource;
34     private final Token[] mTarget;
35     private final int[][] mEditTypeTable;
36     private final int[][] mDistanceTable;
37 
LevenshteinDistance(Token[] source, Token[] target)38     public LevenshteinDistance(Token[] source, Token[] target) {
39         final int sourceSize = source.length;
40         final int targetSize = target.length;
41         final int[][] editTab = new int[sourceSize+1][targetSize+1];
42         final int[][] distTab = new int[sourceSize+1][targetSize+1];
43         editTab[0][0] = EDIT_UNCHANGED;
44         distTab[0][0] = 0;
45         for (int i = 1; i <= sourceSize; ++i) {
46             editTab[i][0] = EDIT_DELETE;
47             distTab[i][0] = i;
48         }
49         for (int i = 1; i <= targetSize; ++i) {
50             editTab[0][i] = EDIT_INSERT;
51             distTab[0][i] = i;
52         }
53         mEditTypeTable = editTab;
54         mDistanceTable = distTab;
55         mSource = source;
56         mTarget = target;
57     }
58 
59     /**
60      * Implementation of Levenshtein distance algorithm.
61      *
62      * @return The Levenshtein distance.
63      */
calculate()64     public int calculate() {
65         final Token[] src = mSource;
66         final Token[] trg = mTarget;
67         final int sourceLen = src.length;
68         final int targetLen = trg.length;
69         final int[][] distTab = mDistanceTable;
70         final int[][] editTab = mEditTypeTable;
71         for (int s = 1; s <= sourceLen; ++s) {
72             Token sourceToken = src[s-1];
73             for (int t = 1; t <= targetLen; ++t) {
74                 Token targetToken = trg[t-1];
75                 int cost = sourceToken.prefixOf(targetToken) ? 0 : 1;
76 
77                 int distance = distTab[s-1][t] + 1;
78                 int type = EDIT_DELETE;
79 
80                 int d = distTab[s][t - 1];
81                 if (d + 1 < distance ) {
82                     distance = d + 1;
83                     type = EDIT_INSERT;
84                 }
85 
86                 d = distTab[s - 1][t - 1];
87                 if (d + cost < distance) {
88                     distance = d + cost;
89                     type = cost == 0 ? EDIT_UNCHANGED : EDIT_REPLACE;
90                 }
91                 distTab[s][t] = distance;
92                 editTab[s][t] = type;
93             }
94         }
95         return distTab[sourceLen][targetLen];
96     }
97 
98     /**
99      * Gets the list of operations which were applied to each target token; {@link #calculate} must
100      * have been called on this object before using this method.
101      * @return A list of {@link EditOperation}s indicating the origin of each token in the target
102      *      string. The position of the token indicates the position in the source string of the
103      *      token that was unchanged/replaced, or the position in the source after which a target
104      *      token was inserted.
105      */
getTargetOperations()106     public EditOperation[] getTargetOperations() {
107         final int trgLen = mTarget.length;
108         final EditOperation[] ops = new EditOperation[trgLen];
109         int targetPos = trgLen;
110         int sourcePos = mSource.length;
111         final int[][] editTab = mEditTypeTable;
112         while (targetPos > 0) {
113             int editType = editTab[sourcePos][targetPos];
114             switch (editType) {
115                 case LevenshteinDistance.EDIT_DELETE:
116                     sourcePos--;
117                     break;
118                 case LevenshteinDistance.EDIT_INSERT:
119                     targetPos--;
120                     ops[targetPos] = new EditOperation(editType, sourcePos);
121                     break;
122                 case LevenshteinDistance.EDIT_UNCHANGED:
123                 case LevenshteinDistance.EDIT_REPLACE:
124                     targetPos--;
125                     sourcePos--;
126                     ops[targetPos] = new EditOperation(editType, sourcePos);
127                     break;
128             }
129         }
130 
131         return ops;
132     }
133 
134     public static final class EditOperation {
135         private final int mType;
136         private final int mPosition;
EditOperation(int type, int position)137         public EditOperation(int type, int position) {
138             mType = type;
139             mPosition = position;
140         }
getType()141         public int getType() {
142             return mType;
143         }
getPosition()144         public int getPosition() {
145             return mPosition;
146         }
147     }
148 
149     public static final class Token implements CharSequence {
150         private final char[] mContainer;
151         public final int mStart;
152         public final int mEnd;
153 
Token(char[] container, int start, int end)154         public Token(char[] container, int start, int end) {
155             mContainer = container;
156             mStart = start;
157             mEnd = end;
158         }
159 
length()160         public int length() {
161             return mEnd - mStart;
162         }
163 
164         @Override
toString()165         public String toString() {
166             // used in tests only.
167             return subSequence(0, length());
168         }
169 
prefixOf(final Token that)170         public boolean prefixOf(final Token that) {
171             final int len = length();
172             if (len > that.length()) return false;
173             final int thisStart = mStart;
174             final int thatStart = that.mStart;
175             final char[] thisContainer = mContainer;
176             final char[] thatContainer = that.mContainer;
177             for (int i = 0; i < len; ++i) {
178                 if (thisContainer[thisStart + i] != thatContainer[thatStart + i]) {
179                     return false;
180                 }
181             }
182             return true;
183         }
184 
charAt(int index)185         public char charAt(int index) {
186             return mContainer[index + mStart];
187         }
188 
subSequence(int start, int end)189         public String subSequence(int start, int end) {
190             return new String(mContainer, mStart + start, length());
191         }
192 
193     }
194 }
195