001 /*
002 * CDDL HEADER START
003 *
004 * The contents of this file are subject to the terms of the
005 * Common Development and Distribution License, Version 1.0 only
006 * (the "License"). You may not use this file except in compliance
007 * with the License.
008 *
009 * You can obtain a copy of the license at
010 * trunk/opends/resource/legal-notices/OpenDS.LICENSE
011 * or https://OpenDS.dev.java.net/OpenDS.LICENSE.
012 * See the License for the specific language governing permissions
013 * and limitations under the License.
014 *
015 * When distributing Covered Code, include this CDDL HEADER in each
016 * file and include the License file at
017 * trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable,
018 * add the following below this CDDL HEADER, with the fields enclosed
019 * by brackets "[]" replaced with your own identifying information:
020 * Portions Copyright [yyyy] [name of copyright owner]
021 *
022 * CDDL HEADER END
023 *
024 *
025 * Copyright 2006-2008 Sun Microsystems, Inc.
026 */
027 package org.opends.server.schema;
028
029
030
031 import java.util.Arrays;
032
033 import org.opends.server.admin.std.server.ApproximateMatchingRuleCfg;
034 import org.opends.server.api.ApproximateMatchingRule;
035 import org.opends.server.config.ConfigException;
036 import org.opends.server.protocols.asn1.ASN1OctetString;
037 import org.opends.server.types.ByteString;
038 import org.opends.server.types.DirectoryException;
039 import org.opends.server.types.InitializationException;
040 import org.opends.server.types.DebugLogLevel;
041
042 import static org.opends.server.loggers.debug.DebugLogger.*;
043 import org.opends.server.loggers.debug.DebugTracer;
044 import static org.opends.server.schema.SchemaConstants.*;
045
046
047
048 /**
049 * This class defines an approximate matching rule based on the Double Metaphone
050 * algorithm. The Metaphone and Double Metaphone algorithms were originally
051 * devised by Lawrence Philips (published in the December 1990 issue of
052 * <I>Computer Language</I> and the
053 * <A HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue
054 * of <I>C/C++ Users Journal</I></A>, respectively), and this version of the
055 * algorithm is based on a version modified by Kevin Atkinson to include
056 * bugfixes and additional functionality (source is available
057 * <A HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional
058 * Metaphone and Double Metaphone information is available at
059 * <A HREF="http://aspell.net/metaphone/">http://aspell.net/metaphone/</A>).
060 * This implementation is largely the same as the one provided by Kevin
061 * Atkinson, but it has been re-written for better readability, for more
062 * efficiency, to get rid of checks for conditions that can't possibly happen,
063 * and to get rid of redundant checks that aren't needed. It has also been
064 * updated to always only generate a single value rather than one or possibly
065 * two values.
066 */
067 public class DoubleMetaphoneApproximateMatchingRule
068 extends ApproximateMatchingRule
069 {
070 /**
071 * The tracer object for the debug logger.
072 */
073 private static final DebugTracer TRACER = getTracer();
074
075
076
077 /**
078 * Creates a new instance of this double metaphone approximate matching rule.
079 */
080 public DoubleMetaphoneApproximateMatchingRule()
081 {
082 super();
083 }
084
085
086
087 /**
088 * {@inheritDoc}
089 */
090 public void initializeMatchingRule(ApproximateMatchingRuleCfg configuration)
091 throws ConfigException, InitializationException
092 {
093 // No initialization is required.
094 }
095
096
097
098 /**
099 * Retrieves the common name for this matching rule.
100 *
101 * @return The common name for this matching rule, or <CODE>null</CODE> if
102 * it does not have a name.
103 */
104 public String getName()
105 {
106 return AMR_DOUBLE_METAPHONE_NAME;
107 }
108
109
110
111 /**
112 * Retrieves the OID for this matching rule.
113 *
114 * @return The OID for this matching rule.
115 */
116 public String getOID()
117 {
118 return AMR_DOUBLE_METAPHONE_OID;
119 }
120
121
122
123 /**
124 * Retrieves the description for this matching rule.
125 *
126 * @return The description for this matching rule, or <CODE>null</CODE> if
127 * there is none.
128 */
129 public String getDescription()
130 {
131 // There is no standard description for this matching rule.
132 return AMR_DOUBLE_METAPHONE_DESCRIPTION;
133 }
134
135
136
137 /**
138 * Retrieves the OID of the syntax with which this matching rule is
139 * associated.
140 *
141 * @return The OID of the syntax with which this matching rule is associated.
142 */
143 public String getSyntaxOID()
144 {
145 // Approximate matching is really only appropriate for DirectoryString
146 // values.
147 return SYNTAX_DIRECTORY_STRING_OID;
148 }
149
150
151
152 /**
153 * Retrieves the normalized form of the provided value, which is best suited
154 * for efficiently performing matching operations on that value.
155 *
156 * @param value The value to be normalized.
157 *
158 * @return The normalized version of the provided value.
159 *
160 * @throws DirectoryException If the provided value is invalid according to
161 * the associated attribute syntax.
162 */
163 public ByteString normalizeValue(ByteString value)
164 throws DirectoryException
165 {
166 String valueString = value.stringValue();
167 int length = valueString.length();
168 if (length == 0)
169 {
170 // The value is empty, so it is already normalized.
171 return new ASN1OctetString();
172 }
173
174 int last = length - 1;
175
176
177 // Pad the value to allow for checks to go past the end of the value.
178 valueString = valueString.toUpperCase() + " ";
179
180
181 // The metaphone value that is being constructed.
182 StringBuilder metaphone = new StringBuilder(4);
183
184
185 // Skip over GN, KN, PN, WR, and PS at the beginning of a word.
186 int pos = 0;
187 String substring = valueString.substring(0, 2);
188 if (substring.equals("GN") || substring.equals("KN") ||
189 substring.equals("PN") || substring.equals("WR") ||
190 substring.equals("PS"))
191 {
192 pos++;
193 }
194
195
196 // 'X' at the beginning of a word will sound like Z, but Z will always be
197 // mapped to S.
198 else if (valueString.charAt(0) == 'X')
199 {
200 metaphone.append("S");
201 pos++;
202 }
203
204
205 // Loop until we have at least four metaphone characters or have reached the
206 // end of the string.
207 while ((metaphone.length() < 4) && (pos < length))
208 {
209 // Check the character at the current position against various targets.
210 char posMinusFour;
211 char posMinusThree;
212 char posMinusTwo;
213 char posMinusOne;
214 char posPlusOne;
215 char posPlusTwo;
216 switch (valueString.charAt(pos))
217 {
218 case 'A':
219 case 'E':
220 case 'I':
221 case 'O':
222 case 'U':
223 case 'Y':
224 // All initial vowels map to 'A'. All others will be ignored.
225 if (pos == 0)
226 {
227 metaphone.append("A");
228 }
229
230 pos++;
231 break;
232
233
234 case 'B':
235 // B and BB will be mapped to P, with the exception of "MB" as in
236 // "crumb", but that will be handled elsewhere.
237 metaphone.append("P");
238
239 if (valueString.charAt(++pos) == 'B')
240 {
241 pos++;
242 }
243
244 break;
245
246
247 case 'C':
248 // Check for various Germanic sequences, which will be mapped to 'K'.
249 // This basically includes all occurrences of "ACH" where the
250 // preceding character is not a vowel and the following character is
251 // neither an 'E' nor an 'I' except in "BACHER" and "MACHER".
252 if ((pos > 1) &&
253 (! isVowel(posMinusTwo = valueString.charAt(pos-2))) &&
254 hasSubstring(valueString, pos-1, "ACH") &&
255 ((posPlusTwo = valueString.charAt(pos+2)) != 'I') &&
256 ((posPlusTwo != 'E') ||
257 ((valueString.charAt(pos+3) == 'R') &&
258 ((posMinusTwo == 'B') || (posMinusTwo == 'M')))))
259 {
260 metaphone.append("K");
261 pos += 2;
262 break;
263 }
264
265
266 // Check for a special case of "caesar", which will be maped to 'S'.
267 if ((pos == 0) && hasSubstring(valueString, pos+1, "AESAR"))
268 {
269 metaphone.append("S");
270 pos += 2;
271 break;
272 }
273
274
275 // CH can be treated in lots of different ways.
276 if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
277 {
278 // Check for "chia" as in "chianti" and map to 'K'.
279 if (hasSubstring(valueString, pos+2, "IA"))
280 {
281 metaphone.append("K");
282 pos += 2;
283 break;
284 }
285
286 // Check for "chae" as in "michael" and map to 'K'.
287 if (hasSubstring(valueString, pos+2, "AE"))
288 {
289 metaphone.append("K");
290 pos += 2;
291 break;
292 }
293
294 // Check for a Greek root at the beginning of the value like
295 // chemistry or chorus and map to 'K'.
296 if ((pos == 0) && (! hasSubstring(valueString, 2, "ORE")) &&
297 (hasSubstring(valueString, 2, "ARAC") ||
298 hasSubstring(valueString, 2, "ARIS") ||
299 hasSubstring(valueString, 2, "OR") ||
300 hasSubstring(valueString, 2, "YM") ||
301 hasSubstring(valueString, 2, "IA") ||
302 hasSubstring(valueString, 2, "EM")))
303 {
304 metaphone.append("K");
305 pos += 2;
306 break;
307 }
308
309
310 // Check for "CH" values that produce a "KH" sound that will be
311 // mapped to 'K'.
312 if (isGermanic(valueString) ||
313 hasSubstring(valueString, pos-2, "ORCHES") ||
314 hasSubstring(valueString, pos-2, "ARCHIT") ||
315 hasSubstring(valueString, pos-2, "ORCHID") ||
316 ((posPlusTwo = valueString.charAt(pos+2)) == 'T') ||
317 (posPlusTwo == 'S') ||
318 (((pos == 0) ||
319 (((posMinusOne = valueString.charAt(pos-1)) == 'A') ||
320 (posMinusOne == 'O') || (posMinusOne == 'U') ||
321 (posMinusOne == 'E'))) &&
322 ((posPlusTwo == 'L') || (posPlusTwo == 'R') ||
323 (posPlusTwo == 'N')|| (posPlusTwo == 'M') ||
324 (posPlusTwo == 'B')|| (posPlusTwo == 'H') ||
325 (posPlusTwo == 'F')|| (posPlusTwo == 'V') ||
326 (posPlusTwo == 'W'))))
327 {
328 metaphone.append("K");
329 pos += 2;
330 break;
331 }
332
333
334 // All other "CH" values.
335 if (pos > 0)
336 {
337 if (hasSubstring(valueString, 0, "MC"))
338 {
339 metaphone.append("K");
340 }
341 else
342 {
343 metaphone.append("X");
344 }
345 }
346 else
347 {
348 metaphone.append("X");
349 }
350
351 pos += 2;
352 break;
353 }
354
355
356 // Check for "CZ" as in "czerny" but not "wicz" and map to 'S'.
357 if ((posPlusOne == 'Z') &&
358 (! hasSubstring(valueString, pos-2, "WI")))
359 {
360 metaphone.append("S");
361 pos += 2;
362 break;
363 }
364
365
366 // Check for "CIA" as in "focaccia" and map to 'X'.
367 if ((posPlusOne == 'I') && (valueString.charAt(pos+2) == 'A'))
368 {
369 metaphone.append("X");
370 pos += 3;
371 break;
372 }
373
374
375 // Check for a double C but not in values that start with "McC"
376 if ((posPlusOne == 'C') &&
377 (! ((pos == 1) && valueString.charAt(0) == 'M')))
378 {
379 if ((((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
380 (posPlusTwo == 'E') || (posPlusTwo == 'H')) &&
381 (! ((posPlusTwo == 'H') && valueString.charAt(pos+3) == 'U')))
382 {
383 if (((pos == 1) && (valueString.charAt(pos-1) == 'A')) ||
384 hasSubstring(valueString, pos-1, "UCCEE") ||
385 hasSubstring(valueString, pos-1, "UCCES"))
386 {
387 // Values like "accident", "accede", and "succeed".
388 metaphone.append("K");
389 pos += 2;
390 break;
391 }
392 else
393 {
394 // Values like "bacci" or "bertucci".
395 metaphone.append("X");
396 pos += 3;
397 break;
398 }
399 }
400 else
401 {
402 // This is Pierce's Rule, whatever that means.
403 metaphone.append("K");
404 pos += 2;
405 break;
406 }
407 }
408
409
410 // Check for CK, CG, or CQ and map to 'K'. Check for CI, CE, and CY
411 // and map to "S".
412 if (((posPlusOne = valueString.charAt(pos+1)) == 'K') ||
413 (posPlusOne == 'G') || (posPlusOne == 'Q'))
414 {
415 metaphone.append("K");
416 pos += 2;
417 break;
418 }
419
420
421 // Check for CI, CE, or CY and map to 'S'.
422 if ((posPlusOne == 'I') || (posPlusOne == 'E') || (posPlusOne == 'Y'))
423 {
424 metaphone.append("S");
425 pos += 2;
426 break;
427 }
428
429
430 // All other cases of "C" will be mapped to 'K'. However, the number
431 // of positions that we skip ahead may vary. If there is a value that
432 // consists of two words like "mac caffrey", then skip ahead three.
433 // For the character combinations of "CK" and "CQ", then skip ahead
434 // two. For the character combinations of "CC" except "CCE" and
435 // "CCI", then skip ahead two. For all other cases, skip ahead one.
436 metaphone.append("K");
437 switch (valueString.charAt(pos+1))
438 {
439 case ' ':
440 switch (valueString.charAt(pos+2))
441 {
442 case 'C':
443 case 'Q':
444 case 'G':
445 pos += 3;
446 break;
447 default:
448 pos++;
449 break;
450 }
451 break;
452
453 case 'K':
454 case 'Q':
455 pos += 2;
456 break;
457
458 case 'C':
459 switch (valueString.charAt(pos+2))
460 {
461 case 'E':
462 case 'I':
463 pos++;
464 break;
465 default:
466 pos += 2;
467 break;
468 }
469 break;
470 default:
471 pos++;
472 }
473 break;
474
475
476 case 'D':
477 // DG will be mapped to either 'J' (in cases like edge) or 'TK' (in
478 // cases like Edgar).
479 if ((posPlusOne = valueString.charAt(pos+1)) == 'G')
480 {
481 if (((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
482 (posPlusTwo == 'E') || (posPlusTwo == 'Y'))
483 {
484 metaphone.append("J");
485 pos += 3;
486 break;
487 }
488 else
489 {
490 metaphone.append("TK");
491 pos += 2;
492 break;
493 }
494 }
495
496
497 // DT and DD will be mapped to 'T'.
498 if ((posPlusOne == 'T') || (posPlusOne == 'D'))
499 {
500 metaphone.append("T");
501 pos += 2;
502 break;
503 }
504
505
506 // All other cases will be mapped to 'T'.
507 metaphone.append("T");
508 pos++;
509 break;
510
511
512 case 'F':
513 // F always maps to F. If there is a double F, then skip the second
514 // one.
515 metaphone.append("F");
516 pos++;
517 if (valueString.charAt(pos) == 'F')
518 {
519 pos++;
520 }
521 break;
522
523
524 case 'G':
525 if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
526 {
527 // A "GH" that is not preceded by a vowel will be mapped to 'K'.
528 if ((pos > 0) && (! isVowel(valueString.charAt(pos-1))))
529 {
530 metaphone.append("K");
531 pos += 2;
532 break;
533 }
534
535 if (pos == 0)
536 {
537 if (valueString.charAt(pos+2) == 'I')
538 {
539 // Words like ghislane or ghiradelli
540 metaphone.append("J");
541 }
542 else
543 {
544 metaphone.append("K");
545 }
546
547 pos += 2;
548 break;
549 }
550
551 // A refined version of Parker's Rule.
552 if (((pos > 1) &&
553 (((posMinusTwo = valueString.charAt(pos-2)) == 'B') ||
554 (posMinusTwo == 'H') || (posMinusTwo == 'D'))) ||
555 ((pos > 2) &&
556 (((posMinusThree = valueString.charAt(pos-3)) == 'B') ||
557 (posMinusThree == 'H') || (posMinusThree == 'D'))) ||
558 ((pos > 3) &&
559 (((posMinusFour = valueString.charAt(pos-4)) == 'B') ||
560 (posMinusFour == 'H'))))
561 {
562 pos += 2;
563 break;
564 }
565 else
566 {
567 if ((pos > 2) && (valueString.charAt(pos-1) == 'U') &&
568 (((posMinusThree = valueString.charAt(pos-3)) == 'C') ||
569 (posMinusThree == 'G') || (posMinusThree == 'L') ||
570 (posMinusThree == 'R') || (posMinusThree == 'T')))
571 {
572 // Words like laugh, McLaughlin, cough, rough are mapped to 'F'.
573 metaphone.append("F");
574 }
575 else if ((pos > 0) && (valueString.charAt(pos-1) != 'I'))
576 {
577 metaphone.append("K");
578 }
579
580 pos += 2;
581 break;
582 }
583 }
584
585
586 if (posPlusOne == 'N')
587 {
588 if ((pos == 1) && isVowel(valueString.charAt(0)) &&
589 (! isSlavoGermanic(valueString)))
590 {
591 metaphone.append("KN");
592 pos += 2;
593 break;
594 }
595 else
596 {
597 if ((! hasSubstring(valueString, pos+2, "EY")) &&
598 (! isSlavoGermanic(valueString)))
599 {
600 metaphone.append("N");
601 }
602 else
603 {
604 metaphone.append("KN");
605 }
606
607 pos += 2;
608 break;
609 }
610 }
611
612
613 // GLI as in tagliaro will be mapped to "KL".
614 if ((posPlusOne == 'L') && (valueString.charAt(pos+2) == 'I'))
615 {
616 metaphone.append("KL");
617 pos += 2;
618 break;
619 }
620
621
622 // Forms of GY, GE, and GI at the beginning of a word will map to 'K'.
623 if ((pos == 0) &&
624 ((posPlusOne == 'Y') ||
625 (substring = valueString.substring(pos+1,pos+3)).equals("ES") ||
626 substring.equals("EP") || substring.equals("EB") ||
627 substring.equals("EL") || substring.equals("EY") ||
628 substring.equals("IB") || substring.equals("IL") ||
629 substring.equals("IN") || substring.equals("IE") ||
630 substring.equals("EI") || substring.equals("ER")))
631 {
632 metaphone.append("K");
633 pos += 2;
634 break;
635 }
636
637
638 // Some occurrences of GER and GY in a word will be mapped to 'K'.
639 posPlusTwo = valueString.charAt(pos+2);
640 if ((((posPlusOne == 'E') && (posPlusTwo == 'R')) ||
641 (posPlusOne == 'Y')) &&
642 ((posMinusOne = valueString.charAt(pos-1)) != 'E') &&
643 (posMinusOne != 'I') &&
644 (! hasSubstring(valueString, 0, "DANGER")) &&
645 (! hasSubstring(valueString, 0, "RANGER")) &&
646 (! hasSubstring(valueString, 0, "MANGER")) &&
647 (! hasSubstring(valueString, pos-1, "RGY")) &&
648 (! hasSubstring(valueString, pos-1, "OGY")))
649 {
650 metaphone.append("K");
651 pos += 2;
652 break;
653 }
654
655
656 // Check for Italian uses like 'biaggi" and map to 'J'.
657 if ((posPlusOne == 'E') || (posPlusOne == 'I') ||
658 (posPlusOne == 'Y') ||
659 hasSubstring(valueString, pos-1, "AGGI") ||
660 hasSubstring(valueString, pos-1, "OGGI"))
661 {
662 // Germanic uses will be mapped to 'K'.
663 if (isGermanic(valueString) ||
664 hasSubstring(valueString, pos+1, "ET"))
665 {
666 metaphone.append("K");
667 }
668 else
669 {
670 metaphone.append("J");
671 }
672
673 pos += 2;
674 break;
675 }
676
677
678 // All other cases will be mapped to 'K'. If there is a double G,
679 // then skip two. Otherwise, just skip one.
680 metaphone.append("K");
681 pos++;
682
683 if (posPlusOne == 'G')
684 {
685 pos++;
686 }
687
688 break;
689
690
691 case 'H':
692 // The letter 'H' will only be processed if it is immediately followed
693 // by a vowel and is either the start of the word or preceded by a
694 // vowel.
695 if (isVowel(valueString.charAt(pos+1)))
696 {
697 if ((pos == 0) || isVowel(valueString.charAt(pos-1)))
698 {
699 metaphone.append("H");
700 pos++;
701 }
702 }
703
704 pos++;
705 break;
706
707
708 case 'J':
709 // Take care of obvious Spanish uses that should map to 'H'.
710 if (hasSubstring(valueString, 0, "SAN "))
711 {
712 metaphone.append("H");
713 pos++;
714 break;
715 }
716
717 if (hasSubstring(valueString, pos, "JOSE"))
718 {
719 if ((pos == 0) && (valueString.charAt(pos+4) == ' '))
720 {
721 metaphone.append("H");
722 }
723 else
724 {
725 metaphone.append("J");
726 }
727
728 pos++;
729 break;
730 }
731
732
733 // All other cases will be mapped to 'J'.
734 metaphone.append("J");
735
736 if (valueString.charAt(pos+1) == 'J')
737 {
738 pos++;
739 }
740
741 pos++;
742 break;
743
744
745 case 'K':
746 // 'K' will always be mapped to 'K'. KK will be treated like K.
747 metaphone.append("K");
748
749 if (valueString.charAt(pos+1) == 'K')
750 {
751 pos++;
752 }
753
754 pos++;
755 break;
756
757
758 case 'L':
759 // 'L' will always be mapped to 'L'. LL will be treated like L, even
760 // for potential Spanish uses.
761 metaphone.append("L");
762
763 if (valueString.charAt(pos+1) == 'L')
764 {
765 pos++;
766 }
767
768 pos++;
769 break;
770
771
772 case 'M':
773 // 'M' will always be mapped to 'M'. MM will be treated like M.
774 // UMB in cases like "dumb" and "thumb" will be treated like M.
775 metaphone.append("M");
776
777 if (valueString.charAt(pos+1) == 'M')
778 {
779 pos++;
780 }
781 else if (hasSubstring(valueString, pos-1, "UMB"))
782 {
783 if (((pos+1) == last) ||
784 hasSubstring(valueString, pos+2, "ER"))
785 {
786 pos++;
787 }
788 }
789
790 pos++;
791 break;
792
793
794 case 'N':
795 // 'N' will always be mapped to 'N'. NN will be treated like N.
796 metaphone.append("N");
797
798 if (valueString.charAt(pos+1) == 'N')
799 {
800 pos++;
801 }
802
803 pos++;
804 break;
805
806
807 case 'P':
808 // PH will be mapped to 'F'.
809 if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
810 {
811 metaphone.append("F");
812 pos += 2;
813 break;
814 }
815
816
817 // All other cases will be mapped to 'P', with PP and PB being treated
818 // like P.
819 metaphone.append("P");
820
821 if ((posPlusOne == 'P') || (posPlusOne == 'B'))
822 {
823 pos++;
824 }
825
826 pos++;
827 break;
828
829
830 case 'Q':
831 // 'Q' will always be mapped to 'K'. QQ will be treated like Q.
832 metaphone.append("K");
833
834 if (valueString.charAt(pos+1) == 'Q')
835 {
836 pos++;
837 }
838
839 pos++;
840 break;
841
842
843 case 'R':
844 // Ignore R at the end of French words.
845 if ((pos == last) && (! isSlavoGermanic(valueString)) &&
846 hasSubstring(valueString, pos-2, "IE") &&
847 (! hasSubstring(valueString, pos-4, "ME")) &&
848 (! hasSubstring(valueString, pos-4, "MA")))
849 {
850 pos++;
851 break;
852 }
853
854
855 // All other cases will be mapped to 'R', with RR treated like R.
856 metaphone.append("R");
857
858 if (valueString.charAt(pos+1) == 'R')
859 {
860 pos++;
861 }
862
863 pos++;
864 break;
865
866
867 case 'S':
868 // Special cases like isle and carlysle will be silent.
869 if (hasSubstring(valueString, pos-1, "ISL") ||
870 hasSubstring(valueString, pos-1, "YSL"))
871 {
872 pos++;
873 break;
874 }
875
876
877 // Special case of sugar mapped to 'X'.
878 if (hasSubstring(valueString, pos+1, "UGAR"))
879 {
880 metaphone.append("X");
881 pos++;
882 break;
883 }
884
885
886 // SH is generally mapped to 'X', but not in Germanic cases.
887 if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
888 {
889 if (hasSubstring(valueString, pos+1, "HEIM") ||
890 hasSubstring(valueString, pos+1, "HOEK") ||
891 hasSubstring(valueString, pos+1, "HOLM") ||
892 hasSubstring(valueString, pos+1, "HOLZ"))
893 {
894 metaphone.append("S");
895 }
896 else
897 {
898 metaphone.append("X");
899 }
900
901 pos += 2;
902 break;
903 }
904
905
906 // Italian and Armenian cases will map to "S".
907 if (hasSubstring(valueString, pos+1, "IO") ||
908 hasSubstring(valueString, pos+1, "IA"))
909 {
910 metaphone.append("S");
911 pos += 3;
912 break;
913 }
914
915
916 // SZ should be mapped to 'S'.
917 if (posPlusOne == 'Z')
918 {
919 metaphone.append("S");
920 pos += 2;
921 break;
922 }
923
924
925 // Various combinations at the beginning of words will be mapped to
926 // 'S'.
927 if ((pos == 0) &&
928 ((posPlusOne == 'M') || (posPlusOne == 'N') ||
929 (posPlusOne == 'L') || (posPlusOne == 'W')))
930 {
931 metaphone.append("S");
932 pos++;
933 break;
934 }
935
936
937 // SC should be mapped to either SK, X, or S.
938 if (posPlusOne == 'C')
939 {
940 if ((posPlusTwo = valueString.charAt(pos+2)) == 'H')
941 {
942 if (hasSubstring(valueString, pos+3, "OO") ||
943 hasSubstring(valueString, pos+3, "UY") ||
944 hasSubstring(valueString, pos+3, "ED") ||
945 hasSubstring(valueString, pos+3, "EM"))
946 {
947 metaphone.append("SK");
948 }
949 else
950 {
951 metaphone.append("X");
952 }
953
954 pos += 3;
955 break;
956 }
957
958 if ((posPlusTwo == 'I') || (posPlusTwo == 'E') ||
959 (posPlusTwo == 'Y'))
960 {
961 metaphone.append("S");
962 pos += 3;
963 break;
964 }
965
966 metaphone.append("SK");
967 pos += 3;
968 break;
969 }
970
971
972 // Ignore a trailing S in French words. All others will be mapped to
973 // 'S'.
974 if (! ((pos == last) &&
975 (hasSubstring(valueString, pos-2, "AI") ||
976 hasSubstring(valueString, pos-2, "OI"))))
977 {
978 metaphone.append("S");
979 }
980
981 if ((posPlusOne == 'S') || (posPlusOne == 'Z'))
982 {
983 pos++;
984 }
985
986 pos++;
987 break;
988
989
990 case 'T':
991 // "TION", "TIA", and "TCH" will be mapped to 'X'.
992 if (hasSubstring(valueString, pos, "TION") ||
993 hasSubstring(valueString, pos, "TIA") ||
994 hasSubstring(valueString, pos, "TCH"))
995 {
996 metaphone.append("X");
997 pos += 3;
998 break;
999 }
1000
1001
1002 // TH or TTH will be mapped to either T (for Germanic cases) or
1003 // 0 (zero) for the rest.
1004 if (((posPlusOne = valueString.charAt(pos+1)) == 'H') ||
1005 ((posPlusOne == 'T') && (valueString.charAt(pos+2) == 'H')))
1006 {
1007 if (isGermanic(valueString) ||
1008 hasSubstring(valueString, pos+2, "OM") ||
1009 hasSubstring(valueString, pos+2, "AM"))
1010 {
1011 metaphone.append("T");
1012 }
1013 else
1014 {
1015 metaphone.append("0");
1016 }
1017
1018 pos += 2;
1019 break;
1020 }
1021
1022
1023 // All other cases will map to T, with TT and TD being treated like T.
1024 metaphone.append("T");
1025
1026 if ((posPlusOne == 'T') || (posPlusOne == 'D'))
1027 {
1028 pos++;
1029 }
1030
1031 pos++;
1032 break;
1033
1034
1035 case 'V':
1036 // 'V' will always be mapped to 'F', with VV treated like V.
1037 metaphone.append("F");
1038
1039 if (valueString.charAt(pos+1) == 'V')
1040 {
1041 pos++;
1042 }
1043
1044 pos++;
1045 break;
1046
1047
1048 case 'W':
1049 // WR should always map to R.
1050 if ((posPlusOne = valueString.charAt(pos+1)) == 'R')
1051 {
1052 metaphone.append("R");
1053 pos += 2;
1054 break;
1055 }
1056
1057
1058 // W[AEIOUYH] at the beginning of the word should be mapped to A.
1059 if ((pos == 0) && (isVowel(posPlusOne) || (posPlusOne == 'H')))
1060 {
1061 metaphone.append("A");
1062
1063 // FIXME -- This isn't in the algorithm as written. Should it be?
1064 pos += 2;
1065 break;
1066 }
1067
1068
1069 // A Polish value like WICZ or WITZ should be mapped to TS.
1070 if (hasSubstring(valueString, pos+1, "WICZ") ||
1071 hasSubstring(valueString, pos+1, "WITZ"))
1072 {
1073 metaphone.append("TS");
1074 pos += 4;
1075 break;
1076 }
1077
1078
1079 // Otherwise, we'll just skip it.
1080 pos++;
1081 break;
1082
1083
1084 case 'X':
1085 // X maps to KS except at the end of French words.
1086 if (! ((pos == last) &&
1087 (hasSubstring(valueString, pos-3, "IAU") ||
1088 hasSubstring(valueString, pos-3, "EAU") ||
1089 hasSubstring(valueString, pos-2, "AU") ||
1090 hasSubstring(valueString, pos-2, "OU"))))
1091 {
1092 metaphone.append("KS");
1093 }
1094
1095 if (((posPlusOne = valueString.charAt(pos+1)) == 'C') ||
1096 (posPlusOne == 'X'))
1097 {
1098 pos++;
1099 }
1100
1101 pos++;
1102 break;
1103
1104
1105 case 'Z':
1106 // Chinese usages like zhao will map to J.
1107 if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
1108 {
1109 metaphone.append("J");
1110 pos += 2;
1111 break;
1112 }
1113
1114
1115 // All other cases map to "S". ZZ will be treated like Z.
1116 metaphone.append("S");
1117
1118 if (posPlusOne == 'Z')
1119 {
1120 pos++;
1121 }
1122
1123 pos++;
1124 break;
1125
1126
1127 case '\u00C7': // C with a cedilla
1128 // This will always be mapped to 'S'.
1129 metaphone.append("S");
1130 pos++;
1131 break;
1132
1133
1134 case '\u00D1': // N with a tilde
1135 // This will always be mapped to 'N'.
1136 metaphone.append("N");
1137 pos++;
1138 break;
1139
1140
1141 default:
1142 // We don't have any special treatment for this character, so skip it.
1143 pos++;
1144 break;
1145 }
1146 }
1147
1148
1149 return new ASN1OctetString(metaphone.toString());
1150 }
1151
1152
1153
1154 /**
1155 * Indicates whether the two provided normalized values are approximately
1156 * equal to each other.
1157 *
1158 * @param value1 The normalized form of the first value to compare.
1159 * @param value2 The normalized form of the second value to compare.
1160 *
1161 * @return <CODE>true</CODE> if the provided values are approximately equal,
1162 * or <CODE>false</CODE> if not.
1163 */
1164 public boolean approximatelyMatch(ByteString value1, ByteString value2)
1165 {
1166 // If the values have been normalized, then we just need to compare their
1167 // byte arrays.
1168 return Arrays.equals(value1.value(), value2.value());
1169 }
1170
1171
1172
1173 /**
1174 * Indicates whether the provided value has the given substring at the
1175 * specified position.
1176 *
1177 * @param value The value containing the range for which to make the
1178 * determination.
1179 * @param start The position in the value at which to start the
1180 * comparison.
1181 * @param substring The substring to compare against the specified value
1182 * range.
1183 *
1184 * @return <CODE>true</CODE> if the specified portion of the value matches
1185 * the given substring, or <CODE>false</CODE> if it does not.
1186 */
1187 private boolean hasSubstring(String value, int start,
1188 String substring)
1189 {
1190 try
1191 {
1192 // This can happen since a lot of the rules "look behind" and
1193 // rightfully don't check if it's the first character
1194 if (start < 0) {
1195 return false;
1196 }
1197
1198 int end = start + substring.length();
1199
1200 // value isn't big enough to do the comparison
1201 if (end > value.length())
1202 {
1203 return false;
1204 }
1205
1206 for (int i=0,pos=start; pos < end; i++,pos++)
1207 {
1208 if (value.charAt(pos) != substring.charAt(i))
1209 {
1210 return false;
1211 }
1212 }
1213
1214 return true;
1215 }
1216 catch (Exception e)
1217 {
1218 if (debugEnabled())
1219 {
1220 TRACER.debugCaught(DebugLogLevel.ERROR, e);
1221 }
1222
1223 return false;
1224 }
1225 }
1226
1227
1228
1229 /**
1230 * Indicates whether the provided character is a vowel (including "Y").
1231 *
1232 * @param c The character for which to make the determination.
1233 *
1234 * @return <CODE>true</CODE> if the provided character is a vowel, or
1235 * <CODE>false</CODE> if not.
1236 */
1237 private boolean isVowel(char c)
1238 {
1239 switch (c)
1240 {
1241 case 'A':
1242 case 'E':
1243 case 'I':
1244 case 'O':
1245 case 'U':
1246 case 'Y':
1247 return true;
1248
1249 default:
1250 return false;
1251 }
1252 }
1253
1254
1255
1256 /**
1257 * Indicates whether the provided string appears to be Slavo-Germanic.
1258 *
1259 * @param s The string for which to make the determination.
1260 *
1261 * @return <CODE>true</CODE> if the provided string appears to be
1262 * Slavo-Germanic, or <CODE>false</CODE> if not.
1263 */
1264 private boolean isSlavoGermanic(String s)
1265 {
1266 return (s.contains("W") || s.contains("K") || s.contains("CZ") ||
1267 s.contains("WITZ"));
1268 }
1269
1270
1271
1272 /**
1273 * Indicates whether the provided string appears Germanic (starts with "VAN ",
1274 * "VON ", or "SCH").
1275 *
1276 * @param s The string for which to make the determination.
1277 *
1278 * @return <CODE>true</CODE> if the provided string appears Germanic, or
1279 * <CODE>false</CODE> if not.
1280 */
1281 private boolean isGermanic(String s)
1282 {
1283 return (s.startsWith("VAN ") || s.startsWith("VON ") ||
1284 s.startsWith("SCH"));
1285 }
1286 }
1287