DiacriticStripper.java

1
/*******************************************************************************
2
 * Copyright (C) 2026, Leo Galambos
3
 * All rights reserved.
4
 * 
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 * 
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 * 
11
 * 2. Redistributions in binary form must reproduce the above copyright notice,
12
 *    this list of conditions and the following disclaimer in the documentation
13
 *    and/or other materials provided with the distribution.
14
 * 
15
 * 3. Neither the name of the copyright holder nor the names of its contributors
16
 *    may be used to endorse or promote products derived from this software
17
 *    without specific prior written permission.
18
 * 
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 * POSSIBILITY OF SUCH DAMAGE.
30
 ******************************************************************************/
31
package org.egothor.stemmer;
32
33
import java.text.Normalizer;
34
import java.text.Normalizer.Form;
35
36
/**
37
 * Utility that strips diacritics from text for diacritic-insensitive trie
38
 * storage and lookup.
39
 */
40
final class DiacriticStripper {
41
42
    /**
43
     * Direct single-character replacement table.
44
     */
45
    private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
46
47
    static {
48
        registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
49
        registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
50
        registerSingle("çćĉċč", 'c');
51
        registerSingle("ÇĆĈĊČ", 'C');
52
        registerSingle("ďđḍ", 'd');
53
        registerSingle("ĎĐḌ", 'D');
54
        registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
55
        registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
56
        registerSingle("ğĝġģǧ", 'g');
57
        registerSingle("ĞĜĠĢǦ", 'G');
58
        registerSingle("ĥħ", 'h');
59
        registerSingle("ĤĦ", 'H');
60
        registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
61
        registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
62
        registerSingle("ĵ", 'j');
63
        registerSingle("Ĵ", 'J');
64
        registerSingle("ķǩ", 'k');
65
        registerSingle("ĶǨ", 'K');
66
        registerSingle("ĺļľŀł", 'l');
67
        registerSingle("ĹĻĽĿŁ", 'L');
68
        registerSingle("ñńņňʼnŋ", 'n');
69
        registerSingle("ÑŃŅŇŊ", 'N');
70
        registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
71
        registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
72
        registerSingle("ŕŗř", 'r');
73
        registerSingle("ŔŖŘ", 'R');
74
        registerSingle("śŝşšș", 's');
75
        registerSingle("ŚŜŞŠȘ", 'S');
76
        registerSingle("ťţŧț", 't');
77
        registerSingle("ŤŢŦȚ", 'T');
78
        registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
79
        registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
80
        registerSingle("ýÿŷỳỵỷỹ", 'y');
81
        registerSingle("ÝŶŸỲỴỶỸ", 'Y');
82
        registerSingle("źżž", 'z');
83
        registerSingle("ŹŻŽ", 'Z');
84
        registerSingle("þ", 't');
85
        registerSingle("Þ", 'T');
86
    }
87
88
    /**
89
     * Utility class.
90
     */
91
    private DiacriticStripper() {
92
        throw new AssertionError("No instances.");
93
    }
94
95
    /**
96
     * Removes supported diacritic marks and common Latin ligatures from the supplied
97
     * text.
98
     *
99
     * <p>
100
     * The method returns the original {@link String} instance when no replacement is
101
     * required, avoiding an unnecessary allocation on the common ASCII path.
102
     * </p>
103
     *
104
     * @param input text to normalize
105
     * @return normalized text, or {@code input} itself when it is already unchanged
106
     */
107
    /* default */ static String strip(final String input) {
108
        StringBuilder normalized = null;
109
110 2 1. strip : negated conditional → KILLED
2. strip : changed conditional boundary → KILLED
        for (int index = 0; index < input.length(); index++) {
111
            final char source = input.charAt(index);
112
            final String replacement = replacementFor(source);
113
114 1 1. strip : negated conditional → KILLED
            if (replacement == null) {
115 1 1. strip : negated conditional → KILLED
                if (normalized != null) {
116
                    normalized.append(source);
117
                }
118
                continue;
119
            }
120
121 1 1. strip : negated conditional → KILLED
            if (normalized == null) {
122
                normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
123
                normalized.append(input, 0, index);
124
            }
125
            normalized.append(replacement);
126
        }
127
128 1 1. strip : negated conditional → KILLED
        if (normalized == null) {
129 1 1. strip : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::strip → KILLED
            return input;
130
        }
131 1 1. strip : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::strip → KILLED
        return normalized.toString();
132
    }
133
134
    /**
135
     * Returns the replacement text for one non-ASCII character.
136
     *
137
     * @param source source character
138
     * @return replacement text, or {@code null} when the character should be kept
139
     *         unchanged
140
     */
141
    @SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
142
    private static String replacementFor(final char source) {
143 2 1. replacementFor : changed conditional boundary → SURVIVED
2. replacementFor : negated conditional → KILLED
        if (source <= 0x007F) {
144 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return null;
145
        }
146
147
        final char mapped = DIRECT_REPLACEMENTS[source];
148 1 1. replacementFor : negated conditional → KILLED
        if (mapped != '\0') {
149 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return String.valueOf(mapped);
150
        }
151
152 1 1. replacementFor : negated conditional → KILLED
        if (source == 'ß') {
153 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return "ss";
154
        }
155 1 1. replacementFor : negated conditional → KILLED
        if (source == 'Æ') {
156 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return "AE";
157
        }
158 1 1. replacementFor : negated conditional → KILLED
        if (source == 'æ') {
159 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return "ae";
160
        }
161 1 1. replacementFor : negated conditional → KILLED
        if (source == 'Œ') {
162 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return "OE";
163
        }
164 1 1. replacementFor : negated conditional → KILLED
        if (source == 'œ') {
165 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return "oe";
166
        }
167
168
        final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
169
        final StringBuilder ascii = new StringBuilder(decomposed.length());
170 2 1. replacementFor : negated conditional → SURVIVED
2. replacementFor : changed conditional boundary → KILLED
        for (int index = 0; index < decomposed.length(); index++) {
171
            final char part = decomposed.charAt(index);
172 1 1. replacementFor : negated conditional → SURVIVED
            if (Character.getType(part) == Character.NON_SPACING_MARK) {
173
                continue;
174
            }
175 2 1. replacementFor : negated conditional → SURVIVED
2. replacementFor : changed conditional boundary → SURVIVED
            if (part <= 0x007F) {
176
                ascii.append(part);
177
            }
178
        }
179
180 1 1. replacementFor : negated conditional → KILLED
        if (ascii.length() == 0) {
181 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED
            return null;
182
        }
183 1 1. replacementFor : replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → NO_COVERAGE
        return ascii.toString();
184
    }
185
186
    /**
187
     * Registers one-character replacements for a set of source characters.
188
     *
189
     * @param sourceCharacters characters to replace
190
     * @param replacement      replacement character
191
     */
192
    private static void registerSingle(final String sourceCharacters, final char replacement) {
193
        for (int index = 0; index < sourceCharacters.length(); index++) {
194
            DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
195
        }
196
    }
197
}

Mutations

110

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

2.2
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
changed conditional boundary → KILLED

114

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

115

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

121

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

128

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

129

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:asciiInputIsReturnedAsIs()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::strip → KILLED

131

1.1
Location : strip
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::strip → KILLED

143

1.1
Location : replacementFor
Killed by : none
changed conditional boundary → SURVIVED
Covering tests

2.2
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
negated conditional → KILLED

144

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

148

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

149

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:mixedInputPreservesUntouchedCharactersAfterNormalizationStarts()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

152

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

153

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

155

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

156

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

158

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

159

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

161

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

162

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

164

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
negated conditional → KILLED

165

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:specialReplacementsSupportMultiCharacterAsciiOutput()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

170

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:unmappableNonLatinCharactersRemainUnchanged()]
changed conditional boundary → KILLED

2.2
Location : replacementFor
Killed by : none
negated conditional → SURVIVED
Covering tests

172

1.1
Location : replacementFor
Killed by : none
negated conditional → SURVIVED
Covering tests

175

1.1
Location : replacementFor
Killed by : none
negated conditional → SURVIVED
Covering tests

2.2
Location : replacementFor
Killed by : none
changed conditional boundary → SURVIVED Covering tests

180

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:unmappableNonLatinCharactersRemainUnchanged()]
negated conditional → KILLED

181

1.1
Location : replacementFor
Killed by : org.egothor.stemmer.DiacriticStripperTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.DiacriticStripperTest]/[method:unmappableNonLatinCharactersRemainUnchanged()]
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → KILLED

183

1.1
Location : replacementFor
Killed by : none
replaced return value with "" for org/egothor/stemmer/DiacriticStripper::replacementFor → NO_COVERAGE

Active mutators

Tests examined


Report generated by PIT 1.22.1