StemmerDictionaryParser.java

1
/*******************************************************************************
2
 * Copyright (C) 2026, Leo Galambos
3
 * All rights reserved.
4
 * 
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 * 
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 * 
11
 * 2. Redistributions in binary form must reproduce the above copyright notice,
12
 *    this list of conditions and the following disclaimer in the documentation
13
 *    and/or other materials provided with the distribution.
14
 * 
15
 * 3. All advertising materials mentioning features or use of this software must
16
 *    display the following acknowledgement:
17
 *    This product includes software developed by the Egothor project.
18
 * 
19
 * 4. Neither the name of the copyright holder nor the names of its contributors
20
 *    may be used to endorse or promote products derived from this software
21
 *    without specific prior written permission.
22
 * 
23
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
27
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33
 * POSSIBILITY OF SUCH DAMAGE.
34
 ******************************************************************************/
35
package org.egothor.stemmer;
36
37
import java.io.BufferedReader;
38
import java.io.IOException;
39
import java.io.Reader;
40
import java.nio.charset.StandardCharsets;
41
import java.nio.file.Files;
42
import java.nio.file.Path;
43
import java.util.Locale;
44
import java.util.Objects;
45
import java.util.StringTokenizer;
46
import java.util.logging.Level;
47
import java.util.logging.Logger;
48
49
/**
50
 * Parser of line-oriented stemmer dictionary files.
51
 *
52
 * <p>
53
 * Each non-empty logical line consists of a stem followed by zero or more known
54
 * word variants separated by whitespace. The first token is interpreted as the
55
 * canonical stem, and every following token on the same line is interpreted as
56
 * a variant belonging to that stem.
57
 *
58
 * <p>
59
 * Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
60
 * and trailing whitespace is ignored.
61
 *
62
 * <p>
63
 * The parser supports line remarks and trailing remarks. The remark markers
64
 * {@code #} and {@code //} terminate the logical content of the line, and the
65
 * remainder of that line is ignored.
66
 *
67
 * <p>
68
 * This class is intentionally stateless and allocation-light so it can be used
69
 * both by runtime loading and by offline compilation tooling.
70
 */
71
public final class StemmerDictionaryParser {
72
73
    /**
74
     * Logger of this class.
75
     */
76
    private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());
77
78
    /**
79
     * Utility class.
80
     */
81
    private StemmerDictionaryParser() {
82
        throw new AssertionError("No instances.");
83
    }
84
85
    /**
86
     * Callback receiving one parsed dictionary line.
87
     */
88
    @FunctionalInterface
89
    public interface EntryHandler {
90
91
        /**
92
         * Accepts one parsed dictionary entry.
93
         *
94
         * @param stem       canonical stem, never {@code null}
95
         * @param variants   variants in encounter order, never {@code null}
96
         * @param lineNumber original physical line number in the parsed source
97
         * @throws IOException if processing fails
98
         */
99
        void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
100
    }
101
102
    /**
103
     * Parses a dictionary file from a filesystem path.
104
     *
105
     * @param path         dictionary file path
106
     * @param entryHandler handler receiving parsed entries
107
     * @return parsing statistics
108
     * @throws NullPointerException if any argument is {@code null}
109
     * @throws IOException          if reading fails
110
     */
111
    public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
112
        Objects.requireNonNull(path, "path");
113
        Objects.requireNonNull(entryHandler, "entryHandler");
114
115
        try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
116 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
            return parse(reader, path.toAbsolutePath().toString(), entryHandler);
117
        }
118
    }
119
120
    /**
121
     * Parses a dictionary file from a path string.
122
     *
123
     * @param fileName     dictionary file name or path string
124
     * @param entryHandler handler receiving parsed entries
125
     * @return parsing statistics
126
     * @throws NullPointerException if any argument is {@code null}
127
     * @throws IOException          if reading fails
128
     */
129
    public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
130
        Objects.requireNonNull(fileName, "fileName");
131 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
        return parse(Path.of(fileName), entryHandler);
132
    }
133
134
    /**
135
     * Parses a dictionary from a reader.
136
     *
137
     * @param reader            source reader
138
     * @param sourceDescription logical source description for diagnostics
139
     * @param entryHandler      handler receiving parsed entries
140
     * @return parsing statistics
141
     * @throws NullPointerException if any argument is {@code null}
142
     * @throws IOException          if reading or handler processing fails
143
     */
144
    public static ParseStatistics parse(final Reader reader, final String sourceDescription,
145
            final EntryHandler entryHandler) throws IOException {
146
        Objects.requireNonNull(reader, "reader");
147
        Objects.requireNonNull(sourceDescription, "sourceDescription");
148
        Objects.requireNonNull(entryHandler, "entryHandler");
149
150 1 1. parse : negated conditional → KILLED
        final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
151
                : new BufferedReader(reader);
152
153
        int lineNumber = 0;
154
        int logicalEntryCount = 0;
155
        int ignoredLineCount = 0;
156
157 1 1. parse : negated conditional → KILLED
        for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
158 1 1. parse : Changed increment from 1 to -1 → KILLED
            lineNumber++;
159
160
            final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
161 1 1. parse : negated conditional → KILLED
            if (normalizedLine.isEmpty()) {
162 1 1. parse : Changed increment from 1 to -1 → KILLED
                ignoredLineCount++;
163
                continue;
164
            }
165
166
            final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
167 1 1. parse : negated conditional → KILLED
            if (!tokenizer.hasMoreTokens()) {
168 1 1. parse : Changed increment from 1 to -1 → NO_COVERAGE
                ignoredLineCount++;
169
                continue;
170
            }
171
172
            final String stem = tokenizer.nextToken();
173
            final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
174
175 2 1. parse : changed conditional boundary → KILLED
2. parse : negated conditional → KILLED
            for (int index = 0; index < variants.length; index++) {
176
                variants[index] = tokenizer.nextToken();
177
            }
178
179 1 1. parse : removed call to org/egothor/stemmer/StemmerDictionaryParser$EntryHandler::onEntry → KILLED
            entryHandler.onEntry(stem, variants, lineNumber);
180 1 1. parse : Changed increment from 1 to -1 → KILLED
            logicalEntryCount++;
181
        }
182
183
        final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
184
                ignoredLineCount);
185
186
        if (LOGGER.isLoggable(Level.FINE)) {
187
            LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
188
                    new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
189
                            statistics.ignoredLineCount() });
190
        }
191
192 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
        return statistics;
193
    }
194
195
    /**
196
     * Removes a trailing remark from one physical line.
197
     *
198
     * <p>
199
     * The earliest occurrence of either supported remark marker terminates the
200
     * logical line content.
201
     *
202
     * @param line physical line
203
     * @return line content without a trailing remark
204
     */
205
    private static String stripRemark(final String line) {
206
        final int hashIndex = line.indexOf('#');
207
        final int slashIndex = line.indexOf("//");
208
209
        final int remarkIndex;
210 2 1. stripRemark : changed conditional boundary → KILLED
2. stripRemark : negated conditional → KILLED
        if (hashIndex < 0) {
211
            remarkIndex = slashIndex;
212 2 1. stripRemark : changed conditional boundary → SURVIVED
2. stripRemark : negated conditional → KILLED
        } else if (slashIndex < 0) {
213
            remarkIndex = hashIndex;
214
        } else {
215
            remarkIndex = Math.min(hashIndex, slashIndex);
216
        }
217
218 2 1. stripRemark : changed conditional boundary → KILLED
2. stripRemark : negated conditional → KILLED
        if (remarkIndex < 0) {
219 1 1. stripRemark : replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED
            return line;
220
        }
221 1 1. stripRemark : replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED
        return line.substring(0, remarkIndex);
222
    }
223
224
    /**
225
     * Immutable parsing statistics.
226
     *
227
     * @param sourceDescription logical source description
228
     * @param lineCount         number of physical lines read
229
     * @param entryCount        number of logical dictionary entries emitted
230
     * @param ignoredLineCount  number of ignored empty or remark-only lines
231
     */
232
    public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {
233
234
        /**
235
         * Creates parsing statistics.
236
         *
237
         * @param sourceDescription logical source description
238
         * @param lineCount         number of physical lines read
239
         * @param entryCount        number of logical dictionary entries emitted
240
         * @param ignoredLineCount  number of ignored empty or remark-only lines
241
         * @throws NullPointerException     if {@code sourceDescription} is {@code null}
242
         * @throws IllegalArgumentException if any numeric value is negative
243
         */
244
        public ParseStatistics {
245
            Objects.requireNonNull(sourceDescription, "sourceDescription");
246
            if (lineCount < 0) {
247
                throw new IllegalArgumentException("lineCount must not be negative.");
248
            }
249
            if (entryCount < 0) {
250
                throw new IllegalArgumentException("entryCount must not be negative.");
251
            }
252
            if (ignoredLineCount < 0) {
253
                throw new IllegalArgumentException("ignoredLineCount must not be negative.");
254
            }
255
        }
256
    }
257
}

Mutations

116

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:FileParsingTests]/[method:shouldParseSameContentThroughPathAndStringOverloads()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

131

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:FileParsingTests]/[method:shouldParseSameContentThroughPathAndStringOverloads()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

150

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

157

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

158

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
Changed increment from 1 to -1 → KILLED

161

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

162

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
Changed increment from 1 to -1 → KILLED

167

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

168

1.1
Location : parse
Killed by : none
Changed increment from 1 to -1 → NO_COVERAGE

175

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
changed conditional boundary → KILLED

2.2
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
negated conditional → KILLED

179

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
removed call to org/egothor/stemmer/StemmerDictionaryParser$EntryHandler::onEntry → KILLED

180

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
Changed increment from 1 to -1 → KILLED

192

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

210

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
changed conditional boundary → KILLED

2.2
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
negated conditional → KILLED

212

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
negated conditional → KILLED

2.2
Location : stripRemark
Killed by : none
changed conditional boundary → SURVIVED
Covering tests

218

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
changed conditional boundary → KILLED

2.2
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

219

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED

221

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED

Active mutators

Tests examined


Report generated by PIT 1.22.1