StemmerDictionaryParser.java

1
/*******************************************************************************
2
 * Copyright (C) 2026, Leo Galambos
3
 * All rights reserved.
4
 * 
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 * 
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 * 
11
 * 2. Redistributions in binary form must reproduce the above copyright notice,
12
 *    this list of conditions and the following disclaimer in the documentation
13
 *    and/or other materials provided with the distribution.
14
 * 
15
 * 3. Neither the name of the copyright holder nor the names of its contributors
16
 *    may be used to endorse or promote products derived from this software
17
 *    without specific prior written permission.
18
 * 
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 * POSSIBILITY OF SUCH DAMAGE.
30
 ******************************************************************************/
31
package org.egothor.stemmer;
32
33
import java.io.BufferedReader;
34
import java.io.IOException;
35
import java.io.Reader;
36
import java.nio.charset.StandardCharsets;
37
import java.nio.file.Files;
38
import java.nio.file.Path;
39
import java.util.Locale;
40
import java.util.Objects;
41
import java.util.StringTokenizer;
42
import java.util.logging.Level;
43
import java.util.logging.Logger;
44
45
/**
46
 * Parser of line-oriented stemmer dictionary files.
47
 *
48
 * <p>
49
 * Each non-empty logical line consists of a stem followed by zero or more known
50
 * word variants separated by whitespace. The first token is interpreted as the
51
 * canonical stem, and every following token on the same line is interpreted as
52
 * a variant belonging to that stem.
53
 *
54
 * <p>
55
 * Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
56
 * and trailing whitespace is ignored.
57
 *
58
 * <p>
59
 * The parser supports line remarks and trailing remarks. The remark markers
60
 * {@code #} and {@code //} terminate the logical content of the line, and the
61
 * remainder of that line is ignored.
62
 *
63
 * <p>
64
 * This class is intentionally stateless and allocation-light so it can be used
65
 * both by runtime loading and by offline compilation tooling.
66
 */
67
public final class StemmerDictionaryParser {
68
69
    /**
70
     * Logger of this class.
71
     */
72
    private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());
73
74
    /**
75
     * Utility class.
76
     */
77
    private StemmerDictionaryParser() {
78
        throw new AssertionError("No instances.");
79
    }
80
81
    /**
82
     * Callback receiving one parsed dictionary line.
83
     */
84
    @FunctionalInterface
85
    public interface EntryHandler {
86
87
        /**
88
         * Accepts one parsed dictionary entry.
89
         *
90
         * @param stem       canonical stem, never {@code null}
91
         * @param variants   variants in encounter order, never {@code null}
92
         * @param lineNumber original physical line number in the parsed source
93
         * @throws IOException if processing fails
94
         */
95
        void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
96
    }
97
98
    /**
99
     * Parses a dictionary file from a filesystem path.
100
     *
101
     * @param path         dictionary file path
102
     * @param entryHandler handler receiving parsed entries
103
     * @return parsing statistics
104
     * @throws NullPointerException if any argument is {@code null}
105
     * @throws IOException          if reading fails
106
     */
107
    public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
108
        Objects.requireNonNull(path, "path");
109
        Objects.requireNonNull(entryHandler, "entryHandler");
110
111
        try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
112 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
            return parse(reader, path.toAbsolutePath().toString(), entryHandler);
113
        }
114
    }
115
116
    /**
117
     * Parses a dictionary file from a path string.
118
     *
119
     * @param fileName     dictionary file name or path string
120
     * @param entryHandler handler receiving parsed entries
121
     * @return parsing statistics
122
     * @throws NullPointerException if any argument is {@code null}
123
     * @throws IOException          if reading fails
124
     */
125
    public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
126
        Objects.requireNonNull(fileName, "fileName");
127 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
        return parse(Path.of(fileName), entryHandler);
128
    }
129
130
    /**
131
     * Parses a dictionary from a reader.
132
     *
133
     * @param reader            source reader
134
     * @param sourceDescription logical source description for diagnostics
135
     * @param entryHandler      handler receiving parsed entries
136
     * @return parsing statistics
137
     * @throws NullPointerException if any argument is {@code null}
138
     * @throws IOException          if reading or handler processing fails
139
     */
140
    public static ParseStatistics parse(final Reader reader, final String sourceDescription,
141
            final EntryHandler entryHandler) throws IOException {
142
        Objects.requireNonNull(reader, "reader");
143
        Objects.requireNonNull(sourceDescription, "sourceDescription");
144
        Objects.requireNonNull(entryHandler, "entryHandler");
145
146 1 1. parse : negated conditional → KILLED
        final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
147
                : new BufferedReader(reader);
148
149
        int lineNumber = 0;
150
        int logicalEntryCount = 0;
151
        int ignoredLineCount = 0;
152
153 1 1. parse : negated conditional → KILLED
        for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
154 1 1. parse : Changed increment from 1 to -1 → KILLED
            lineNumber++;
155
156
            final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
157 1 1. parse : negated conditional → KILLED
            if (normalizedLine.isEmpty()) {
158 1 1. parse : Changed increment from 1 to -1 → KILLED
                ignoredLineCount++;
159
                continue;
160
            }
161
162
            final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
163 1 1. parse : negated conditional → KILLED
            if (!tokenizer.hasMoreTokens()) {
164 1 1. parse : Changed increment from 1 to -1 → NO_COVERAGE
                ignoredLineCount++;
165
                continue;
166
            }
167
168
            final String stem = tokenizer.nextToken();
169
            final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
170
171 2 1. parse : changed conditional boundary → KILLED
2. parse : negated conditional → KILLED
            for (int index = 0; index < variants.length; index++) {
172
                variants[index] = tokenizer.nextToken();
173
            }
174
175 1 1. parse : removed call to org/egothor/stemmer/StemmerDictionaryParser$EntryHandler::onEntry → KILLED
            entryHandler.onEntry(stem, variants, lineNumber);
176 1 1. parse : Changed increment from 1 to -1 → KILLED
            logicalEntryCount++;
177
        }
178
179
        final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
180
                ignoredLineCount);
181
182
        if (LOGGER.isLoggable(Level.FINE)) {
183
            LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
184
                    new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
185
                            statistics.ignoredLineCount() });
186
        }
187
188 1 1. parse : replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED
        return statistics;
189
    }
190
191
    /**
192
     * Removes a trailing remark from one physical line.
193
     *
194
     * <p>
195
     * The earliest occurrence of either supported remark marker terminates the
196
     * logical line content.
197
     *
198
     * @param line physical line
199
     * @return line content without a trailing remark
200
     */
201
    private static String stripRemark(final String line) {
202
        final int hashIndex = line.indexOf('#');
203
        final int slashIndex = line.indexOf("//");
204
205
        final int remarkIndex;
206 2 1. stripRemark : changed conditional boundary → KILLED
2. stripRemark : negated conditional → KILLED
        if (hashIndex < 0) {
207
            remarkIndex = slashIndex;
208 2 1. stripRemark : changed conditional boundary → SURVIVED
2. stripRemark : negated conditional → KILLED
        } else if (slashIndex < 0) {
209
            remarkIndex = hashIndex;
210
        } else {
211
            remarkIndex = Math.min(hashIndex, slashIndex);
212
        }
213
214 2 1. stripRemark : changed conditional boundary → KILLED
2. stripRemark : negated conditional → KILLED
        if (remarkIndex < 0) {
215 1 1. stripRemark : replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED
            return line;
216
        }
217 1 1. stripRemark : replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED
        return line.substring(0, remarkIndex);
218
    }
219
220
    /**
221
     * Immutable parsing statistics.
222
     *
223
     * @param sourceDescription logical source description
224
     * @param lineCount         number of physical lines read
225
     * @param entryCount        number of logical dictionary entries emitted
226
     * @param ignoredLineCount  number of ignored empty or remark-only lines
227
     */
228
    public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {
229
230
        /**
231
         * Creates parsing statistics.
232
         *
233
         * @param sourceDescription logical source description
234
         * @param lineCount         number of physical lines read
235
         * @param entryCount        number of logical dictionary entries emitted
236
         * @param ignoredLineCount  number of ignored empty or remark-only lines
237
         * @throws NullPointerException     if {@code sourceDescription} is {@code null}
238
         * @throws IllegalArgumentException if any numeric value is negative
239
         */
240
        public ParseStatistics {
241
            Objects.requireNonNull(sourceDescription, "sourceDescription");
242
            if (lineCount < 0) {
243
                throw new IllegalArgumentException("lineCount must not be negative.");
244
            }
245
            if (entryCount < 0) {
246
                throw new IllegalArgumentException("entryCount must not be negative.");
247
            }
248
            if (ignoredLineCount < 0) {
249
                throw new IllegalArgumentException("ignoredLineCount must not be negative.");
250
            }
251
        }
252
    }
253
}

Mutations

112

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:FileParsingTests]/[method:shouldParseSameContentThroughPathAndStringOverloads()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

127

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:FileParsingTests]/[method:shouldParseSameContentThroughPathAndStringOverloads()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

146

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

153

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

154

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
Changed increment from 1 to -1 → KILLED

157

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

158

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
Changed increment from 1 to -1 → KILLED

163

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

164

1.1
Location : parse
Killed by : none
Changed increment from 1 to -1 → NO_COVERAGE

171

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
changed conditional boundary → KILLED

2.2
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
negated conditional → KILLED

175

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
removed call to org/egothor/stemmer/StemmerDictionaryParser$EntryHandler::onEntry → KILLED

176

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
Changed increment from 1 to -1 → KILLED

188

1.1
Location : parse
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
replaced return value with null for org/egothor/stemmer/StemmerDictionaryParser::parse → KILLED

206

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
changed conditional boundary → KILLED

2.2
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
negated conditional → KILLED

208

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
negated conditional → KILLED

2.2
Location : stripRemark
Killed by : none
changed conditional boundary → SURVIVED
Covering tests

214

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldParseNormalizedEntriesAndCollectAccurateStatistics()]
changed conditional boundary → KILLED

2.2
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
negated conditional → KILLED

215

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPropagateHandlerIOExceptionWithoutSwallowingIt()]
replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED

217

1.1
Location : stripRemark
Killed by : org.egothor.stemmer.StemmerDictionaryParserTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.StemmerDictionaryParserTest]/[nested-class:ReaderParsingTests]/[method:shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType()]
replaced return value with "" for org/egothor/stemmer/StemmerDictionaryParser::stripRemark → KILLED

Active mutators

Tests examined


Report generated by PIT 1.22.1