Compile.java

1
/*******************************************************************************
2
 * Copyright (C) 2026, Leo Galambos
3
 * All rights reserved.
4
 * 
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 * 
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 * 
11
 * 2. Redistributions in binary form must reproduce the above copyright notice,
12
 *    this list of conditions and the following disclaimer in the documentation
13
 *    and/or other materials provided with the distribution.
14
 * 
15
 * 3. All advertising materials mentioning features or use of this software must
16
 *    display the following acknowledgement:
17
 *    This product includes software developed by the Egothor project.
18
 * 
19
 * 4. Neither the name of the copyright holder nor the names of its contributors
20
 *    may be used to endorse or promote products derived from this software
21
 *    without specific prior written permission.
22
 * 
23
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
27
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33
 * POSSIBILITY OF SUCH DAMAGE.
34
 ******************************************************************************/
35
package org.egothor.stemmer;
36
37
import java.io.IOException;
38
import java.nio.file.Files;
39
import java.nio.file.Path;
40
import java.util.Locale;
41
import java.util.Objects;
42
import java.util.logging.Level;
43
import java.util.logging.Logger;
44
45
/**
46
 * Command-line compiler of stemmer dictionary files into compressed binary
47
 * {@link FrequencyTrie} artifacts.
48
 *
49
 * <p>
50
 * The CLI reads an input file in the same syntax as the project's stemmer
51
 * resource files, compiles it into a read-only {@link FrequencyTrie} of patch
52
 * commands, applies the selected subtree reduction strategy, and writes the
53
 * resulting trie in the project binary format under GZip compression.
54
 *
55
 * <p>
56
 * Remarks introduced by {@code #} or {@code //} are supported through
57
 * {@link StemmerDictionaryParser}.
58
 *
59
 * <p>
60
 * Supported arguments:
61
 * </p>
62
 *
63
 * <pre>
64
 * --input &lt;file&gt;
65
 * --output &lt;file&gt;
66
 * --reduction-mode &lt;mode&gt;
67
 * [--store-original]
68
 * [--dominant-winner-min-percent &lt;1..100&gt;]
69
 * [--dominant-winner-over-second-ratio &lt;1..n&gt;]
70
 * [--overwrite]
71
 * [--help]
72
 * </pre>
73
 */
74
public final class Compile {
75
76
    /**
77
     * Logger of this class.
78
     */
79
    private static final Logger LOGGER = Logger.getLogger(Compile.class.getName());
80
81
    /**
82
     * Exit status indicating success.
83
     */
84
    private static final int EXIT_SUCCESS = 0;
85
86
    /**
87
     * Exit status indicating invalid command-line usage.
88
     */
89
    private static final int EXIT_USAGE_ERROR = 2;
90
91
    /**
92
     * Exit status indicating processing failure.
93
     */
94
    private static final int EXIT_PROCESSING_ERROR = 1;
95
96
    /**
97
     * Utility class.
98
     */
99
    private Compile() {
100
        throw new AssertionError("No instances.");
101
    }
102
103
    /**
104
     * CLI entry point.
105
     *
106
     * @param arguments command-line arguments
107
     */
108
    public static void main(final String[] arguments) {
109
        final int exitCode = run(arguments);
110
        if (exitCode != EXIT_SUCCESS) {
111
            System.exit(exitCode);
112
        }
113
    }
114
115
    /**
116
     * Executes the CLI.
117
     *
118
     * @param arguments command-line arguments
119
     * @return process exit code
120
     */
121
    /* default */ static int run(final String... arguments) {
122
        try {
123
            final Arguments parsedArguments = Arguments.parse(arguments);
124
            if (parsedArguments.help()) {
125
                printUsage();
126
                return EXIT_SUCCESS;
127
            }
128
129
            compile(parsedArguments);
130
            return EXIT_SUCCESS;
131
        } catch (IllegalArgumentException exception) {
132
            System.err.println(exception.getMessage());
133
            System.err.println();
134
            printUsage();
135
            return EXIT_USAGE_ERROR;
136
        } catch (IOException exception) {
137
            if (LOGGER.isLoggable(Level.SEVERE)) {
138
                LOGGER.log(Level.SEVERE, "CLI compilation failed for input {0} and output {1}.",
139
                        new Object[] { safeInput(arguments), safeOutput(arguments) });
140
            }
141
            System.err.println("Compilation failed: " + exception.getMessage());
142
            return EXIT_PROCESSING_ERROR;
143
        }
144
    }
145
146
    /**
147
     * Compiles the input dictionary and writes the compressed binary trie.
148
     *
149
     * @param arguments parsed command-line arguments
150
     * @throws IOException if compilation or output writing fails
151
     */
152
    private static void compile(final Arguments arguments) throws IOException {
153
        final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
154
                arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
155
156
        final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
157
                reductionSettings);
158
159
        final Path outputFile = arguments.outputFile();
160
        final Path parent = outputFile.toAbsolutePath().getParent();
161
        if (parent != null) {
162
            Files.createDirectories(parent);
163
        }
164
165
        if (Files.exists(outputFile) && !arguments.overwrite()) {
166
            throw new IOException("Output file already exists: " + outputFile.toAbsolutePath());
167
        }
168
169
        StemmerPatchTrieBinaryIO.write(trie, outputFile);
170
171
        if (LOGGER.isLoggable(Level.INFO)) {
172
            LOGGER.log(Level.INFO,
173
                    "Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
174
                    new Object[] { arguments.inputFile().toAbsolutePath().toString(),
175
                            arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
176
                            arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
177
                            arguments.dominantWinnerOverSecondRatio() });
178
        }
179
    }
180
181
    /**
182
     * Prints CLI usage help.
183
     */
184
    private static void printUsage() {
185
        System.err.println("Usage:");
186
        System.err.println("  java org.egothor.stemmer.Compile \\");
187
        System.err.println("      --input <file> \\");
188
        System.err.println("      --output <file> \\");
189
        System.err.println("      --reduction-mode <mode> \\");
190
        System.err.println("      [--store-original] \\");
191
        System.err.println("      [--dominant-winner-min-percent <1..100>] \\");
192
        System.err.println("      [--dominant-winner-over-second-ratio <1..n>] \\");
193
        System.err.println("      [--overwrite]");
194
        System.err.println();
195
        System.err.println("Supported reduction modes:");
196
        for (ReductionMode mode : ReductionMode.values()) {
197
            System.err.println("  " + mode.name());
198
        }
199
    }
200
201
    /**
202
     * Returns a best-effort input value for diagnostic logging.
203
     *
204
     * @param arguments raw command-line arguments
205
     * @return input value if present, otherwise {@code "<unknown>"}
206
     */
207
    private static String safeInput(final String... arguments) {
208
        return safeOptionValue(arguments, "--input");
209
    }
210
211
    /**
212
     * Returns a best-effort output value for diagnostic logging.
213
     *
214
     * @param arguments raw command-line arguments
215
     * @return output value if present, otherwise {@code "<unknown>"}
216
     */
217
    private static String safeOutput(final String... arguments) {
218
        return safeOptionValue(arguments, "--output");
219
    }
220
221
    /**
222
     * Returns a best-effort option value from raw arguments.
223
     *
224
     * @param arguments raw command-line arguments
225
     * @param option    option name
226
     * @return option value if present, otherwise {@code "<unknown>"}
227
     */
228
    private static String safeOptionValue(final String[] arguments, final String option) {
229
        if (arguments == null) {
230
            return "<unknown>";
231
        }
232
        for (int index = 0; index < arguments.length - 1; index++) {
233
            if (option.equals(arguments[index])) {
234
                return arguments[index + 1];
235
            }
236
        }
237
        return "<unknown>";
238
    }
239
240
    /**
241
     * Immutable parsed CLI arguments.
242
     *
243
     * @param inputFile                     input dictionary file
244
     * @param outputFile                    output compressed trie file
245
     * @param reductionMode                 subtree reduction mode
246
     * @param storeOriginal                 whether original stems are stored
247
     * @param dominantWinnerMinPercent      dominant winner minimum percent
248
     * @param dominantWinnerOverSecondRatio dominant winner over second ratio
249
     * @param overwrite                     whether an existing output may be
250
     *                                      replaced
251
     * @param help                          whether usage help was requested
252
     */
253
    @SuppressWarnings("PMD.LongVariable")
254
    private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
255
            int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
256
257
        /**
258
         * Parses raw command-line arguments.
259
         *
260
         * @param arguments raw command-line arguments
261
         * @return parsed arguments
262
         */
263
        @SuppressWarnings({ "PMD.AvoidReassigningLoopVariables", "PMD.CyclomaticComplexity" })
264
        private static Arguments parse(final String... arguments) {
265
            Objects.requireNonNull(arguments, "arguments");
266
267
            Path inputFile = null;
268
            Path outputFile = null;
269
            ReductionMode reductionMode = null;
270
            boolean storeOriginal = false;
271
            boolean overwrite = false;
272
            boolean help = false;
273
            int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
274
            int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
275
276 2 1. parse : negated conditional → KILLED
2. parse : changed conditional boundary → KILLED
            for (int index = 0; index < arguments.length; index++) {
277
                final String argument = arguments[index];
278
279
                switch (argument) {
280
                    case "--help":
281
                    case "-h":
282
                        help = true;
283
                        break;
284
285
                    case "--store-original":
286
                        storeOriginal = true;
287
                        break;
288
289
                    case "--overwrite":
290
                        overwrite = true;
291
                        break;
292
293
                    case "--input":
294 1 1. parse : Changed increment from 1 to -1 → KILLED
                        inputFile = Path.of(requireValue(arguments, ++index, "--input"));
295
                        break;
296
297
                    case "--output":
298 1 1. parse : Changed increment from 1 to -1 → KILLED
                        outputFile = Path.of(requireValue(arguments, ++index, "--output"));
299
                        break;
300
301
                    case "--reduction-mode":
302 1 1. parse : Changed increment from 1 to -1 → KILLED
                        reductionMode = ReductionMode
303
                                .valueOf(requireValue(arguments, ++index, "--reduction-mode").toUpperCase(Locale.ROOT));
304
                        break;
305
306
                    case "--dominant-winner-min-percent":
307 1 1. parse : Changed increment from 1 to -1 → KILLED
                        dominantWinnerMinPercent = parseInteger(
308
                                requireValue(arguments, ++index, "--dominant-winner-min-percent"),
309
                                "--dominant-winner-min-percent");
310
                        break;
311
312
                    case "--dominant-winner-over-second-ratio":
313 1 1. parse : Changed increment from 1 to -1 → KILLED
                        dominantWinnerOverSecondRatio = parseInteger(
314
                                requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
315
                                "--dominant-winner-over-second-ratio");
316
                        break;
317
318
                    default:
319
                        throw new IllegalArgumentException("Unknown argument: " + argument);
320
                }
321
            }
322
323 1 1. parse : negated conditional → KILLED
            if (help) {
324 1 1. parse : replaced return value with null for org/egothor/stemmer/Compile$Arguments::parse → KILLED
                return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
325
                        dominantWinnerOverSecondRatio, overwrite, true);
326
            }
327
328 1 1. parse : negated conditional → KILLED
            if (inputFile == null) {
329
                throw new IllegalArgumentException("Missing required argument --input.");
330
            }
331 1 1. parse : negated conditional → KILLED
            if (outputFile == null) {
332
                throw new IllegalArgumentException("Missing required argument --output.");
333
            }
334 1 1. parse : negated conditional → KILLED
            if (reductionMode == null) {
335
                throw new IllegalArgumentException("Missing required argument --reduction-mode.");
336
            }
337
338 1 1. parse : replaced return value with null for org/egothor/stemmer/Compile$Arguments::parse → KILLED
            return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
339
                    dominantWinnerOverSecondRatio, overwrite, false);
340
        }
341
342
        /**
343
         * Returns the required value of an option.
344
         *
345
         * @param arguments raw arguments
346
         * @param index     value index
347
         * @param option    option name
348
         * @return option value
349
         */
350
        private static String requireValue(final String[] arguments, final int index, final String option) {
351 2 1. requireValue : changed conditional boundary → KILLED
2. requireValue : negated conditional → KILLED
            if (index >= arguments.length) {
352
                throw new IllegalArgumentException("Missing value for " + option + ".");
353
            }
354 1 1. requireValue : replaced return value with "" for org/egothor/stemmer/Compile$Arguments::requireValue → KILLED
            return arguments[index];
355
        }
356
357
        /**
358
         * Parses an integer option value.
359
         *
360
         * @param value      raw value
361
         * @param optionName option name
362
         * @return parsed integer
363
         */
364
        private static int parseInteger(final String value, final String optionName) {
365
            try {
366 1 1. parseInteger : replaced int return with 0 for org/egothor/stemmer/Compile$Arguments::parseInteger → NO_COVERAGE
                return Integer.parseInt(value);
367
            } catch (NumberFormatException exception) {
368
                throw new IllegalArgumentException("Invalid integer for " + optionName + ": " + value, exception);
369
            }
370
        }
371
    }
372
}

Mutations

276

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorForUnknownArgument()]
negated conditional → KILLED

2.2
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenInputIsMissing()]
changed conditional boundary → KILLED

294

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenOptionValueIsMissing()]
Changed increment from 1 to -1 → KILLED

298

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenInputIsMissing()]
Changed increment from 1 to -1 → KILLED

302

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorForInvalidDominantWinnerMinPercent()]
Changed increment from 1 to -1 → KILLED

307

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorForInvalidDominantWinnerMinPercent()]
Changed increment from 1 to -1 → KILLED

313

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorForInvalidDominantWinnerOverSecondRatio()]
Changed increment from 1 to -1 → KILLED

323

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenInputIsMissing()]
negated conditional → KILLED

324

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[method:shouldReturnSuccessAndPrintUsageWhenHelpIsRequested()]
replaced return value with null for org/egothor/stemmer/Compile$Arguments::parse → KILLED

328

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenInputIsMissing()]
negated conditional → KILLED

331

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenOutputIsMissing()]
negated conditional → KILLED

334

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenReductionModeIsMissing()]
negated conditional → KILLED

338

1.1
Location : parse
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[method:shouldFailWithProcessingErrorWhenInputFileDoesNotExist()]
replaced return value with null for org/egothor/stemmer/Compile$Arguments::parse → KILLED

351

1.1
Location : requireValue
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenOptionValueIsMissing()]
changed conditional boundary → KILLED

2.2
Location : requireValue
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorWhenOptionValueIsMissing()]
negated conditional → KILLED

354

1.1
Location : requireValue
Killed by : org.egothor.stemmer.CompileTest.[engine:junit-jupiter]/[class:org.egothor.stemmer.CompileTest]/[nested-class:ArgumentValidationTest]/[method:shouldFailWithUsageErrorForInvalidDominantWinnerMinPercent()]
replaced return value with "" for org/egothor/stemmer/Compile$Arguments::requireValue → KILLED

366

1.1
Location : parseInteger
Killed by : none
replaced int return with 0 for org/egothor/stemmer/Compile$Arguments::parseInteger → NO_COVERAGE

Active mutators

Tests examined


Report generated by PIT 1.22.1