StemmerDictionaryParser.java

/*******************************************************************************
 * Copyright (C) 2026, Leo Galambos
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * 3. Neither the name of the copyright holder nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************/
package org.egothor.stemmer;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Parser of line-oriented stemmer dictionary files.
 *
 * <p>
 * Each non-empty logical line consists of a stem followed by zero or more known
 * word variants separated by whitespace. The first token is interpreted as the
 * canonical stem, and every following token on the same line is interpreted as
 * a variant belonging to that stem.
 *
 * <p>
 * Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
 * and trailing whitespace is ignored.
 *
 * <p>
 * The parser supports line remarks and trailing remarks. The remark markers
 * {@code #} and {@code //} terminate the logical content of the line, and the
 * remainder of that line is ignored.
 *
 * <p>
 * This class is intentionally stateless and allocation-light so it can be used
 * both by runtime loading and by offline compilation tooling.
 */
public final class StemmerDictionaryParser {

    /**
     * Logger of this class.
     */
    private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());

    /**
     * Utility class.
     */
    private StemmerDictionaryParser() {
        throw new AssertionError("No instances.");
    }

    /**
     * Callback receiving one parsed dictionary line.
     */
    @FunctionalInterface
    public interface EntryHandler {

        /**
         * Accepts one parsed dictionary entry.
         *
         * @param stem       canonical stem, never {@code null}
         * @param variants   variants in encounter order, never {@code null}
         * @param lineNumber original physical line number in the parsed source
         * @throws IOException if processing fails
         */
        void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
    }

    /**
     * Parses a dictionary file from a filesystem path.
     *
     * @param path         dictionary file path
     * @param entryHandler handler receiving parsed entries
     * @return parsing statistics
     * @throws NullPointerException if any argument is {@code null}
     * @throws IOException          if reading fails
     */
    public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
        Objects.requireNonNull(path, "path");
        Objects.requireNonNull(entryHandler, "entryHandler");

        try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
            return parse(reader, path.toAbsolutePath().toString(), entryHandler);
        }
    }

    /**
     * Parses a dictionary file from a path string.
     *
     * @param fileName     dictionary file name or path string
     * @param entryHandler handler receiving parsed entries
     * @return parsing statistics
     * @throws NullPointerException if any argument is {@code null}
     * @throws IOException          if reading fails
     */
    public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
        Objects.requireNonNull(fileName, "fileName");
        return parse(Path.of(fileName), entryHandler);
    }

    /**
     * Parses a dictionary from a reader.
     *
     * @param reader            source reader
     * @param sourceDescription logical source description for diagnostics
     * @param entryHandler      handler receiving parsed entries
     * @return parsing statistics
     * @throws NullPointerException if any argument is {@code null}
     * @throws IOException          if reading or handler processing fails
     */
    public static ParseStatistics parse(final Reader reader, final String sourceDescription,
            final EntryHandler entryHandler) throws IOException {
        Objects.requireNonNull(reader, "reader");
        Objects.requireNonNull(sourceDescription, "sourceDescription");
        Objects.requireNonNull(entryHandler, "entryHandler");

        final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
                : new BufferedReader(reader);

        int lineNumber = 0;
        int logicalEntryCount = 0;
        int ignoredLineCount = 0;

        for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
            lineNumber++;

            final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
            if (normalizedLine.isEmpty()) {
                ignoredLineCount++;
                continue;
            }

            final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
            if (!tokenizer.hasMoreTokens()) {
                ignoredLineCount++;
                continue;
            }

            final String stem = tokenizer.nextToken();
            final String[] variants = new String[tokenizer.countTokens()]; // NOPMD

            for (int index = 0; index < variants.length; index++) {
                variants[index] = tokenizer.nextToken();
            }

            entryHandler.onEntry(stem, variants, lineNumber);
            logicalEntryCount++;
        }

        final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
                ignoredLineCount);

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
                    new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
                            statistics.ignoredLineCount() });
        }

        return statistics;
    }

    /**
     * Removes a trailing remark from one physical line.
     *
     * <p>
     * The earliest occurrence of either supported remark marker terminates the
     * logical line content.
     *
     * @param line physical line
     * @return line content without a trailing remark
     */
    private static String stripRemark(final String line) {
        final int hashIndex = line.indexOf('#');
        final int slashIndex = line.indexOf("//");

        final int remarkIndex;
        if (hashIndex < 0) {
            remarkIndex = slashIndex;
        } else if (slashIndex < 0) {
            remarkIndex = hashIndex;
        } else {
            remarkIndex = Math.min(hashIndex, slashIndex);
        }

        if (remarkIndex < 0) {
            return line;
        }
        return line.substring(0, remarkIndex);
    }

    /**
     * Immutable parsing statistics.
     *
     * @param sourceDescription logical source description
     * @param lineCount         number of physical lines read
     * @param entryCount        number of logical dictionary entries emitted
     * @param ignoredLineCount  number of ignored empty or remark-only lines
     */
    public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {

        /**
         * Creates parsing statistics.
         *
         * @param sourceDescription logical source description
         * @param lineCount         number of physical lines read
         * @param entryCount        number of logical dictionary entries emitted
         * @param ignoredLineCount  number of ignored empty or remark-only lines
         * @throws NullPointerException     if {@code sourceDescription} is {@code null}
         * @throws IllegalArgumentException if any numeric value is negative
         */
        public ParseStatistics {
            Objects.requireNonNull(sourceDescription, "sourceDescription");
            if (lineCount < 0) {
                throw new IllegalArgumentException("lineCount must not be negative.");
            }
            if (entryCount < 0) {
                throw new IllegalArgumentException("entryCount must not be negative.");
            }
            if (ignoredLineCount < 0) {
                throw new IllegalArgumentException("ignoredLineCount must not be negative.");
            }
        }
    }
}