TrieMetadata.java
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
/**
* Immutable metadata persisted together with a compiled trie artifact.
*
* <p>
* The metadata captures the semantic build configuration required to interpret
* the compiled trie correctly after it is reloaded. Persisting the metadata as
* part of the artifact makes the binary format self-describing and avoids
* coupling runtime consumers to external side-channel configuration.
* </p>
*
* <p>
* The record is intentionally extensible. It already models traversal
* direction, reduction settings, and diacritic processing strategy, even though
* not every field necessarily influences all current code paths yet.
* </p>
*
* @param formatVersion persisted binary format version of the trie
* artifact
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy associated with
* the artifact
* @param caseProcessingMode case processing strategy associated with the
* artifact
*/
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
CaseProcessingMode caseProcessingMode) {
/**
* Header identifying the human-readable metadata block layout.
*/
private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1";
/**
* Creates a new metadata instance.
*
* @param formatVersion persisted binary format version, must be at
* least {@code 1}
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy
* @param caseProcessingMode case processing strategy
*/
public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
final CaseProcessingMode caseProcessingMode) {
if (formatVersion < 1) { // NOPMD
throw new IllegalArgumentException("formatVersion must be at least 1.");
}
this.formatVersion = formatVersion;
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
}
/**
* Creates metadata populated with current-format defaults for freshly compiled
* tries.
*
* @param formatVersion persisted binary format version
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @return metadata initialized with current defaults
*/
public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings) {
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
* Creates metadata for a newly compiled trie using the currently persisted
* binary stream format version.
*
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy
* @param caseProcessingMode case processing strategy
* @return metadata aligned with the current persisted stream format
*/
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
final CaseProcessingMode caseProcessingMode) {
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
diacriticProcessingMode, caseProcessingMode);
}
/**
* Creates metadata compatible with a legacy artifact version that did not store
* the full configuration explicitly.
*
* @param formatVersion legacy persisted binary format version
* @param traversalDirection logical key traversal direction reconstructed from
* the legacy stream
* @return metadata reconstructed with conservative compatibility defaults
*/
public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) {
return new TrieMetadata(formatVersion, traversalDirection,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
* Returns metadata encoded as a deterministic human-readable text block.
*
* <p>
* The format intentionally uses plain {@code key=value} lines so users can
* inspect metadata quickly from a decompressed trie payload without additional
* dependencies.
* </p>
*
* @return persisted metadata text block
*/
@SuppressWarnings("PMD.ConsecutiveLiteralAppends")
public String toTextBlock() {
final StringBuilder textBlockBuilder = new StringBuilder(1024);
textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n')
//
.append("formatVersion=").append(this.formatVersion).append('\n')
//
.append("traversalDirection=").append(this.traversalDirection.name()).append('\n')
//
.append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n')
//
.append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n')
//
.append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent())
.append('\n')
//
.append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio())
.append('\n')
//
.append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n')
//
.append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n');
return textBlockBuilder.toString();
}
/**
* Parses metadata from a text block produced by {@link #toTextBlock()}.
*
* @param formatVersion persisted binary format version
* @param textBlock metadata text block
* @return parsed metadata
*/
public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) {
Objects.requireNonNull(textBlock, "textBlock");
final String[] lines = textBlock.split("\\R");
if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) {
throw new IllegalArgumentException("Unsupported metadata block header.");
}
final Map<String, String> entries = new HashMap<>();
for (int index = 1; index < lines.length; index++) {
final String line = lines[index];
if (line.isBlank()) {
continue;
}
final int delimiterIndex = line.indexOf('=');
if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) {
throw new IllegalArgumentException("Invalid metadata line: " + line);
}
entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1));
}
final WordTraversalDirection traversalDirection = WordTraversalDirection
.valueOf(requireEntry(entries, "traversalDirection"));
final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode"));
final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent"));
final int dominantWinnerOverSecondRatio = Integer // NOPMD
.parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio"));
final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode
.valueOf(requireEntry(entries, "diacriticProcessingMode"));
final CaseProcessingMode caseProcessingMode = CaseProcessingMode
.valueOf(requireEntry(entries, "caseProcessingMode"));
return new TrieMetadata(formatVersion, traversalDirection,
new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
diacriticProcessingMode, caseProcessingMode);
}
/**
* Returns a required metadata entry from a parsed text block.
*
* @param entries parsed metadata entries
* @param key required entry key
* @return non-blank entry value
* @throws IllegalArgumentException if the entry is absent or blank
*/
private static String requireEntry(final Map<String, String> entries, final String key) {
final String value = entries.get(key);
if (value == null || value.isBlank()) {
throw new IllegalArgumentException("Missing metadata entry: " + key);
}
return value;
}
}