| 1 | /******************************************************************************* | |
| 2 | * Copyright (C) 2026, Leo Galambos | |
| 3 | * All rights reserved. | |
| 4 | * | |
| 5 | * Redistribution and use in source and binary forms, with or without | |
| 6 | * modification, are permitted provided that the following conditions are met: | |
| 7 | * | |
| 8 | * 1. Redistributions of source code must retain the above copyright notice, | |
| 9 | * this list of conditions and the following disclaimer. | |
| 10 | * | |
| 11 | * 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 | * this list of conditions and the following disclaimer in the documentation | |
| 13 | * and/or other materials provided with the distribution. | |
| 14 | * | |
| 15 | * 3. Neither the name of the copyright holder nor the names of its contributors | |
| 16 | * may be used to endorse or promote products derived from this software | |
| 17 | * without specific prior written permission. | |
| 18 | * | |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 29 | * POSSIBILITY OF SUCH DAMAGE. | |
| 30 | ******************************************************************************/ | |
| 31 | package org.egothor.stemmer; | |
| 32 | ||
| 33 | import java.io.DataInputStream; | |
| 34 | import java.io.DataOutputStream; | |
| 35 | import java.io.IOException; | |
| 36 | import java.io.InputStream; | |
| 37 | import java.io.OutputStream; | |
| 38 | import java.util.ArrayList; | |
| 39 | import java.util.Arrays; | |
| 40 | import java.util.Collections; | |
| 41 | import java.util.IdentityHashMap; | |
| 42 | import java.util.LinkedHashMap; | |
| 43 | import java.util.List; | |
| 44 | import java.util.Locale; | |
| 45 | import java.util.Map; | |
| 46 | import java.util.Objects; | |
| 47 | import java.util.function.IntFunction; | |
| 48 | import java.util.logging.Level; | |
| 49 | import java.util.logging.Logger; | |
| 50 | ||
| 51 | import org.egothor.stemmer.trie.CompiledNode; | |
| 52 | import org.egothor.stemmer.trie.LocalValueSummary; | |
| 53 | import org.egothor.stemmer.trie.MutableNode; | |
| 54 | import org.egothor.stemmer.trie.NodeData; | |
| 55 | import org.egothor.stemmer.trie.ReducedNode; | |
| 56 | import org.egothor.stemmer.trie.ReductionContext; | |
| 57 | import org.egothor.stemmer.trie.ReductionSignature; | |
| 58 | ||
| 59 | /** | |
| 60 | * Read-only trie mapping {@link String} keys to one or more values with | |
| 61 | * frequency tracking. | |
| 62 | * | |
| 63 | * <p> | |
| 64 | * A key may be associated with multiple values. Each value keeps the number of | |
| 65 | * times it was inserted during the build phase. The method {@link #get(String)} | |
| 66 | * returns the locally most frequent value stored at the terminal node of the | |
| 67 | * supplied key, while {@link #getAll(String)} returns all locally stored values | |
| 68 | * ordered by descending frequency. | |
| 69 | * | |
| 70 | * <p> | |
| 71 | * If multiple values have the same local frequency, their ordering is | |
| 72 | * deterministic. The preferred value is selected by the following tie-breaking | |
| 73 | * rules, in order: | |
| 74 | * <ol> | |
| 75 | * <li>shorter {@link String} representation wins, based on | |
| 76 | * {@code value.toString()}</li> | |
| 77 | * <li>if the lengths are equal, lexicographically lower {@link String} | |
| 78 | * representation wins</li> | |
| 79 | * <li>if the textual representations are still equal, first-seen insertion | |
| 80 | * order remains stable</li> | |
| 81 | * </ol> | |
| 82 | * | |
| 83 | * <p> | |
| 84 | * Values may be stored at any trie node, including internal nodes and leaf | |
| 85 | * nodes. Therefore, reduction and canonicalization always operate on both the | |
| 86 | * node-local terminal values and the structure of all descendant edges. | |
| 87 | * | |
| 88 | * @param <V> value type | |
| 89 | */ | |
| 90 | @SuppressWarnings("PMD.CyclomaticComplexity") | |
| 91 | public final class FrequencyTrie<V> { | |
| 92 | ||
| 93 | /** | |
| 94 | * Logger of this class. | |
| 95 | */ | |
| 96 | private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName()); | |
| 97 | ||
| 98 | /** | |
| 99 | * Factory used to create correctly typed arrays for {@link #getAll(String)}. | |
| 100 | */ | |
| 101 | private final IntFunction<V[]> arrayFactory; | |
| 102 | ||
| 103 | /** | |
| 104 | * Root node of the compiled read-only trie. | |
| 105 | */ | |
| 106 | private final CompiledNode<V> root; | |
| 107 | ||
| 108 | /** | |
| 109 | * Metadata persisted together with this trie. | |
| 110 | */ | |
| 111 | private final TrieMetadata metadata; | |
| 112 | ||
| 113 | /** | |
| 114 | * Binary format magic header. | |
| 115 | */ | |
| 116 | private static final int STREAM_MAGIC = 0x45475452; | |
| 117 | ||
| 118 | /** | |
| 119 | * Binary format version. | |
| 120 | */ | |
| 121 | private static final int STREAM_VERSION = 5; | |
| 122 | ||
| 123 | /** | |
| 124 | * Returns the current persisted binary stream format version. | |
| 125 | * | |
| 126 | * <p> | |
| 127 | * This method exists so other components can construct {@link TrieMetadata} | |
| 128 | * instances aligned with the currently written binary format without | |
| 129 | * duplicating constants. | |
| 130 | * </p> | |
| 131 | * | |
| 132 | * @return current trie stream format version | |
| 133 | */ | |
| 134 | public static int currentFormatVersion() { | |
| 135 |
1
1. currentFormatVersion : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie::currentFormatVersion → KILLED |
return STREAM_VERSION; |
| 136 | } | |
| 137 | ||
| 138 | /** | |
| 139 | * Creates a new compiled trie instance. | |
| 140 | * | |
| 141 | * @param arrayFactory array factory | |
| 142 | * @param root compiled root node | |
| 143 | * @param metadata trie metadata describing lookup and persistence semantics | |
| 144 | * @throws NullPointerException if any argument is {@code null} | |
| 145 | */ | |
| 146 | private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root, | |
| 147 | final TrieMetadata metadata) { | |
| 148 | this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 149 | this.root = Objects.requireNonNull(root, "root"); | |
| 150 | this.metadata = Objects.requireNonNull(metadata, "metadata"); | |
| 151 | } | |
| 152 | ||
| 153 | /** | |
| 154 | * Returns the most frequent value stored at the node addressed by the supplied | |
| 155 | * key. | |
| 156 | * | |
| 157 | * <p> | |
| 158 | * If multiple values have the same local frequency, the returned value is | |
| 159 | * selected deterministically by shorter {@code toString()} value first, then by | |
| 160 | * lexicographically lower {@code toString()}, and finally by stable first-seen | |
| 161 | * order. | |
| 162 | * | |
| 163 | * <p> | |
| 164 | * The supplied key is normalized according to persisted | |
| 165 | * {@link TrieMetadata#caseProcessingMode()} before traversal. | |
| 166 | * | |
| 167 | * @param key key to resolve | |
| 168 | * @return most frequent value, or {@code null} if the key does not exist or no | |
| 169 | * value is stored at the addressed node | |
| 170 | * @throws NullPointerException if {@code key} is {@code null} | |
| 171 | */ | |
| 172 | public V get(final String key) { | |
| 173 | Objects.requireNonNull(key, "key"); | |
| 174 | final CompiledNode<V> node = findNode(normalizeLookupKey(key)); | |
| 175 |
2
1. get : negated conditional → KILLED 2. get : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 176 | return null; | |
| 177 | } | |
| 178 |
1
1. get : replaced return value with null for org/egothor/stemmer/FrequencyTrie::get → KILLED |
return node.orderedValues()[0]; |
| 179 | } | |
| 180 | ||
| 181 | /** | |
| 182 | * Returns all values stored at the node addressed by the supplied key, ordered | |
| 183 | * by descending frequency. | |
| 184 | * | |
| 185 | * <p> | |
| 186 | * If multiple values have the same local frequency, the ordering is | |
| 187 | * deterministic by shorter {@code toString()} value first, then by | |
| 188 | * lexicographically lower {@code toString()}, and finally by stable first-seen | |
| 189 | * order. | |
| 190 | * | |
| 191 | * <p> | |
| 192 | * The returned array is a defensive copy. | |
| 193 | * | |
| 194 | * <p> | |
| 195 | * The supplied key is normalized according to persisted | |
| 196 | * {@link TrieMetadata#caseProcessingMode()} before traversal. | |
| 197 | * | |
| 198 | * @param key key to resolve | |
| 199 | * @return all values stored at the addressed node, ordered by descending | |
| 200 | * frequency; returns an empty array if the key does not exist or no | |
| 201 | * value is stored at the addressed node | |
| 202 | * @throws NullPointerException if {@code key} is {@code null} | |
| 203 | */ | |
| 204 | public V[] getAll(final String key) { | |
| 205 | Objects.requireNonNull(key, "key"); | |
| 206 | final CompiledNode<V> node = findNode(normalizeLookupKey(key)); | |
| 207 |
2
1. getAll : negated conditional → KILLED 2. getAll : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 208 |
1
1. getAll : replaced return value with null for org/egothor/stemmer/FrequencyTrie::getAll → KILLED |
return this.arrayFactory.apply(0); |
| 209 | } | |
| 210 |
1
1. getAll : replaced return value with null for org/egothor/stemmer/FrequencyTrie::getAll → KILLED |
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length); |
| 211 | } | |
| 212 | ||
| 213 | /** | |
| 214 | * Returns all values stored at the node addressed by the supplied key together | |
| 215 | * with their occurrence counts, ordered by the same rules as | |
| 216 | * {@link #getAll(String)}. | |
| 217 | * | |
| 218 | * <p> | |
| 219 | * The returned list is aligned with the arrays returned by | |
| 220 | * {@link #getAll(String)} and the internal compiled count representation. | |
| 221 | * | |
| 222 | * <p> | |
| 223 | * The returned list is immutable. | |
| 224 | * | |
| 225 | * <p> | |
| 226 | * In reduction modes that merge semantically equivalent subtrees, the returned | |
| 227 | * counts may be aggregated across multiple original build-time nodes that were | |
| 228 | * reduced into the same canonical compiled node. | |
| 229 | * | |
| 230 | * @param key key to resolve | |
| 231 | * @return immutable ordered list of value-count entries; returns an empty list | |
| 232 | * if the key does not exist or no value is stored at the addressed node | |
| 233 | * @throws NullPointerException if {@code key} is {@code null} | |
| 234 | */ | |
| 235 | public List<ValueCount<V>> getEntries(final String key) { | |
| 236 | Objects.requireNonNull(key, "key"); | |
| 237 | final CompiledNode<V> node = findNode(normalizeLookupKey(key)); | |
| 238 |
2
1. getEntries : negated conditional → KILLED 2. getEntries : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 239 | return List.of(); | |
| 240 | } | |
| 241 | ||
| 242 | final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length); | |
| 243 |
2
1. getEntries : changed conditional boundary → KILLED 2. getEntries : negated conditional → KILLED |
for (int index = 0; index < node.orderedValues().length; index++) { |
| 244 | entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index])); | |
| 245 | } | |
| 246 |
1
1. getEntries : replaced return value with Collections.emptyList for org/egothor/stemmer/FrequencyTrie::getEntries → KILLED |
return Collections.unmodifiableList(entries); |
| 247 | } | |
| 248 | ||
| 249 | /** | |
| 250 | * Returns the logical key traversal direction used by this trie. | |
| 251 | * | |
| 252 | * <p> | |
| 253 | * The same direction must be used when reconstructing mutable builders or when | |
| 254 | * applying patch commands that were generated against keys stored in this trie. | |
| 255 | * </p> | |
| 256 | * | |
| 257 | * @return logical key traversal direction | |
| 258 | */ | |
| 259 | public WordTraversalDirection traversalDirection() { | |
| 260 |
1
1. traversalDirection : replaced return value with null for org/egothor/stemmer/FrequencyTrie::traversalDirection → KILLED |
return this.metadata.traversalDirection(); |
| 261 | } | |
| 262 | ||
| 263 | /** | |
| 264 | * Returns immutable persisted metadata associated with this trie. | |
| 265 | * | |
| 266 | * @return trie metadata | |
| 267 | */ | |
| 268 | public TrieMetadata metadata() { | |
| 269 |
1
1. metadata : replaced return value with null for org/egothor/stemmer/FrequencyTrie::metadata → KILLED |
return this.metadata; |
| 270 | } | |
| 271 | ||
| 272 | /** | |
| 273 | * Returns the root node mainly for diagnostics and tests within the package. | |
| 274 | * | |
| 275 | * @return compiled root node | |
| 276 | */ | |
| 277 | /* default */ CompiledNode<V> root() { | |
| 278 |
1
1. root : replaced return value with null for org/egothor/stemmer/FrequencyTrie::root → KILLED |
return this.root; |
| 279 | } | |
| 280 | ||
| 281 | /** | |
| 282 | * Writes this compiled trie to the supplied output stream. | |
| 283 | * | |
| 284 | * <p> | |
| 285 | * The binary format is versioned and preserves canonical shared compiled nodes, | |
| 286 | * therefore the serialized representation remains compact even for tries | |
| 287 | * reduced by subtree merging. | |
| 288 | * | |
| 289 | * <p> | |
| 290 | * The supplied codec is responsible for persisting individual values of type | |
| 291 | * {@code V}. | |
| 292 | * | |
| 293 | * @param outputStream target output stream | |
| 294 | * @param valueCodec codec used to write values | |
| 295 | * @throws NullPointerException if any argument is {@code null} | |
| 296 | * @throws IOException if writing fails | |
| 297 | */ | |
| 298 | public void writeTo(final OutputStream outputStream, final ValueStreamCodec<V> valueCodec) throws IOException { | |
| 299 | Objects.requireNonNull(outputStream, "outputStream"); | |
| 300 | Objects.requireNonNull(valueCodec, "valueCodec"); | |
| 301 | ||
| 302 | final DataOutputStream dataOutput; // NOPMD | |
| 303 |
1
1. writeTo : negated conditional → KILLED |
if (outputStream instanceof DataOutputStream) { |
| 304 | dataOutput = (DataOutputStream) outputStream; | |
| 305 | } else { | |
| 306 | dataOutput = new DataOutputStream(outputStream); | |
| 307 | } | |
| 308 | ||
| 309 | final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>(); | |
| 310 | final List<CompiledNode<V>> orderedNodes = new ArrayList<>(); | |
| 311 |
1
1. writeTo : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(this.root, nodeIds, orderedNodes); |
| 312 | ||
| 313 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 314 | LOGGER.log(Level.FINE, "Writing compiled trie with {0} canonical nodes.", orderedNodes.size()); | |
| 315 | } | |
| 316 | ||
| 317 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(STREAM_MAGIC); |
| 318 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(STREAM_VERSION); |
| 319 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(orderedNodes.size()); |
| 320 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(nodeIds.get(this.root)); |
| 321 |
1
1. writeTo : removed call to org/egothor/stemmer/FrequencyTrie::writeMetadata → KILLED |
writeMetadata(dataOutput, this.metadata); |
| 322 | ||
| 323 | for (CompiledNode<V> node : orderedNodes) { | |
| 324 |
1
1. writeTo : removed call to org/egothor/stemmer/FrequencyTrie::writeNode → KILLED |
writeNode(dataOutput, valueCodec, node, nodeIds); |
| 325 | } | |
| 326 | ||
| 327 |
1
1. writeTo : removed call to java/io/DataOutputStream::flush → SURVIVED |
dataOutput.flush(); |
| 328 | } | |
| 329 | ||
| 330 | /** | |
| 331 | * Reads a compiled trie from the supplied input stream. | |
| 332 | * | |
| 333 | * <p> | |
| 334 | * The caller must provide the same value codec semantics that were used during | |
| 335 | * persistence as well as the array factory required for typed result arrays. | |
| 336 | * | |
| 337 | * @param inputStream source input stream | |
| 338 | * @param arrayFactory factory used to create typed arrays | |
| 339 | * @param valueCodec codec used to read values | |
| 340 | * @param <V> value type | |
| 341 | * @return deserialized compiled trie | |
| 342 | * @throws NullPointerException if any argument is {@code null} | |
| 343 | * @throws IOException if reading fails or the binary format is invalid | |
| 344 | */ | |
| 345 | public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory, | |
| 346 | final ValueStreamCodec<V> valueCodec) throws IOException { | |
| 347 | Objects.requireNonNull(inputStream, "inputStream"); | |
| 348 | Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 349 | Objects.requireNonNull(valueCodec, "valueCodec"); | |
| 350 | ||
| 351 | final DataInputStream dataInput; // NOPMD | |
| 352 |
1
1. readFrom : negated conditional → KILLED |
if (inputStream instanceof DataInputStream) { |
| 353 | dataInput = (DataInputStream) inputStream; | |
| 354 | } else { | |
| 355 | dataInput = new DataInputStream(inputStream); | |
| 356 | } | |
| 357 | ||
| 358 | final int magic = dataInput.readInt(); | |
| 359 |
1
1. readFrom : negated conditional → KILLED |
if (magic != STREAM_MAGIC) { |
| 360 | throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic)); | |
| 361 | } | |
| 362 | ||
| 363 | final int version = dataInput.readInt(); | |
| 364 |
4
1. readFrom : negated conditional → KILLED 2. readFrom : negated conditional → KILLED 3. readFrom : negated conditional → KILLED 4. readFrom : negated conditional → KILLED |
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) { |
| 365 | throw new IOException("Unsupported trie stream version: " + version); | |
| 366 | } | |
| 367 | ||
| 368 | final int nodeCount = dataInput.readInt(); | |
| 369 |
2
1. readFrom : changed conditional boundary → SURVIVED 2. readFrom : negated conditional → KILLED |
if (nodeCount < 0) { |
| 370 | throw new IOException("Negative node count: " + nodeCount); | |
| 371 | } | |
| 372 | ||
| 373 | final int rootNodeId = dataInput.readInt(); | |
| 374 |
4
1. readFrom : negated conditional → KILLED 2. readFrom : changed conditional boundary → KILLED 3. readFrom : negated conditional → KILLED 4. readFrom : changed conditional boundary → KILLED |
if (rootNodeId < 0 || rootNodeId >= nodeCount) { |
| 375 | throw new IOException("Invalid root node id: " + rootNodeId); | |
| 376 | } | |
| 377 | ||
| 378 | final TrieMetadata metadata = readMetadata(dataInput, version); | |
| 379 | ||
| 380 | final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount); | |
| 381 | final CompiledNode<V> rootNode = nodes[rootNodeId]; | |
| 382 | ||
| 383 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 384 | LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount); | |
| 385 | } | |
| 386 | ||
| 387 |
1
1. readFrom : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readFrom → KILLED |
return new FrequencyTrie<>(arrayFactory, rootNode, metadata); |
| 388 | } | |
| 389 | ||
| 390 | /** | |
| 391 | * Writes persisted trie metadata. | |
| 392 | * | |
| 393 | * @param dataOutput output stream | |
| 394 | * @param metadata metadata to serialize | |
| 395 | * @throws IOException if writing fails | |
| 396 | */ | |
| 397 | private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata) | |
| 398 | throws IOException { | |
| 399 |
1
1. writeMetadata : removed call to java/io/DataOutputStream::writeUTF → KILLED |
dataOutput.writeUTF(metadata.toTextBlock()); |
| 400 | } | |
| 401 | ||
| 402 | /** | |
| 403 | * Reads persisted trie metadata while remaining backward compatible with | |
| 404 | * earlier stream versions. | |
| 405 | * | |
| 406 | * @param dataInput input stream | |
| 407 | * @param version persisted stream version | |
| 408 | * @return deserialized metadata | |
| 409 | * @throws IOException if the metadata section is invalid | |
| 410 | */ | |
| 411 | private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException { | |
| 412 |
2
1. readMetadata : negated conditional → KILLED 2. readMetadata : changed conditional boundary → KILLED |
if (version >= 5) { // NOPMD |
| 413 | try { | |
| 414 |
1
1. readMetadata : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readMetadata → KILLED |
return TrieMetadata.fromTextBlock(version, dataInput.readUTF()); |
| 415 | } catch (IllegalArgumentException exception) { | |
| 416 | throw new IOException("Invalid metadata block.", exception); | |
| 417 | } | |
| 418 | } | |
| 419 | ||
| 420 | final WordTraversalDirection traversalDirection; | |
| 421 |
2
1. readMetadata : changed conditional boundary → SURVIVED 2. readMetadata : negated conditional → KILLED |
if (version >= 2) { // NOPMD |
| 422 | final int traversalDirectionOrdinal = dataInput.readInt(); | |
| 423 | final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values(); | |
| 424 |
4
1. readMetadata : changed conditional boundary → NO_COVERAGE 2. readMetadata : negated conditional → NO_COVERAGE 3. readMetadata : changed conditional boundary → NO_COVERAGE 4. readMetadata : negated conditional → NO_COVERAGE |
if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) { |
| 425 | throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal); | |
| 426 | } | |
| 427 | traversalDirection = traversalDirections[traversalDirectionOrdinal]; | |
| 428 | } else { | |
| 429 | traversalDirection = WordTraversalDirection.BACKWARD; | |
| 430 | } | |
| 431 | ||
| 432 |
2
1. readMetadata : changed conditional boundary → SURVIVED 2. readMetadata : negated conditional → KILLED |
if (version < 3) { // NOPMD |
| 433 |
1
1. readMetadata : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readMetadata → SURVIVED |
return TrieMetadata.legacy(version, traversalDirection); |
| 434 | } | |
| 435 | ||
| 436 | final ReductionMode[] reductionModes = ReductionMode.values(); | |
| 437 | final int reductionModeOrdinal = dataInput.readInt(); | |
| 438 |
4
1. readMetadata : changed conditional boundary → NO_COVERAGE 2. readMetadata : negated conditional → NO_COVERAGE 3. readMetadata : changed conditional boundary → NO_COVERAGE 4. readMetadata : negated conditional → NO_COVERAGE |
if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) { |
| 439 | throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal); | |
| 440 | } | |
| 441 | ||
| 442 | final int dominantWinnerMinPercent = dataInput.readInt(); | |
| 443 | final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD | |
| 444 | ||
| 445 | final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values(); | |
| 446 | final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD | |
| 447 |
4
1. readMetadata : changed conditional boundary → NO_COVERAGE 2. readMetadata : negated conditional → NO_COVERAGE 3. readMetadata : changed conditional boundary → NO_COVERAGE 4. readMetadata : negated conditional → NO_COVERAGE |
if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) { |
| 448 | throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal); | |
| 449 | } | |
| 450 | ||
| 451 | final CaseProcessingMode caseProcessingMode; | |
| 452 |
2
1. readMetadata : negated conditional → NO_COVERAGE 2. readMetadata : changed conditional boundary → NO_COVERAGE |
if (version >= 4) { // NOPMD |
| 453 | final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values(); | |
| 454 | final int caseProcessingModeOrdinal = dataInput.readInt(); | |
| 455 |
4
1. readMetadata : negated conditional → NO_COVERAGE 2. readMetadata : changed conditional boundary → NO_COVERAGE 3. readMetadata : negated conditional → NO_COVERAGE 4. readMetadata : changed conditional boundary → NO_COVERAGE |
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) { |
| 456 | throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal); | |
| 457 | } | |
| 458 | caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal]; | |
| 459 | } else { | |
| 460 | caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT; | |
| 461 | } | |
| 462 | ||
| 463 |
1
1. readMetadata : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readMetadata → NO_COVERAGE |
return new TrieMetadata(version, traversalDirection, |
| 464 | new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent, | |
| 465 | dominantWinnerOverSecondRatio), | |
| 466 | diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode); | |
| 467 | } | |
| 468 | ||
| 469 | /** | |
| 470 | * Returns the number of canonical compiled nodes reachable from the root. | |
| 471 | * | |
| 472 | * <p> | |
| 473 | * The returned value reflects the size of the final reduced immutable trie, not | |
| 474 | * the number of mutable build-time nodes inserted before reduction. Shared | |
| 475 | * canonical subtrees are counted only once. | |
| 476 | * | |
| 477 | * @return number of canonical compiled nodes in this trie | |
| 478 | */ | |
| 479 | public int size() { | |
| 480 | final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>(); | |
| 481 | final List<CompiledNode<V>> orderedNodes = new ArrayList<>(); | |
| 482 |
1
1. size : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(this.root, nodeIds, orderedNodes); |
| 483 |
1
1. size : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie::size → KILLED |
return orderedNodes.size(); |
| 484 | } | |
| 485 | ||
| 486 | /** | |
| 487 | * Assigns deterministic identifiers to all canonical compiled nodes reachable | |
| 488 | * from the supplied root. | |
| 489 | * | |
| 490 | * @param node current node | |
| 491 | * @param nodeIds assigned node identifiers | |
| 492 | * @param orderedNodes ordered nodes in identifier order | |
| 493 | */ | |
| 494 | private static <V> void assignNodeIds(final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds, | |
| 495 | final List<CompiledNode<V>> orderedNodes) { | |
| 496 |
1
1. assignNodeIds : negated conditional → KILLED |
if (nodeIds.containsKey(node)) { |
| 497 | return; | |
| 498 | } | |
| 499 | ||
| 500 | final int nodeId = orderedNodes.size(); | |
| 501 | nodeIds.put(node, nodeId); | |
| 502 | orderedNodes.add(node); | |
| 503 | ||
| 504 | for (CompiledNode<V> child : node.children()) { | |
| 505 |
1
1. assignNodeIds : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(child, nodeIds, orderedNodes); |
| 506 | } | |
| 507 | } | |
| 508 | ||
| 509 | /** | |
| 510 | * Writes one compiled node. | |
| 511 | * | |
| 512 | * @param dataOutput output | |
| 513 | * @param valueCodec value codec | |
| 514 | * @param node node to write | |
| 515 | * @param nodeIds node identifiers | |
| 516 | * @throws IOException if writing fails | |
| 517 | */ | |
| 518 | private static <V> void writeNode(final DataOutputStream dataOutput, final ValueStreamCodec<V> valueCodec, | |
| 519 | final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds) throws IOException { | |
| 520 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.edgeLabels().length); |
| 521 |
2
1. writeNode : changed conditional boundary → KILLED 2. writeNode : negated conditional → KILLED |
for (int index = 0; index < node.edgeLabels().length; index++) { |
| 522 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeChar → KILLED |
dataOutput.writeChar(node.edgeLabels()[index]); |
| 523 | final Integer childNodeId = nodeIds.get(node.children()[index]); | |
| 524 |
1
1. writeNode : negated conditional → KILLED |
if (childNodeId == null) { |
| 525 | throw new IOException("Missing child node identifier during serialization."); | |
| 526 | } | |
| 527 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(childNodeId); |
| 528 | } | |
| 529 | ||
| 530 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.orderedValues().length); |
| 531 |
2
1. writeNode : changed conditional boundary → KILLED 2. writeNode : negated conditional → KILLED |
for (int index = 0; index < node.orderedValues().length; index++) { |
| 532 |
1
1. writeNode : removed call to org/egothor/stemmer/FrequencyTrie$ValueStreamCodec::write → KILLED |
valueCodec.write(dataOutput, node.orderedValues()[index]); |
| 533 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.orderedCounts()[index]); |
| 534 | } | |
| 535 | } | |
| 536 | ||
| 537 | /** | |
| 538 | * Reads all compiled nodes and resolves child references. | |
| 539 | * | |
| 540 | * @param dataInput input | |
| 541 | * @param arrayFactory array factory | |
| 542 | * @param valueCodec value codec | |
| 543 | * @param nodeCount number of nodes | |
| 544 | * @param <V> value type | |
| 545 | * @return array of nodes indexed by serialized node identifier | |
| 546 | * @throws IOException if reading fails or the stream is invalid | |
| 547 | */ | |
| 548 | @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") | |
| 549 | private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory, | |
| 550 | final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException { | |
| 551 | final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount); | |
| 552 | ||
| 553 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 554 | final int edgeCount = dataInput.readInt(); | |
| 555 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
if (edgeCount < 0) { |
| 556 | throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount); | |
| 557 | } | |
| 558 | ||
| 559 | final char[] edgeLabels = new char[edgeCount]; | |
| 560 | final int[] childNodeIds = new int[edgeCount]; | |
| 561 | ||
| 562 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) { |
| 563 | edgeLabels[edgeIndex] = dataInput.readChar(); | |
| 564 | childNodeIds[edgeIndex] = dataInput.readInt(); | |
| 565 | } | |
| 566 | ||
| 567 |
1
1. readNodes : removed call to org/egothor/stemmer/FrequencyTrie::validateSerializedEdges → KILLED |
validateSerializedEdges(nodeIndex, edgeLabels); |
| 568 | ||
| 569 | final int valueCount = dataInput.readInt(); | |
| 570 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
if (valueCount < 0) { |
| 571 | throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount); | |
| 572 | } | |
| 573 | ||
| 574 | final V[] orderedValues = arrayFactory.apply(valueCount); | |
| 575 | final int[] orderedCounts = new int[valueCount]; | |
| 576 | ||
| 577 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) { |
| 578 | orderedValues[valueIndex] = valueCodec.read(dataInput); | |
| 579 | orderedCounts[valueIndex] = dataInput.readInt(); | |
| 580 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
if (orderedCounts[valueIndex] <= 0) { |
| 581 | throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index " | |
| 582 | + valueIndex + ": " + orderedCounts[valueIndex]); | |
| 583 | } | |
| 584 | } | |
| 585 | ||
| 586 | nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts)); | |
| 587 | } | |
| 588 | ||
| 589 | @SuppressWarnings("unchecked") | |
| 590 | final CompiledNode<V>[] nodes = new CompiledNode[nodeCount]; | |
| 591 | ||
| 592 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 593 | final NodeData<V> nodeData = nodeDataList.get(nodeIndex); | |
| 594 | @SuppressWarnings("unchecked") | |
| 595 | final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length]; | |
| 596 | nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(), | |
| 597 | nodeData.orderedCounts()); | |
| 598 | } | |
| 599 | ||
| 600 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 601 | final NodeData<V> nodeData = nodeDataList.get(nodeIndex); | |
| 602 | final CompiledNode<V> node = nodes[nodeIndex]; | |
| 603 | ||
| 604 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) { |
| 605 | final int childNodeId = nodeData.childNodeIds()[edgeIndex]; | |
| 606 |
4
1. readNodes : changed conditional boundary → SURVIVED 2. readNodes : changed conditional boundary → SURVIVED 3. readNodes : negated conditional → KILLED 4. readNodes : negated conditional → KILLED |
if (childNodeId < 0 || childNodeId >= nodeCount) { |
| 607 | throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex | |
| 608 | + ": " + childNodeId); | |
| 609 | } | |
| 610 | node.children()[edgeIndex] = nodes[childNodeId]; | |
| 611 | } | |
| 612 | } | |
| 613 | ||
| 614 |
1
1. readNodes : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readNodes → KILLED |
return nodes; |
| 615 | } | |
| 616 | ||
| 617 | /** | |
| 618 | * Validates the serialized edge-label sequence for one node. | |
| 619 | * | |
| 620 | * <p> | |
| 621 | * Compiled nodes rely on binary search for child lookup and therefore require | |
| 622 | * edge labels to be stored in strict ascending order without duplicates. | |
| 623 | * Rejecting malformed streams here keeps lookup semantics deterministic and | |
| 624 | * avoids silently constructing a trie whose search behavior would be undefined. | |
| 625 | * | |
| 626 | * @param nodeIndex serialized node identifier | |
| 627 | * @param edgeLabels serialized edge labels | |
| 628 | * @throws IOException if the edge labels are not strictly ascending | |
| 629 | */ | |
| 630 | private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException { | |
| 631 |
2
1. validateSerializedEdges : negated conditional → KILLED 2. validateSerializedEdges : changed conditional boundary → KILLED |
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) { |
| 632 |
3
1. validateSerializedEdges : changed conditional boundary → SURVIVED 2. validateSerializedEdges : negated conditional → KILLED 3. validateSerializedEdges : Replaced integer subtraction with addition → KILLED |
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) { |
| 633 |
1
1. validateSerializedEdges : Replaced integer subtraction with addition → KILLED |
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index " |
| 634 | + edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'."); | |
| 635 | } | |
| 636 | } | |
| 637 | } | |
| 638 | ||
| 639 | /** | |
| 640 | * Locates the compiled node for the supplied key. | |
| 641 | * | |
| 642 | * @param key already-normalized key to resolve | |
| 643 | * @return compiled node, or {@code null} if the path does not exist | |
| 644 | */ | |
| 645 | private CompiledNode<V> findNode(final String key) { | |
| 646 | CompiledNode<V> current = this.root; | |
| 647 |
2
1. findNode : changed conditional boundary → KILLED 2. findNode : negated conditional → KILLED |
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) { |
| 648 | current = current.findChild( | |
| 649 | key.charAt(this.metadata.traversalDirection().logicalIndex(key.length(), traversalOffset))); | |
| 650 |
1
1. findNode : negated conditional → KILLED |
if (current == null) { |
| 651 | return null; | |
| 652 | } | |
| 653 | } | |
| 654 |
1
1. findNode : replaced return value with null for org/egothor/stemmer/FrequencyTrie::findNode → KILLED |
return current; |
| 655 | } | |
| 656 | ||
| 657 | /** | |
| 658 | * Applies lookup-time case normalization according to persisted metadata. | |
| 659 | * | |
| 660 | * @param key lookup key | |
| 661 | * @return normalized key for trie traversal | |
| 662 | */ | |
| 663 | private String normalizeLookupKey(final String key) { | |
| 664 | String normalized = key; | |
| 665 | ||
| 666 |
1
1. normalizeLookupKey : negated conditional → KILLED |
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { |
| 667 | normalized = normalized.toLowerCase(Locale.ROOT); | |
| 668 | } | |
| 669 | ||
| 670 |
1
1. normalizeLookupKey : negated conditional → KILLED |
if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) { |
| 671 | normalized = DiacriticStripper.strip(normalized); | |
| 672 |
1
1. normalizeLookupKey : negated conditional → KILLED |
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) { |
| 673 | throw new UnsupportedOperationException( | |
| 674 | "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet."); | |
| 675 | } | |
| 676 | ||
| 677 |
1
1. normalizeLookupKey : replaced return value with "" for org/egothor/stemmer/FrequencyTrie::normalizeLookupKey → KILLED |
return normalized; |
| 678 | } | |
| 679 | ||
| 680 | /** | |
| 681 | * Builder of {@link FrequencyTrie}. | |
| 682 | * | |
| 683 | * <p> | |
| 684 | * The builder is intentionally mutable and optimized for repeated | |
| 685 | * {@link #put(String, Object)} calls. The final trie is created by | |
| 686 | * {@link #build()}, which performs bottom-up subtree reduction and converts the | |
| 687 | * structure to a compact immutable representation optimized for read | |
| 688 | * operations. | |
| 689 | * | |
| 690 | * @param <V> value type | |
| 691 | */ | |
| 692 | public static final class Builder<V> { | |
| 693 | ||
| 694 | /** | |
| 695 | * Logger of this class. | |
| 696 | */ | |
| 697 | private static final Logger LOGGER = Logger.getLogger(Builder.class.getName()); | |
| 698 | ||
| 699 | /** | |
| 700 | * Factory used to create typed arrays. | |
| 701 | */ | |
| 702 | private final IntFunction<V[]> arrayFactory; | |
| 703 | ||
| 704 | /** | |
| 705 | * Reduction configuration. | |
| 706 | */ | |
| 707 | private final ReductionSettings reductionSettings; | |
| 708 | ||
| 709 | /** | |
| 710 | * Logical key traversal direction used by this builder. | |
| 711 | */ | |
| 712 | private final WordTraversalDirection traversalDirection; | |
| 713 | ||
| 714 | /** | |
| 715 | * Dictionary case processing mode associated with this builder. | |
| 716 | */ | |
| 717 | private final CaseProcessingMode caseProcessingMode; | |
| 718 | ||
| 719 | /** | |
| 720 | * Dictionary diacritic processing mode associated with this builder. | |
| 721 | */ | |
| 722 | private final DiacriticProcessingMode diacriticProcessingMode; | |
| 723 | ||
| 724 | /** | |
| 725 | * Mutable root node. | |
| 726 | */ | |
| 727 | private final MutableNode<V> root; | |
| 728 | ||
| 729 | /** | |
| 730 | * Creates a new builder with the provided settings. | |
| 731 | * | |
| 732 | * <p> | |
| 733 | * This constructor preserves the historical Egothor behavior and therefore | |
| 734 | * traverses logical keys from their end toward their beginning. | |
| 735 | * </p> | |
| 736 | * | |
| 737 | * @param arrayFactory array factory | |
| 738 | * @param reductionSettings reduction configuration | |
| 739 | * @throws NullPointerException if any argument is {@code null} | |
| 740 | */ | |
| 741 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) { | |
| 742 | this(arrayFactory, reductionSettings, WordTraversalDirection.BACKWARD); | |
| 743 | } | |
| 744 | ||
| 745 | /** | |
| 746 | * Creates a new builder with the provided settings and explicit traversal | |
| 747 | * direction. | |
| 748 | * | |
| 749 | * @param arrayFactory array factory | |
| 750 | * @param reductionSettings reduction configuration | |
| 751 | * @param traversalDirection logical key traversal direction | |
| 752 | * @throws NullPointerException if any argument is {@code null} | |
| 753 | */ | |
| 754 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings, | |
| 755 | final WordTraversalDirection traversalDirection) { | |
| 756 | this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); | |
| 757 | } | |
| 758 | ||
| 759 | /** | |
| 760 | * Creates a new builder with the provided settings, explicit traversal | |
| 761 | * direction, and explicit case processing mode. | |
| 762 | * | |
| 763 | * @param arrayFactory array factory | |
| 764 | * @param reductionSettings reduction configuration | |
| 765 | * @param traversalDirection logical key traversal direction | |
| 766 | * @param caseProcessingMode dictionary case processing mode | |
| 767 | * @throws NullPointerException if any argument is {@code null} | |
| 768 | */ | |
| 769 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings, | |
| 770 | final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) { | |
| 771 | this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, | |
| 772 | DiacriticProcessingMode.AS_IS); | |
| 773 | } | |
| 774 | ||
| 775 | /** | |
| 776 | * Creates a new builder with the provided settings, explicit traversal | |
| 777 | * direction, explicit case processing mode, and explicit diacritic processing | |
| 778 | * mode. | |
| 779 | * | |
| 780 | * @param arrayFactory array factory | |
| 781 | * @param reductionSettings reduction configuration | |
| 782 | * @param traversalDirection logical key traversal direction | |
| 783 | * @param caseProcessingMode dictionary case processing mode | |
| 784 | * @param diacriticProcessingMode dictionary diacritic processing mode | |
| 785 | * @throws NullPointerException if any argument is {@code null} | |
| 786 | */ | |
| 787 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings, | |
| 788 | final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode, | |
| 789 | final DiacriticProcessingMode diacriticProcessingMode) { | |
| 790 | this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 791 | this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings"); | |
| 792 | this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection"); | |
| 793 | this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); | |
| 794 | this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode"); | |
| 795 | this.root = new MutableNode<>(); | |
| 796 | } | |
| 797 | ||
| 798 | /** | |
| 799 | * Creates a new builder using default thresholds for the supplied reduction | |
| 800 | * mode. | |
| 801 | * | |
| 802 | * <p> | |
| 803 | * This constructor preserves the historical Egothor behavior and therefore | |
| 804 | * traverses logical keys from their end toward their beginning. | |
| 805 | * </p> | |
| 806 | * | |
| 807 | * @param arrayFactory array factory | |
| 808 | * @param reductionMode reduction mode | |
| 809 | * @throws NullPointerException if any argument is {@code null} | |
| 810 | */ | |
| 811 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) { | |
| 812 | this(arrayFactory, ReductionSettings.withDefaults(reductionMode), WordTraversalDirection.BACKWARD); | |
| 813 | } | |
| 814 | ||
| 815 | /** | |
| 816 | * Creates a new builder using default thresholds for the supplied reduction | |
| 817 | * mode and explicit traversal direction. | |
| 818 | * | |
| 819 | * @param arrayFactory array factory | |
| 820 | * @param reductionMode reduction mode | |
| 821 | * @param traversalDirection logical key traversal direction | |
| 822 | * @throws NullPointerException if any argument is {@code null} | |
| 823 | */ | |
| 824 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode, | |
| 825 | final WordTraversalDirection traversalDirection) { | |
| 826 | this(arrayFactory, ReductionSettings.withDefaults(reductionMode), traversalDirection); | |
| 827 | } | |
| 828 | ||
| 829 | /** | |
| 830 | * Stores a value for the supplied key and increments its local frequency. | |
| 831 | * | |
| 832 | * <p> | |
| 833 | * Values are stored at the node addressed by the full key. Since trie values | |
| 834 | * may also appear on internal nodes, an empty key is valid and stores a value | |
| 835 | * directly at the root. | |
| 836 | * | |
| 837 | * @param key key | |
| 838 | * @param value value | |
| 839 | * @return this builder | |
| 840 | * @throws NullPointerException if {@code key} or {@code value} is {@code null} | |
| 841 | */ | |
| 842 | public Builder<V> put(final String key, final V value) { | |
| 843 |
1
1. put : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::put → SURVIVED |
return put(key, value, 1); |
| 844 | } | |
| 845 | ||
| 846 | /** | |
| 847 | * Builds a compiled read-only trie. | |
| 848 | * | |
| 849 | * @return compiled trie | |
| 850 | */ | |
| 851 | public FrequencyTrie<V> build() { | |
| 852 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 853 | LOGGER.log(Level.FINE, "Starting trie compilation with reduction mode {0}.", | |
| 854 | this.reductionSettings.reductionMode()); | |
| 855 | } | |
| 856 | ||
| 857 | final ReductionContext<V> reductionContext = new ReductionContext<>(this.reductionSettings); | |
| 858 | final ReducedNode<V> reducedRoot = reduce(this.root, reductionContext); | |
| 859 | final CompiledNode<V> compiledRoot = freeze(reducedRoot, new IdentityHashMap<>()); | |
| 860 | ||
| 861 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 862 | LOGGER.log(Level.FINE, "Trie compilation finished. Canonical node count: {0}.", | |
| 863 | reductionContext.canonicalNodeCount()); | |
| 864 | } | |
| 865 | ||
| 866 | final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings, | |
| 867 | this.diacriticProcessingMode, this.caseProcessingMode); | |
| 868 |
1
1. build : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::build → KILLED |
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata); |
| 869 | } | |
| 870 | ||
| 871 | /** | |
| 872 | * Stores a value for the supplied key and increments its local frequency by the | |
| 873 | * specified positive count. | |
| 874 | * | |
| 875 | * <p> | |
| 876 | * Values are stored at the node addressed by the full key. Since trie values | |
| 877 | * may also appear on internal nodes, an empty key is valid and stores a value | |
| 878 | * directly at the root. | |
| 879 | * | |
| 880 | * <p> | |
| 881 | * This method is functionally equivalent to calling | |
| 882 | * {@link #put(String, Object)} repeatedly {@code count} times, but it avoids | |
| 883 | * unnecessary repeated map updates and is therefore preferable for bulk | |
| 884 | * reconstruction from compiled tries or other aggregated sources. | |
| 885 | * | |
| 886 | * @param key key | |
| 887 | * @param value value | |
| 888 | * @param count positive frequency increment | |
| 889 | * @return this builder | |
| 890 | * @throws NullPointerException if {@code key} or {@code value} is | |
| 891 | * {@code null} | |
| 892 | * @throws IllegalArgumentException if {@code count} is less than {@code 1} | |
| 893 | */ | |
| 894 | public Builder<V> put(final String key, final V value, final int count) { | |
| 895 | Objects.requireNonNull(key, "key"); | |
| 896 | Objects.requireNonNull(value, "value"); | |
| 897 | ||
| 898 |
2
1. put : changed conditional boundary → KILLED 2. put : negated conditional → KILLED |
if (count < 1) { // NOPMD |
| 899 | throw new IllegalArgumentException("count must be at least 1."); | |
| 900 | } | |
| 901 | ||
| 902 | final String normalizedKey = normalizeDictionaryKey(key); | |
| 903 | ||
| 904 | MutableNode<V> current = this.root; | |
| 905 |
2
1. put : changed conditional boundary → KILLED 2. put : negated conditional → KILLED |
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) { |
| 906 | final Character edge = normalizedKey | |
| 907 | .charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset)); | |
| 908 | MutableNode<V> child = current.children().get(edge); | |
| 909 |
1
1. put : negated conditional → KILLED |
if (child == null) { |
| 910 | child = new MutableNode<>(); // NOPMD | |
| 911 | current.children().put(edge, child); | |
| 912 | } | |
| 913 | current = child; | |
| 914 | } | |
| 915 | ||
| 916 | final Integer previous = current.valueCounts().get(value); | |
| 917 |
1
1. put : negated conditional → KILLED |
if (previous == null) { |
| 918 | current.valueCounts().put(value, count); | |
| 919 | } else { | |
| 920 |
1
1. put : Replaced integer addition with subtraction → KILLED |
current.valueCounts().put(value, previous + count); |
| 921 | } | |
| 922 |
1
1. put : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::put → SURVIVED |
return this; |
| 923 | } | |
| 924 | ||
| 925 | /** | |
| 926 | * Applies build-time dictionary-key normalization according to the builder | |
| 927 | * configuration. | |
| 928 | * | |
| 929 | * @param key dictionary key | |
| 930 | * @return normalized key for trie insertion | |
| 931 | */ | |
| 932 | private String normalizeDictionaryKey(final String key) { | |
| 933 | String normalized = key; | |
| 934 | ||
| 935 |
1
1. normalizeDictionaryKey : negated conditional → KILLED |
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { |
| 936 | normalized = normalized.toLowerCase(Locale.ROOT); | |
| 937 | } | |
| 938 | ||
| 939 |
1
1. normalizeDictionaryKey : negated conditional → KILLED |
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) { |
| 940 | normalized = DiacriticStripper.strip(normalized); | |
| 941 |
1
1. normalizeDictionaryKey : negated conditional → KILLED |
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) { |
| 942 | throw new UnsupportedOperationException( | |
| 943 | "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet."); | |
| 944 | } | |
| 945 | ||
| 946 |
1
1. normalizeDictionaryKey : replaced return value with "" for org/egothor/stemmer/FrequencyTrie$Builder::normalizeDictionaryKey → KILLED |
return normalized; |
| 947 | } | |
| 948 | ||
| 949 | /** | |
| 950 | * Returns the number of mutable build-time nodes currently reachable from the | |
| 951 | * builder root. | |
| 952 | * | |
| 953 | * <p> | |
| 954 | * This metric is intended mainly for diagnostics and tests that compare the | |
| 955 | * unreduced build-time structure with the final reduced compiled trie. | |
| 956 | * | |
| 957 | * @return number of mutable build-time nodes | |
| 958 | */ | |
| 959 | /* default */ int buildTimeSize() { | |
| 960 |
1
1. buildTimeSize : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie$Builder::buildTimeSize → KILLED |
return countMutableNodes(this.root); |
| 961 | } | |
| 962 | ||
| 963 | /** | |
| 964 | * Returns the logical key traversal direction used by this builder. | |
| 965 | * | |
| 966 | * @return logical key traversal direction | |
| 967 | */ | |
| 968 | /* default */ WordTraversalDirection traversalDirection() { | |
| 969 |
1
1. traversalDirection : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::traversalDirection → NO_COVERAGE |
return this.traversalDirection; |
| 970 | } | |
| 971 | ||
| 972 | /** | |
| 973 | * Counts mutable nodes recursively. | |
| 974 | * | |
| 975 | * @param node current node | |
| 976 | * @return reachable mutable node count | |
| 977 | */ | |
| 978 | private int countMutableNodes(final MutableNode<V> node) { | |
| 979 | int count = 1; | |
| 980 | for (MutableNode<V> child : node.children().values()) { | |
| 981 |
1
1. countMutableNodes : Replaced integer addition with subtraction → KILLED |
count += countMutableNodes(child); |
| 982 | } | |
| 983 |
1
1. countMutableNodes : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie$Builder::countMutableNodes → KILLED |
return count; |
| 984 | } | |
| 985 | ||
| 986 | /** | |
| 987 | * Reduces a mutable node to a canonical reduced node. | |
| 988 | * | |
| 989 | * @param source source mutable node | |
| 990 | * @param context reduction context | |
| 991 | * @return canonical reduced node | |
| 992 | */ | |
| 993 | private ReducedNode<V> reduce(final MutableNode<V> source, final ReductionContext<V> context) { | |
| 994 | final Map<Character, ReducedNode<V>> reducedChildren = new LinkedHashMap<>(); | |
| 995 | ||
| 996 | for (Map.Entry<Character, MutableNode<V>> childEntry : source.children().entrySet()) { | |
| 997 | final ReducedNode<V> reducedChild = reduce(childEntry.getValue(), context); | |
| 998 | reducedChildren.put(childEntry.getKey(), reducedChild); | |
| 999 | } | |
| 1000 | ||
| 1001 | final Map<V, Integer> localCounts = copyCounts(source.valueCounts()); | |
| 1002 | final LocalValueSummary<V> localSummary = LocalValueSummary.of(localCounts, this.arrayFactory); | |
| 1003 | final ReductionSignature<V> signature = ReductionSignature.create(localSummary, reducedChildren, | |
| 1004 | context.settings()); | |
| 1005 | ||
| 1006 | ReducedNode<V> canonical = context.lookup(signature); | |
| 1007 |
1
1. reduce : negated conditional → KILLED |
if (canonical == null) { |
| 1008 | canonical = new ReducedNode<>(signature, localCounts, reducedChildren); | |
| 1009 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReductionContext::register → KILLED |
context.register(signature, canonical); |
| 1010 |
1
1. reduce : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::reduce → KILLED |
return canonical; |
| 1011 | } | |
| 1012 | ||
| 1013 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReducedNode::mergeLocalCounts → KILLED |
canonical.mergeLocalCounts(localCounts); |
| 1014 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReducedNode::mergeChildren → SURVIVED |
canonical.mergeChildren(reducedChildren); |
| 1015 | ||
| 1016 |
1
1. reduce : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::reduce → KILLED |
return canonical; |
| 1017 | } | |
| 1018 | ||
| 1019 | /** | |
| 1020 | * Freezes a reduced node into an immutable compiled node. | |
| 1021 | * | |
| 1022 | * @param reducedNode reduced node | |
| 1023 | * @param cache already frozen nodes | |
| 1024 | * @return immutable compiled node | |
| 1025 | */ | |
| 1026 | private CompiledNode<V> freeze(final ReducedNode<V> reducedNode, | |
| 1027 | final Map<ReducedNode<V>, CompiledNode<V>> cache) { | |
| 1028 | final CompiledNode<V> existing = cache.get(reducedNode); | |
| 1029 |
1
1. freeze : negated conditional → KILLED |
if (existing != null) { |
| 1030 |
1
1. freeze : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::freeze → KILLED |
return existing; |
| 1031 | } | |
| 1032 | ||
| 1033 | final LocalValueSummary<V> localSummary = LocalValueSummary.of(reducedNode.localCounts(), | |
| 1034 | this.arrayFactory); | |
| 1035 | ||
| 1036 | final List<Map.Entry<Character, ReducedNode<V>>> childEntries = new ArrayList<>( | |
| 1037 | reducedNode.children().entrySet()); | |
| 1038 |
1
1. freeze : removed call to java/util/List::sort → KILLED |
childEntries.sort(Map.Entry.comparingByKey()); |
| 1039 | ||
| 1040 | final char[] edges = new char[childEntries.size()]; | |
| 1041 | @SuppressWarnings("unchecked") | |
| 1042 | final CompiledNode<V>[] childNodes = new CompiledNode[childEntries.size()]; | |
| 1043 | ||
| 1044 |
2
1. freeze : negated conditional → KILLED 2. freeze : changed conditional boundary → KILLED |
for (int index = 0; index < childEntries.size(); index++) { |
| 1045 | final Map.Entry<Character, ReducedNode<V>> entry = childEntries.get(index); | |
| 1046 | edges[index] = entry.getKey(); | |
| 1047 | childNodes[index] = freeze(entry.getValue(), cache); | |
| 1048 | } | |
| 1049 | ||
| 1050 | final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(), | |
| 1051 | localSummary.orderedCounts()); | |
| 1052 | cache.put(reducedNode, frozen); | |
| 1053 |
1
1. freeze : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::freeze → KILLED |
return frozen; |
| 1054 | } | |
| 1055 | ||
| 1056 | /** | |
| 1057 | * Creates a shallow frequency copy preserving deterministic insertion order of | |
| 1058 | * first occurrence. | |
| 1059 | * | |
| 1060 | * @param source source counts | |
| 1061 | * @return copied counts | |
| 1062 | */ | |
| 1063 | private Map<V, Integer> copyCounts(final Map<V, Integer> source) { | |
| 1064 |
1
1. copyCounts : replaced return value with Collections.emptyMap for org/egothor/stemmer/FrequencyTrie$Builder::copyCounts → KILLED |
return new LinkedHashMap<>(source); |
| 1065 | } | |
| 1066 | } | |
| 1067 | ||
| 1068 | /** | |
| 1069 | * Codec used to persist values stored in the trie. | |
| 1070 | * | |
| 1071 | * @param <V> value type | |
| 1072 | */ | |
| 1073 | public interface ValueStreamCodec<V> { | |
| 1074 | ||
| 1075 | /** | |
| 1076 | * Writes one value to the supplied data output. | |
| 1077 | * | |
| 1078 | * @param dataOutput target data output | |
| 1079 | * @param value value to write | |
| 1080 | * @throws IOException if writing fails | |
| 1081 | */ | |
| 1082 | void write(DataOutputStream dataOutput, V value) throws IOException; | |
| 1083 | ||
| 1084 | /** | |
| 1085 | * Reads one value from the supplied data input. | |
| 1086 | * | |
| 1087 | * @param dataInput source data input | |
| 1088 | * @return read value | |
| 1089 | * @throws IOException if reading fails | |
| 1090 | */ | |
| 1091 | V read(DataInputStream dataInput) throws IOException; | |
| 1092 | } | |
| 1093 | ||
| 1094 | } | |
Mutations | ||
| 135 |
1.1 |
|
| 175 |
1.1 2.2 |
|
| 178 |
1.1 |
|
| 207 |
1.1 2.2 |
|
| 208 |
1.1 |
|
| 210 |
1.1 |
|
| 238 |
1.1 2.2 |
|
| 243 |
1.1 2.2 |
|
| 246 |
1.1 |
|
| 260 |
1.1 |
|
| 269 |
1.1 |
|
| 278 |
1.1 |
|
| 303 |
1.1 |
|
| 311 |
1.1 |
|
| 317 |
1.1 |
|
| 318 |
1.1 |
|
| 319 |
1.1 |
|
| 320 |
1.1 |
|
| 321 |
1.1 |
|
| 324 |
1.1 |
|
| 327 |
1.1 |
|
| 352 |
1.1 |
|
| 359 |
1.1 |
|
| 364 |
1.1 2.2 3.3 4.4 |
|
| 369 |
1.1 2.2 |
|
| 374 |
1.1 2.2 3.3 4.4 |
|
| 387 |
1.1 |
|
| 399 |
1.1 |
|
| 412 |
1.1 2.2 |
|
| 414 |
1.1 |
|
| 421 |
1.1 2.2 |
|
| 424 |
1.1 2.2 3.3 4.4 |
|
| 432 |
1.1 2.2 |
|
| 433 |
1.1 |
|
| 438 |
1.1 2.2 3.3 4.4 |
|
| 447 |
1.1 2.2 3.3 4.4 |
|
| 452 |
1.1 2.2 |
|
| 455 |
1.1 2.2 3.3 4.4 |
|
| 463 |
1.1 |
|
| 482 |
1.1 |
|
| 483 |
1.1 |
|
| 496 |
1.1 |
|
| 505 |
1.1 |
|
| 520 |
1.1 |
|
| 521 |
1.1 2.2 |
|
| 522 |
1.1 |
|
| 524 |
1.1 |
|
| 527 |
1.1 |
|
| 530 |
1.1 |
|
| 531 |
1.1 2.2 |
|
| 532 |
1.1 |
|
| 533 |
1.1 |
|
| 553 |
1.1 2.2 |
|
| 555 |
1.1 2.2 |
|
| 562 |
1.1 2.2 |
|
| 567 |
1.1 |
|
| 570 |
1.1 2.2 |
|
| 577 |
1.1 2.2 |
|
| 580 |
1.1 2.2 |
|
| 592 |
1.1 2.2 |
|
| 600 |
1.1 2.2 |
|
| 604 |
1.1 2.2 |
|
| 606 |
1.1 2.2 3.3 4.4 |
|
| 614 |
1.1 |
|
| 631 |
1.1 2.2 |
|
| 632 |
1.1 2.2 3.3 |
|
| 633 |
1.1 |
|
| 647 |
1.1 2.2 |
|
| 650 |
1.1 |
|
| 654 |
1.1 |
|
| 666 |
1.1 |
|
| 670 |
1.1 |
|
| 672 |
1.1 |
|
| 677 |
1.1 |
|
| 843 |
1.1 |
|
| 868 |
1.1 |
|
| 898 |
1.1 2.2 |
|
| 905 |
1.1 2.2 |
|
| 909 |
1.1 |
|
| 917 |
1.1 |
|
| 920 |
1.1 |
|
| 922 |
1.1 |
|
| 935 |
1.1 |
|
| 939 |
1.1 |
|
| 941 |
1.1 |
|
| 946 |
1.1 |
|
| 960 |
1.1 |
|
| 969 |
1.1 |
|
| 981 |
1.1 |
|
| 983 |
1.1 |
|
| 1007 |
1.1 |
|
| 1009 |
1.1 |
|
| 1010 |
1.1 |
|
| 1013 |
1.1 |
|
| 1014 |
1.1 |
|
| 1016 |
1.1 |
|
| 1029 |
1.1 |
|
| 1030 |
1.1 |
|
| 1038 |
1.1 |
|
| 1044 |
1.1 2.2 |
|
| 1053 |
1.1 |
|
| 1064 |
1.1 |