| 1 | /******************************************************************************* | |
| 2 | * Copyright (C) 2026, Leo Galambos | |
| 3 | * All rights reserved. | |
| 4 | * | |
| 5 | * Redistribution and use in source and binary forms, with or without | |
| 6 | * modification, are permitted provided that the following conditions are met: | |
| 7 | * | |
| 8 | * 1. Redistributions of source code must retain the above copyright notice, | |
| 9 | * this list of conditions and the following disclaimer. | |
| 10 | * | |
| 11 | * 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 | * this list of conditions and the following disclaimer in the documentation | |
| 13 | * and/or other materials provided with the distribution. | |
| 14 | * | |
| 15 | * 3. All advertising materials mentioning features or use of this software must | |
| 16 | * display the following acknowledgement: | |
| 17 | * This product includes software developed by the Egothor project. | |
| 18 | * | |
| 19 | * 4. Neither the name of the copyright holder nor the names of its contributors | |
| 20 | * may be used to endorse or promote products derived from this software | |
| 21 | * without specific prior written permission. | |
| 22 | * | |
| 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 24 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 25 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 26 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 27 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 28 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 29 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 30 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 31 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 32 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 33 | * POSSIBILITY OF SUCH DAMAGE. | |
| 34 | ******************************************************************************/ | |
| 35 | package org.egothor.stemmer; | |
| 36 | ||
| 37 | import java.io.DataInputStream; | |
| 38 | import java.io.DataOutputStream; | |
| 39 | import java.io.IOException; | |
| 40 | import java.io.InputStream; | |
| 41 | import java.io.OutputStream; | |
| 42 | import java.util.ArrayList; | |
| 43 | import java.util.Arrays; | |
| 44 | import java.util.Collections; | |
| 45 | import java.util.IdentityHashMap; | |
| 46 | import java.util.LinkedHashMap; | |
| 47 | import java.util.List; | |
| 48 | import java.util.Map; | |
| 49 | import java.util.Objects; | |
| 50 | import java.util.function.IntFunction; | |
| 51 | import java.util.logging.Level; | |
| 52 | import java.util.logging.Logger; | |
| 53 | ||
| 54 | import org.egothor.stemmer.trie.CompiledNode; | |
| 55 | import org.egothor.stemmer.trie.LocalValueSummary; | |
| 56 | import org.egothor.stemmer.trie.MutableNode; | |
| 57 | import org.egothor.stemmer.trie.NodeData; | |
| 58 | import org.egothor.stemmer.trie.ReducedNode; | |
| 59 | import org.egothor.stemmer.trie.ReductionContext; | |
| 60 | import org.egothor.stemmer.trie.ReductionSignature; | |
| 61 | ||
| 62 | /** | |
| 63 | * Read-only trie mapping {@link String} keys to one or more values with | |
| 64 | * frequency tracking. | |
| 65 | * | |
| 66 | * <p> | |
| 67 | * A key may be associated with multiple values. Each value keeps the number of | |
| 68 | * times it was inserted during the build phase. The method {@link #get(String)} | |
| 69 | * returns the locally most frequent value stored at the terminal node of the | |
| 70 | * supplied key, while {@link #getAll(String)} returns all locally stored values | |
| 71 | * ordered by descending frequency. | |
| 72 | * | |
| 73 | * <p> | |
| 74 | * If multiple values have the same local frequency, their ordering is | |
| 75 | * deterministic. The preferred value is selected by the following tie-breaking | |
| 76 | * rules, in order: | |
| 77 | * <ol> | |
| 78 | * <li>shorter {@link String} representation wins, based on | |
| 79 | * {@code value.toString()}</li> | |
| 80 | * <li>if the lengths are equal, lexicographically lower {@link String} | |
| 81 | * representation wins</li> | |
| 82 | * <li>if the textual representations are still equal, first-seen insertion | |
| 83 | * order remains stable</li> | |
| 84 | * </ol> | |
| 85 | * | |
| 86 | * <p> | |
| 87 | * Values may be stored at any trie node, including internal nodes and leaf | |
| 88 | * nodes. Therefore, reduction and canonicalization always operate on both the | |
| 89 | * node-local terminal values and the structure of all descendant edges. | |
| 90 | * | |
| 91 | * @param <V> value type | |
| 92 | */ | |
| 93 | public final class FrequencyTrie<V> { | |
| 94 | ||
| 95 | /** | |
| 96 | * Logger of this class. | |
| 97 | */ | |
| 98 | private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName()); | |
| 99 | ||
| 100 | /** | |
| 101 | * Binary format magic header. | |
| 102 | */ | |
| 103 | private static final int STREAM_MAGIC = 0x45475452; | |
| 104 | ||
| 105 | /** | |
| 106 | * Binary format version. | |
| 107 | */ | |
| 108 | private static final int STREAM_VERSION = 1; | |
| 109 | ||
| 110 | /** | |
| 111 | * Factory used to create correctly typed arrays for {@link #getAll(String)}. | |
| 112 | */ | |
| 113 | private final IntFunction<V[]> arrayFactory; | |
| 114 | ||
| 115 | /** | |
| 116 | * Root node of the compiled read-only trie. | |
| 117 | */ | |
| 118 | private final CompiledNode<V> root; | |
| 119 | ||
| 120 | /** | |
| 121 | * Creates a new compiled trie instance. | |
| 122 | * | |
| 123 | * @param arrayFactory array factory | |
| 124 | * @param root compiled root node | |
| 125 | * @throws NullPointerException if any argument is {@code null} | |
| 126 | */ | |
| 127 | private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) { | |
| 128 | this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 129 | this.root = Objects.requireNonNull(root, "root"); | |
| 130 | } | |
| 131 | ||
| 132 | /** | |
| 133 | * Returns the most frequent value stored at the node addressed by the supplied | |
| 134 | * key. | |
| 135 | * | |
| 136 | * <p> | |
| 137 | * If multiple values have the same local frequency, the returned value is | |
| 138 | * selected deterministically by shorter {@code toString()} value first, then by | |
| 139 | * lexicographically lower {@code toString()}, and finally by stable first-seen | |
| 140 | * order. | |
| 141 | * | |
| 142 | * @param key key to resolve | |
| 143 | * @return most frequent value, or {@code null} if the key does not exist or no | |
| 144 | * value is stored at the addressed node | |
| 145 | * @throws NullPointerException if {@code key} is {@code null} | |
| 146 | */ | |
| 147 | public V get(final String key) { | |
| 148 | Objects.requireNonNull(key, "key"); | |
| 149 | final CompiledNode<V> node = findNode(key); | |
| 150 |
2
1. get : negated conditional → KILLED 2. get : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 151 | return null; | |
| 152 | } | |
| 153 |
1
1. get : replaced return value with null for org/egothor/stemmer/FrequencyTrie::get → KILLED |
return node.orderedValues()[0]; |
| 154 | } | |
| 155 | ||
| 156 | /** | |
| 157 | * Returns all values stored at the node addressed by the supplied key, ordered | |
| 158 | * by descending frequency. | |
| 159 | * | |
| 160 | * <p> | |
| 161 | * If multiple values have the same local frequency, the ordering is | |
| 162 | * deterministic by shorter {@code toString()} value first, then by | |
| 163 | * lexicographically lower {@code toString()}, and finally by stable first-seen | |
| 164 | * order. | |
| 165 | * | |
| 166 | * <p> | |
| 167 | * The returned array is a defensive copy. | |
| 168 | * | |
| 169 | * @param key key to resolve | |
| 170 | * @return all values stored at the addressed node, ordered by descending | |
| 171 | * frequency; returns an empty array if the key does not exist or no | |
| 172 | * value is stored at the addressed node | |
| 173 | * @throws NullPointerException if {@code key} is {@code null} | |
| 174 | */ | |
| 175 | public V[] getAll(final String key) { | |
| 176 | Objects.requireNonNull(key, "key"); | |
| 177 | final CompiledNode<V> node = findNode(key); | |
| 178 |
2
1. getAll : negated conditional → KILLED 2. getAll : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 179 |
1
1. getAll : replaced return value with null for org/egothor/stemmer/FrequencyTrie::getAll → KILLED |
return this.arrayFactory.apply(0); |
| 180 | } | |
| 181 |
1
1. getAll : replaced return value with null for org/egothor/stemmer/FrequencyTrie::getAll → KILLED |
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length); |
| 182 | } | |
| 183 | ||
| 184 | /** | |
| 185 | * Returns all values stored at the node addressed by the supplied key together | |
| 186 | * with their occurrence counts, ordered by the same rules as | |
| 187 | * {@link #getAll(String)}. | |
| 188 | * | |
| 189 | * <p> | |
| 190 | * The returned list is aligned with the arrays returned by | |
| 191 | * {@link #getAll(String)} and the internal compiled count representation. | |
| 192 | * | |
| 193 | * <p> | |
| 194 | * The returned list is immutable. | |
| 195 | * | |
| 196 | * <p> | |
| 197 | * In reduction modes that merge semantically equivalent subtrees, the returned | |
| 198 | * counts may be aggregated across multiple original build-time nodes that were | |
| 199 | * reduced into the same canonical compiled node. | |
| 200 | * | |
| 201 | * @param key key to resolve | |
| 202 | * @return immutable ordered list of value-count entries; returns an empty list | |
| 203 | * if the key does not exist or no value is stored at the addressed node | |
| 204 | * @throws NullPointerException if {@code key} is {@code null} | |
| 205 | */ | |
| 206 | public List<ValueCount<V>> getEntries(final String key) { | |
| 207 | Objects.requireNonNull(key, "key"); | |
| 208 | final CompiledNode<V> node = findNode(key); | |
| 209 |
2
1. getEntries : negated conditional → KILLED 2. getEntries : negated conditional → KILLED |
if (node == null || node.orderedValues().length == 0) { |
| 210 | return List.of(); | |
| 211 | } | |
| 212 | ||
| 213 | final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length); | |
| 214 |
2
1. getEntries : changed conditional boundary → KILLED 2. getEntries : negated conditional → KILLED |
for (int index = 0; index < node.orderedValues().length; index++) { |
| 215 | entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index])); | |
| 216 | } | |
| 217 |
1
1. getEntries : replaced return value with Collections.emptyList for org/egothor/stemmer/FrequencyTrie::getEntries → KILLED |
return Collections.unmodifiableList(entries); |
| 218 | } | |
| 219 | ||
| 220 | /** | |
| 221 | * Returns the root node mainly for diagnostics and tests within the package. | |
| 222 | * | |
| 223 | * @return compiled root node | |
| 224 | */ | |
| 225 | /* default */ CompiledNode<V> root() { | |
| 226 |
1
1. root : replaced return value with null for org/egothor/stemmer/FrequencyTrie::root → KILLED |
return this.root; |
| 227 | } | |
| 228 | ||
| 229 | /** | |
| 230 | * Writes this compiled trie to the supplied output stream. | |
| 231 | * | |
| 232 | * <p> | |
| 233 | * The binary format is versioned and preserves canonical shared compiled nodes, | |
| 234 | * therefore the serialized representation remains compact even for tries | |
| 235 | * reduced by subtree merging. | |
| 236 | * | |
| 237 | * <p> | |
| 238 | * The supplied codec is responsible for persisting individual values of type | |
| 239 | * {@code V}. | |
| 240 | * | |
| 241 | * @param outputStream target output stream | |
| 242 | * @param valueCodec codec used to write values | |
| 243 | * @throws NullPointerException if any argument is {@code null} | |
| 244 | * @throws IOException if writing fails | |
| 245 | */ | |
| 246 | public void writeTo(final OutputStream outputStream, final ValueStreamCodec<V> valueCodec) throws IOException { | |
| 247 | Objects.requireNonNull(outputStream, "outputStream"); | |
| 248 | Objects.requireNonNull(valueCodec, "valueCodec"); | |
| 249 | ||
| 250 | final DataOutputStream dataOutput; // NOPMD | |
| 251 |
1
1. writeTo : negated conditional → KILLED |
if (outputStream instanceof DataOutputStream) { |
| 252 | dataOutput = (DataOutputStream) outputStream; | |
| 253 | } else { | |
| 254 | dataOutput = new DataOutputStream(outputStream); | |
| 255 | } | |
| 256 | ||
| 257 | final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>(); | |
| 258 | final List<CompiledNode<V>> orderedNodes = new ArrayList<>(); | |
| 259 |
1
1. writeTo : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(this.root, nodeIds, orderedNodes); |
| 260 | ||
| 261 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 262 | LOGGER.log(Level.FINE, "Writing compiled trie with {0} canonical nodes.", orderedNodes.size()); | |
| 263 | } | |
| 264 | ||
| 265 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(STREAM_MAGIC); |
| 266 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(STREAM_VERSION); |
| 267 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(orderedNodes.size()); |
| 268 |
1
1. writeTo : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(nodeIds.get(this.root)); |
| 269 | ||
| 270 | for (CompiledNode<V> node : orderedNodes) { | |
| 271 |
1
1. writeTo : removed call to org/egothor/stemmer/FrequencyTrie::writeNode → KILLED |
writeNode(dataOutput, valueCodec, node, nodeIds); |
| 272 | } | |
| 273 | ||
| 274 |
1
1. writeTo : removed call to java/io/DataOutputStream::flush → SURVIVED |
dataOutput.flush(); |
| 275 | } | |
| 276 | ||
| 277 | /** | |
| 278 | * Reads a compiled trie from the supplied input stream. | |
| 279 | * | |
| 280 | * <p> | |
| 281 | * The caller must provide the same value codec semantics that were used during | |
| 282 | * persistence as well as the array factory required for typed result arrays. | |
| 283 | * | |
| 284 | * @param inputStream source input stream | |
| 285 | * @param arrayFactory factory used to create typed arrays | |
| 286 | * @param valueCodec codec used to read values | |
| 287 | * @param <V> value type | |
| 288 | * @return deserialized compiled trie | |
| 289 | * @throws NullPointerException if any argument is {@code null} | |
| 290 | * @throws IOException if reading fails or the binary format is invalid | |
| 291 | */ | |
| 292 | public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory, | |
| 293 | final ValueStreamCodec<V> valueCodec) throws IOException { | |
| 294 | Objects.requireNonNull(inputStream, "inputStream"); | |
| 295 | Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 296 | Objects.requireNonNull(valueCodec, "valueCodec"); | |
| 297 | ||
| 298 | final DataInputStream dataInput; // NOPMD | |
| 299 |
1
1. readFrom : negated conditional → KILLED |
if (inputStream instanceof DataInputStream) { |
| 300 | dataInput = (DataInputStream) inputStream; | |
| 301 | } else { | |
| 302 | dataInput = new DataInputStream(inputStream); | |
| 303 | } | |
| 304 | ||
| 305 | final int magic = dataInput.readInt(); | |
| 306 |
1
1. readFrom : negated conditional → KILLED |
if (magic != STREAM_MAGIC) { |
| 307 | throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic)); | |
| 308 | } | |
| 309 | ||
| 310 | final int version = dataInput.readInt(); | |
| 311 |
1
1. readFrom : negated conditional → KILLED |
if (version != STREAM_VERSION) { |
| 312 | throw new IOException("Unsupported trie stream version: " + version); | |
| 313 | } | |
| 314 | ||
| 315 | final int nodeCount = dataInput.readInt(); | |
| 316 |
2
1. readFrom : changed conditional boundary → SURVIVED 2. readFrom : negated conditional → KILLED |
if (nodeCount < 0) { |
| 317 | throw new IOException("Negative node count: " + nodeCount); | |
| 318 | } | |
| 319 | ||
| 320 | final int rootNodeId = dataInput.readInt(); | |
| 321 |
4
1. readFrom : changed conditional boundary → KILLED 2. readFrom : negated conditional → KILLED 3. readFrom : negated conditional → KILLED 4. readFrom : changed conditional boundary → KILLED |
if (rootNodeId < 0 || rootNodeId >= nodeCount) { |
| 322 | throw new IOException("Invalid root node id: " + rootNodeId); | |
| 323 | } | |
| 324 | ||
| 325 | final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount); | |
| 326 | final CompiledNode<V> rootNode = nodes[rootNodeId]; | |
| 327 | ||
| 328 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 329 | LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount); | |
| 330 | } | |
| 331 | ||
| 332 |
1
1. readFrom : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readFrom → KILLED |
return new FrequencyTrie<>(arrayFactory, rootNode); |
| 333 | } | |
| 334 | ||
| 335 | /** | |
| 336 | * Returns the number of canonical compiled nodes reachable from the root. | |
| 337 | * | |
| 338 | * <p> | |
| 339 | * The returned value reflects the size of the final reduced immutable trie, not | |
| 340 | * the number of mutable build-time nodes inserted before reduction. Shared | |
| 341 | * canonical subtrees are counted only once. | |
| 342 | * | |
| 343 | * @return number of canonical compiled nodes in this trie | |
| 344 | */ | |
| 345 | public int size() { | |
| 346 | final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>(); | |
| 347 | final List<CompiledNode<V>> orderedNodes = new ArrayList<>(); | |
| 348 |
1
1. size : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(this.root, nodeIds, orderedNodes); |
| 349 |
1
1. size : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie::size → KILLED |
return orderedNodes.size(); |
| 350 | } | |
| 351 | ||
| 352 | /** | |
| 353 | * Assigns deterministic identifiers to all canonical compiled nodes reachable | |
| 354 | * from the supplied root. | |
| 355 | * | |
| 356 | * @param node current node | |
| 357 | * @param nodeIds assigned node identifiers | |
| 358 | * @param orderedNodes ordered nodes in identifier order | |
| 359 | */ | |
| 360 | private static <V> void assignNodeIds(final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds, | |
| 361 | final List<CompiledNode<V>> orderedNodes) { | |
| 362 |
1
1. assignNodeIds : negated conditional → KILLED |
if (nodeIds.containsKey(node)) { |
| 363 | return; | |
| 364 | } | |
| 365 | ||
| 366 | final int nodeId = orderedNodes.size(); | |
| 367 | nodeIds.put(node, nodeId); | |
| 368 | orderedNodes.add(node); | |
| 369 | ||
| 370 | for (CompiledNode<V> child : node.children()) { | |
| 371 |
1
1. assignNodeIds : removed call to org/egothor/stemmer/FrequencyTrie::assignNodeIds → KILLED |
assignNodeIds(child, nodeIds, orderedNodes); |
| 372 | } | |
| 373 | } | |
| 374 | ||
| 375 | /** | |
| 376 | * Writes one compiled node. | |
| 377 | * | |
| 378 | * @param dataOutput output | |
| 379 | * @param valueCodec value codec | |
| 380 | * @param node node to write | |
| 381 | * @param nodeIds node identifiers | |
| 382 | * @throws IOException if writing fails | |
| 383 | */ | |
| 384 | private static <V> void writeNode(final DataOutputStream dataOutput, final ValueStreamCodec<V> valueCodec, | |
| 385 | final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds) throws IOException { | |
| 386 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.edgeLabels().length); |
| 387 |
2
1. writeNode : changed conditional boundary → KILLED 2. writeNode : negated conditional → KILLED |
for (int index = 0; index < node.edgeLabels().length; index++) { |
| 388 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeChar → KILLED |
dataOutput.writeChar(node.edgeLabels()[index]); |
| 389 | final Integer childNodeId = nodeIds.get(node.children()[index]); | |
| 390 |
1
1. writeNode : negated conditional → KILLED |
if (childNodeId == null) { |
| 391 | throw new IOException("Missing child node identifier during serialization."); | |
| 392 | } | |
| 393 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(childNodeId); |
| 394 | } | |
| 395 | ||
| 396 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.orderedValues().length); |
| 397 |
2
1. writeNode : changed conditional boundary → KILLED 2. writeNode : negated conditional → KILLED |
for (int index = 0; index < node.orderedValues().length; index++) { |
| 398 |
1
1. writeNode : removed call to org/egothor/stemmer/FrequencyTrie$ValueStreamCodec::write → KILLED |
valueCodec.write(dataOutput, node.orderedValues()[index]); |
| 399 |
1
1. writeNode : removed call to java/io/DataOutputStream::writeInt → KILLED |
dataOutput.writeInt(node.orderedCounts()[index]); |
| 400 | } | |
| 401 | } | |
| 402 | ||
| 403 | /** | |
| 404 | * Reads all compiled nodes and resolves child references. | |
| 405 | * | |
| 406 | * @param dataInput input | |
| 407 | * @param arrayFactory array factory | |
| 408 | * @param valueCodec value codec | |
| 409 | * @param nodeCount number of nodes | |
| 410 | * @param <V> value type | |
| 411 | * @return array of nodes indexed by serialized node identifier | |
| 412 | * @throws IOException if reading fails or the stream is invalid | |
| 413 | */ | |
| 414 | @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") | |
| 415 | private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory, | |
| 416 | final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException { | |
| 417 | final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount); | |
| 418 | ||
| 419 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 420 | final int edgeCount = dataInput.readInt(); | |
| 421 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
if (edgeCount < 0) { |
| 422 | throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount); | |
| 423 | } | |
| 424 | ||
| 425 | final char[] edgeLabels = new char[edgeCount]; | |
| 426 | final int[] childNodeIds = new int[edgeCount]; | |
| 427 | ||
| 428 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) { |
| 429 | edgeLabels[edgeIndex] = dataInput.readChar(); | |
| 430 | childNodeIds[edgeIndex] = dataInput.readInt(); | |
| 431 | } | |
| 432 | ||
| 433 | final int valueCount = dataInput.readInt(); | |
| 434 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
if (valueCount < 0) { |
| 435 | throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount); | |
| 436 | } | |
| 437 | ||
| 438 | final V[] orderedValues = arrayFactory.apply(valueCount); | |
| 439 | final int[] orderedCounts = new int[valueCount]; | |
| 440 | ||
| 441 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) { |
| 442 | orderedValues[valueIndex] = valueCodec.read(dataInput); | |
| 443 | orderedCounts[valueIndex] = dataInput.readInt(); | |
| 444 |
2
1. readNodes : negated conditional → KILLED 2. readNodes : changed conditional boundary → KILLED |
if (orderedCounts[valueIndex] <= 0) { |
| 445 | throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index " | |
| 446 | + valueIndex + ": " + orderedCounts[valueIndex]); | |
| 447 | } | |
| 448 | } | |
| 449 | ||
| 450 | nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts)); | |
| 451 | } | |
| 452 | ||
| 453 | @SuppressWarnings("unchecked") | |
| 454 | final CompiledNode<V>[] nodes = new CompiledNode[nodeCount]; | |
| 455 | ||
| 456 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 457 | final NodeData<V> nodeData = nodeDataList.get(nodeIndex); | |
| 458 | @SuppressWarnings("unchecked") | |
| 459 | final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length]; | |
| 460 | nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(), | |
| 461 | nodeData.orderedCounts()); | |
| 462 | } | |
| 463 | ||
| 464 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) { |
| 465 | final NodeData<V> nodeData = nodeDataList.get(nodeIndex); | |
| 466 | final CompiledNode<V> node = nodes[nodeIndex]; | |
| 467 | ||
| 468 |
2
1. readNodes : changed conditional boundary → KILLED 2. readNodes : negated conditional → KILLED |
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) { |
| 469 | final int childNodeId = nodeData.childNodeIds()[edgeIndex]; | |
| 470 |
4
1. readNodes : changed conditional boundary → SURVIVED 2. readNodes : changed conditional boundary → SURVIVED 3. readNodes : negated conditional → KILLED 4. readNodes : negated conditional → KILLED |
if (childNodeId < 0 || childNodeId >= nodeCount) { |
| 471 | throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex | |
| 472 | + ": " + childNodeId); | |
| 473 | } | |
| 474 | node.children()[edgeIndex] = nodes[childNodeId]; | |
| 475 | } | |
| 476 | } | |
| 477 | ||
| 478 |
1
1. readNodes : replaced return value with null for org/egothor/stemmer/FrequencyTrie::readNodes → KILLED |
return nodes; |
| 479 | } | |
| 480 | ||
| 481 | /** | |
| 482 | * Locates the compiled node for the supplied key. | |
| 483 | * | |
| 484 | * @param key key to resolve | |
| 485 | * @return compiled node, or {@code null} if the path does not exist | |
| 486 | */ | |
| 487 | private CompiledNode<V> findNode(final String key) { | |
| 488 | CompiledNode<V> current = this.root; | |
| 489 |
2
1. findNode : changed conditional boundary → KILLED 2. findNode : negated conditional → KILLED |
for (int index = 0; index < key.length(); index++) { |
| 490 | current = current.findChild(key.charAt(index)); | |
| 491 |
1
1. findNode : negated conditional → KILLED |
if (current == null) { |
| 492 | return null; | |
| 493 | } | |
| 494 | } | |
| 495 |
1
1. findNode : replaced return value with null for org/egothor/stemmer/FrequencyTrie::findNode → KILLED |
return current; |
| 496 | } | |
| 497 | ||
| 498 | /** | |
| 499 | * Builder of {@link FrequencyTrie}. | |
| 500 | * | |
| 501 | * <p> | |
| 502 | * The builder is intentionally mutable and optimized for repeated | |
| 503 | * {@link #put(String, Object)} calls. The final trie is created by | |
| 504 | * {@link #build()}, which performs bottom-up subtree reduction and converts the | |
| 505 | * structure to a compact immutable representation optimized for read | |
| 506 | * operations. | |
| 507 | * | |
| 508 | * @param <V> value type | |
| 509 | */ | |
| 510 | public static final class Builder<V> { | |
| 511 | ||
| 512 | /** | |
| 513 | * Logger of this class. | |
| 514 | */ | |
| 515 | private static final Logger LOGGER = Logger.getLogger(Builder.class.getName()); | |
| 516 | ||
| 517 | /** | |
| 518 | * Factory used to create typed arrays. | |
| 519 | */ | |
| 520 | private final IntFunction<V[]> arrayFactory; | |
| 521 | ||
| 522 | /** | |
| 523 | * Reduction configuration. | |
| 524 | */ | |
| 525 | private final ReductionSettings reductionSettings; | |
| 526 | ||
| 527 | /** | |
| 528 | * Mutable root node. | |
| 529 | */ | |
| 530 | private final MutableNode<V> root; | |
| 531 | ||
| 532 | /** | |
| 533 | * Creates a new builder with the provided settings. | |
| 534 | * | |
| 535 | * @param arrayFactory array factory | |
| 536 | * @param reductionSettings reduction configuration | |
| 537 | * @throws NullPointerException if any argument is {@code null} | |
| 538 | */ | |
| 539 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) { | |
| 540 | this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); | |
| 541 | this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings"); | |
| 542 | this.root = new MutableNode<>(); | |
| 543 | } | |
| 544 | ||
| 545 | /** | |
| 546 | * Creates a new builder using default thresholds for the supplied reduction | |
| 547 | * mode. | |
| 548 | * | |
| 549 | * @param arrayFactory array factory | |
| 550 | * @param reductionMode reduction mode | |
| 551 | * @throws NullPointerException if any argument is {@code null} | |
| 552 | */ | |
| 553 | public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) { | |
| 554 | this(arrayFactory, ReductionSettings.withDefaults(reductionMode)); | |
| 555 | } | |
| 556 | ||
| 557 | /** | |
| 558 | * Stores a value for the supplied key and increments its local frequency. | |
| 559 | * | |
| 560 | * <p> | |
| 561 | * Values are stored at the node addressed by the full key. Since trie values | |
| 562 | * may also appear on internal nodes, an empty key is valid and stores a value | |
| 563 | * directly at the root. | |
| 564 | * | |
| 565 | * @param key key | |
| 566 | * @param value value | |
| 567 | * @return this builder | |
| 568 | * @throws NullPointerException if {@code key} or {@code value} is {@code null} | |
| 569 | */ | |
| 570 | public Builder<V> put(final String key, final V value) { | |
| 571 |
1
1. put : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::put → KILLED |
return put(key, value, 1); |
| 572 | } | |
| 573 | ||
| 574 | /** | |
| 575 | * Builds a compiled read-only trie. | |
| 576 | * | |
| 577 | * @return compiled trie | |
| 578 | */ | |
| 579 | public FrequencyTrie<V> build() { | |
| 580 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 581 | LOGGER.log(Level.FINE, "Starting trie compilation with reduction mode {0}.", | |
| 582 | this.reductionSettings.reductionMode()); | |
| 583 | } | |
| 584 | ||
| 585 | final ReductionContext<V> reductionContext = new ReductionContext<>(this.reductionSettings); | |
| 586 | final ReducedNode<V> reducedRoot = reduce(this.root, reductionContext); | |
| 587 | final CompiledNode<V> compiledRoot = freeze(reducedRoot, new IdentityHashMap<>()); | |
| 588 | ||
| 589 | if (LOGGER.isLoggable(Level.FINE)) { | |
| 590 | LOGGER.log(Level.FINE, "Trie compilation finished. Canonical node count: {0}.", | |
| 591 | reductionContext.canonicalNodeCount()); | |
| 592 | } | |
| 593 | ||
| 594 |
1
1. build : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::build → KILLED |
return new FrequencyTrie<>(this.arrayFactory, compiledRoot); |
| 595 | } | |
| 596 | ||
| 597 | /** | |
| 598 | * Stores a value for the supplied key and increments its local frequency by the | |
| 599 | * specified positive count. | |
| 600 | * | |
| 601 | * <p> | |
| 602 | * Values are stored at the node addressed by the full key. Since trie values | |
| 603 | * may also appear on internal nodes, an empty key is valid and stores a value | |
| 604 | * directly at the root. | |
| 605 | * | |
| 606 | * <p> | |
| 607 | * This method is functionally equivalent to calling | |
| 608 | * {@link #put(String, Object)} repeatedly {@code count} times, but it avoids | |
| 609 | * unnecessary repeated map updates and is therefore preferable for bulk | |
| 610 | * reconstruction from compiled tries or other aggregated sources. | |
| 611 | * | |
| 612 | * @param key key | |
| 613 | * @param value value | |
| 614 | * @param count positive frequency increment | |
| 615 | * @return this builder | |
| 616 | * @throws NullPointerException if {@code key} or {@code value} is | |
| 617 | * {@code null} | |
| 618 | * @throws IllegalArgumentException if {@code count} is less than {@code 1} | |
| 619 | */ | |
| 620 | public Builder<V> put(final String key, final V value, final int count) { | |
| 621 | Objects.requireNonNull(key, "key"); | |
| 622 | Objects.requireNonNull(value, "value"); | |
| 623 | ||
| 624 |
2
1. put : changed conditional boundary → KILLED 2. put : negated conditional → KILLED |
if (count < 1) { // NOPMD |
| 625 | throw new IllegalArgumentException("count must be at least 1."); | |
| 626 | } | |
| 627 | ||
| 628 | MutableNode<V> current = this.root; | |
| 629 |
2
1. put : negated conditional → KILLED 2. put : changed conditional boundary → KILLED |
for (int index = 0; index < key.length(); index++) { |
| 630 | final Character edge = key.charAt(index); | |
| 631 | MutableNode<V> child = current.children().get(edge); | |
| 632 |
1
1. put : negated conditional → KILLED |
if (child == null) { |
| 633 | child = new MutableNode<>(); // NOPMD | |
| 634 | current.children().put(edge, child); | |
| 635 | } | |
| 636 | current = child; | |
| 637 | } | |
| 638 | ||
| 639 | final Integer previous = current.valueCounts().get(value); | |
| 640 |
1
1. put : negated conditional → KILLED |
if (previous == null) { |
| 641 | current.valueCounts().put(value, count); | |
| 642 | } else { | |
| 643 |
1
1. put : Replaced integer addition with subtraction → KILLED |
current.valueCounts().put(value, previous + count); |
| 644 | } | |
| 645 |
1
1. put : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::put → KILLED |
return this; |
| 646 | } | |
| 647 | ||
| 648 | /** | |
| 649 | * Returns the number of mutable build-time nodes currently reachable from the | |
| 650 | * builder root. | |
| 651 | * | |
| 652 | * <p> | |
| 653 | * This metric is intended mainly for diagnostics and tests that compare the | |
| 654 | * unreduced build-time structure with the final reduced compiled trie. | |
| 655 | * | |
| 656 | * @return number of mutable build-time nodes | |
| 657 | */ | |
| 658 | /* default */ int buildTimeSize() { | |
| 659 |
1
1. buildTimeSize : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie$Builder::buildTimeSize → KILLED |
return countMutableNodes(this.root); |
| 660 | } | |
| 661 | ||
| 662 | /** | |
| 663 | * Counts mutable nodes recursively. | |
| 664 | * | |
| 665 | * @param node current node | |
| 666 | * @return reachable mutable node count | |
| 667 | */ | |
| 668 | private int countMutableNodes(final MutableNode<V> node) { | |
| 669 | int count = 1; | |
| 670 | for (MutableNode<V> child : node.children().values()) { | |
| 671 |
1
1. countMutableNodes : Replaced integer addition with subtraction → KILLED |
count += countMutableNodes(child); |
| 672 | } | |
| 673 |
1
1. countMutableNodes : replaced int return with 0 for org/egothor/stemmer/FrequencyTrie$Builder::countMutableNodes → KILLED |
return count; |
| 674 | } | |
| 675 | ||
| 676 | /** | |
| 677 | * Reduces a mutable node to a canonical reduced node. | |
| 678 | * | |
| 679 | * @param source source mutable node | |
| 680 | * @param context reduction context | |
| 681 | * @return canonical reduced node | |
| 682 | */ | |
| 683 | private ReducedNode<V> reduce(final MutableNode<V> source, final ReductionContext<V> context) { | |
| 684 | final Map<Character, ReducedNode<V>> reducedChildren = new LinkedHashMap<>(); | |
| 685 | ||
| 686 | for (Map.Entry<Character, MutableNode<V>> childEntry : source.children().entrySet()) { | |
| 687 | final ReducedNode<V> reducedChild = reduce(childEntry.getValue(), context); | |
| 688 | reducedChildren.put(childEntry.getKey(), reducedChild); | |
| 689 | } | |
| 690 | ||
| 691 | final Map<V, Integer> localCounts = copyCounts(source.valueCounts()); | |
| 692 | final LocalValueSummary<V> localSummary = LocalValueSummary.of(localCounts, this.arrayFactory); | |
| 693 | final ReductionSignature<V> signature = ReductionSignature.create(localSummary, reducedChildren, | |
| 694 | context.settings()); | |
| 695 | ||
| 696 | ReducedNode<V> canonical = context.lookup(signature); | |
| 697 |
1
1. reduce : negated conditional → KILLED |
if (canonical == null) { |
| 698 | canonical = new ReducedNode<>(signature, localCounts, reducedChildren); | |
| 699 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReductionContext::register → KILLED |
context.register(signature, canonical); |
| 700 |
1
1. reduce : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::reduce → KILLED |
return canonical; |
| 701 | } | |
| 702 | ||
| 703 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReducedNode::mergeLocalCounts → KILLED |
canonical.mergeLocalCounts(localCounts); |
| 704 |
1
1. reduce : removed call to org/egothor/stemmer/trie/ReducedNode::mergeChildren → SURVIVED |
canonical.mergeChildren(reducedChildren); |
| 705 | ||
| 706 |
1
1. reduce : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::reduce → KILLED |
return canonical; |
| 707 | } | |
| 708 | ||
| 709 | /** | |
| 710 | * Freezes a reduced node into an immutable compiled node. | |
| 711 | * | |
| 712 | * @param reducedNode reduced node | |
| 713 | * @param cache already frozen nodes | |
| 714 | * @return immutable compiled node | |
| 715 | */ | |
| 716 | private CompiledNode<V> freeze(final ReducedNode<V> reducedNode, | |
| 717 | final Map<ReducedNode<V>, CompiledNode<V>> cache) { | |
| 718 | final CompiledNode<V> existing = cache.get(reducedNode); | |
| 719 |
1
1. freeze : negated conditional → KILLED |
if (existing != null) { |
| 720 |
1
1. freeze : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::freeze → KILLED |
return existing; |
| 721 | } | |
| 722 | ||
| 723 | final LocalValueSummary<V> localSummary = LocalValueSummary.of(reducedNode.localCounts(), | |
| 724 | this.arrayFactory); | |
| 725 | ||
| 726 | final List<Map.Entry<Character, ReducedNode<V>>> childEntries = new ArrayList<>( | |
| 727 | reducedNode.children().entrySet()); | |
| 728 |
1
1. freeze : removed call to java/util/List::sort → KILLED |
childEntries.sort(Map.Entry.comparingByKey()); |
| 729 | ||
| 730 | final char[] edges = new char[childEntries.size()]; | |
| 731 | @SuppressWarnings("unchecked") | |
| 732 | final CompiledNode<V>[] childNodes = new CompiledNode[childEntries.size()]; | |
| 733 | ||
| 734 |
2
1. freeze : negated conditional → KILLED 2. freeze : changed conditional boundary → KILLED |
for (int index = 0; index < childEntries.size(); index++) { |
| 735 | final Map.Entry<Character, ReducedNode<V>> entry = childEntries.get(index); | |
| 736 | edges[index] = entry.getKey(); | |
| 737 | childNodes[index] = freeze(entry.getValue(), cache); | |
| 738 | } | |
| 739 | ||
| 740 | final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(), | |
| 741 | localSummary.orderedCounts()); | |
| 742 | cache.put(reducedNode, frozen); | |
| 743 |
1
1. freeze : replaced return value with null for org/egothor/stemmer/FrequencyTrie$Builder::freeze → KILLED |
return frozen; |
| 744 | } | |
| 745 | ||
| 746 | /** | |
| 747 | * Creates a shallow frequency copy preserving deterministic insertion order of | |
| 748 | * first occurrence. | |
| 749 | * | |
| 750 | * @param source source counts | |
| 751 | * @return copied counts | |
| 752 | */ | |
| 753 | private Map<V, Integer> copyCounts(final Map<V, Integer> source) { | |
| 754 |
1
1. copyCounts : replaced return value with Collections.emptyMap for org/egothor/stemmer/FrequencyTrie$Builder::copyCounts → KILLED |
return new LinkedHashMap<>(source); |
| 755 | } | |
| 756 | } | |
| 757 | ||
| 758 | /** | |
| 759 | * Codec used to persist values stored in the trie. | |
| 760 | * | |
| 761 | * @param <V> value type | |
| 762 | */ | |
| 763 | public interface ValueStreamCodec<V> { | |
| 764 | ||
| 765 | /** | |
| 766 | * Writes one value to the supplied data output. | |
| 767 | * | |
| 768 | * @param dataOutput target data output | |
| 769 | * @param value value to write | |
| 770 | * @throws IOException if writing fails | |
| 771 | */ | |
| 772 | void write(DataOutputStream dataOutput, V value) throws IOException; | |
| 773 | ||
| 774 | /** | |
| 775 | * Reads one value from the supplied data input. | |
| 776 | * | |
| 777 | * @param dataInput source data input | |
| 778 | * @return read value | |
| 779 | * @throws IOException if reading fails | |
| 780 | */ | |
| 781 | V read(DataInputStream dataInput) throws IOException; | |
| 782 | } | |
| 783 | ||
| 784 | } | |
Mutations | ||
| 150 |
1.1 2.2 |
|
| 153 |
1.1 |
|
| 178 |
1.1 2.2 |
|
| 179 |
1.1 |
|
| 181 |
1.1 |
|
| 209 |
1.1 2.2 |
|
| 214 |
1.1 2.2 |
|
| 217 |
1.1 |
|
| 226 |
1.1 |
|
| 251 |
1.1 |
|
| 259 |
1.1 |
|
| 265 |
1.1 |
|
| 266 |
1.1 |
|
| 267 |
1.1 |
|
| 268 |
1.1 |
|
| 271 |
1.1 |
|
| 274 |
1.1 |
|
| 299 |
1.1 |
|
| 306 |
1.1 |
|
| 311 |
1.1 |
|
| 316 |
1.1 2.2 |
|
| 321 |
1.1 2.2 3.3 4.4 |
|
| 332 |
1.1 |
|
| 348 |
1.1 |
|
| 349 |
1.1 |
|
| 362 |
1.1 |
|
| 371 |
1.1 |
|
| 386 |
1.1 |
|
| 387 |
1.1 2.2 |
|
| 388 |
1.1 |
|
| 390 |
1.1 |
|
| 393 |
1.1 |
|
| 396 |
1.1 |
|
| 397 |
1.1 2.2 |
|
| 398 |
1.1 |
|
| 399 |
1.1 |
|
| 419 |
1.1 2.2 |
|
| 421 |
1.1 2.2 |
|
| 428 |
1.1 2.2 |
|
| 434 |
1.1 2.2 |
|
| 441 |
1.1 2.2 |
|
| 444 |
1.1 2.2 |
|
| 456 |
1.1 2.2 |
|
| 464 |
1.1 2.2 |
|
| 468 |
1.1 2.2 |
|
| 470 |
1.1 2.2 3.3 4.4 |
|
| 478 |
1.1 |
|
| 489 |
1.1 2.2 |
|
| 491 |
1.1 |
|
| 495 |
1.1 |
|
| 571 |
1.1 |
|
| 594 |
1.1 |
|
| 624 |
1.1 2.2 |
|
| 629 |
1.1 2.2 |
|
| 632 |
1.1 |
|
| 640 |
1.1 |
|
| 643 |
1.1 |
|
| 645 |
1.1 |
|
| 659 |
1.1 |
|
| 671 |
1.1 |
|
| 673 |
1.1 |
|
| 697 |
1.1 |
|
| 699 |
1.1 |
|
| 700 |
1.1 |
|
| 703 |
1.1 |
|
| 704 |
1.1 |
|
| 706 |
1.1 |
|
| 719 |
1.1 |
|
| 720 |
1.1 |
|
| 728 |
1.1 |
|
| 734 |
1.1 2.2 |
|
| 743 |
1.1 |
|
| 754 |
1.1 |